Hi All,
Here is another Example from PowerBasic Forum written by Steve Hutchesson,
I have convert his Example to C-- & Masm, the procedures converted to Masm , and the main code to c--.
/*************************************
* New Sphinx Cmm *
* *
* memory-copy-benchmarks *
* *
* from powerbasic Forum *
* *
*************************************/
#pragma option w32c //create Windows console EXE.
#pragma option OS //speed optimization
#pragma option dbg
#pragma option lst
//#pragma option upx-
#Entry main
#includelib win32.lib MSVCRT.lib ole32.lib
// $ will replaced with SphinxC-- main path
#includepath "$\winlib"
#include <windows.h>
#include <MSVCRT.H-->
#pragma option ia
// tells SphinxC about masm ShowCpu function
extern {
cdecl SSEcopy(dword src,dword dst,dword blen);
cdecl SSEcopy2(dword src,dword dst,dword blen);
}
//start of Masm code
^^
.MODEL flat, c
.nolist
.data
pflead dd 0
.code
;// masm code taken from the next link and cereated by Steve Hutchesson.
;// https://forum.powerbasic.com/forum/user-to-user-discussions/powerbasic-inline-assembler/43459-memory-copy-benchmarks
SSEcopy proc c,src:DWORD,dst:DWORD,blen:DWORD
mov esi, src
mov edi, dst
mov ebx, blen
shr ebx, 6 ; int divide ebx by 64
xor edx, edx ; zero EDX and use as INDEX
align 4
lbl0:
movdqa xmm0, [esi+edx] ; 16 byte aligned reads
movdqa xmm1, [esi+edx+16]
movdqa xmm2, [esi+edx+32]
movdqa xmm3, [esi+edx+48]
movntdq [edi+edx], xmm0 ; non temporal writes
movntdq [edi+edx+16], xmm1
movntdq [edi+edx+32], xmm2
movntdq [edi+edx+48], xmm3
add edx, 64 ; add block copy size to INDEX
sub ebx, 1 ; decrement loop counter
jnz lbl0
mov ebx, edx ; test for remainder
sub ebx, blen ; EBX is remainder loop counter if not zero
jz lbl2
align 4
lbl1:
movzx eax, BYTE PTR [esi+edx] ; copy remainder
mov [edi+edx], al
add edx, 1 ; increment the INDEX
sub ebx, 1 ; decrement the loop counter
jnz lbl1
lbl2:
ret
SSEcopy endp
SSEcopy2 proc c,src:DWORD,dst:DWORD,blen:DWORD
mov esi, src
mov edi, dst
mov ebx, blen
shr ebx, 7 ;// int divide ebx by 128
xor edx, edx ;// zero EDX and use as INDEX
align 4
lbl0:
; prefetchnta BYTE PTR [esi+edx+pflead]
;// prefetcht0 BYTE PTR [esi+edx+%pflead]
;// prefetcht1 BYTE PTR [esi+edx+%pflead]
;// prefetcht2 BYTE PTR [esi+edx+%pflead]
movdqa xmm0, [esi+edx] ;// 16 byte aligned reads
movdqa xmm1, [esi+edx+16]
movdqa xmm2, [esi+edx+32]
movdqa xmm3, [esi+edx+48]
movdqa xmm4, [esi+edx+64]
movdqa xmm5, [esi+edx+80]
movdqa xmm6, [esi+edx+96]
movdqa xmm7, [esi+edx+112]
movntdq [edi+edx], xmm0 ;// non temporal writes
movntdq [edi+edx+16], xmm1
movntdq [edi+edx+32], xmm2
movntdq [edi+edx+48], xmm3
movntdq [edi+edx+64], xmm4
movntdq [edi+edx+80], xmm5
movntdq [edi+edx+96], xmm6
movntdq [edi+edx+112], xmm7
add edx, 128 ;// add block copy size to INDEX
sub ebx, 1 ;// decrement loop counter
jnz lbl0
mov ebx, edx ;// test for remainder
sub ebx, blen ;// EBX is remainder loop counter if not zero
jz lbl2
align 4
lbl1:
movzx eax, BYTE PTR [esi+edx] ;// copy remainder
mov [edi+edx], al
add edx, 1 ;// increment the INDEX
sub ebx, 1 ;// decrement the loop counter
jnz lbl1
lbl2:
ret
SSEcopy2 endp
^^
//End of Masm code
;//***********************************************************************************//
#define MEMLEN 1024*1024*129
main()
{
dword hMem,tMem,aMem,tc;
hMem = GlobalAlloc(GMEM_FIXED | GMEM_ZEROINIT,MEMLEN); // allocate 129 meg
tMem = GlobalAlloc(GMEM_FIXED,MEMLEN); // allocate 129 meg
MOV esi, hMem
// -------------------------------
// align ESI to a 16 byte boundary
// -------------------------------
add esi, 15
and esi, -16
mov aMem, esi
puts("please wait......");
tc = GetTickCount();
mov ecx, 100
lbl0:
SSEcopy2(aMem,tMem,MEMLEN);
sub ecx, 1
jnz lbl0
tc = GetTickCount() - tc;
printf("XMM copy 12.8 gig memory copy in %d ms\n",tc);
GlobalFree(hMem);
GlobalFree(tMem);
system("pause");
}