Hi guys
I`m trying to create a variation of msvcrt memcmp, but i´m having some problems making it go faster. I created a variation in SSE2, like this:
Masm version
memcmp_SSE2_Original2 proc near
XMMReg0Dis = byte ptr -58h
XMMReg1Dis = byte ptr -48h
XmmPreserve = dword ptr -8
Reminder = dword ptr -4
pBuffer1 = dword ptr 8
pBuffer2 = dword ptr 0Ch
Lenght = dword ptr 10h
push ebp
mov ebp, esp
sub esp, 4
sub esp, 54h
mov [ebp+XmmPreserve], esp
push esi
push edi
push edx
lea eax, [ebp+XMMReg0Dis]
movdqu xmmword ptr [eax], xmm0
lea eax, [ebp+XMMReg1Dis]
movdqu xmmword ptr [eax], xmm1
mov eax, [ebp+Lenght]
test eax, eax
jz loc_403819
mov edx, eax
mov esi, [ebp+pBuffer1]
mov edi, [ebp+pBuffer2]
and edx, 0FFFFFFF0h
mov eax, edx
xor edx, [ebp+Lenght]
test eax, eax
jz loc_4037D1
mov [ebp+Reminder], edx
shr eax, 4
loc_4037A6: ; CODE XREF: memcmp_SSE2_Original2+68↓j
movdqu xmm0, xmmword ptr [esi]
movdqu xmm1, xmmword ptr [edi]
pcmpeqd xmm0, xmm1
movmskps edx, xmm0
cmp edx, 0Fh
jz short loc_4037C1
xor eax, eax
jmp loc_403819
; ---------------------------------------------------------------------------
loc_4037C1: ; CODE XREF: memcmp_SSE2_Original2+58↑j
add esi, 10h
add edi, 10h
dec eax
jnz loc_4037A6
mov edx, [ebp+Reminder]
loc_4037D1: ; CODE XREF: memcmp_SSE2_Original2+3A↑j
test edx, edx
jz loc_403814
mov eax, edx
and edx, 0FFFFFFFCh
xor edx, eax
shr eax, 2
jz short loc_403801
mov [ebp+Reminder], edx
loc_4037E8: ; CODE XREF: memcmp_SSE2_Original2+9C↓j
mov edx, [esi]
cmp edx, [edi]
jz short loc_4037F5
xor eax, eax
jmp loc_403819
; ---------------------------------------------------------------------------
loc_4037F5: ; CODE XREF: memcmp_SSE2_Original2+8C↑j
add esi, 4
add edi, 4
dec eax
jnz short loc_4037E8
mov edx, [ebp+Reminder]
loc_403801: ; CODE XREF: memcmp_SSE2_Original2+83↑j
test edx, edx
jz short loc_403814
loc_403805: ; CODE XREF: memcmp_SSE2_Original2+B2↓j
mov al, [esi]
cmp al, [edi]
jz short loc_40380F
xor eax, eax
jmp short loc_403819
; ---------------------------------------------------------------------------
loc_40380F: ; CODE XREF: memcmp_SSE2_Original2+A9↑j
inc edi
inc esi
dec edx
jnz short loc_403805
loc_403814: ; CODE XREF: memcmp_SSE2_Original2+73↑j
; memcmp_SSE2_Original2+A3↑j
mov eax, 1
loc_403819: ; CODE XREF: memcmp_SSE2_Original2+22↑j
; memcmp_SSE2_Original2+5C↑j ...
lea esi, [ebp+XMMReg0Dis]
movdqu xmm0, xmmword ptr [esi]
lea edi, [ebp+XMMReg1Dis]
movdqu xmm1, xmmword ptr [edi]
pop edx
pop edi
pop esi
mov esp, ebp
pop ebp
retn 0Ch
memcmp_SSE2_Original2 endp
called as:
buffer1 db '123456789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl10111231234567812345p789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl101112312345678', 0
buffer2 db '123456789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl10111231234567812345p789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl101112312345678', 0
push 71
push offset buffer2
push offset buffer1
call memcmp_SSE2
Can someone please, benchmark for me the results and also if someone has a faster version (Using SSe2 and 32 bits), pls post it here. The goal for the function is recreate the one existent in msvcrt. So it needs to load 2 chunks of data of any size, and then return 0 if equal or -1 or 1 if it is buffer1 is bigger or smaller then buffer2 etc.
My version is fast, but it is not fast enough in all situations. One way to make it faster is force the algo to work in parallel splitting the memory blocks in 2. So, working from 32 to 32 bytes.
So, we may uses xmm0, xmm1 to load the 1st 16 bytes on each buffer, and uses xmm2 and xmm3 to load the last 16 bytes, and then on each loop, we increase the 1st bytes by 16 and decrease the last by 16, until it reaches the middle of the data buffer.
And, of course, handling the remainder bytes, if the chunk is not a multiple of 32. I gave a try, and it is also fast, but surprising slow in small chunks of data
Also, on my version, i didn´t succeeded yet to make it return 0, -1 or 1. So far, it returns only TRUE (if the memory are equal or FALSE if it is different.
On my tests, it takes something around 16 clock cycles, against 38 on the normal msvcrt. I can make it a bit faster, avoiding preserving the registers esi, edi, and edx, (I reached around 9-11 clock cycles earlier, but, this is not desirable, since the registers must be preserved (including xmm registers)
Note. The RosAsm version is as this:
Proc memcmp_SSE2:
Arguments @pBuffer1, @pBuffer2, @Lenght
Local @Reminder
Structure @XmmPreserve 80, @XMMReg0Dis 0, @XMMReg1Dis 16, @XMMReg2Dis 32, @XMMReg3Dis 48, @XMMReg4Dis 64
Uses esi, edi, edx
; save the contents xmm registers to avoid altering them
lea eax D@XMMReg0Dis | movdqu X$eax XMM0
lea eax D@XMMReg1Dis | movdqu X$eax XMM1
; Step1 - Check if the lenght is zeroed. If lenght is 0, jump over the whole function
mov eax D@Lenght
..Test_If_Not_Zero eax ; eax = 0 ? Lenght = 0, exit
mov edx eax
mov esi D@pBuffer1
mov edi D@pBuffer2
and edx 0-16 | mov eax edx | xor edx D@Lenght ; When 0 means lenght is divisible by 16 and we have no remainders, jmp over to the main function
; Step3 - This is similar to step2
.Test_If_Not_Zero eax ; edx = 0 ? no remainders exit
mov D@Reminder edx
; get the multiple of 16 and divide by 16 to get the amount of needed loops
shr eax 4 ; divide by 16. ecx now is the counter of multiple of 16 bytes
.Do
movdqu xmm0 X$esi;+ecx
movdqu xmm1 X$edi;+ecx
pcmpeqd xmm0 xmm1
movmskps edx xmm0
If edx <> 0F
xor eax eax | jmp L4>>
End_If
add esi 16
add edi 16
dec eax
.Repeat_Until_Zero
mov edx D@Reminder
.Test_End
.Test_If_Not_Zero edx
mov eax edx | and edx 0-4 | xor edx eax ; eax = remainder of the multiple of 4
shr eax 2 | jz L2> ; how many loops multiple of 4 fits in ? No multiple of 4 ? jmp over
mov D@Reminder edx
; if eax = 0, means it is not divisible by 4. Therefore, we can have only 3, 2, or 1 reminders
Do
mov edx D$esi
If edx <> D$edi ; if both are different
xor eax eax | jmp L4>>
End_If
add esi 4
add edi 4
dec eax
Repeat_Until_Zero eax
mov edx D@Reminder
L2:
Test_If_Not_Zero edx
Do
mov al B$esi
If al <> B$edi ; if both are different
xor eax eax | jmp L4>
End_If
inc edi | inc esi
dec edx
Repeat_Until_Zero edx
Test_End
.Test_End
mov eax &TRUE
..Test_End
L4:
lea esi D@XMMReg0Dis | movdqu XMM0 X$esi
lea edi D@XMMReg1Dis | movdqu XMM1 X$edi
EndP
how about leveraging the usage of YMM CPU registers? they are as big as twice comparing to XMM ones
Unroll loop with use all xmm regs? Align start of loop with 64 bytes?
Use aligned Movaps instead? One byte smaller than SSE2 uses 066h prefix+ opcode make it easier to unroll more times?
Quote from: greenozon on September 11, 2024, 03:35:38 PMhow about leveraging the usage of YMM CPU registers? they are as big as twice comparing to XMM ones
At the moment, i cannot compile for 64 bits and neither implemented some specific AVX opcodes in RosAsm. That´s why i need to code mainly using SSE2. I´m currently trying to do several updates in RosAsm for the next release in order to do that later, so i can be able to try to port it to 64 bits as well.
Quote from: daydreamer on September 11, 2024, 04:44:22 PMUnroll loop with use all xmm regs? Align start of loop with 64 bytes?
Use aligned Movaps instead? One byte smaller than SSE2 uses 066h prefix+ opcode make it easier to unroll more times?
Hy Daydreamer. movaps/movups, i didn´t tried yet. Good idea, i´ll take a look to see if it makes any difference. About, unrolling the loops, i gave a test before unrolling the final loop for the remainder bytes (after the main xmm loops), but surprisingly was a bit slower - maybe because it resulted in a longer jmp to the end of the function, i suppose.
Btw, did you guys tested the attached file so i can make sure about the timmings ?
Quote from: guga on September 11, 2024, 09:00:50 PMBtw, did you guys tested the attached file so i can make sure about the timmings ?
here we go!
Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
3960 cycles for 100 * CRT memcmp
19566 cycles for 100 * Masm32 ucLen
8961 cycles for 100 * MasmBasic wLen
8307 cycles for 100 * _MbStrLenW
8298 cycles for 100 * _MbStrLenW2
2917 cycles for 100 * memcmp_SSE2 Guga
3925 cycles for 100 * CRT memcmp
19586 cycles for 100 * Masm32 ucLen
9075 cycles for 100 * MasmBasic wLen
8309 cycles for 100 * _MbStrLenW
8316 cycles for 100 * _MbStrLenW2
2898 cycles for 100 * memcmp_SSE2 Guga
3918 cycles for 100 * CRT memcmp
19536 cycles for 100 * Masm32 ucLen
8872 cycles for 100 * MasmBasic wLen
8621 cycles for 100 * _MbStrLenW
8296 cycles for 100 * _MbStrLenW2
2895 cycles for 100 * memcmp_SSE2 Guga
4142 cycles for 100 * CRT memcmp
19533 cycles for 100 * Masm32 ucLen
8919 cycles for 100 * MasmBasic wLen
8324 cycles for 100 * _MbStrLenW
8320 cycles for 100 * _MbStrLenW2
2880 cycles for 100 * memcmp_SSE2 Guga
21 bytes for CRT memcmp
10 bytes for Masm32 ucLen
10 bytes for MasmBasic wLen
66 bytes for _MbStrLenW
66 bytes for _MbStrLenW2
201 bytes for memcmp_SSE2 Guga
0 = eax CRT memcmp
100 = eax Masm32 ucLen
100 = eax MasmBasic wLen
100 = eax _MbStrLenW
100 = eax _MbStrLenW2
1 = eax memcmp_SSE2 Guga
--- ok ---
Great, greenozon. Tks.
Here comes a update I removed the shr eax 4 before the SSE2 operations, unrolled the loops of the remainder bytes as suggested by daydreamer, and placed the remainder bytes in dl and dh registers to preserve one more registers on exiting.
The new function is temporarily labeled as "memcmp_SSE2_New2"
Masm syntax:
memcmp_SSE2_New2 proc near ; CODE XREF: start+C↑p
XMMReg0Dis = byte ptr -58h
XMMReg1Dis = byte ptr -48h
XmmPreserve = dword ptr -8
Reminder = dword ptr -4
pBuffer1 = dword ptr 8
pBuffer2 = dword ptr 0Ch
Lenght = dword ptr 10h
push ebp
mov ebp, esp
sub esp, 4
sub esp, 54h
mov [ebp+XmmPreserve], esp
push esi
push edi
push edx
lea eax, [ebp+XMMReg0Dis]
movdqu xmmword ptr [eax], xmm0
lea eax, [ebp+XMMReg1Dis]
movdqu xmmword ptr [eax], xmm1
mov eax, [ebp+Lenght]
test eax, eax
jz loc_4041C9
mov edx, eax
mov esi, [ebp+pBuffer1]
mov edi, [ebp+pBuffer2]
and edx, 0FFFFFFF0h
mov eax, edx
xor edx, [ebp+Lenght]
test eax, eax
jz loc_404150
mov [ebp+Reminder], edx
loc_404123: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+67↓j
movdqu xmm0, xmmword ptr [esi]
movdqu xmm1, xmmword ptr [edi]
pcmpeqd xmm0, xmm1
movmskps edx, xmm0
cmp edx, 0Fh
jz short loc_40413E
xor eax, eax
jmp loc_4041C9
; ---------------------------------------------------------------------------
loc_40413E: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+55↑j
add esi, 10h
add edi, 10h
add eax, 0FFFFFFF0h
jnz loc_404123
mov edx, [ebp+Reminder]
loc_404150: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+3A↑j
test edx, edx
jz loc_4041C4
mov dh, dl
and dl, 0FCh
xor dh, dl
test dl, dl
jz short loc_40419D
mov eax, [esi]
cmp eax, [edi]
jz short loc_40416D
xor eax, eax
jmp short loc_4041C9
; ---------------------------------------------------------------------------
loc_40416D: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+87↑j
add esi, 4
add edi, 4
add dl, 0FCh
jz short loc_40419D
mov eax, [esi]
cmp eax, [edi]
jz short loc_404182
xor eax, eax
jmp short loc_4041C9
; ---------------------------------------------------------------------------
loc_404182: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+9C↑j
add esi, 4
add edi, 4
add dl, 0FCh
jz short loc_40419D
mov eax, [esi]
cmp eax, [edi]
jz short loc_404197
xor eax, eax
jmp short loc_4041C9
; ---------------------------------------------------------------------------
loc_404197: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+B1↑j
add esi, 4
add edi, 4
loc_40419D: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+81↑j
; memcmp_SSE2_OriginalGuga_Quase6+96↑j ...
test dh, dh
jz short loc_4041C4
mov eax, 0
mov dl, [esi]
cmp dl, [edi]
jnz short loc_4041C9
dec dh
jz short loc_4041C4
mov dl, [esi+1]
cmp dl, [edi+1]
jnz short loc_4041C9
dec dh
jz short loc_4041C4
mov dl, [esi+2]
cmp dl, [edi+2]
jnz short loc_4041C9
loc_4041C4: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+72↑j
; memcmp_SSE2_OriginalGuga_Quase6+BF↑j ...
mov eax, 1
loc_4041C9: ; CODE XREF: memcmp_SSE2_OriginalGuga_Quase6+22↑j
; memcmp_SSE2_OriginalGuga_Quase6+59↑j ...
lea esi, [ebp+XMMReg0Dis]
movdqu xmm0, xmmword ptr [esi]
lea edi, [ebp+XMMReg1Dis]
movdqu xmm1, xmmword ptr [edi]
pop edx
pop edi
pop esi
mov esp, ebp
pop ebp
retn 0Ch
memcmp_SSE2_New2 endp
RosAsm syntax (with comments)
; memcmp_SSE2_New2
Proc memcmp_SSE2_Final:
Arguments @pBuffer1, @pBuffer2, @Lenght
Local @Reminder
Structure @XmmPreserve 80, @XMMReg0Dis 0, @XMMReg1Dis 16, @XMMReg2Dis 32, @XMMReg3Dis 48, @XMMReg4Dis 64
Uses esi, edi, edx
; save the contents xmm registers to avoid altering them
lea eax D@XMMReg0Dis | movdqu X$eax XMM0
lea eax D@XMMReg1Dis | movdqu X$eax XMM1
; Step1 - Check if the lenght is zeroed. If lenght is 0, jump over the whole function
mov eax D@Lenght
..Test_If_Not_Zero eax ; eax = 0 ? Lenght = 0, exit
mov edx eax
mov esi D@pBuffer1
mov edi D@pBuffer2
and edx 0-16 | mov eax edx | xor edx D@Lenght ; When 0 means lenght is divisible by 16 and we have no remainders, jmp over to the main function
; Step3 - This is similar to step2
.Test_If_Not_Zero eax ; edx = 0 ? no remainders exit
mov D@Reminder edx
; We are working with multiples of 16 bytes, therefore, we can get rid of using shr eax 4 and directly subtract 16 bytes on each loop
; On this way we get a bit more of speed while keeping performance. Interesting is that add eax 0-16 seems a bit faster then sub eax 16
.Do
movdqu xmm0 X$esi
movdqu xmm1 X$edi
pcmpeqd xmm0 xmm1
movmskps edx xmm0
If edx <> 0F
xor eax eax | jmp L4>>
End_If
add esi 16
add edi 16
add eax 0-16 ; subtract by 16 bytes. Faster then sub
.Repeat_Until_Zero
mov edx D@Reminder
.Test_End
.Test_If_Not_Zero edx
; Now becomes the trick. Here we will calculate the remainders of our previous routine stored in edx
; So, assuming our initial value was 43, we now have only 11 as remainder, which is 2 Dwords (8 bytes) + 3 bytes
; So we have 2 main blocks to check. One containing only the dwords checking. So, a value whose range is 0 to 12 (multiple of 4 only, so, 0, 4, 8, 12
; and other related to the final bytes (3), whose values are a maximum of 3, so 0, 1, 2, 3
; We will store the Dwords computation in dl and the Byte computation in dh
; This way is faster then we use shr eax 2 or something to divid the dword by 4 and later compute the opther byte remainders
mov dh dl
and dl 0-4 ; dl = our Dword computation. So, values multiples of 4 only, on a maximum of 12
xor dh dl ; dh = our byte computation. So, values are multples of 1 only, on a maximum of 3
Test_If_Not_Zero dl
; If dl = 0, so if we have no dwords remainder, jmp over to compute the byte remainders
mov eax D$esi | cmp eax D$edi | jz L1> | xor eax eax | jmp L4> | L1: | add esi 4 | add edi 4 | add dl 0-4 | jz L2>
mov eax D$esi | cmp eax D$edi | jz L1> | xor eax eax | jmp L4> | L1: | add esi 4 | add edi 4 | add dl 0-4 | jz L2>
mov eax D$esi | cmp eax D$edi | jz L1> | xor eax eax | jmp L4> | L1: | add esi 4 | add edi 4
Test_End
L2:
; check if we have no byte remainders
Test_If_Not_Zero dh ; dh = our byte computation. So, values are multples of 1 only, on a maximum of 3
mov eax 0 ; surprisinly a bit faster then xor eax eax
mov dl B$esi | cmp dl B$edi | jnz L4> | dec dh | jz L2>
mov dl B$esi+1 | cmp dl B$edi+1 | jnz L4> | dec dh | jz L2>
mov dl B$esi+2 | cmp dl B$edi+2 | jnz L4>
L2:
Test_End
.Test_End
mov eax &TRUE
..Test_End
L4:
lea esi D@XMMReg0Dis | movdqu XMM0 X$esi
lea edi D@XMMReg1Dis | movdqu XMM1 X$edi
EndP
My results:QuoteAMD Ryzen 5 2400G with Radeon Vega Graphics (SSE4)
4758 cycles for 100 * CRT memcmp
25474 cycles for 100 * Masm32 ucLen
7661 cycles for 100 * MasmBasic wLen
2462 cycles for 100 * memcmp_SSE2_New2 Guga
3102 cycles for 100 * memcmp_SSE2_New Guga
2717 cycles for 100 * memcmp_SSE2 Guga
4700 cycles for 100 * CRT memcmp
25464 cycles for 100 * Masm32 ucLen
7586 cycles for 100 * MasmBasic wLen
2832 cycles for 100 * memcmp_SSE2_New2 Guga
3748 cycles for 100 * memcmp_SSE2_New Guga
3223 cycles for 100 * memcmp_SSE2 Guga
5662 cycles for 100 * CRT memcmp
31167 cycles for 100 * Masm32 ucLen
8993 cycles for 100 * MasmBasic wLen
2857 cycles for 100 * memcmp_SSE2_New2 Guga
3703 cycles for 100 * memcmp_SSE2_New Guga
3284 cycles for 100 * memcmp_SSE2 Guga
5595 cycles for 100 * CRT memcmp
25235 cycles for 100 * Masm32 ucLen
7788 cycles for 100 * MasmBasic wLen
2444 cycles for 100 * memcmp_SSE2_New2 Guga
3106 cycles for 100 * memcmp_SSE2_New Guga
2709 cycles for 100 * memcmp_SSE2 Guga
21 bytes for CRT memcmp
10 bytes for Masm32 ucLen
10 bytes for MasmBasic wLen
261 bytes for memcmp_SSE2_New2 Guga
197 bytes for memcmp_SSE2_New Guga
201 bytes for memcmp_SSE2 Guga
0 = eax CRT memcmp
100 = eax Masm32 ucLen
100 = eax MasmBasic wLen
1 = eax memcmp_SSE2_New2 Guga
1 = eax memcmp_SSE2_New Guga
1 = eax memcmp_SSE2 Guga
--- ok ---
Why do you limit the comparison to 71 bytes? The strings are 166 bytes long...
Hi JJ
In fact, i didn´t limited on the function. I was mainly testing any amount of bytes to see if was also working on unaligned/aligned chunks or what happens when the data wan´t a multiple of 16 bytes etc etc etc. That´s why i used a sequence of XXXX bytes and while i was testing, i just changed the value of the length (easier than create new data chains on each time i made the modifications/updates)
You can insert 140, 166, 120 etc or any other value corresponding to the length of the memory you need to be compared, and it will also work.
Note: THis function seems to be fast on small memory data. To make it even faster, a variation can be made to deal with larger memory data (say, 1 Mg, 100 kb etc), simply making it work in parallel on each loop where we use SSE2.
So, for larger data we can split the data chunk to be analyzed in 2 blocks: 16 at the beginning and 16 on the end. So, we always compare the 1st 16 and last 16 at once. If the data was equal on the 2 blocks, we do the loop again, increasing more 16 bytes at the next loop and decreasing 16 bytes from the end block. When the function finds non equal data, it exists. It would be faster this way for larger memory.
Ex:
a) Say our data chunk of 16 blocks are even (16*10) blocks.
Loop1
esi+0 esi+(leght-16) ---> if both blocks are different, exit the function
Loop2
esi+16 esi+(lenght-32) ---> if both blocks are different, exit the function
....
Loop N
esi+N esi+(lenght-N*16).
etc.
On this way, if all the 32*N bytes are equal, the function ends the loop when it reach the half of the multiple of 16 data. Or will immediately ends if both blocks are different.
b) Say our data chunk of 16 blocks are odd (16*11) blocks.
we compute the 1st block of 16 bytes and then we compute the other even pairs of multiple of 16
1 - compute the 1st 16 bytes (1 block).
2 - Compute the rest of the 10 blocks in parallel (doing 32 bytes at once)
3 - Then we deal the remainder bytes