Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change
Quote from: guga on September 11, 2024, 09:00:50 PMBtw, did you guys tested the attached file so i can make sure about the timmings ?
Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
3960 cycles for 100 * CRT memcmp
19566 cycles for 100 * Masm32 ucLen
8961 cycles for 100 * MasmBasic wLen
8307 cycles for 100 * _MbStrLenW
8298 cycles for 100 * _MbStrLenW2
2917 cycles for 100 * memcmp_SSE2 Guga
3925 cycles for 100 * CRT memcmp
19586 cycles for 100 * Masm32 ucLen
9075 cycles for 100 * MasmBasic wLen
8309 cycles for 100 * _MbStrLenW
8316 cycles for 100 * _MbStrLenW2
2898 cycles for 100 * memcmp_SSE2 Guga
3918 cycles for 100 * CRT memcmp
19536 cycles for 100 * Masm32 ucLen
8872 cycles for 100 * MasmBasic wLen
8621 cycles for 100 * _MbStrLenW
8296 cycles for 100 * _MbStrLenW2
2895 cycles for 100 * memcmp_SSE2 Guga
4142 cycles for 100 * CRT memcmp
19533 cycles for 100 * Masm32 ucLen
8919 cycles for 100 * MasmBasic wLen
8324 cycles for 100 * _MbStrLenW
8320 cycles for 100 * _MbStrLenW2
2880 cycles for 100 * memcmp_SSE2 Guga
21 bytes for CRT memcmp
10 bytes for Masm32 ucLen
10 bytes for MasmBasic wLen
66 bytes for _MbStrLenW
66 bytes for _MbStrLenW2
201 bytes for memcmp_SSE2 Guga
0 = eax CRT memcmp
100 = eax Masm32 ucLen
100 = eax MasmBasic wLen
100 = eax _MbStrLenW
100 = eax _MbStrLenW2
1 = eax memcmp_SSE2 Guga
--- ok ---
Quote from: daydreamer on September 11, 2024, 04:44:22 PMUnroll loop with use all xmm regs? Align start of loop with 64 bytes?
Use aligned Movaps instead? One byte smaller than SSE2 uses 066h prefix+ opcode make it easier to unroll more times?
Quote from: greenozon on September 11, 2024, 03:35:38 PMhow about leveraging the usage of YMM CPU registers? they are as big as twice comparing to XMM onesAt the moment, i cannot compile for 64 bits and neither implemented some specific AVX opcodes in RosAsm. That´s why i need to code mainly using SSE2. I´m currently trying to do several updates in RosAsm for the next release in order to do that later, so i can be able to try to port it to 64 bits as well.
memcmp_SSE2_Original2 proc near
XMMReg0Dis = byte ptr -58h
XMMReg1Dis = byte ptr -48h
XmmPreserve = dword ptr -8
Reminder = dword ptr -4
pBuffer1 = dword ptr 8
pBuffer2 = dword ptr 0Ch
Lenght = dword ptr 10h
push ebp
mov ebp, esp
sub esp, 4
sub esp, 54h
mov [ebp+XmmPreserve], esp
push esi
push edi
push edx
lea eax, [ebp+XMMReg0Dis]
movdqu xmmword ptr [eax], xmm0
lea eax, [ebp+XMMReg1Dis]
movdqu xmmword ptr [eax], xmm1
mov eax, [ebp+Lenght]
test eax, eax
jz loc_403819
mov edx, eax
mov esi, [ebp+pBuffer1]
mov edi, [ebp+pBuffer2]
and edx, 0FFFFFFF0h
mov eax, edx
xor edx, [ebp+Lenght]
test eax, eax
jz loc_4037D1
mov [ebp+Reminder], edx
shr eax, 4
loc_4037A6: ; CODE XREF: memcmp_SSE2_Original2+68↓j
movdqu xmm0, xmmword ptr [esi]
movdqu xmm1, xmmword ptr [edi]
pcmpeqd xmm0, xmm1
movmskps edx, xmm0
cmp edx, 0Fh
jz short loc_4037C1
xor eax, eax
jmp loc_403819
; ---------------------------------------------------------------------------
loc_4037C1: ; CODE XREF: memcmp_SSE2_Original2+58↑j
add esi, 10h
add edi, 10h
dec eax
jnz loc_4037A6
mov edx, [ebp+Reminder]
loc_4037D1: ; CODE XREF: memcmp_SSE2_Original2+3A↑j
test edx, edx
jz loc_403814
mov eax, edx
and edx, 0FFFFFFFCh
xor edx, eax
shr eax, 2
jz short loc_403801
mov [ebp+Reminder], edx
loc_4037E8: ; CODE XREF: memcmp_SSE2_Original2+9C↓j
mov edx, [esi]
cmp edx, [edi]
jz short loc_4037F5
xor eax, eax
jmp loc_403819
; ---------------------------------------------------------------------------
loc_4037F5: ; CODE XREF: memcmp_SSE2_Original2+8C↑j
add esi, 4
add edi, 4
dec eax
jnz short loc_4037E8
mov edx, [ebp+Reminder]
loc_403801: ; CODE XREF: memcmp_SSE2_Original2+83↑j
test edx, edx
jz short loc_403814
loc_403805: ; CODE XREF: memcmp_SSE2_Original2+B2↓j
mov al, [esi]
cmp al, [edi]
jz short loc_40380F
xor eax, eax
jmp short loc_403819
; ---------------------------------------------------------------------------
loc_40380F: ; CODE XREF: memcmp_SSE2_Original2+A9↑j
inc edi
inc esi
dec edx
jnz short loc_403805
loc_403814: ; CODE XREF: memcmp_SSE2_Original2+73↑j
; memcmp_SSE2_Original2+A3↑j
mov eax, 1
loc_403819: ; CODE XREF: memcmp_SSE2_Original2+22↑j
; memcmp_SSE2_Original2+5C↑j ...
lea esi, [ebp+XMMReg0Dis]
movdqu xmm0, xmmword ptr [esi]
lea edi, [ebp+XMMReg1Dis]
movdqu xmm1, xmmword ptr [edi]
pop edx
pop edi
pop esi
mov esp, ebp
pop ebp
retn 0Ch
memcmp_SSE2_Original2 endp
buffer1 db '123456789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl10111231234567812345p789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl101112312345678', 0
buffer2 db '123456789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl10111231234567812345p789101112312345678910111231234567891011123hfsjhgsghskgjdsgjgl101112312345678', 0
push 71
push offset buffer2
push offset buffer1
call memcmp_SSE2
Can someone please, benchmark for me the results and also if someone has a faster version (Using SSe2 and 32 bits), pls post it here. The goal for the function is recreate the one existent in msvcrt. So it needs to load 2 chunks of data of any size, and then return 0 if equal or -1 or 1 if it is buffer1 is bigger or smaller then buffer2 etc.
Proc memcmp_SSE2:
Arguments @pBuffer1, @pBuffer2, @Lenght
Local @Reminder
Structure @XmmPreserve 80, @XMMReg0Dis 0, @XMMReg1Dis 16, @XMMReg2Dis 32, @XMMReg3Dis 48, @XMMReg4Dis 64
Uses esi, edi, edx
; save the contents xmm registers to avoid altering them
lea eax D@XMMReg0Dis | movdqu X$eax XMM0
lea eax D@XMMReg1Dis | movdqu X$eax XMM1
; Step1 - Check if the lenght is zeroed. If lenght is 0, jump over the whole function
mov eax D@Lenght
..Test_If_Not_Zero eax ; eax = 0 ? Lenght = 0, exit
mov edx eax
mov esi D@pBuffer1
mov edi D@pBuffer2
and edx 0-16 | mov eax edx | xor edx D@Lenght ; When 0 means lenght is divisible by 16 and we have no remainders, jmp over to the main function
; Step3 - This is similar to step2
.Test_If_Not_Zero eax ; edx = 0 ? no remainders exit
mov D@Reminder edx
; get the multiple of 16 and divide by 16 to get the amount of needed loops
shr eax 4 ; divide by 16. ecx now is the counter of multiple of 16 bytes
.Do
movdqu xmm0 X$esi;+ecx
movdqu xmm1 X$edi;+ecx
pcmpeqd xmm0 xmm1
movmskps edx xmm0
If edx <> 0F
xor eax eax | jmp L4>>
End_If
add esi 16
add edi 16
dec eax
.Repeat_Until_Zero
mov edx D@Reminder
.Test_End
.Test_If_Not_Zero edx
mov eax edx | and edx 0-4 | xor edx eax ; eax = remainder of the multiple of 4
shr eax 2 | jz L2> ; how many loops multiple of 4 fits in ? No multiple of 4 ? jmp over
mov D@Reminder edx
; if eax = 0, means it is not divisible by 4. Therefore, we can have only 3, 2, or 1 reminders
Do
mov edx D$esi
If edx <> D$edi ; if both are different
xor eax eax | jmp L4>>
End_If
add esi 4
add edi 4
dec eax
Repeat_Until_Zero eax
mov edx D@Reminder
L2:
Test_If_Not_Zero edx
Do
mov al B$esi
If al <> B$edi ; if both are different
xor eax eax | jmp L4>
End_If
inc edi | inc esi
dec edx
Repeat_Until_Zero edx
Test_End
.Test_End
mov eax &TRUE
..Test_End
L4:
lea esi D@XMMReg0Dis | movdqu XMM0 X$esi
lea edi D@XMMReg1Dis | movdqu XMM1 X$edi
EndP
Quote from: Biterider on September 08, 2024, 08:47:09 PMI noticed that the line endpoints are always cut vertically.
Quote from: Biterider on September 08, 2024, 08:47:09 PMPerhaps a more flexible approach...
Maybe we can save some cycles ...
Quote from: HSE on September 07, 2024, 11:39:22 PMTo build an application that show results in DebugCenter are used a couple of includes and a library, like for any debug system.