Hi
I need to clean tha stack used for variables (117000 bytes +/-).
How to do it fast ? Could you help me ?
I have one idea using SSE2: if esi is 16 bit aligned and edi= esi+16
we may use movdqa xmm0, [esi] and movdqa [edi],xmm0.
May be also movda xmm0, [esi], movdqa xmm1, [esi+16]
and movdqa [edi], xmm0, movdqa [edi+16], xmm1 if edi=esi+32.
The first 16 or 32 bytes are 0,
Thanks :t
THIS:
CleaningLargeBuffer proc pBuffer:DWORD, lBuffer:DWORD
push ecx
push esi
;
mov esi, pBuffer
mov ecx, lBuffer
xor eax, eax
;------------------------------------
; clean the first 16 bytes
;------------------------------------
mov [esi+0], eax
mov [esi+4], eax
mov [esi+8], eax
mov [esi+12], eax ; 16 bytes
;------------------------------------
; clean the last 64 bytes
;------------------------------------
movdqu xmm0, [esi]
movdqu [esi+ecx-64], xmm0
movdqu [esi+ecx-48], xmm0
movdqu [esi+ecx-32], xmm0
movdqu [esi+ecx-16], xmm0
;----------------------------------
; align address
;----------------------------------
mov eax, esi
neg eax
and eax, 15
jz short _aligned0
;---------------------------------------
add esi, eax
sub ecx, eax
;----------------------------------------
_aligned0: and ecx, -64
;------------------------------------
; clean the first 64 bytes
;------------------------------------
movdqa [esi+16], xmm0
movdqa [esi+32], xmm0
movdqa [esi+48], xmm0
@@: sub ecx, 64
movdqa [esi+ecx], xmm0
movdqa [esi+ecx+16], xmm0
movdqa [esi+ecx+32], xmm0
movdqa [esi+ecx+48], xmm0
jnz short @B
pop esi
pop ecx
ret
CleaningLargeBuffer endp
As 177k is a small amount, "rep stosd" would be easily fast enough.
Quote from: hutch-- on December 18, 2016, 02:34:55 PM
"rep stosd" would be easily fast enough.
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
Init
mov ecx, 200000
Let edi=New$(ecx)
mov eax, Mirror$("Ciao")
shr ecx, 2
NanoTimer()
rep stosd
Inkey Str$("That took a whopping %i µs", NanoTimer(µs))
EndOfCodeOutput:That took a whopping 10 µs
Thanks Hutch and Jochen
Jochen could you test my procedure, please ?
Sure, here it is:Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz
rep stosd took 15 µs
Rui's proc took 82 µs\
rep stosd took 28 µs
Rui's proc took 92 µs\
rep stosd took 26 µs
Rui's proc took 84 µs\
rep stosd took 31 µs
Rui's proc took 92 µs\
rep stosd took 31 µs
Rui's proc took 220 µs\
rep stosd took 25 µs
Rui's proc took 100 µs\
rep stosd took 28 µs
Rui's proc took 96 µs\
rep stosd took 31 µs
Rui's proc took 70 µs\
rep stosd took 31 µs
Rui's proc took 95 µs\
rep stosd took 31 µs
Rui's proc took 93 µs\
Project attached.
Very well Jochen, thank you so much ;)
I will do another to move - each step- 128 bytes
Jochen, take this one, please.
It moves 128 bytes in each loop, now.
Please use it with a buffer of 117 000 bytes or so.
CleaningLargeBuffer proc pBuffer:DWORD, lBuffer:DWORD
push ecx
push esi
;
mov esi, pBuffer
mov ecx, lBuffer
xor eax, eax
;------------------------------------
; clean the first 16 bytes
;------------------------------------
mov [esi+0], eax
mov [esi+4], eax
mov [esi+8], eax
mov [esi+12], eax ; 16 bytes
;------------------------------------
; clean the last 128 bytes
;------------------------------------
movdqu xmm0, [esi]
movdqu [esi+ecx-128], xmm0
movdqu [esi+ecx-112], xmm0
movdqu [esi+ecx-96], xmm0
movdqu [esi+ecx-80], xmm0
movdqu [esi+ecx-64], xmm0
movdqu [esi+ecx-48], xmm0
movdqu [esi+ecx-32], xmm0
movdqu [esi+ecx-16], xmm0
;----------------------------------
; align address
;----------------------------------
mov eax, esi
neg eax
and eax, 15
jz short _aligned0
;---------------------------------------
add esi, eax
sub ecx, eax
;----------------------------------------
_aligned0: and ecx, -128 ;64
;------------------------------------
; clean the first 128 bytes
;------------------------------------
movdqa [esi+16], xmm0
movdqa [esi+32], xmm0
movdqa [esi+48], xmm0
movdqa [esi+64], xmm0
movdqa [esi+80], xmm0
movdqa [esi+96], xmm0
movdqa [esi+112], xmm0
@@: sub ecx, 128 ; 64
movdqa [esi+ecx], xmm0
movdqa [esi+ecx+16], xmm0
movdqa [esi+ecx+32], xmm0
movdqa [esi+ecx+48], xmm0
movdqa [esi+ecx+64], xmm0
movdqa [esi+ecx+80], xmm0
movdqa [esi+ecx+96], xmm0
movdqa [esi+ecx+112], xmm0
jnz short @B
pop esi
pop ecx
ret
CleaningLargeBuffer endp
test yourself...
Quote from: jj2007 on December 19, 2016, 11:13:43 AM
test yourself...
Thanks but i can't see any results because it doesnt stop, it closes yhe window and exit Jochen !
Start it from a DOS prompt...
The last is nearly better. Would you mind to use this:
$StackpBuffer = 8
$StacklBuffer = 12
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
ALIGN 16
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
CleaningLargeBuffer proc pBuffer:DWORD, lBuffer:DWORD
push esi
;
mov esi, [esp+$StackpBuffer]
mov ecx, [esp+$StacklBuffer]
xor eax, eax
;------------------------------------
; clean the first 16 bytes
;------------------------------------
mov [esi+0], eax
mov [esi+4], eax
mov [esi+8], eax
mov [esi+12], eax ; 16 bytes
;------------------------------------
; clean the last 256 bytes
;------------------------------------
movdqu xmm0, [esi]
movdqu [esi+ecx-256],xmm0
movdqu [esi+ecx-240],xmm0
movdqu [esi+ecx-224],xmm0
movdqu [esi+ecx-208],xmm0
movdqu [esi+ecx-192],xmm0
movdqu [esi+ecx-176],xmm0
movdqu [esi+ecx-160],xmm0
movdqu [esi+ecx-144],xmm0
movdqu [esi+ecx-128],xmm0
movdqu [esi+ecx-112],xmm0
movdqu [esi+ecx-96], xmm0
movdqu [esi+ecx-80], xmm0
movdqu [esi+ecx-64], xmm0
movdqu [esi+ecx-48], xmm0
movdqu [esi+ecx-32], xmm0
movdqu [esi+ecx-16], xmm0
;----------------------------------
; align address
;----------------------------------
mov eax, esi
neg eax
and eax, 15
jz short _aligned0
;---------------------------------------
add esi, eax
sub ecx, eax
;----------------------------------------
_aligned0: and ecx, -256
;------------------------------------
; clean the first 256 bytes
;------------------------------------
movdqa [esi+16], xmm0
movdqa [esi+32], xmm0
movdqa [esi+48], xmm0
movdqa [esi+64], xmm0
movdqa [esi+80], xmm0
movdqa [esi+96], xmm0
movdqa [esi+112],xmm0
movdqa [esi+128],xmm0
movdqa [esi+144],xmm0
movdqa [esi+160],xmm0
movdqa [esi+176],xmm0
movdqa [esi+192],xmm0
movdqa [esi+208],xmm0
movdqa [esi+224],xmm0
movdqa [esi+240],xmm0
_loop0: sub ecx, 256
movdqa [esi+ecx], xmm0
movdqa [esi+ecx+16], xmm0
movdqa [esi+ecx+32], xmm0
movdqa [esi+ecx+48], xmm0
movdqa [esi+ecx+64], xmm0
movdqa [esi+ecx+80], xmm0
movdqa [esi+ecx+96], xmm0
movdqa [esi+ecx+112],xmm0
movdqa [esi+ecx+128],xmm0
movdqa [esi+ecx+144],xmm0
movdqa [esi+ecx+160],xmm0
movdqa [esi+ecx+176],xmm0
movdqa [esi+ecx+192],xmm0
movdqa [esi+ecx+208],xmm0
movdqa [esi+ecx+224],xmm0
movdqa [esi+ecx+240],xmm0
jnz _loop0
_exit: pop esi
ret 8
CleaningLargeBuffer endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Much better :tIntel(R) Core(TM) i5-2450M CPU @ 2.50GHz
rep stosd took 12 µs
Rui's proc took 25 µs
rep stosd took 18 µs
Rui's proc took 25 µs
rep stosd took 14 µs
Rui's proc took 33 µs
rep stosd took 15 µs
Rui's proc took 20 µs
rep stosd took 7 µs
Rui's proc took 10 µs
rep stosd took 18 µs
Rui's proc took 25 µs
rep stosd took 18 µs
Rui's proc took 17 µs
rep stosd took 18 µs
Rui's proc took 25 µs
rep stosd took 17 µs
Rui's proc took 34 µs
rep stosd took 29 µs
Rui's proc took 21 µs
-- hit any key --
:biggrin: :biggrin:
Too much better ;)
Thank you Jochen
Quote
Intel(R) Pentium(R) 4 CPU 3.40GHz
rep stosd took 33 µs
Rui's proc took 35 µs
rep stosd took 42 µs
Rui's proc took 24 µs
rep stosd took 44 µs
Rui's proc took 26 µs
rep stosd took 21 µs
Rui's proc took 24 µs
rep stosd took 78 µs
Rui's proc took 28 µs
Sorry...
Quote
Intel(R) Pentium(R) 4 CPU 3.40GHz
rep stosd took 33 µs
Rui's proc took 35 µs
rep stosd took 42 µs
Rui's proc took 24 µs
rep stosd took 44 µs
Rui's proc took 26 µs
rep stosd took 21 µs
Rui's proc took 24 µs
rep stosd took 78 µs
Rui's proc took 28 µs
rep stosd took 22 µs
Rui's proc took 22 µs
rep stosd took 22 µs
Rui's proc took 20 µs
rep stosd took 21 µs
Rui's proc took 21 µs
rep stosd took 31 µs
Rui's proc took 23 µs
rep stosd took 23 µs
Rui's proc took 20 µs
-- hit any key --
:biggrin: :biggrin:
Hi Jochen Here we have the procedure to clean 114028 bytes in
the proc ResolveExpress. But i aligned the address (see there )
and the total length to clean is a multiple of 512 bytes
(114 176 bytes- the last space is not used).
Is it possible to test it ?
Thanks ;)
$StackpBuffer = 8
$StacklBuffer = 12
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
ALIGN 16
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
CleanAligned512Buffer proc pBuffer:DWORD, lBuffer:DWORD
push esi
;
mov esi, [esp+$StackpBuffer]
mov ecx, [esp+$StacklBuffer]
xor eax, eax
;------------------------------------
; clean the first 16 bytes
;------------------------------------
mov [esi+0], eax
mov [esi+4], eax
mov [esi+8], eax
mov [esi+12], eax ; 16 bytes
;------------------------------------
; load xmm0 with 16 bytes
;------------------------------------
movdqu xmm0, [esi]
;----------------------------------
; align address
;----------------------------------
;mov eax, esi
;neg eax
;and eax, 15
;jz short _aligned0
;---------------------------------------
;add esi, eax
;sub ecx, eax
;----------------------------------------
_aligned0: and ecx, -512
_loop0: sub ecx, 512
movdqa [esi+ecx], xmm0
movdqa [esi+ecx+16], xmm0
movdqa [esi+ecx+32], xmm0
movdqa [esi+ecx+48], xmm0
movdqa [esi+ecx+64], xmm0
movdqa [esi+ecx+80], xmm0
movdqa [esi+ecx+96], xmm0
movdqa [esi+ecx+112],xmm0
movdqa [esi+ecx+128],xmm0
movdqa [esi+ecx+144],xmm0
movdqa [esi+ecx+160],xmm0
movdqa [esi+ecx+176],xmm0
movdqa [esi+ecx+192],xmm0
movdqa [esi+ecx+208],xmm0
movdqa [esi+ecx+224],xmm0
movdqa [esi+ecx+240],xmm0
movdqa [esi+ecx+256],xmm0
movdqa [esi+ecx+272],xmm0
movdqa [esi+ecx+288],xmm0
movdqa [esi+ecx+304],xmm0
movdqa [esi+ecx+320],xmm0
movdqa [esi+ecx+336],xmm0
movdqa [esi+ecx+352],xmm0
movdqa [esi+ecx+368],xmm0
movdqa [esi+ecx+384],xmm0
movdqa [esi+ecx+400],xmm0
movdqa [esi+ecx+416],xmm0
movdqa [esi+ecx+432],xmm0
movdqa [esi+ecx+448],xmm0
movdqa [esi+ecx+464],xmm0
movdqa [esi+ecx+480],xmm0
movdqa [esi+ecx+496],xmm0
jnz _loop0
_exit: pop esi
ret 8
CleanAligned512Buffer endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
; rcl0_pRclStr equ rcl0_lenstackbuffer + 8
; rcl_pRclStr equ dword ptr [ebp+rcl0_pRclStr]
Quote
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
ResolveExpress proc pRclStr:DWORD
push ebp
mov ecx, rcl0_lenstackbuffer ; = 114 556 -> usefull 114 028
sub esp, ecx
mov ebp, esp
;
mov eax, ebp
neg eax
and eax, 15
jz short @F
add ebp, eax
sub ecx, eax
@@: push ebx
push esi
push edi
;
invoke CleanAligned512Buffer, ebp, ecx
;mov esi, rcl_pRclStr
;--------------
; do something
;--------------
pop edi
pop esi
pop ebx
add esp, rcl0_lenstackbuffer
clc
pop ebp
ret 4
ResolveExpress endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Here you are:
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz
rep stosd took 61 µs
Rui's proc took 32 µs
rep stosd took 18 µs
Rui's proc took 16 µs
rep stosd took 15 µs
Rui's proc took 49 µs
rep stosd took 17 µs
Rui's proc took 16 µs
rep stosd took 18 µs
Rui's proc took 20 µs
rep stosd took 18 µs
Rui's proc took 20 µs
rep stosd took 16 µs
Rui's proc took 17 µs
rep stosd took 41 µs
Rui's proc took 16 µs
rep stosd took 17 µs
Rui's proc took 37 µs
rep stosd took 18 µs
Rui's proc took 20 µs
Doesnt work. It executes an illegal operation and is closed.
Thanks in any way Jochen. :t
I dont know why but the buffer address must be an aligned 16 address as in ResolveExpress.
About the length, if we need 114 028 we add 528 (512+16) to get 114 556 bytes. In this
way we clean 114 176 = 223 (loops) * 512 bytes (each loop) . This is the trick and
the 114 028 bytes are cleaned and the starting address is 16 bytes aligned.
Adding 528 bytes is not any problem, we simple have some unused space at the end of the structure.
Could you show me what is the procedure to test it (send me msg)?
Well i ( you ) may write the same to use loops of 128 bytes or 256 bytes the basic idea is the same.
For loops of 128 bytes add 128+16 replace 512 by 128 and remove some movdqa.
For loops of 256, add 256+16, replace 512 by 256 and remove some movdqa.
EDIT:We may write versions to move from left to rigth too
;
$StackpBuffer = 8
$StacklBuffer = 12
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
ALIGN 16
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
CleanAligned256Buffer proc pBuffer:DWORD, lBuffer:DWORD
push esi
;
mov esi, [esp+$StackpBuffer]
mov ecx, [esp+$StacklBuffer]
xor eax, eax
;------------------------------------
; clean the first 16 bytes
;------------------------------------
mov [esi+0], eax
mov [esi+4], eax
mov [esi+8], eax
mov [esi+12], eax ; 16 bytes
;------------------------------------
; load xmm0 with 16 bytes
;------------------------------------
movdqu xmm0, [esi]
;----------------------------------
; align address
;----------------------------------
;mov eax, esi
;neg eax
;and eax, 15
;jz short _aligned0
;---------------------------------------
;add esi, eax
;sub ecx, eax
;----------------------------------------
_aligned0: and ecx, -256
_loop0: sub ecx, 256
movdqa [esi+ecx], xmm0
movdqa [esi+ecx+16], xmm0
movdqa [esi+ecx+32], xmm0
movdqa [esi+ecx+48], xmm0
movdqa [esi+ecx+64], xmm0
movdqa [esi+ecx+80], xmm0
movdqa [esi+ecx+96], xmm0
movdqa [esi+ecx+112],xmm0
movdqa [esi+ecx+128],xmm0
movdqa [esi+ecx+144],xmm0
movdqa [esi+ecx+160],xmm0
movdqa [esi+ecx+176],xmm0
movdqa [esi+ecx+192],xmm0
movdqa [esi+ecx+208],xmm0
movdqa [esi+ecx+224],xmm0
movdqa [esi+ecx+240],xmm0
jnz _loop0
_exit: pop esi
ret 8
CleanAligned256Buffer endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;
$StackpBuffer = 8
$StacklBuffer = 12
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
ALIGN 16
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
CleanAligned128Buffer proc pBuffer:DWORD, lBuffer:DWORD
push esi
;
mov esi, [esp+$StackpBuffer]
mov ecx, [esp+$StacklBuffer]
xor eax, eax
;------------------------------------
; clean the first 16 bytes
;------------------------------------
mov [esi+0], eax
mov [esi+4], eax
mov [esi+8], eax
mov [esi+12], eax ; 16 bytes
;------------------------------------
; load xmm0 with 16 bytes
;------------------------------------
movdqu xmm0, [esi]
;----------------------------------
; align address
;----------------------------------
;mov eax, esi
;neg eax
;and eax, 15
;jz short _aligned0
;---------------------------------------
;add esi, eax
;sub ecx, eax
;----------------------------------------
_aligned0: and ecx, -128
_loop0: sub ecx, 128
movdqa [esi+ecx], xmm0
movdqa [esi+ecx+16], xmm0
movdqa [esi+ecx+32], xmm0
movdqa [esi+ecx+48], xmm0
movdqa [esi+ecx+64], xmm0
movdqa [esi+ecx+80], xmm0
movdqa [esi+ecx+96], xmm0
movdqa [esi+ecx+112],xmm0
jnz _loop0
_exit: pop esi
ret 8
CleanAligned128Buffer endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Quote from: RuiLoureiro on December 22, 2016, 09:40:35 AM
Could you show me what is the procedure to test it (send me msg)?
The source is included above, Rui. Timings are done with NanoTimer() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1171).
Quote from: jj2007 on December 22, 2016, 12:58:32 PM
Quote from: RuiLoureiro on December 22, 2016, 09:40:35 AM
Could you show me what is the procedure to test it (send me msg)?
The source is included above, Rui. Timings are done with NanoTimer() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1171).
Is this the source (i have not another ) ? It seems that the edi address is not 16 bytes aligned.
I need to find out my code to test it but now i have no time.
Thanks Jochen :t
Quote
{\rtf1\ansi\ansicpg1252\deff0\deflang1040{\fonttbl{\f0\fswiss\fprq2\fcharset0 System;}{\f1\fswiss\fprq2\fcharset0 Arial;}{\f2\fnil\fcharset0 Tahoma;}}
{\colortbl ;\red0\green0\blue0;\red0\green0\blue255;\red144\green160\blue128;\red255\green0\blue0;\red112\green112\blue112;}
{\*\generator Riched20 5.50.99.2070;}\viewkind4\uc1\pard\sl360\slmult1\tx1000\tx1400\tx4800\tx5800\tx0\cf1\b\v\f0\fs20\lang2057\'b0B$;01Ew01Ew0V23270J376b105m0014a00003Aa whopping45.datale00063ebxFile00007GfaFile00007GfaFile00007GfaFile00007GfaFile00007GfaFile00007GfaFile0007t001Co000P01E5000d01EM000e00SN0902003JSo-b00Su000It00zc004#\v0 include \\masm32\\MasmBasic\\\cf2 MasmBasic.inc\tab\cf3 ; \cf0\b0\protect\v \protect0\v0 download\protect\v (http://masm32.com/board/index.php?topic=94.0)\cf1\b\protect0\v0\par
\pard\sl240\slmult1\tx1000\tx1400\tx4800\tx5800\tx0\lang2055 .code\par
\pard\tx1000\tx1400\tx4800\tx5800\tx0\cf0\b0\lang1040$StackpBuffer = 8\par
$StacklBuffer = 12\par
OPTION PROLOGUE:NONE\par
OPTION EPILOGUE:NONE\par
ALIGN 16\par
; \'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\'ab\par
CleanAligned512Buffer proc pBuffer:DWORD, lBuffer:DWORD\par
push esi\par
;\par
mov esi, [esp+$StackpBuffer]\par
mov ecx, [esp+$StacklBuffer]\par
xor eax, eax\par
\par
;------------------------------------\par
; clean the first 16 bytes\par
;------------------------------------\par
mov [esi+0], eax\par
mov [esi+4], eax\par
mov [esi+8], eax\par
mov [esi+12], eax ; 16 bytes\par
\par
;------------------------------------\par
; load xmm0 with 16 bytes\par
;------------------------------------\par
movdqu xmm0, [esi]\par
\par
;----------------------------------\par
; align address\par
;----------------------------------\par
;mov eax, esi\par
;neg eax\par
;and eax, 15\par
;jz short _aligned0\par
\par
;---------------------------------------\par
;add esi, eax\par
;sub ecx, eax\par
\par
;----------------------------------------\par
_aligned0: and ecx, -512\par
\par
\par
_loop0: sub ecx, 512\par
\par
movdqa [esi+ecx], xmm0\par
movdqa [esi+ecx+16], xmm0\par
movdqa [esi+ecx+32], xmm0\par
movdqa [esi+ecx+48], xmm0\par
movdqa [esi+ecx+64], xmm0\par
movdqa [esi+ecx+80], xmm0\par
movdqa [esi+ecx+96], xmm0\par
\par
movdqa [esi+ecx+112],xmm0\par
movdqa [esi+ecx+128],xmm0\par
movdqa [esi+ecx+144],xmm0\par
movdqa [esi+ecx+160],xmm0\par
movdqa [esi+ecx+176],xmm0\par
movdqa [esi+ecx+192],xmm0\par
\par
movdqa [esi+ecx+208],xmm0\par
movdqa [esi+ecx+224],xmm0\par
movdqa [esi+ecx+240],xmm0\par
movdqa [esi+ecx+256],xmm0\par
movdqa [esi+ecx+272],xmm0\par
movdqa [esi+ecx+288],xmm0\par
\par
movdqa [esi+ecx+304],xmm0\par
movdqa [esi+ecx+320],xmm0\par
movdqa [esi+ecx+336],xmm0\par
movdqa [esi+ecx+352],xmm0\par
movdqa [esi+ecx+368],xmm0\par
movdqa [esi+ecx+384],xmm0\par
\par
movdqa [esi+ecx+400],xmm0\par
movdqa [esi+ecx+416],xmm0\par
movdqa [esi+ecx+432],xmm0\par
movdqa [esi+ecx+448],xmm0\par
movdqa [esi+ecx+464],xmm0\par
movdqa [esi+ecx+480],xmm0\par
movdqa [esi+ecx+496],xmm0 \par
jnz _loop0\par
\par
_exit: pop esi\par
ret 8\par
CleanAligned512Buffer endp\par
OPTION PROLOGUE:PrologueDef\par
OPTION EPILOGUE:EpilogueDef\par
\cf2\b\lang2057 \cf0\b0\protect\v\f1\lang1040\'b0B1\cf2\b\protect0\v0\fs28\lang2055 Init\cf1\f0\fs20\lang1040\par
\cf2 PrintCpu\cf1 0\par
push\cf4 9\cf5\tab ; 10 rounds\par
\cf1 testbytes=117000\par
\cf2 Let\cf1 edi=\cf2 New$\cf1(testbytes)\par
.Repeat\par
\tab mov ecx, testbytes\par
\tab mov eax, \cf2 Mirror$\cf1("Ciao")\par
\tab push\cf4 ecx\par
\cf1\tab push\cf4 edi\par
\cf1\tab shr ecx, 2\par
\tab\cf2 Delay\cf1 100\par
\tab\cf2 NanoTimer\cf1()\par
\tab\cf0\b0\protect\v\'b0B6\cf1\b\protect0\v0\f2\fs28 rep stosd\f0\fs20\par
\tab\cf2 Print\cf1 \cf2 Str$\cf1("rep stosd took %i \'b5s\\n", \cf2 NanoTimer\cf1(\'b5s))\par
\tab pop \cf4 edi\par
\cf1\tab pop \cf4 ecx\par
\cf1\tab\cf2 Delay\cf1 100\par
\tab\cf2 NanoTimer\cf1()\par
\tab invoke \cf0\b0 CleanAligned512Buffer\cf1\b , edi, ecx\par
\tab\cf2 Print\cf1 \cf2 Str$\cf1("Rui's proc took %i \'b5s\\n\\n", \cf2 NanoTimer\cf1(\'b5s))\par
\tab dec \cf2 stack\cf1\par
.Until Sign?\par
Inkey "-- hit any key --"\par
\pard\tx600\tx1000\tx1400\tx1800\tx2200\tx2600\tx3000\tx3400\tx3800\tx4200\tx0\tx0\tx0\tx0\tx0\tx0\tx0\cf2 EndOfCode\cf1\par
\par
}
Quote from: RuiLoureiro on December 23, 2016, 05:31:34 AM
Is this the source
Yes, but you need an editor that understands RTF, like WordPad, MS Word or RichMasm (http://masm32.com/board/index.php?topic=5314.0).