include \masm32\include\masm32rt.inc
LeaSize macro args:VARARG
push $
args
mov eax, $
pop edx
sub eax, edx
print str$(eax), " bytes for &args", 13, 10
ENDM
.code
start:
LeaSize lea eax, [eax]
LeaSize lea eax, [eax+eax]
LeaSize lea eax, [eax+2*eax]
LeaSize lea eax, [eax+4*eax]
LeaSize lea eax, [eax+eax+127]
LeaSize lea eax, [eax+2*eax+127]
LeaSize lea eax, [eax+4*eax+127]
LeaSize lea eax, [2*eax]
LeaSize lea eax, [4*eax]
LeaSize lea eax, [8*eax]
inkey "hit any key"
exit
end start
whats the speed difference?
and compared to
add ecx,x in innerloop compared to lea ebx,[ebx+ecx*x],or better with lea ebx,[buffer+combo with ecx*
x being the typesize used to store/load memory or typesize*number of unrolls
its also the old way of speed up innerloop from multiply each loop is find a cheaper add
for example if you multiply with 640 each loop,you instead add 640 each loop
000001FD 8D00 lea eax, [eax]
000001FF 8D0400 lea eax, [eax+eax]
00000202 8D0440 lea eax, [eax+2*eax]
00000205 8D0480 lea eax, [eax+4*eax]
00000208 8D44007F lea eax, [eax+eax+127]
0000020C 8D44407F lea eax, [eax+2*eax+127]
00000210 8D44807F lea eax, [eax+4*eax+127]
00000214 8D044500000000 lea eax, [2*eax]
0000021B 8D048500000000 lea eax, [4*eax]
00000222 8D04C500000000 lea eax, [8*eax]
.nolist
include \masm32\include\masm32rt.inc
LeaSize macro args:VARARG
push $
args
mov eax, $
pop edx
sub eax, edx
print str$(eax), " bytes for &args", 13, 10
ENDM
LeaSizex macro args:VARARG
local st
st=$
args
mov eax, $-st
print str$(eax), " bytes for &args", 13, 10
ENDM
.list
.code
start:
LeaSize lea eax, [eax]
LeaSize lea eax, [eax+eax]
LeaSize lea eax, [eax+2*eax]
LeaSize lea eax, [eax+4*eax]
LeaSize lea eax, [eax+eax+127]
LeaSize lea eax, [eax+2*eax+127]
LeaSize lea eax, [eax+4*eax+127]
LeaSize lea eax, [2*eax]
LeaSize lea eax, [4*eax]
LeaSize lea eax, [8*eax]
inkey "hit any key"
lea eax, [eax]
lea eax, [eax+eax]
lea eax, [eax+2*eax]
lea eax, [eax+4*eax]
lea eax, [eax+eax+127]
lea eax, [eax+2*eax+127]
lea eax, [eax+4*eax+127]
lea eax, [2*eax]
lea eax, [4*eax]
lea eax, [8*eax]
LeaSizex lea eax, [eax]
LeaSizex lea eax, [eax+eax]
LeaSizex lea eax, [eax+2*eax]
LeaSizex lea eax, [eax+4*eax]
LeaSizex lea eax, [eax+eax+127]
LeaSizex lea eax, [eax+2*eax+127]
LeaSizex lea eax, [eax+4*eax+127]
LeaSizex lea eax, [2*eax]
LeaSizex lea eax, [4*eax]
LeaSizex lea eax, [8*eax]
inkey "hit any key"
exit
end start
7 bytes for lea eax,[eax]
8 bytes for lea eax,[eax+eax]
8 bytes for lea eax,[eax+2*eax]
8 bytes for lea eax,[eax+4*eax]
9 bytes for lea eax,[eax+eax+127]
9 bytes for lea eax,[eax+2*eax+127]
9 bytes for lea eax,[eax+4*eax+127]
12 bytes for lea eax,[2*eax]
12 bytes for lea eax,[4*eax]
12 bytes for lea eax,[8*eax]
hit any key
2 bytes for lea eax,[eax]
3 bytes for lea eax,[eax+eax]
3 bytes for lea eax,[eax+2*eax]
3 bytes for lea eax,[eax+4*eax]
4 bytes for lea eax,[eax+eax+127]
4 bytes for lea eax,[eax+2*eax+127]
4 bytes for lea eax,[eax+4*eax+127]
7 bytes for lea eax,[2*eax]
7 bytes for lea eax,[4*eax]
7 bytes for lea eax,[8*eax]
hit any key
and yeah, it's weird that it takes more bytes for lea eax, [2*eax] than for lea eax, [eax+2*eax]
It is both hardware dependent and irrelevant as anything post 1990 does its prefetch by instruction count, not instruction length. Up to a PIII LEA was fast, on a PIV it was slow and subject to the PIV pipeline length. Core2 and later don't suffer the problem. The world has changed since the days of an 8088.
agreed. totally irrelevant with the bloat of today. just a little odd.
Quote from: hutch-- on February 28, 2020, 09:47:52 AM
It is both hardware dependent and irrelevant as anything post 1990 does its prefetch by instruction count, not instruction length. Up to a PIII LEA was fast, on a PIV it was slow and subject to the PIV pipeline length. Core2 and later don't suffer the problem. The world has changed since the days of an 8088.
advice I read is,you can endup in a shorter instructions calculation with LEA,vs sequence of arithmetic instructions doing the same thing
read about LEA on Sandy bridge cpu,in intel optimization manual,but those cpu names ,seem you need to be some expert on list of cpus released before and after those meantioned to know if it works the same on your own cpu
actually its just forget to use .486 processor directive and you are forced to stick to 8086 mnemonics
The only test that matters is the clock, real time testing will make you wiser. If you write test pieces you can test the speed difference if any between different processor families. Something that will jump up and bite you is optimisation for one family of processor may be a flop on another. Having a few different computers helps you to check the differences. Now with LEA having owned most processors from the i486 through Pentiums, Core2 and currently gen 4 and 5 Haswell hardware, LEA worked well on most EXCEPT the Pentium Northwood and Prescott models where in some cases it was faster to use multiple ADD instructions.
LEA is a very useful mnemonic in many circumstances where you need to perform simple calculations quickly, its faster than bit shifts and rotates when you have overlap of methods to do a calculation and on most processors you can get a lower instruction count by not having to do the things that you had to do on a Pentium.
I know you like to play with later SSE and AVX but you must get the swing of addressing to properly use them and this means getting GOOD at general purpose register coding otherwise you will miss out on the real performance gains that late SSE and AVX/2 offers.
Quote from: hutch-- on February 29, 2020, 06:15:35 PMThe only test that matters is the clock, real time testing will make you wiser
Fortunately we have done that already (http://masm32.com/board/index.php?topic=5900.0) :biggrin: