deleted
Results of AMD Ryzen5 3400G
WinMerge is good for comparing result files.
Hopefully we see more results.
deleted
Something I have learnt long ago is that different hardware gives you wide ranges or variation on the same code. I once had an AMD CPU box that had some really fast instructions but equally some really slow ones back then compared with early Intel CPUs of the sme era.
Hi Nidud,
11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz (AVX512)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
adc reg64,reg64 3 4
adc reg64,mem64 6 3
adc reg64,imm8 4 3
adc mem64,reg64 6 6
adc mem64,imm8 7 8
add reg64,reg64 3 1
add reg64,mem64 6 2
add reg64,imm8 4 1
add mem64,reg64 6 4
add mem64,imm8 7 4
and reg64,reg64 3 1
and reg64,mem64 6 2
and reg64,imm8 4 1
and mem64,reg64 6 4
and mem64,imm8 7 4
bsf reg64,reg64 4 5
bsf reg64,mem64 7 5
bsr reg64,reg64 4 5
bsr reg64,mem64 7 5
bswap reg32 2 2
bswap reg64 3 5
bt reg64,reg64 4 2
bt reg64,imm8 5 2
bt mem16,reg16 6 22
bt mem16,imm8 6 2
btc reg64,reg64 4 2
btc reg64,imm8 5 2
btc mem16,imm8 6 3
btr reg64,reg64 4 2
btr reg64,imm8 5 2
btr mem16,imm8 6 3
bts reg64,reg64 4 2
bts reg64,imm8 5 2
bts mem16,imm8 6 3
call reg64 2 78
cbw 2 4
cdq 1 5
clc 1 1
cld 1 18
cmp reg64,reg64 3 1
cmp reg64,imm8 4 1
cmp mem64,reg64 6 2
cmp mem64,imm8 7 2
cmpsb 1 18
cmpsw 2 18
cmpsd 1 18
cmpxchg reg64,reg64 4 22
cmpxchg mem64,reg64 7 18
cwd 2 5
cwde 1 5
dec reg8 2 1
dec reg64 3 1
dec mem8 2 5
dec mem64 6 5
div reg64 8 55
enter imm8,imm8 4 36
idiv reg8 5 64
idiv reg16 8 45
idiv reg32 7 39
imul reg8 2 13
imul reg16 3 18
imul reg32 2 19
imul reg64 3 14
imul mem8 2 13
imul mem16 4 18
imul mem32 4 19
imul mem64 6 13
imul reg16,reg16 4 5
imul reg32,reg32 3 5
imul reg64,reg64 4 5
imul reg16,reg16,imm8 4 5
imul reg32,reg32,imm8 3 4
imul reg64,reg64,imm8 4 5
inc reg8 2 1
inc reg64 3 1
inc mem8 2 5
inc mem64 6 5
lahf 1 13
lar reg16,reg16 4 562
lar reg32,reg32 3 559
lea reg64,mem64 6 1
lodsb 1 5
lodsw 2 5
lodsd 1 5
mov reg64,reg64 3 1
mov reg64,mem64 6 2
mov reg64,imm8 7 1
mov mem64,reg64 6 2
mov mem64,imm8 10 2
movsb 1 18
movsw 2 18
movsd 1 18
movsx reg32,reg8 3 1
movsx reg32,mem8 4 2
movsx reg64,reg16 4 1
movsx reg64,mem16 5 2
movzx reg32,reg8 3 1
movzx reg32,mem8 4 2
movzx reg64,reg16 4 1
movzx reg64,mem16 5 2
mul reg8 2 13
mul reg16 3 18
mul reg32 2 19
mul reg64 3 14
mul mem8 2 13
mul mem16 4 18
mul mem32 4 19
mul mem64 6 13
neg reg8 2 1
neg reg64 3 1
neg mem8 2 5
neg mem64 6 4
nop 1 1
not reg8 2 1
not reg64 3 1
not mem32 4 5
not mem64 6 5
or reg8,reg8 2 1
or reg64,reg64 3 1
or reg64,mem64 6 2
or reg64,imm8 4 1
or mem8,reg8 3 5
or mem64,reg64 6 4
or mem64,imm8 7 4
pop reg64 1 2
popfq 4 96
push reg64 1 2
push mem64 2 5
pushfq 1 5
rcl reg8,imm8 2 7
rcl reg64,imm8 3 7
rcl reg8,reg8 2 27
rcl reg64,reg8 3 27
rcl mem8,imm8 2 10
rcl mem64,imm8 6 10
rcr reg8,imm8 2 7
rcr reg64,imm8 3 7
rcr reg8,reg8 2 33
rcr reg64,reg8 3 23
rcr mem8,imm8 2 10
rcr mem64,imm8 6 10
rol reg8,imm8 2 5
rol reg64,imm8 3 5
rol reg8,reg8 2 5
rol reg64,reg8 3 5
rol mem8,imm8 2 5
rol mem64,imm8 6 5
ror reg8,imm8 2 5
ror reg64,imm8 3 5
ror reg8,reg8 2 5
ror reg64,reg8 3 5
ror mem8,imm8 2 5
ror mem64,imm8 6 5
shl reg8,imm8 2 2
shl reg64,imm8 3 2
shl reg8,reg8 2 5
shl reg64,reg8 3 5
shl mem8,imm8 2 5
shl mem64,imm8 6 5
sar reg8,imm8 2 2
sar reg64,imm8 3 2
sar reg8,reg8 2 5
sar reg64,reg8 3 5
sar mem8,imm8 2 5
sar mem64,imm8 6 6
sbb reg8,reg8 2 4
sbb reg64,reg64 3 4
sbb reg8,imm8 3 3
sbb reg64,imm8 4 3
sbb mem8,reg8 3 6
sbb mem64,reg64 6 6
sbb mem8,imm8 3 8
sbb mem64,imm8 7 8
scasb 1 5
scasw 2 5
scasd 1 5
setae reg8 3 2
setae mem8 3 5
setb reg8 3 2
setb mem8 3 5
setg reg8 3 2
setg mem8 3 5
setng reg8 3 2
setng mem8 3 5
shr reg8,imm8 2 2
shr reg64,imm8 3 2
shr reg8,reg8 2 5
shr reg64,reg8 3 5
shr mem8,imm8 2 5
shr mem64,imm8 6 4
shld reg16,reg16,imm8 5 5
shld reg32,reg32,imm8 4 5
shld reg64,reg64,imm8 5 5
shld mem16,reg16,imm8 7 5
shld mem64,reg64,imm8 8 5
shrd reg16,reg16,imm8 5 5
shrd reg32,reg32,imm8 4 5
shrd reg64,reg64,imm8 5 5
shrd mem16,reg16,imm8 7 5
shrd mem64,reg64,imm8 8 5
smsw reg32 3 3108
smsw reg64 4 4187
stc 1 1
std 1 24
stosb 1 5
stosw 2 5
stosd 1 5
str reg16 4 3049
str reg32 3 3934
sub reg8,reg8 2 1
sub reg64,reg64 3 1
sub reg64,imm8 4 1
sub reg64,mem64 6 2
sub mem64,imm8 7 4
sub mem64,reg64 6 4
test reg8,reg8 2 1
test reg64,reg64 3 1
test reg64,imm8 6 1
test reg64,mem64 6 2
test mem64,imm8 10 2
test mem64,reg64 6 2
xchg reg8,reg8 2 5
xchg reg64,reg64 2 5
xchg reg64,mem64 6 83
xchg mem64,reg64 6 83
xlat 1 30
xlatb 1 30
xor reg8,reg8 2 1
xor reg64,reg64 3 1
xor reg64,mem64 6 2
xor reg64,imm8 4 1
xor mem8,reg8 3 5
xor mem64,reg64 6 4
xor mem64,imm8 7 4
\Masm32\Members\Nidud\it>InstructionTiming.exe
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (AVX)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
"asmc64" non รจ riconosciuto come comando interno o esterno,
un programma eseguibile o un file batch.
error reading: bin\mov_reg64_reg64.bin
Interesting stuff. I'd like to see the difference between movsx eax, byte ptr [reg32] and the simple mov al, [reg
32]
Just copy asmc64.exe to InstructionTiming-folder for testing.
Hopefully we see other Ryzen series too.
Is 12th intel available at someone ?
Quote from: TimoVJL on November 23, 2021, 09:50:05 AM
Just copy asmc64.exe to InstructionTiming-folder for testing.
Yep, that worked. Nidud probably uses environment variables instead of a simple \Masm32\bin\Asmc64.exe :cool:
deleted
Odd results
Instr. Operands Bytes Intel AMD Intel 11th
smsw reg32 3 34 12 3108
smsw reg64 4 33 13 4187
...
str reg16 4 25 13 3049
str reg32 3 26 12 3934
...
Attached Excel macrotable for collecting results.
Quote from: hutch-- on November 23, 2021, 07:42:13 AM
Something I have learnt long ago is that different hardware gives you wide ranges or variation on the same code. I once had an AMD CPU box that had some really fast instructions but equally some really slow ones back then compared with early Intel CPUs of the sme era.
Old knowledge like Pentium u and V execution units, also in help files,useful in dos16bit, because you know most people will run dosbox that emulates old Pentium,but I am unsure about Movsd/stosd vs fpu doing REAL10 memory moves is best?
Fpu was faster on amd,while Intel developed newer SSE instructions
So I am interested in timings of .alternative like fild/Fidiv/fistp which let you use eax,edx for other purposes vs
pushes to preserve gp registers,div,pop registers?
Stosq 64 bit version?
The faster register preservation : Movd xmm,eax /Movd eax,xmm vs push eax/pop eax?
Quote from: nidud on November 23, 2021, 12:17:00 PM
Quote from: jj2007 on November 23, 2021, 09:37:59 AM
Interesting stuff. I'd like to see the difference between movsx eax, byte ptr [reg32] and the simple mov al, [reg
32]
The test is 64-bit so that wont work.
Oh, really :tongue:
0000000140001232 | 48:BE 5A1D004001000000 | mov rsi,140001D5A | 140001D5A:"Test"
000000014000123C | 8A06 | mov al,[rsi] |
QuoteQuote from: jj2007 on November 23, 2021, 11:34:58 AM
Yep, that worked. Nidud probably uses environment variables instead of a simple \Masm32\bin\Asmc64.exe :cool:
Asmc64 is not part of Masm32 so that wont work.
1. Putting Asmc64 in \Masm32\bin\Asmc64.exe works perfectly.
2. Copying AsmC64.exe in every project folder works less perfectly.
3. Environment variables are stone age.
Quote from: jj2007 on November 23, 2021, 09:37:59 AMI'd like to see the difference between movsx eax, byte ptr [reg32] and the simple mov al, [reg32]
Core i5
1560 ticks for mov al+inc
1201 ticks for movzx+inc
1701 ticks for lodsb
This program was assembled with UAsm64 in 64-bit format
1607 ticks for mov al+inc
1248 ticks for movzx+inc
1778 ticks for lodsb
This program was assembled with UAsm64 in 32-bit format
AMD Ryzen 5 3400G with Radeon Vega Graphics (SSE4)719 ticks for mov al+inc
594 ticks for movzx+inc
1515 ticks for lodsb
This program was assembled with UAsm64 in 32-bit format.
719 ticks for mov al+inc
640 ticks for movzx+inc
1500 ticks for lodsb
This program was assembled with UAsm64 in 64-bit format.
deleted
Quote from: TimoVJL on November 23, 2021, 11:23:42 PM
AMD Ryzen 5 3400G
As already seen in the other thread, AMD sucks for lodsb :cool:
594 ticks for movzx+inc
1515 ticks for lodsb
Mine is old Zen 2 gen, 12 nm, with integral graphics.
We still wait Zen 3 gen results.
Ryzen 5 3400G is Zen+.
Attached is Zen 2 (AMD Ryzen 7 3700X)
Thanks.
Quote from: Greenhorn on November 24, 2021, 11:39:01 AM
Ryzen 5 3400G is Zen+.
Yes, Zen+ based (2nd generation)
Instr. Operands Bytes Intel Intel AMD AMD
--------- ----------------- ------ ------ ------ ------
lodsb 1 4 5 4 12 6
lodsw 2 4 5 4 12 6
lodsd 1 4 5 3 12 12
...
xchg reg64,mem64 6 77 83 75 119 68
xchg mem64,reg64 6 77 83 75 119 68
Something missing in the ZIP file.
Intel(R) Core(TM) i7-5820K CPU @ 3.30GHz (AVX2)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
error reading: bin\mov_reg64_reg64.bin
That test needs asmc64.exe too, in path or in same folder.
Quote from: TimoVJL on November 24, 2021, 05:10:16 PM
Thanks.Quote from: Greenhorn on November 24, 2021, 11:39:01 AM
Ryzen 5 3400G is Zen+.
Yes, Zen+ based (2nd generation)
Sorry, I missed the period after the "2". My eyes get bad.
deleted
You guys worry me, a working executable file should be trivial to make.
Intel(R) Core(TM) i7-5820K CPU @ 3.30GHz (AVX2)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
The system cannot find the path specified.
error reading: bin\mov_reg64_reg64.bin
A:\InstructionTiming>
deleted
deleted
@hutch--
Pre-created binaries with empty asmc64.cmd.
Perhaps now you can have results for excel table.
Hi Timo,
I downloaded the Excel reader you posted, had a look at the table but deleted it after as it was squarking crap with a nag screen. At least I understood what the data was and the CPU comparisons it contained.
Ok,
Attached results in tabular format and tiny viewer for it.
Thanks,
That worked OK.
deleted
deleted
Wrong file I guess
deleted
That worked but what happened to working executables that made sense ?
My antique Haswell i7 clocked at 4 gig.
name op1 op2 op3 imm-value
adc reg64 reg64 0 0
adc reg64 mem64 0 0
adc reg64 imm8 0 1
adc mem64 reg64 0 0
adc mem64 imm8 0 1
add reg64 reg64 0 0
add reg64 mem64 0 0
add reg64 imm8 0 1
add mem64 reg64 0 0
add mem64 imm8 0 1
and reg64 reg64 0 0
and reg64 mem64 0 0
and reg64 imm8 0 1
and mem64 reg64 0 0
and mem64 imm8 0 1
bsf reg64 reg64 0 0
bsf reg64 mem64 0 0
bsr reg64 reg64 0 0
bsr reg64 mem64 0 0
bswap reg32 0 0 0
bswap reg64 0 0 0
bt reg64 reg64 0 0
bt reg64 imm8 0 1
bt mem16 reg16 0 0
bt mem16 imm8 0 1
btc reg64 reg64 0 0
btc reg64 imm8 0 1
btc mem16 imm8 0 1
btr reg64 reg64 0 0
btr reg64 imm8 0 1
btr mem16 imm8 0 1
bts reg64 reg64 0 0
bts reg64 imm8 0 1
bts mem16 imm8 0 1
call reg64 0 0 0 ; *
cbw 0 0 0 0
cdq 0 0 0 0
clc 0 0 0 0
cld 0 0 0 0
;cmovz reg64 reg64 0 0
;cmovnz reg64 reg64 0 0
cmp reg64 reg64 0 0
cmp reg64 imm8 0 1
cmp mem64 reg64 0 0
cmp mem64 imm8 0 1
cmpsb 0 0 0 0
cmpsw 0 0 0 0
cmpsd 0 0 0 0
cmpxchg reg64 reg64 0 0
cmpxchg mem64 reg64 0 0
cwd 0 0 0 0
cwde 0 0 0 0
dec reg8 0 0 0
dec reg64 0 0 0
dec mem8 0 0 0
dec mem64 0 0 0
div reg64 0 0 0 ;*
enter imm8 imm8 0 8 ;*
;idiv reg8 0 0 0 ;*
;idiv reg16 0 0 0 ;*
idiv reg32 0 0 0 ;*
;idiv reg64 0 0 0 ;*
;idiv mem8 0 0 0 ;*
;idiv mem16 0 0 0 ;*
;idiv mem32 0 0 0 ;*
;idiv mem64 0 0 0 ;*
imul reg8 0 0 0
imul reg16 0 0 0
imul reg32 0 0 0
imul reg64 0 0 0
imul mem8 0 0 0
imul mem16 0 0 0
imul mem32 0 0 0
imul mem64 0 0 0
imul reg16 reg16 0 0
imul reg32 reg32 0 0
imul reg64 reg64 0 0
imul reg16 reg16 imm8 2
imul reg32 reg32 imm8 2
imul reg64 reg64 imm8 2
inc reg8 0 0 0
inc reg64 0 0 0
inc mem8 0 0 0
inc mem64 0 0 0
lahf 0 0 0 0
lar reg16 reg16 0 0
lar reg32 reg32 0 0
lea reg64 mem64 0 0
lodsb 0 0 0 0
lodsw 0 0 0 0
lodsd 0 0 0 0
mov reg64 reg64 0 0
mov reg64 mem64 0 0
mov reg64 imm8 0 10
mov mem64 reg64 0 0
mov mem64 imm8 0 0
movsb 0 0 0 0
movsw 0 0 0 0
movsd 0 0 0 0
movsx reg32 reg8 0 0
movsx reg32 mem8 0 0
movsx reg64 reg16 0 0
movsx reg64 mem16 0 0
movzx reg32 reg8 0 0
movzx reg32 mem8 0 0
movzx reg64 reg16 0 0
movzx reg64 mem16 0 0
mul reg8 0 0 0
mul reg16 0 0 0
mul reg32 0 0 0
mul reg64 0 0 0
mul mem8 0 0 0
mul mem16 0 0 0
mul mem32 0 0 0
mul mem64 0 0 0
neg reg8 0 0 0
neg reg64 0 0 0
neg mem8 0 0 0
neg mem64 0 0 0
nop 0 0 0 0
not reg8 0 0 0
not reg64 0 0 0
not mem32 0 0 0
not mem64 0 0 0
or reg8 reg8 0 0
or reg64 reg64 0 0
or reg64 mem64 0 0
or reg64 imm8 0 8
or mem8 reg8 0 0
or mem64 reg64 0 0
or mem64 imm8 0 8
pop reg64 0 0 0
popfq 0 0 0 0
push reg64 0 0 0
push mem64 0 0 0
pushfq 0 0 0 0
rcl reg8 imm8 0 1
rcl reg64 imm8 0 1
rcl reg8 regcl 0 0
rcl reg64 regcl 0 0
rcl mem8 imm8 0 1
rcl mem64 imm8 0 1
rcr reg8 imm8 0 1
rcr reg64 imm8 0 1
rcr reg8 regcl 0 0
rcr reg64 regcl 0 0
rcr mem8 imm8 0 1
rcr mem64 imm8 0 1
rol reg8 imm8 0 1
rol reg64 imm8 0 1
rol reg8 regcl 0 0
rol reg64 regcl 0 0
rol mem8 imm8 0 1
rol mem64 imm8 0 1
ror reg8 imm8 0 1
ror reg64 imm8 0 1
ror reg8 regcl 0 0
ror reg64 regcl 0 0
ror mem8 imm8 0 1
ror mem64 imm8 0 1
shl reg8 imm8 0 1
shl reg64 imm8 0 1
shl reg8 regcl 0 0
shl reg64 regcl 0 0
shl mem8 imm8 0 1
shl mem64 imm8 0 1
sar reg8 imm8 0 1
sar reg64 imm8 0 1
sar reg8 regcl 0 0
sar reg64 regcl 0 0
sar mem8 imm8 0 1
sar mem64 imm8 0 1
sbb reg8 reg8 0 0
sbb reg64 reg64 0 0
sbb reg8 imm8 0 1
sbb reg64 imm8 0 1
sbb mem8 reg8 0 0
sbb mem64 reg64 0 0
sbb mem8 imm8 0 1
sbb mem64 imm8 0 1
scasb 0 0 0 0
scasw 0 0 0 0
scasd 0 0 0 0
setae reg8 0 0 0
setae mem8 0 0 0
setb reg8 0 0 0
setb mem8 0 0 0
setg reg8 0 0 0
setg mem8 0 0 0
setng reg8 0 0 0
setng mem8 0 0 0
shr reg8 imm8 0 1
shr reg64 imm8 0 1
shr reg8 regcl 0 0
shr reg64 regcl 0 0
shr mem8 imm8 0 1
shr mem64 imm8 0 1
shld reg16 reg16 imm8 1
shld reg32 reg32 imm8 1
shld reg64 reg64 imm8 1
shld mem16 reg16 imm8 1
shld mem64 reg64 imm8 1
shrd reg16 reg16 imm8 1
shrd reg32 reg32 imm8 1
shrd reg64 reg64 imm8 1
shrd mem16 reg16 imm8 1
shrd mem64 reg64 imm8 1
smsw reg32 0 0 0
smsw reg64 0 0 0
stc 0 0 0 0
std 0 0 0 0 ; *
stosb 0 0 0 0
stosw 0 0 0 0
stosd 0 0 0 0
str reg16 0 0 0
str reg32 0 0 0
sub reg8 reg8 0 0
sub reg64 reg64 0 0
sub reg64 imm8 0 0
sub reg64 mem64 0 0
sub mem64 imm8 0 0
sub mem64 reg64 0 0
test reg8 reg8 0 0
test reg64 reg64 0 0
test reg64 imm8 0 0
test reg64 mem64 0 0
test mem64 imm8 0 0
test mem64 reg64 0 0
xchg reg8 reg8 0 0
xchg reg64 reg64 0 0
xchg reg64 mem64 0 0
xchg mem64 reg64 0 0
xlat 0 0 0 0 ; *
xlatb 0 0 0 0 ; *
xor reg8 reg8 0 0
xor reg64 reg64 0 0
xor reg64 mem64 0 0
xor reg64 imm8 0 8
xor mem8 reg8 0 0
xor mem64 reg64 0 0
xor mem64 imm8 0 8
movd reg32 reg32 0 0
movd reg64 reg64 0 0
We could collect result to files for later processing.
InstructionTiming.exe > Result.txt
InstructionTiming.exe simd.txt > ResultSIMD.txt
exit
AMD Ryzen 5 3400G with Radeon Vega Graphics (AVX2)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
addpd reg128,reg128 4 9
addps reg128,reg128 3 9
addsd reg128,reg128 4 9
addss reg128,reg128 4 9
andnpd reg128,reg128 4 1
andnps reg128,reg128 3 1
andpd reg128,reg128 4 3
andps reg128,reg128 3 3
comisd reg128,reg128 4 4
comiss reg128,reg128 3 4
cvtsd2si reg64,reg128 5 4
cvtsi2sd reg128,reg64 5 4
cvtsi2ss reg128,reg32 4 4
cvtss2si reg32,reg128 4 4
divpd reg128,reg128 4 23
divps reg128,reg128 3 29
divsd reg128,reg128 4 23
divss reg128,reg128 4 29
maxpd reg128,reg128 4 3
maxps reg128,reg128 3 3
maxsd reg128,reg128 4 3
maxss reg128,reg128 4 3
minpd reg128,reg128 4 3
minps reg128,reg128 3 3
minsd reg128,reg128 4 3
minss reg128,reg128 4 3
movapd reg128,reg128 4 1
movaps reg128,reg128 3 1
movaps reg128,reg128 3 1
movd reg32,reg128 4 4
movdqa reg128,reg128 4 1
movdqu reg128,reg128 4 1
movhlps reg128,reg128 3 3
movhpd reg128,mem128 7 2
movhps reg128,mem128 6 2
movlhps reg128,reg128 3 3
movlpd reg128,mem128 7 2
movlps reg128,mem128 6 2
movntdq mem64,reg128 7 10
movntpd mem64,reg128 7 10
movntps mem64,reg128 6 11
movq reg64,reg128 5 5
movsd reg128,reg128 4 2
movss reg128,reg128 4 2
movupd reg128,reg128 4 2
movups reg128,reg128 3 2
mulpd reg128,reg128 4 15
mulps reg128,reg128 3 11
mulsd reg128,reg128 4 15
mulss reg128,reg128 4 11
orpd reg128,reg128 4 4
orps reg128,reg128 3 4
sqrtpd reg128,reg128 4 20
sqrtps reg128,reg128 3 19
sqrtsd reg128,reg128 4 19
sqrtss reg128,reg128 4 17
subpd reg128,reg128 4 9
subps reg128,reg128 3 9
subsd reg128,reg128 4 9
subss reg128,reg128 4 9
xorpd reg128,reg128 4 1
xorps reg128,reg128 3 1
Quote from: nidud on December 02, 2021, 01:04:23 PM
It takes input files so drop the simd.txt file on it.
Very smart for me :biggrin:
Intel(R) Core(TM) i3-10100 CPU @ 3.60GHz (AVX2)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
addpd reg128,reg128 4 10
addps reg128,reg128 3 10
addsd reg128,reg128 4 10
addss reg128,reg128 4 10
andnpd reg128,reg128 4 3
andnps reg128,reg128 3 3
andpd reg128,reg128 4 2
andps reg128,reg128 3 3
comisd reg128,reg128 4 3
comiss reg128,reg128 3 3
cvtsd2si reg64,reg128 5 3
cvtsi2sd reg128,reg64 5 3
cvtsi2ss reg128,reg32 4 3
cvtss2si reg32,reg128 4 3
divpd reg128,reg128 4 34
divps reg128,reg128 3 28
divsd reg128,reg128 4 33
divss reg128,reg128 4 28
maxpd reg128,reg128 4 10
maxps reg128,reg128 3 10
maxsd reg128,reg128 4 10
maxss reg128,reg128 4 10
minpd reg128,reg128 4 10
minps reg128,reg128 3 10
minsd reg128,reg128 4 10
minss reg128,reg128 4 10
movapd reg128,reg128 4 1
movaps reg128,reg128 3 1
movaps reg128,reg128 3 1
movd reg32,reg128 4 3
movdqa reg128,reg128 4 1
movdqu reg128,reg128 4 1
movhlps reg128,reg128 3 3
movhpd reg128,mem128 7 3
movhps reg128,mem128 6 3
movlhps reg128,reg128 3 3
movlpd reg128,mem128 7 3
movlps reg128,mem128 6 3
movntdq mem64,reg128 7 5
movntpd mem64,reg128 7 5
movntps mem64,reg128 6 5
movq reg64,reg128 5 3
movsd reg128,reg128 4 3
movss reg128,reg128 4 3
movupd reg128,reg128 4 1
movups reg128,reg128 3 1
mulpd reg128,reg128 4 10
mulps reg128,reg128 3 10
mulsd reg128,reg128 4 10
mulss reg128,reg128 4 10
orpd reg128,reg128 4 3
orps reg128,reg128 3 3
sqrtpd reg128,reg128 4 15
sqrtps reg128,reg128 3 10
sqrtsd reg128,reg128 4 34
sqrtss reg128,reg128 4 31
subpd reg128,reg128 4 10
subps reg128,reg128 3 10
subsd reg128,reg128 4 10
subss reg128,reg128 4 10
xorpd reg128,reg128 4 1
xorps reg128,reg128 3 1
Intel(R) Core(TM) i3-10100 AMD Ryzen 5 3400G 11th Gen Intel(R) Core(TM) i7-11800H
------------------------- -------
Instr. Operands Bytes
------------------------- -------
addpd reg128,reg128 4 10 9 13
addps reg128,reg128 3 10 9 13
addsd reg128,reg128 4 10 9 13
addss reg128,reg128 4 10 9 13
andnpd reg128,reg128 4 3 1 3
andnps reg128,reg128 3 3 1 3
andpd reg128,reg128 4 2 3 3
andps reg128,reg128 3 3 3 3
comisd reg128,reg128 4 3 4 4
comiss reg128,reg128 3 3 4 4
cvtsd2sireg64,reg128 5 3 4 4
cvtsi2sdreg128,reg64 5 3 4 4
cvtsi2ssreg128,reg32 4 3 4 4
cvtss2sireg32,reg128 4 3 4 4
divpd reg128,reg128 4 34 23 43
divps reg128,reg128 3 28 29 36
divsd reg128,reg128 4 33 23 43
divss reg128,reg128 4 28 29 36
maxpd reg128,reg128 4 10 3 13
maxps reg128,reg128 3 10 3 13
maxsd reg128,reg128 4 10 3 13
maxss reg128,reg128 4 10 3 13
minpd reg128,reg128 4 10 3 13
minps reg128,reg128 3 10 3 13
minsd reg128,reg128 4 10 3 13
minss reg128,reg128 4 10 3 13
movapd reg128,reg128 4 1 1 1
movaps reg128,reg128 3 1 1 1
movaps reg128,reg128 3 1 1 1
movd reg32,reg128 4 3 4 4
movdqa reg128,reg128 4 1 1 1
movdqu reg128,reg128 4 1 1 1
movhlps reg128,reg128 3 3 3 4
movhpd reg128,mem128 7 3 2 4
movhps reg128,mem128 6* 3 2 4
movlhps reg128,reg128 3 3 3 4
movlpd reg128,mem128 7 3 2 2
movlps reg128,mem128 6 3 2 2
movntdq mem64,reg128 7 5 10 7
movntpd mem64,reg128 7 5 10 7
movntps mem64,reg128 6 5 11 8
movq reg64,reg128 5 3 5 4
movsd reg128,reg128 4 3 2 3
movss reg128,reg128 4 3 2 3
movupd reg128,reg128 4 1 2 1
movups reg128,reg128 3 1 2 1
mulpd reg128,reg128 4 10 15 13
mulps reg128,reg128 3 10 11 13
mulsd reg128,reg128 4 10 15 13
mulss reg128,reg128 4 10 11 13
orpd reg128,reg128 4 3 4 3
orps reg128,reg128 3 3 4 3
sqrtpd reg128,reg128 4 15 20 20
sqrtps reg128,reg128 3 10 19 13
sqrtsd reg128,reg128 4* 34 19 44
sqrtss reg128,reg128 4* 31 17 39
subpd reg128,reg128 4* 10 9 13
subps reg128,reg128 3* 10 9 13
subsd reg128,reg128 4* 10 9 13
subss reg128,reg128 4 10 9 13
xorpd reg128,reg128 4 1 1 1
xorps reg128,reg128 3 1 1 1
11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz (AVX512)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
adc reg64,reg64 3 4
adc reg64,mem128 6 3
adc reg64,imm8 4 3
adc mem128,reg64 6 6
adc mem128,imm8 7 8
add reg64,reg64 3 2
add reg64,mem128 6 2
add reg64,imm8 4 1
add mem128,reg64 6 4
add mem128,imm8 7 4
and reg64,reg64 3 1
and reg64,mem128 6 2
and reg64,imm8 4 1
and mem128,reg64 6 4
and mem128,imm8 7 4
bsf reg64,reg64 4 5
bsf reg64,mem128 7 5
bsr reg64,reg64 4 5
bsr reg64,mem128 7 5
bswap reg32 2 2
bswap reg64 3 5
bt reg64,reg64 4 2
bt reg64,imm8 5 2
bt mem16,reg16 6 22
bt mem16,imm8 6 2
btc reg64,reg64 4 2
btc reg64,imm8 5 2
btc mem16,imm8 6 3
btr reg64,reg64 4 2
btr reg64,imm8 5 2
btr mem16,imm8 6 3
bts reg64,reg64 4 2
bts reg64,imm8 5 2
bts mem16,imm8 6 3
call reg64 2 78
cbw 2 4
cdq 1 5
clc 1 1
cld 1 18
cmp reg64,reg64 3 1
cmp reg64,imm8 4 1
cmp mem128,reg64 6 2
cmp mem128,imm8 7 2
cmpsb 1 18
cmpsw 2 18
cmpsd 1 18
cmpxchg reg64,reg64 4 22
cmpxchg mem128,reg64 7 18
cwd 2 5
cwde 1 5
dec reg8 2 1
dec reg64 3 1
dec mem8 2 5
dec mem128 6 5
div reg64 8 57
enter imm8,imm8 4 36
idiv reg32 7 40
imul reg8 2 13
imul reg16 3 19
imul reg32 2 19
imul reg64 3 14
imul mem8 2 13
imul mem16 4 18
imul mem32 4 19
imul mem128 6 13
imul reg16,reg16 4 5
imul reg32,reg32 3 5
imul reg64,reg64 4 5
imul reg16,reg16,imm8 4 5
imul reg32,reg32,imm8 3 5
imul reg64,reg64,imm8 4 5
inc reg8 2 1
inc reg64 3 1
inc mem8 2 5
inc mem128 6 5
lahf 1 13
lar reg16,reg16 4 572
lar reg32,reg32 3 568
lea reg64,mem128 6 1
lodsb 1 5
lodsw 2 5
lodsd 1 5
mov reg64,reg64 3 1
mov reg64,mem128 6 2
mov reg64,imm8 7 1
mov mem128,reg64 6 2
mov mem128,imm8 10 2
movsb 1 18
movsw 2 18
movsd 1 18
movsx reg32,reg8 3 1
movsx reg32,mem8 4 2
movsx reg64,reg16 4 1
movsx reg64,mem16 5 2
movzx reg32,reg8 3 1
movzx reg32,mem8 4 2
movzx reg64,reg16 4 1
movzx reg64,mem16 5 2
mul reg8 2 13
mul reg16 3 19
mul reg32 2 19
mul reg64 3 14
mul mem8 2 13
mul mem16 4 18
mul mem32 4 19
mul mem128 6 13
neg reg8 2 1
neg reg64 3 1
neg mem8 2 5
neg mem128 6 4
nop 1 1
not reg8 2 1
not reg64 3 1
not mem32 4 5
not mem128 6 5
or reg8,reg8 2 1
or reg64,reg64 3 1
or reg64,mem128 6 2
or reg64,imm8 4 1
or mem8,reg8 3 5
or mem128,reg64 6 4
or mem128,imm8 7 4
pop reg64 1 2
popfq 4 98
push reg64 1 2
simd
11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz (AVX512)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
addpd reg128,reg128 4 13
addps reg128,reg128 3 13
addsd reg128,reg128 4 13
addss reg128,reg128 4 13
andnpd reg128,reg128 4 3
andnps reg128,reg128 3 3
andpd reg128,reg128 4 3
andps reg128,reg128 3 3
comisd reg128,reg128 4 4
comiss reg128,reg128 3 4
cvtsd2si reg64,reg128 5 4
cvtsi2sd reg128,reg64 5 4
cvtsi2ss reg128,reg32 4 4
cvtss2si reg32,reg128 4 4
divpd reg128,reg128 4 43
divps reg128,reg128 3 36
divsd reg128,reg128 4 43
divss reg128,reg128 4 36
maxpd reg128,reg128 4 13
maxps reg128,reg128 3 13
maxsd reg128,reg128 4 13
maxss reg128,reg128 4 13
minpd reg128,reg128 4 13
minps reg128,reg128 3 13
minsd reg128,reg128 4 13
minss reg128,reg128 4 13
movapd reg128,reg128 4 1
movaps reg128,reg128 3 1
movaps reg128,reg128 3 1
movd reg32,reg128 4 4
movdqa reg128,reg128 4 1
movdqu reg128,reg128 4 1
movhlps reg128,reg128 3 4
movhpd reg128,mem128 7 4
movhps reg128,mem128 6 4
movlhps reg128,reg128 3 4
movlpd reg128,mem128 7 2
movlps reg128,mem128 6 2
movntdq mem64,reg128 7 7
movntpd mem64,reg128 7 7
movntps mem64,reg128 6 8
movq reg64,reg128 5 4
movsd reg128,reg128 4 3
movss reg128,reg128 4 3
movupd reg128,reg128 4 1
movups reg128,reg128 3 1
mulpd reg128,reg128 4 13
mulps reg128,reg128 3 13
mulsd reg128,reg128 4 13
mulss reg128,reg128 4 13
orpd reg128,reg128 4 3
orps reg128,reg128 3 3
sqrtpd reg128,reg128 4 20
sqrtps reg128,reg128 3 13
sqrtsd reg128,reg128 4 44
sqrtss reg128,reg128 4 39
subpd reg128,reg128 4 13
subps reg128,reg128 3 13
subsd reg128,reg128 4 13
subss reg128,reg128 4 13
xorpd reg128,reg128 4 1
xorps reg128,reg128 3 1
deleted
Fantastic Nidud!!
So many instructions I don't have a clue :thumbsup:
Intel(R) Core(TM) i3-10100 CPU @ 3.60GHz (AVX2)
------------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------------
vbroadcasti128 reg256,mem128 8 2
vbroadcastsd reg256,reg128 5 3
vbroadcastss reg128,reg128 5 3
vbroadcastss reg256,reg128 5 3
vcvtsd2si reg32,mem64 7 4
vcvtsd2si reg32,reg128 4 3
vcvtsd2si reg64,mem64 8 4
vcvtsd2si reg64,reg128 5 3
vcvtss2si reg32,mem32 6 4
vcvtss2si reg32,reg128 4 3
vcvtss2si reg64,mem32 7 4
vcvtss2si reg64,reg128 5 4
vcvttsd2si reg32,mem64 7 4
vcvttsd2si reg32,reg128 4 3
vcvttsd2si reg64,mem64 8 4
vcvttss2si reg32,mem32 6 4
vcvttss2si reg32,reg128 4 3
vcvttss2si reg64,mem32 7 4
vcvttss2si reg64,reg128 5 4
vextracti128 mem128,reg256,imm8 9 4
vextracti128 reg128,reg256,imm8 6 3
vinserti128 reg256,reg256,mem128,imm8 9 2
vinserti128 reg256,reg256,reg128,imm8 6 3
vmovd reg32,reg128 4 3
vmovd mem32,reg128 6 4
vmovd mem64,reg128 8 4
vmovd reg64,reg128 5 3
vmovd reg128,reg32 4 3
vmovd reg128,mem32 6 2
vmovd reg128,mem64 8 2
vmovd reg128,reg64 5 3
vmovntdqa reg256,mem256 8 2
vmovq mem64,reg128 7 4
vmovq reg128,mem64 7 2
vmpsadbw reg256,reg256,reg256,imm8 6 7
vmpsadbw reg256,reg256,mem256,imm8 9 7
vpabsb reg256,mem256 8 2
vpabsb reg256,reg256 5 2
vpabsd reg256,mem256 8 2
vpabsd reg256,reg256 5 2
vpabsw reg256,mem256 8 2
vpabsw reg256,reg256 5 2
vpackssdw reg256,reg256,reg256 4 3
vpackssdw reg256,reg256,mem256 7 4
vpacksswb reg256,reg256,reg256 4 3
vpacksswb reg256,reg256,mem256 7 3
vpackusdw reg256,reg256,reg256 5 3
vpackusdw reg256,reg256,mem256 8 3
vpackuswb reg256,reg256,reg256 4 3
vpackuswb reg256,reg256,mem256 7 3
vpaddb reg256,reg256,reg256 4 1
vpaddb reg256,reg256,mem256 7 2
vpaddd reg256,reg256,reg256 4 1
vpaddd reg256,reg256,mem256 7 2
vpaddq reg256,reg256,reg256 4 1
vpaddq reg256,reg256,mem256 7 2
vpaddsb reg256,reg256,reg256 4 2
vpaddsb reg256,reg256,mem256 7 2
vpaddsw reg256,reg256,reg256 4 2
vpaddsw reg256,reg256,mem256 7 2
vpaddusb reg256,reg256,reg256 4 2
vpaddusb reg256,reg256,mem256 7 2
vpaddusw reg256,reg256,reg256 4 2
vpaddusw reg256,reg256,mem256 7 2
vpaddw reg256,reg256,reg256 4 1
vpaddw reg256,reg256,mem256 7 2
vpalignr reg256,reg256,reg256,imm8 6 3
vpalignr reg256,reg256,mem256,imm8 9 4
vpand reg256,reg256,reg256 4 1
vpand reg256,reg256,mem256 7 2
vpandn reg256,reg256,reg256 4 1
vpandn reg256,reg256,mem256 7 2
vpavgb reg256,reg256,reg256 4 2
vpavgb reg256,reg256,mem256 7 2
vpavgw reg256,reg256,reg256 4 2
vpavgw reg256,reg256,mem256 7 2
vpblendd reg128,reg128,mem128,imm8 9 2
vpblendd reg128,reg128,reg128,imm8 6 1
vpblendd reg256,reg256,reg256,imm8 6 1
vpblendd reg256,reg256,mem256,imm8 9 2
vpblendvb reg256,reg256,reg256,reg256 6 4
vpblendvb reg256,reg256,mem256,reg256 9 4
vpblendw reg256,reg256,reg256,imm8 6 3
vpblendw reg256,reg256,mem256,imm8 9 4
vpbroadcastb reg128,mem8 5 3
vpbroadcastb reg128,reg128 5 3
vpbroadcastb reg256,mem8 5 3
vpbroadcastb reg256,reg128 5 4
vpbroadcastd reg128,mem32 7 2
vpbroadcastd reg128,reg128 5 3
vpbroadcastd reg256,mem32 7 2
vpbroadcastd reg256,reg128 5 3
vpbroadcastq reg128,mem64 8 2
vpbroadcastq reg128,reg128 5 3
vpbroadcastq reg256,mem64 8 2
vpbroadcastq reg256,reg128 5 3
vpbroadcastw reg128,mem16 6 3
vpbroadcastw reg128,reg128 5 3
vpbroadcastw reg256,mem16 6 3
vpbroadcastw reg256,reg128 5 4
vpcmpeqb reg256,reg256,reg256 4 2
vpcmpeqb reg256,reg256,mem256 7 2
vpcmpeqd reg256,reg256,reg256 4 2
vpcmpeqd reg256,reg256,mem256 7 2
vpcmpeqq reg256,reg256,reg256 5 2
vpcmpeqq reg256,reg256,mem256 8 2
vpcmpeqw reg256,reg256,reg256 4 2
vpcmpeqw reg256,reg256,mem256 7 2
vpcmpgtb reg256,reg256,reg256 4 1
vpcmpgtb reg256,reg256,mem256 7 2
vpcmpgtd reg256,reg256,reg256 4 1
vpcmpgtd reg256,reg256,mem256 7 2
vpcmpgtq reg256,reg256,reg256 5 1
vpcmpgtq reg256,reg256,mem256 8 4
vpcmpgtw reg256,reg256,reg256 4 1
vpcmpgtw reg256,reg256,mem256 7 2
vperm2i128 reg256,reg256,reg256,imm8 6 3
vperm2i128 reg256,reg256,mem256,imm8 9 3
vpermd reg256,reg256,reg256 5 3
vpermd reg256,reg256,mem256 8 3
vpermpd reg256,reg256,imm8 6 3
vpermpd reg256,mem256,imm8 9 3
vpermps reg256,reg256,reg256 5 3
vpermps reg256,reg256,mem256 8 3
vpermq reg256,reg256,imm8 6 3
vpermq reg256,mem256,imm8 9 3
vpextrb reg32,reg128,imm8 6 3
vpextrb mem8,reg128,imm8 6 4
vpextrd reg32,reg128,imm8 6 3
vpextrd mem32,reg128,imm8 8 4
vpextrq mem64,reg128,imm8 9 4
vpextrq reg64,reg128,imm8 6 3
vpextrw reg32,reg128,imm8 5 3
vpextrw mem16,reg128,imm8 7 4
vphaddd reg256,reg256,reg256 5 7
vphaddd reg256,reg256,mem256 8 7
vphaddsw reg256,reg256,reg256 5 7
vphaddsw reg256,reg256,mem256 8 7
vphaddw reg256,reg256,reg256 5 7
vphaddw reg256,reg256,mem256 8 7
vphsubd reg256,reg256,reg256 5 7
vphsubd reg256,reg256,mem256 8 7
vphsubsw reg256,reg256,reg256 5 7
vphsubsw reg256,reg256,mem256 8 7
vphsubw reg256,reg256,reg256 5 7
vphsubw reg256,reg256,mem256 8 7
vpinsrb reg128,reg128,reg32,imm8 6 7
vpinsrb reg128,reg128,mem8,imm8 6 3
vpinsrd reg128,reg128,reg32,imm8 6 7
vpinsrd reg128,reg128,mem32,imm8 8 3
vpinsrq reg128,reg128,mem64,imm8 9 3
vpinsrq reg128,reg128,reg64,imm8 6 7
vpinsrw reg128,reg128,reg32,imm8 5 7
vpinsrw reg128,reg128,mem16,imm8 6 3
vpmaddubsw reg256,reg256,reg256 5 2
vpmaddubsw reg256,reg256,mem256 8 3
vpmaddwd reg256,reg256,reg256 4 2
vpmaddwd reg256,reg256,mem256 7 2
vpmaskmovd mem128,reg128,reg128 8 5
vpmaskmovd reg128,reg128,mem128 8 2
vpmaskmovd reg256,reg256,mem256 8 2
vpmaskmovd mem256,reg256,reg256 8 4
vpmaskmovq mem128,reg128,reg128 8 4
vpmaskmovq reg128,reg128,mem128 8 2
vpmaskmovq reg256,reg256,mem256 8 2
vpmaskmovq mem256,reg256,reg256 8 4
vpmaxsb reg256,reg256,reg256 5 2
vpmaxsb reg256,reg256,mem256 8 2
vpmaxsd reg256,reg256,reg256 5 2
vpmaxsd reg256,reg256,mem256 8 2
vpmaxsw reg256,reg256,reg256 4 2
vpmaxsw reg256,reg256,mem256 7 2
vpmaxub reg256,reg256,reg256 4 2
vpmaxub reg256,reg256,mem256 7 2
vpmaxud reg256,reg256,reg256 5 2
vpmaxud reg256,reg256,mem256 8 2
vpmaxuw reg256,reg256,reg256 5 2
vpmaxuw reg256,reg256,mem256 8 2
vpminsb reg256,reg256,reg256 5 2
vpminsb reg256,reg256,mem256 8 2
vpminsd reg256,reg256,reg256 5 2
vpminsd reg256,reg256,mem256 8 2
vpminsw reg256,reg256,reg256 4 2
vpminsw reg256,reg256,mem256 7 2
vpminub reg256,reg256,reg256 4 2
vpminub reg256,reg256,mem256 7 2
vpminud reg256,reg256,reg256 5 2
vpminud reg256,reg256,mem256 8 2
vpminuw reg256,reg256,reg256 5 2
vpminuw reg256,reg256,mem256 8 3
vpmovmskb reg32,reg128 4 3
vpmovmskb reg32,reg256 4 3
vpmovmskb reg64,reg128 5 3
vpmovsxbd reg256,mem64 8 4
vpmovsxbd reg256,reg128 5 3
vpmovsxbq reg256,mem32 7 3
vpmovsxbq reg256,reg128 5 3
vpmovsxbw reg256,mem128 8 3
vpmovsxbw reg256,reg128 5 3
vpmovsxdq reg256,mem128 8 3
vpmovsxdq reg256,reg128 5 3
vpmovsxwd reg256,mem128 8 3
vpmovsxwd reg256,reg128 5 3
vpmovsxwq reg256,mem64 8 3
vpmovsxwq reg256,reg128 5 3
vpmovzxbd reg256,mem64 8 3
vpmovzxbd reg256,reg128 5 3
vpmovzxbq reg256,mem32 7 3
vpmovzxbq reg256,reg128 5 3
vpmovzxbw reg256,mem128 8 3
vpmovzxbw reg256,reg128 5 3
vpmovzxdq reg256,mem128 8 3
vpmovzxdq reg256,reg128 5 3
vpmovzxwd reg256,mem128 8 3
vpmovzxwd reg256,reg128 5 3
vpmovzxwq reg256,mem64 8 4
vpmovzxwq reg256,reg128 5 3
vpmuldq reg256,reg256,reg256 5 2
vpmuldq reg256,reg256,mem256 8 2
vpmulhrsw reg256,reg256,reg256 5 2
vpmulhrsw reg256,reg256,mem256 8 2
vpmulhuw reg256,reg256,reg256 4 2
vpmulhuw reg256,reg256,mem256 7 2
vpmulhw reg256,reg256,reg256 4 2
vpmulhw reg256,reg256,mem256 7 2
vpmulld reg256,reg256,reg256 5 4
vpmulld reg256,reg256,mem256 8 4
vpmullw reg256,reg256,reg256 4 2
vpmullw reg256,reg256,mem256 7 2
vpmuludq reg256,reg256,reg256 4 2
vpmuludq reg256,reg256,mem256 7 2
vpor reg256,reg256,reg256 4 1
vpor reg256,reg256,mem256 7 2
vpsadbw reg256,reg256,reg256 4 3
vpsadbw reg256,reg256,mem256 7 3
vpshufb reg256,reg256,reg256 5 3
vpshufb reg256,reg256,mem256 8 3
vpshufd reg256,reg256,imm8 5 3
vpshufd reg256,mem256,imm8 8 3
vpshufhw reg256,reg256,imm8 5 3
vpshufhw reg256,mem256,imm8 8 3
vpshuflw reg256,reg256,imm8 5 3
vpshuflw reg256,mem256,imm8 8 3
vpsignb reg256,reg256,reg256 5 2
vpsignb reg256,reg256,mem256 8 2
vpsignd reg256,reg256,reg256 5 2
vpsignd reg256,reg256,mem256 8 2
vpsignw reg256,reg256,reg256 5 2
vpsignw reg256,reg256,mem256 8 2
vpslld reg256,reg256,imm8 5 2
vpslld reg256,reg256,mem128 7 2
vpslld reg256,reg256,reg128 4 3
vpslldq reg256,reg256,imm8 5 3
vpsllq reg256,reg256,imm8 5 2
vpsllq reg256,reg256,mem128 7 2
vpsllq reg256,reg256,reg128 4 3
vpsllvd reg128,reg128,reg128 5 2
vpsllvd reg128,reg128,mem128 8 2
vpsllvd reg256,reg256,reg256 5 2
vpsllvd reg256,reg256,mem256 8 2
vpsllvq reg128,reg128,reg128 5 2
vpsllvq reg128,reg128,mem128 8 2
vpsllvq reg256,reg256,reg256 5 2
vpsllvq reg256,reg256,mem256 8 2
vpsllw reg256,reg256,imm8 5 2
vpsllw reg256,reg256,mem128 7 2
vpsllw reg256,reg256,reg128 4 3
vpsrad reg256,reg256,imm8 5 2
vpsrad reg256,reg256,mem128 7 2
vpsrad reg256,reg256,reg128 4 3
vpsravd reg128,reg128,reg128 5 2
vpsravd reg128,reg128,mem128 8 2
vpsravd reg256,reg256,reg256 5 2
vpsravd reg256,reg256,mem256 8 2
vpsraw reg256,reg256,imm8 5 2
vpsraw reg256,reg256,mem128 7 2
vpsraw reg256,reg256,reg128 4 3
vpsrld reg256,reg256,imm8 5 2
vpsrld reg256,reg256,mem128 7 2
vpsrld reg256,reg256,reg128 4 3
vpsrldq reg256,reg256,imm8 5 3
vpsrlq reg256,reg256,imm8 5 2
vpsrlq reg256,reg256,mem128 7 2
vpsrlq reg256,reg256,reg128 4 3
vpsrlvd reg128,reg128,reg128 5 2
vpsrlvd reg128,reg128,mem128 8 2
vpsrlvd reg256,reg256,reg256 5 2
vpsrlvd reg256,reg256,mem256 8 2
vpsrlvq reg128,reg128,mem128 8 2
vpsrlvq reg128,reg128,reg128 5 2
vpsrlvq reg256,reg256,reg256 5 2
vpsrlvq reg256,reg256,mem256 8 2
vpsrlw reg256,reg256,imm8 5 2
vpsrlw reg256,reg256,mem128 7 2
vpsrlw reg256,reg256,reg128 4 3
vpsubb reg256,reg256,reg256 4 1
vpsubb reg256,reg256,mem256 7 2
vpsubd reg256,reg256,reg256 4 1
vpsubd reg256,reg256,mem256 7 2
vpsubq reg256,reg256,reg256 4 1
vpsubq reg256,reg256,mem256 7 2
vpsubsb reg256,reg256,reg256 4 1
vpsubsb reg256,reg256,mem256 7 2
vpsubsw reg256,reg256,reg256 4 1
vpsubsw reg256,reg256,mem256 7 2
vpsubusb reg256,reg256,reg256 4 1
vpsubusb reg256,reg256,mem256 7 2
vpsubusw reg256,reg256,reg256 4 1
vpsubusw reg256,reg256,mem256 7 2
vpsubw reg256,reg256,reg256 4 1
vpsubw reg256,reg256,mem256 7 2
vpunpckhbw reg256,reg256,reg256 4 3
vpunpckhbw reg256,reg256,mem256 7 3
vpunpckhdq reg256,reg256,reg256 4 3
vpunpckhdq reg256,reg256,mem256 7 3
vpunpckhqdq reg256,reg256,reg256 4 3
vpunpckhqdq reg256,reg256,mem256 7 3
vpunpckhwd reg256,reg256,reg256 4 3
vpunpckhwd reg256,reg256,mem256 7 3
vpunpcklbw reg256,reg256,reg256 4 3
vpunpcklbw reg256,reg256,mem256 7 3
vpunpckldq reg256,reg256,reg256 4 3
vpunpckldq reg256,reg256,mem256 7 3
vpunpcklqdq reg256,reg256,reg256 4 3
vpunpcklqdq reg256,reg256,mem256 7 3
vpunpcklwd reg256,reg256,reg256 4 3
vpunpcklwd reg256,reg256,mem256 7 3
vpxor reg256,reg256,reg256 4 1
vpxor reg256,reg256,mem256 7 2
Quote from: nidud on December 03, 2021, 06:57:33 AM
New version (https://github.com/nidud/asmc/tree/master/source/test/benchmark/x64/InstructionTiming) (AVX/AVX512).
This have no default input file and a new (regular) syntax. Space, Tab, immediats, and cl are allowed.
InstructionTiming instruction_list [ output_file ]
AVX
rcl reg64,cl
vcvttss2si reg64, reg128
vextracti128 mem128,reg256,7
vextracti128 reg128,reg256,7
vinserti128 reg256,reg256,mem128,7
:arrow_down:
AMD Ryzen 5 3400G AVX
AMD Ryzen Zen+ 2. gen have weak points
AMD Ryzen 3400G Intel(R) Core(TM) i3-10100 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz (AVX512)
...
vmpsadbw reg256,reg256,reg256,imm8 6 12 7 4
vmpsadbw reg256,reg256,mem256,imm8 9 15 7 5
...
vpcmpgtq reg256,reg256,mem256 8 10 4 4
vpcmpgtw reg256,reg256,reg256 4 3 1 1
vpcmpgtw reg256,reg256,mem256 7 5 2 3
vperm2i128 reg256,reg256,reg256,imm8 6 15 3 4
vperm2i128 reg256,reg256,mem256,imm8 9 19 3 4
vpermd reg256,reg256,reg256 5 18 3 4
vpermd reg256,reg256,mem256 8 17 3 4
vpermpd reg256,reg256,imm8 6 9 3 4
vpermpd reg256,mem256,imm8 9 8 3 4
vpermps reg256,reg256,reg256 5 16 3 4
vpermps reg256,reg256,mem256 8 15 3 4
...
vphaddd reg256,reg256,reg256 5 12 7 5
vphaddd reg256,reg256,mem256 8 16 7 5
vphaddsw reg256,reg256,reg256 5 12 7 5
vphaddsw reg256,reg256,mem256 8 16 7 5
vphaddw reg256,reg256,reg256 5 13 7 5
vphaddw reg256,reg256,mem256 8 16 7 5
vphsubd reg256,reg256,reg256 5 12 7 5
vphsubd reg256,reg256,mem256 8 16 7 5
vphsubsw reg256,reg256,reg256 5 12 7 5
vphsubsw reg256,reg256,mem256 8 16 7 5
vphsubw reg256,reg256,reg256 5 12 7 5
vphsubw reg256,reg256,mem256 8 16 7 5
...
vpmaskmovd mem128,reg128,reg128 8 23 5 4
vpmaskmovd reg128,reg128,mem128 8 24 2 2
vpmaskmovd reg256,reg256,mem256 8 39 2 2
vpmaskmovd mem256,reg256,reg256 8 46 4 4
vpmaskmovq mem128,reg128,reg128 8 16 4 5
vpmaskmovq reg128,reg128,mem128 8 16 2 2
vpmaskmovq reg256,reg256,mem256 8 27 2 2
vpmaskmovq mem256,reg256,reg256 8 25 4 4
...
What are 3. gen results ?
EDIT: An interesting AMD CPU, just when price drops
AMD Ryzen 7 5700G (https://www.cpubenchmark.net/mobile/cpu.php?cpu=AMD+Ryzen+7+5700G)
My old AMD fpu was faster than Intel,that's relevant testing on newer amds
This thread seems the most relevant for these thoughts:
1. LEA on Skylake has 3 versions [rcx+offset] [rcx+rax] [rcx*2+rax+offset] taking x,2x,3x cycles respectively. This was surprising since I thought there were only "simple" and "complex" address types, apparently there's a "medium" one too.
On AlderLake LEA has the same timing for all.
2. What is faster?
vxorps xmm31, xmm0,xmm0 or vxorps xmm31,xmm31,xmm31 ?
The former might not be as clever as it seems since the zero elimination might not work. I haven't tested it.
These all look like normal complex addressing mode. different opcodes for each but pretty standard mnemonics. I would be surprised if any instruction like LEA is different from all before it as it would make that CPU no standard x86 or x64.
Quote from: InfiniteLoop on January 08, 2022, 08:06:24 PM1. LEA on Skylake has 3 versions [rcx+offset] [rcx+rax] [rcx*2+rax+offset] taking x,2x,3x cycles respectively
Timings are not very stable, as it's a very tight loop:
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)
2 cycles for 100 * lea eax, [edx+123]
0 cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
7 cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
0 cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
8 cycles for 100 * lea eax, somestring[8*edx+ebx]
1 cycles for 100 * lea eax, [edx+123]
3 cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
8 cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
0 cycles for 100 * lea eax, somestring[edx+ebx]
2 cycles for 100 * lea eax, somestring[4*edx+ebx]
10 cycles for 100 * lea eax, somestring[8*edx+ebx]
Size is 7 bytes for all, while lea eax, [edx+123
4] would be 10 bytes.
AMD Ryzen 5 3400G with Radeon Vega Graphics (SSE4)
5 cycles for 100 * lea eax, [edx+123]
43 cycles for 100 * lea eax, somestring[edx+ebx]
47 cycles for 100 * lea eax, somestring[4*edx+ebx]
47 cycles for 100 * lea eax, somestring[8*edx+ebx]
7 cycles for 100 * lea eax, [edx+123]
35 cycles for 100 * lea eax, somestring[edx+ebx]
57 cycles for 100 * lea eax, somestring[4*edx+ebx]
49 cycles for 100 * lea eax, somestring[8*edx+ebx]
9 cycles for 100 * lea eax, [edx+123]
35 cycles for 100 * lea eax, somestring[edx+ebx]
71 cycles for 100 * lea eax, somestring[4*edx+ebx]
32 cycles for 100 * lea eax, somestring[8*edx+ebx]
6 cycles for 100 * lea eax, [edx+123]
46 cycles for 100 * lea eax, somestring[edx+ebx]
57 cycles for 100 * lea eax, somestring[4*edx+ebx]
43 cycles for 100 * lea eax, somestring[8*edx+ebx]
7 bytes for lea eax, [edx+123]
7 bytes for lea eax, somestring[edx+ebx]
7 bytes for lea eax, somestring[4*edx+ebx]
7 bytes for lea eax, somestring[8*edx+ebx]
Hi,
Two laptops.
Intel(R) Core(TM) i3-4005U CPU @ 1.70GHz (SSE4)
0 cycles for 100 * lea eax, [edx+123]
?? cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
?? cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
?? cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
?? cycles for 100 * lea eax, somestring[edx+ebx]
0 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
7 bytes for lea eax, [edx+123]
7 bytes for lea eax, somestring[edx+ebx]
7 bytes for lea eax, somestring[4*edx+ebx]
7 bytes for lea eax, somestring[8*edx+ebx]
--- ok ---
Intel(R) Core(TM) i3-10110U CPU @ 2.10GHz (SSE4)
14 cycles for 100 * lea eax, [edx+123]
24 cycles for 100 * lea eax, somestring[edx+ebx]
20 cycles for 100 * lea eax, somestring[4*edx+ebx]
30 cycles for 100 * lea eax, somestring[8*edx+ebx]
32 cycles for 100 * lea eax, [edx+123]
2 cycles for 100 * lea eax, somestring[edx+ebx]
23 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
?? cycles for 100 * lea eax, [edx+123]
?? cycles for 100 * lea eax, somestring[edx+ebx]
6 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
3 cycles for 100 * lea eax, [edx+123]
?? cycles for 100 * lea eax, somestring[edx+ebx]
2 cycles for 100 * lea eax, somestring[4*edx+ebx]
?? cycles for 100 * lea eax, somestring[8*edx+ebx]
7 bytes for lea eax, [edx+123]
7 bytes for lea eax, somestring[edx+ebx]
7 bytes for lea eax, somestring[4*edx+ebx]
7 bytes for lea eax, somestring[8*edx+ebx]
--- ok ---
Regards,
Steve
Hi,
11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz (SSE4)
4 cycles for 100 * lea eax, [edx+123]
10 cycles for 100 * lea eax, somestring[edx+ebx]
25 cycles for 100 * lea eax, somestring[4*edx+ebx]
26 cycles for 100 * lea eax, somestring[8*edx+ebx]
4 cycles for 100 * lea eax, [edx+123]
8 cycles for 100 * lea eax, somestring[edx+ebx]
27 cycles for 100 * lea eax, somestring[4*edx+ebx]
27 cycles for 100 * lea eax, somestring[8*edx+ebx]
6 cycles for 100 * lea eax, [edx+123]
10 cycles for 100 * lea eax, somestring[edx+ebx]
30 cycles for 100 * lea eax, somestring[4*edx+ebx]
29 cycles for 100 * lea eax, somestring[8*edx+ebx]
7 cycles for 100 * lea eax, [edx+123]
12 cycles for 100 * lea eax, somestring[edx+ebx]
31 cycles for 100 * lea eax, somestring[4*edx+ebx]
32 cycles for 100 * lea eax, somestring[8*edx+ebx]
7 bytes for lea eax, [edx+123]
7 bytes for lea eax, somestring[edx+ebx]
7 bytes for lea eax, somestring[4*edx+ebx]
7 bytes for lea eax, somestring[8*edx+ebx]
--- ok ---
AMD Ryzen 5 2400G with Radeon Vega Graphics (SSE4)
?? cycles for 100 * lea eax, [edx+123]
37 cycles for 100 * lea eax, somestring[edx+ebx]
76 cycles for 100 * lea eax, somestring[4*edx+ebx]
42 cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
32 cycles for 100 * lea eax, somestring[edx+ebx]
74 cycles for 100 * lea eax, somestring[4*edx+ebx]
40 cycles for 100 * lea eax, somestring[8*edx+ebx]
2 cycles for 100 * lea eax, [edx+123]
35 cycles for 100 * lea eax, somestring[edx+ebx]
78 cycles for 100 * lea eax, somestring[4*edx+ebx]
41 cycles for 100 * lea eax, somestring[8*edx+ebx]
0 cycles for 100 * lea eax, [edx+123]
33 cycles for 100 * lea eax, somestring[edx+ebx]
68 cycles for 100 * lea eax, somestring[4*edx+ebx]
35 cycles for 100 * lea eax, somestring[8*edx+ebx]
7 bytes for lea eax, [edx+123]
7 bytes for lea eax, somestring[edx+ebx]
7 bytes for lea eax, somestring[4*edx+ebx]
7 bytes for lea eax, somestring[8*edx+ebx]
--- ok ---
AMD Ryzen 5 2400G with Radeon Vega Graphics (AVX2)
------------------------------------------------
Instr. Operands Bytes Clocks
------------------------------------------------
adc reg64,reg64 3 3
adc reg64,mem128 6 2
adc reg64,imm8 4 2
adc mem128,reg64 6 4
adc mem128,imm8 7 4
add reg64,reg64 3 3
add reg64,mem128 6 2
add reg64,imm8 4 1
add mem128,reg64 6 4
add mem128,imm8 7 5
and reg64,reg64 3 1
and reg64,mem128 6 2
and reg64,imm8 4 2
and mem128,reg64 6 4
and mem128,imm8 7 4
bsf reg64,reg64 4 12
bsf reg64,mem128 7 16
bsr reg64,reg64 4 16
bsr reg64,mem128 7 22
bswap reg32 2 2
bswap reg64 3 2
bt reg64,reg64 4 1
bt reg64,imm8 5 1
bt mem16,reg16 6 12
bt mem16,imm8 6 2
btc reg64,reg64 4 3
btc reg64,imm8 5 3
btc mem16,imm8 6 8
btr reg64,reg64 4 3
btr reg64,imm8 5 3
btr mem16,imm8 6 8
bts reg64,reg64 4 3
bts reg64,imm8 5 3
bts mem16,imm8 6 8
call reg64 2 16
cbw 2 4
cdq 1 1
clc 1 2
cld 1 12
cmp reg64,reg64 3 1
cmp reg64,imm8 4 1
cmp mem128,reg64 6 2
cmp mem128,imm8 7 2
cmpsb 1 12
cmpsw 2 12
cmpsd 1 12
cmpxchg reg64,reg64 4 12
cmpxchg mem128,reg64 7 12
cwd 2 3
cwde 1 4
dec reg8 2 1
dec reg64 3 1
dec mem8 2 4
dec mem128 6 4
div reg64 8 56
enter imm8,imm8 4 60
idiv reg32 7 57
imul reg8 2 11
imul reg16 3 13
imul reg32 2 12
imul reg64 3 12
imul mem8 2 12
imul mem16 4 12
imul mem32 4 11
imul mem128 6 11
imul reg16,reg16 4 5
imul reg32,reg32 3 5
imul reg64,reg64 4 5
imul reg16,reg16,imm8 4 7
imul reg32,reg32,imm8 3 4
imul reg64,reg64,imm8 4 4
inc reg8 2 1
inc reg64 3 1
inc mem8 2 4
inc mem128 6 4
lahf 1 8
lar reg16,reg16 4 320
lar reg32,reg32 3 303
lea reg64,mem128 6 1
lodsb 1 12
lodsw 2 12
lodsd 1 12
mov reg64,reg64 3 1
mov reg64,mem128 6 2
mov reg64,imm8 7 1
mov mem128,reg64 6 4
mov mem128,imm8 10 4
movsb 1 12
movsw 2 12
movsd 1 12
movsx reg32,reg8 3 1
movsx reg32,mem8 4 2
movsx reg64,reg16 4 1
movsx reg64,mem16 5 2
movzx reg32,reg8 3 1
movzx reg32,mem8 4 2
movzx reg64,reg16 4 1
movzx reg64,mem16 5 2
mul reg8 2 11
mul reg16 3 13
mul reg32 2 12
mul reg64 3 12
mul mem8 2 11
mul mem16 4 13
mul mem32 4 12
mul mem128 6 12
neg reg8 2 1
neg reg64 3 2
neg mem8 2 4
neg mem128 6 4
nop 1 1
not reg8 2 1
not reg64 3 1
not mem32 4 4
not mem128 6 5
or reg8,reg8 2 1
or reg64,reg64 3 1
or reg64,mem128 6 2
or reg64,imm8 4 1
or mem8,reg8 3 4
or mem128,reg64 6 4
or mem128,imm8 7 4
pop reg64 1 2
popfq 4 55
push reg64 1 4