Something consistent on the 3.3 gig Haswell is the a simple "movsb" copy is faster by a slight amount to a combined "movsq/movsb" algo. The example was originally a test of the register preservation macros but I though I may as well make something useful out of it as well. I would be interested to see if there is any real different on different processors.
These are the typical results I get with this test piece.
Warming up . . . .
mcopy = 1407
bcopy = 1313
mcopy = 1375
bcopy = 1344
mcopy = 1375
bcopy = 1313
mcopy = 1406
bcopy = 1359
mcopy = 1391
bcopy = 1328
mcopy = 1375
bcopy = 1313
mcopy = 1375
bcopy = 1312
mcopy = 1422
bcopy = 1312
1390 mcopy average
1324 bcopy average
The test piece source.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
LOCAL pMem1 :QWORD
LOCAL pMem2 :QWORD
LOCAL cntr :QWORD
LOCAL tcnt :QWORD
LOCAL ocnt :QWORD
LOCAL cnt1 :QWORD
LOCAL cnt2 :QWORD
mov cnt1, 0
mov cnt2, 0
mov pMem1, alloc(1024*1024*1024)
mov pMem2, alloc(1024*1024*1024)
conout " Warming up . . . .",lf,lf
mov tcnt, rv(GetTickCount)
mov cntr, 10
@@:
rcall bcopy2,pMem1,pMem2,1024*1024*1024
sub cntr, 1
jnz @B
invoke GetTickCount
sub rax, tcnt
mov ocnt, 8
reloop:
; ---------------------------------------
invoke SleepEx,10,0
cpuid
mov tcnt, rv(GetTickCount)
mov cntr, 10
@@:
rcall mcopy2,pMem1,pMem2,1024*1024*1024
sub cntr, 1
jnz @B
invoke GetTickCount
sub rax, tcnt
add cnt1, rax
conout " mcopy = ",str$(rax),lf
; ---------------------------------------
invoke SleepEx,10,0
cpuid
mov tcnt, rv(GetTickCount)
mov cntr, 10
@@:
rcall bcopy2,pMem1,pMem2,1024*1024*1024
sub cntr, 1
jnz @B
invoke GetTickCount
sub rax, tcnt
add cnt2, rax
conout " bcopy = ",str$(rax),lf
; ---------------------------------------
sub ocnt, 1
jnz reloop
shr cnt1, 3
shr cnt2, 3
conout lf
conout " ",str$(cnt1)," mcopy average",lf
conout " ",str$(cnt2)," bcopy average",lf
waitkey
mfree pMem1
mfree pMem2
invoke ExitProcess,0
ret
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
mcopy2 proc
; rcx = source address
; rdx = destination address
; r8 = byte count
; --------------
; save rsi & rdi
; --------------
sav rsi
sav rdi
cld
mov rsi, rcx
mov rdi, rdx
mov rcx, r8
shr rcx, 3
rep movsq
mov rcx, r8
and rcx, 7
rep movsb
; -----------------
; restore rsi & rdi
; -----------------
rst rsi
rst rdi
ret
mcopy2 endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
bcopy2 proc
; rcx = source address
; rdx = destination address
; r8 = byte count
; --------------
; save rsi & rdi
; --------------
sav rsi
sav rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, r8
rep movsb
; -----------------
; restore rsi & rdi
; -----------------
rst rsi
rst rdi
ret
bcopy2 endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end
Alright then. When I get home, I'll throw it on the barbeque to test your recipe.
I always like comparing my amd to Intel's offerings.
Core i5:
1601 mcopy average
1708 bcopy average
Hi,
Core i3, notebook, Windows 8.1.
F:\TEMP\TEST>mcopy2
Warming up . . . .
mcopy = 3172
bcopy = 2921
mcopy = 3219
bcopy = 2906
mcopy = 2875
bcopy = 2313
mcopy = 2516
bcopy = 2360
mcopy = 2547
bcopy = 2313
mcopy = 2531
bcopy = 2328
mcopy = 2547
bcopy = 2328
mcopy = 2593
bcopy = 2438
2750 mcopy average
2488 bcopy average
Press any key to continue...
Cheers,
Steve N.
Warming up . . . .
mcopy = 1172
bcopy = 1156
mcopy = 1172
bcopy = 1157
mcopy = 1235
bcopy = 1171
mcopy = 1172
bcopy = 1172
mcopy = 1172
bcopy = 1172
mcopy = 1172
bcopy = 1172
mcopy = 1172
bcopy = 1172
mcopy = 1156
bcopy = 1172
1177 mcopy average
1168 bcopy average
Press any key to continue...
From my AMD Threadripper 1950x
(I assume a lower result is better?)
There is a guy that found the fastest memcpy/memmove on x86/x64 ... EVER, written in C (https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE). :biggrin:
He became completely obsessed with the subject for a few years (probably he is nuts by now, I hope not).
The funny part is that his algo is broken, as I mentioned in the comments to the article, but had a number of useful ideas and I have reused them in some software. Unfortunately I can't publish here.
Here are my results:
I did something special for these tests. The first one is running in SAFE MODE, command prompt interface.
The second one is SAFE MODE, explorer gui.
The third one is full all out explorer gui running normally, and everything else loaded.
SAFE MODE, command prompt interface.
Warming up . . . .
mcopy = 4781
bcopy = 4218
mcopy = 4109
bcopy = 4110
mcopy = 4109
bcopy = 4110
mcopy = 4109
bcopy = 4140
mcopy = 4281
bcopy = 4109
mcopy = 4109
bcopy = 4110
mcopy = 4109
bcopy = 4125
mcopy = 4109
bcopy = 4110
4214 mcopy average
4129 bcopy average
Press any key to continue...
SAFE MODE, explorer gui desktop.
Warming up . . . .
mcopy = 4704
bcopy = 4375
mcopy = 4250
bcopy = 4187
mcopy = 4203
bcopy = 4437
mcopy = 4547
bcopy = 4172
mcopy = 4157
bcopy = 4141
mcopy = 4156
bcopy = 4140
mcopy = 4141
bcopy = 4141
mcopy = 4141
bcopy = 4141
4287 mcopy average
4216 bcopy average
Press any key to continue...
Full on, explorer gui, all processes loaded.
Warming up . . . .
mcopy = 5047
bcopy = 4921
mcopy = 4375
bcopy = 4515
mcopy = 4828
bcopy = 5172
mcopy = 4985
bcopy = 4907
mcopy = 8578
bcopy = 4125
mcopy = 4109
bcopy = 4172
mcopy = 4468
bcopy = 4625
mcopy = 4094
bcopy = 4797
5060 mcopy average
4654 bcopy average
Press any key to continue...
AMD A6-9220e 1.60 Ghz
I'll let you test it for youselves, to draw your own conclusions.
Seems to me that having everything else running in the background is really affecting performance.
Comparing with the others,, my 'puter seems sluggish, even in safe mode cmd interface.. :redface:
I'm going to run another test in Windows 7 based WinPE it will take a few minutes to get that set up ......
This is from Windows 10 based WinPE. The Windows 7 version I have is 32 bits... (I forgot)
Warming up . . . .
mcopy = 4656
bcopy = 4563
mcopy = 4312
bcopy = 4218
mcopy = 4187
bcopy = 4125
mcopy = 4125
bcopy = 4110
mcopy = 4109
bcopy = 4110
mcopy = 4109
bcopy = 4109
mcopy = 4109
bcopy = 4110
mcopy = 4109
bcopy = 4141
4214 mcopy average
4185 bcopy average
Press any key to continue...
Warming up . . . .
mcopy = 2468
bcopy = 2546
mcopy = 2343
bcopy = 2453
mcopy = 2282
bcopy = 2531
mcopy = 2297
bcopy = 2484
mcopy = 2406
bcopy = 2485
mcopy = 2375
bcopy = 2407
mcopy = 2312
bcopy = 2469
mcopy = 2406
bcopy = 2375
2361 mcopy average
2468 bcopy average
Press any key to continue...
I have an i5 in windows 8.1. The Intel is from an old generation i guess... :idea:
Thanks everyone for the testing, it looks like over a different range of hardware that there is not a lot of difference between the two techniques. The test piece for each number is a 1 gig buffer copied 10 times so 10 gig each test so using the simpler "rep movsb" is easily fast enough for most tasks. I have some test pieces done a while ago for SSE and AVX that are a bit faster but not by all that much. Intel has provided special case circuitry for the REP MOVS/STOS instructions as they are so widely used so unless you are bashing away at SSE or AVX and need the data sizes, ordinary memory copy with "rep movsb" will be with us for a while yet.
Yes, it seems that rep movs? has become more competitive over time. With my current CPU, only movdqa beats rep movsd, see below.
Remember the old Code location sensitivity of timings (http://www.masmforum.com/board/index.php?topic=11454.msg87600#msg87600) thread? We still tried to optimise for a Prescott CPU...
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)
Algo memcpy memcpy_S MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT Guga rep movs movdqa lps+hps movdqa movdqa Masm32 Habran
dest-al psllq CeleronM dest-al src-al library
Code size ? 88 59 291 222 200 269 33 104
---------------------------------------------------------------------------------------------
2048, d0s0-0 147 294 209 194 199 204 197 211 297
2048, d1s1-0 412 304 291 238 234 222 202 246 310
2048, d7s7-0 416 306 291 239 231 239 237 288 312
2048, d7s8-1 285 301 281 547 474 226 231 283 305
2048, d7s9-2 293 306 288 545 478 226 236 270 307
2048, d8s7+1 283 298 278 548 421 231 231 281 304
2048, d8s8-0 418 307 281 239 224 228 237 276 301
2048, d7s3+4 289 295 290 556 421 235 231 280 306
2048, d3s7-4 292 306 290 569 475 224 232 287 310
2048, d8s9-1 283 306 277 538 478 221 231 280 304
2048, d9s7+2 291 300 278 538 418 221 237 278 303
2048, d9s8+1 284 306 288 538 421 231 229 283 313
2048, d9s9-0 413 300 286 240 237 240 239 279 308
2048, d15s15 359 301 217 241 230 239 239 280 313
Sum of count 318 302 274 412 352 227 229 273 306
Note the very first value: The CRT beats them all for fully aligned, big copies, using a series of movdqa. But at a price: all xmm regs are trashed, which is, if I remember well, a violation of the ABI.
MASM32 library FTW!
Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz (SSE4)
Algo memcpy memcpy_S MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT Guga rep movs movdqa lps+hps movdqa movdqa Masm32 Habran
dest-al psllq CeleronM dest-al src-al library
Code size ? 88 59 291 222 200 269 33 104
---------------------------------------------------------------------------------------------
2048, d0s0-0 143 191 116 194 190 187 188 114 269
2048, d1s1-0 228 217 198 219 221 215 217 172 267
2048, d7s7-0 221 218 205 218 219 218 216 171 266
2048, d7s8-1 185 216 193 540 405 213 221 174 269
2048, d7s9-2 186 216 196 537 395 210 222 172 264
2048, d8s7+1 189 216 162 562 368 211 216 172 263
2048, d8s8-0 231 225 165 238 239 232 251 181 264
2048, d7s3+4 193 227 199 570 359 219 219 172 272
2048, d3s7-4 192 221 201 578 408 211 219 173 265
2048, d8s9-1 198 218 164 537 411 212 220 179 264
2048, d9s7+2 191 220 196 550 366 211 224 193 265
2048, d9s8+1 186 227 193 548 381 226 218 195 264
2048, d9s9-0 219 216 200 316 242 237 229 176 263
2048, d15s15 233 218 207 229 220 229 223 172 271
Sum of count 199 217 185 416 316 216 220 172 266
The spec in win64 is the first 8 are trashable but I don't think win32 had a spec on SSE. I have played with both SSE and AVX for raw copy and they are faster than REP STOS/MOVS but not by all that much. I get the impression that the limit is absolute hardware, not the instruction.
As always something comes up while I'm on my way to work. I'll post the new results when I get home.
8)
Quote from: hutch-- on May 29, 2018, 07:04:05 PMI don't think win32 had a spec on SSE.
You are right, Agner (http://www.agner.org/optimize/calling_conventions.pdf) says xmm regs are all volatile.
Hi,
Some sort of summary.
bcopy / mcopy (rounded)
hutch--
0.95
jj2007 #2
1.07
FORTRANS #3
0.90
johnsa #4
0.99
zedd151 #6
0.98
0.98
0.92
zedd151 #8
0.99
felipe #9
0.99
Cheers,
Steve N.
Warming up . . . .
mcopy = 1042
bcopy = 1076
mcopy = 990
bcopy = 1030
mcopy = 1021
bcopy = 1020
mcopy = 1005
bcopy = 1047
mcopy = 1033
bcopy = 1056
mcopy = 1051
bcopy = 1070
mcopy = 1048
bcopy = 1040
mcopy = 1029
bcopy = 1048
1027 mcopy average
1048 bcopy average
Press any key to continue...
processor : 7
vendor_id : GenuineIntel
cpu family : 6
model : 94
model name : Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
stepping : 3
microcode : 0xba
cpu MHz : 800.000
cache size : 8192 KB
physical id : 0
siblings : 8
core id : 3
cpu cores : 4
apicid : 7
initial apicid : 7
fpu : yes
fpu_exception : yes
cpuid level : 22
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat xsaveopt pln pts dtherm invpcid_single retpoline tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap
bogomips : 6816.06
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
from jj's testbed...
AMD A6-9220e RADEON R4, 5 COMPUTE CORES 2C+3G (SSE4)
Algo memcpy memcpy_S MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT Guga rep movs movdqa lps+hps movdqa movdqa Masm32 Habran
dest-al psllq CeleronM dest-al src-al library
Code size ? 88 59 291 222 200 269 33 104
---------------------------------------------------------------------------------------------
2048, d0s0-0 132 297 196 191 186 186 175 191 276
2048, d1s1-0 238 251 218 215 207 206 208 241 274
2048, d7s7-0 234 253 216 214 203 202 212 245 275
2048, d7s8-1 236 252 257 406 525 163 175 240 276
2048, d7s9-2 234 253 260 406 526 161 176 243 275
2048, d8s7+1 228 253 244 379 527 164 325 241 275
2048, d8s8-0 234 251 197 216 210 205 215 193 278
2048, d7s3+4 227 251 254 380 527 161 167 245 277
2048, d3s7-4 233 250 257 417 526 164 176 239 274
2048, d8s9-1 226 251 242 400 522 163 176 241 273
2048, d9s7+2 231 249 258 374 524 165 166 243 272
2048, d9s8+1 229 253 259 372 524 166 165 239 271
2048, d9s9-0 233 249 217 213 207 203 209 243 272
2048, d15s15 235 252 213 217 205 206 212 243 274
Sum of count 225 254 234 314 387 179 196 234 274
--- ok ---
This is from my last toy, a 8th generation one:
Coffe Lake
i7 8700K @3700K GHz
mcopy = 938
bcopy = 922
mcopy = 937
bcopy = 906
mcopy = 890
bcopy = 906
mcopy = 953
bcopy = 922
mcopy = 875
bcopy = 875
mcopy = 906
bcopy = 891
mcopy = 891
bcopy = 875
mcopy = 906
bcopy = 906
912 mcopy average
900 bcopy average
And this one does support TSX instructions.
It is not a notebook, but an old school desktop set to slowly replace my 4 year old beloved windows 7 workhorse.
Intel(R) Core(TM) i7-4700MQ CPU @ 2.40GHz
Instructions sets MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, EM64T, VT-x, AES, AVX, AVX2, FMA3
Warming up . . . . Warming up . . . . Warming up . . . .
mcopy = 1375 mcopy = 1296 mcopy = 1375
bcopy = 1266 bcopy = 1172 bcopy = 1187
mcopy = 1328 mcopy = 1312 mcopy = 1359
bcopy = 1234 bcopy = 1172 bcopy = 1187
mcopy = 1344 mcopy = 1266 mcopy = 1297
bcopy = 1188 bcopy = 1157 bcopy = 1172
mcopy = 1297 mcopy = 1250 mcopy = 1313
bcopy = 1203 bcopy = 1140 bcopy = 1156
mcopy = 1266 mcopy = 1250 mcopy = 1250
bcopy = 1172 bcopy = 1140 bcopy = 1141
mcopy = 1297 mcopy = 1234 mcopy = 1297
bcopy = 1172 bcopy = 1140 bcopy = 1125
mcopy = 1281 mcopy = 1219 mcopy = 1281
bcopy = 1172 bcopy = 1140 bcopy = 1234
mcopy = 1281 mcopy = 1250 mcopy = 1390
bcopy = 1172 bcopy = 1140 bcopy = 1234
1308 mcopy average 1259 mcopy average 1320 mcopy average
1197 bcopy average 1150 bcopy average 1179 bcopy average