This is the next version, primarily testing the increase thread count and how it effects timing. I have stuck with the spinlock to avoid the 64 thread limit and have tested 128 and 256 threads but it does not get faster than the 64 thread count. Testing over the thread count range, 4 threads runs at about the speed of my earlier test piece, 8 is a lot faster and you get incremental gains up to 64 threads.
Older hardware may not be happy with high thread counts but should handle 16 or 32 threads.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.data
flag dq ? ; completion flag
bsiz dq ? ; buffer size
blsz dq ? ; block size
.code
tcnt equ <64> ; <8> <16> <32> <64> <128> <256> ; thread count
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
USING r12,r13,r14,r15
LOCAL hFile :QWORD
LOCAL bwrt :QWORD
SaveRegs ; save volatile registers
mov flag, 0 ; zero the completion flag
mov bsiz, 1024*1024*1024 ; allocated size
mov blsz, 1024*1024*1024/tcnt ; individual block size
conout "Creating random pad",lf
mov r15, alloc(bsiz) ; allocate single block
mov r14, rvcall(GetTickCount) ; start the timing
mov r12, tcnt ; tcnt thread count
mov r13, r15
@@:
invoke CreateThread,0,0,ptr$(Thread),r13,0,0
rcall CloseHandle, rax
add r13, blsz ; set next write location
sub r12, 1
jnz @B
spinlock: ; poll for completion count
cmp flag, tcnt
jne spinlock
rcall GetTickCount
sub rax, r14
conout "Timing = ",str$(rax)," ms",lf ; display timing results
conout "Thread Completion Count = ",str$(flag),lf
conout "Saving file to disk",lf
mov bwrt, savefile("test.pad",r15,bsiz) ; write result to disk
exec "ent test.pad"
waitkey "Wait for ENT to complete",lf
RestoreRegs ; restor volatile registers
.exit
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
Thread proc
mov rdx, blsz ; load the block size
@@:
rdrand rax
mov [rcx], rax ; rcx is the buffer address
add rcx, 8
sub rdx, 8
jnz @B
add flag, 1 ; increment the completion flag
ret
Thread endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end