News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Unaligned memory copy test piece.

Started by hutch--, December 06, 2021, 08:34:55 PM

Previous topic - Next topic

hutch--

Intel Manual

66 0F E7 /r
MOVNTDQ m128, xmm1
A V/V SSE2 Move packed integer values in xmm1 to m128 using nontemporal hint.



LiaoMi

Quote from: hutch-- on December 10, 2021, 10:33:46 PM
Intel Manual

66 0F E7 /r
MOVNTDQ m128, xmm1
A V/V SSE2 Move packed integer values in xmm1 to m128 using nontemporal hint.


Hi Hutch,

exactly :thup:  :thup:  :thup:  :thup: Thanks!

What is the meaning of "non temporal" memory accesses in x86 - https://stackoverflow.com/questions/37070/what-is-the-meaning-of-non-temporal-memory-accesses-in-x86
When are x86 LFENCE, SFENCE and MFENCE instructions required? - https://stackoverflow.com/questions/27595595/when-are-x86-lfence-sfence-and-mfence-instructions-required

The "non temporal" phrase means lacking temporal locality. Caches exploit two kinds of locality - spatial and temporal, and by using a non-temporal instruction you're signaling to the processor that you don't expect the data item be used in the near future.

Notes on "non-temporal" (aka "streaming") stores - https://sites.utexas.edu/jdm4372/2018/01/01/notes-on-non-temporal-aka-streaming-stores/
Optimizing Cache Usage With Nontemporal Accesses - https://vgatherps.github.io/2018-09-02-nontemporal/
void force_nt_store(cache_line *a) {
    __m128i zeros = {0, 0}; // chosen to use zeroing idiom;

    __asm volatile("movntdq %0, (%1)\n\t"
#if BYTES > 16
                   "movntdq %0, 16(%1)\n\t"
#endif
#if BYTES > 32
                   "movntdq %0, 32(%1)\n\t"
#endif
#if BYTES > 48
                   "movntdq %0, 48(%1)"
#endif
                   :
                   : "x" (zeros), "r" (&a->vec_val)
                   : "memory");
}

uint64_t run_timer_loop(void) {

    mfence();
    uint64_t start = rdtscp();

    for (int i = 0; i < 32; i++) {
        force_nt_store(&large_buffer[i]);
    }

    mfence();

    uint64_t end = rdtscp();
}


nontemporal_stores
https://github.com/vgatherps/nontemporal_stores/blob/master/basic_write_allocate/test.c

LiaoMi

#32
movntdq + mfence - Example
https://www.felixcloutier.com/x86/mfence
https://www.felixcloutier.com/x86/lfence
https://www.felixcloutier.com/x86/sfence

    .686
    .model  flat,C
    .xmm
    .code

;------------------------------------------------------------------------------
;  VOID *
;  InternalMemCopyMem (
;    IN VOID   *Destination,
;    IN VOID   *Source,
;    IN UINTN  Count
;    );
;------------------------------------------------------------------------------
InternalMemCopyMem  PROC    USES    esi edi
    mov     esi, [esp + 16]             ; esi <- Source
    mov     edi, [esp + 12]             ; edi <- Destination
    mov     edx, [esp + 20]             ; edx <- Count
    lea     eax, [esi + edx - 1]        ; eax <- End of Source
    cmp     esi, edi
    jae     @F
    cmp     eax, edi                    ; Overlapped?
    jae     @CopyBackward               ; Copy backward if overlapped
@@:
    xor     ecx, ecx
    sub     ecx, edi
    and     ecx, 15                     ; ecx + edi aligns on 16-byte boundary
    jz      @F
    cmp     ecx, edx
    cmova   ecx, edx
    sub     edx, ecx                    ; edx <- remaining bytes to copy
    rep     movsb
@@:
    mov     ecx, edx
    and     edx, 15
    shr     ecx, 4                      ; ecx <- # of DQwords to copy
    jz      @CopyBytes
    add     esp, -16
    movdqu  [esp], xmm0                 ; save xmm0
@@:
    movdqu  xmm0, [esi]                 ; esi may not be 16-bytes aligned
    movntdq [edi], xmm0                 ; edi should be 16-bytes aligned
    add     esi, 16
    add     edi, 16
    loop    @B
    mfence
    movdqu  xmm0, [esp]                 ; restore xmm0
    add     esp, 16                     ; stack cleanup
    jmp     @CopyBytes
@CopyBackward:
    mov     esi, eax                    ; esi <- Last byte in Source
    lea     edi, [edi + edx - 1]        ; edi <- Last byte in Destination
    std
@CopyBytes:
    mov     ecx, edx
    rep     movsb
    cld
    mov     eax, [esp + 12]             ; eax <- Destination as return value
    ret
InternalMemCopyMem  ENDP

    END


    .686
    .model  flat,C
    .xmm
    .code

;------------------------------------------------------------------------------
;  VOID *
;  EFIAPI
;  InternalMemSetMem (
;    IN VOID   *Buffer,
;    IN UINTN  Count,
;    IN UINT8  Value
;    );
;------------------------------------------------------------------------------
InternalMemSetMem   PROC    USES    edi
    mov     edx, [esp + 12]             ; edx <- Count
    mov     edi, [esp + 8]              ; edi <- Buffer
    mov     al, [esp + 16]              ; al <- Value
    xor     ecx, ecx
    sub     ecx, edi
    and     ecx, 15                     ; ecx + edi aligns on 16-byte boundary
    jz      @F
    cmp     ecx, edx
    cmova   ecx, edx
    sub     edx, ecx
    rep     stosb
@@:
    mov     ecx, edx
    and     edx, 15
    shr     ecx, 4                      ; ecx <- # of DQwords to set
    jz      @SetBytes
    mov     ah, al                      ; ax <- Value | (Value << 8)
    add     esp, -16
    movdqu  [esp], xmm0                 ; save xmm0
    movd    xmm0, eax
    pshuflw xmm0, xmm0, 0               ; xmm0[0..63] <- Value repeats 8 times
    movlhps xmm0, xmm0                  ; xmm0 <- Value repeats 16 times
@@:
    movntdq [edi], xmm0                 ; edi should be 16-byte aligned
    add     edi, 16
    loop    @B
    mfence
    movdqu  xmm0, [esp]                 ; restore xmm0
    add     esp, 16                     ; stack cleanup
@SetBytes:
    mov     ecx, edx
    rep     stosb
    mov     eax, [esp + 8]              ; eax <- Buffer as return value
    ret
InternalMemSetMem   ENDP

    END



    .686
    .model  flat,C
    .xmm
    .code

;------------------------------------------------------------------------------
;  VOID *
;  EFIAPI
;  InternalMemZeroMem (
;    IN VOID   *Buffer,
;    IN UINTN  Count
;    );
;------------------------------------------------------------------------------
InternalMemZeroMem  PROC    USES    edi
    mov     edi, [esp + 8]
    mov     edx, [esp + 12]
    xor     ecx, ecx
    sub     ecx, edi
    xor     eax, eax
    and     ecx, 15
    jz      @F
    cmp     ecx, edx
    cmova   ecx, edx
    sub     edx, ecx
    rep     stosb
@@:
    mov     ecx, edx
    and     edx, 15
    shr     ecx, 4
    jz      @ZeroBytes
    pxor    xmm0, xmm0
@@:
    movntdq [edi], xmm0
    add     edi, 16
    loop    @B
    mfence
@ZeroBytes:
    mov     ecx, edx
    rep     stosb
    mov     eax, [esp + 8]
    ret
InternalMemZeroMem  ENDP

    END

LiaoMi

"cpuid" before "rdtsc" - https://newbedev.com/cpuid-before-rdtsc

It's to prevent out-of-order execution. From a link that has now disappeared from the web (but which was fortuitously copied here before it disappeared), this text is from an article entitled "Performance monitoring" by one John Eckerdal:
The Pentium Pro and Pentium II processors support out-of-order execution instructions may be executed in another order as you programmed them. This can be a source of errors if not taken care of.
To prevent this the programmer must serialize the the instruction queue. This can be done by inserting a serializing instruction like CPUID instruction before the RDTSC instruction.

Two reasons:

As paxdiablo says, when the CPU sees a CPUID opcode it makes sure all the previous instructions are executed, then the CPUID taken, before any subsequent instructions execute. Without such an instruction, the CPU execution pipeline may end up executing TSC before the instruction(s) you'd like to time.
A significant proportion of machines fail to synchronise the TSC registers across cores. In you want to read it from a horse's mouth - knock yourself out at http://msdn.microsoft.com/en-us/library/ee417693%28VS.85%29.aspx. So, when measuring an interval between TSC readings, unless they're taken on the same core you'll have an effectively random but possibly constant (see below) interval introduced - it can easily be several seconds (yes seconds) even soon after bootup. This effectively reflects how long the BIOS was running on a single core before kicking off the others, plus - if you've any nasty power saving options on - increasing drift caused by cores running at different frequencies or shutting down again. So, if you haven't nailed the threads reading TSC registers to the same core then you'll need to build some kind of cross-core delta table and know the core id (which is returned by CPUID) of each TSC sample in order to compensate for this offset. That's another reason you can see CPUID alongside RDTSC, and indeed a reason why with newer RDTSCP many OSes are storing core id numbers into the extra TSC_AUX[31:0] data returned. (Available from Core i7 and Athlon 64 X2, RDTSCP is a much better option in all respects - the OS normally gives you the core id as mentioned, atomic to the TSC read, and prevent instruction reordering).

CPUID is serializing, preventing out-of-order execution of RDTSC.

These days you can safely use LFENCE instead. It's documented as serializing on the instruction stream (but not stores to memory) on Intel CPUs, and now also on AMD after their microcode update for Spectre.

https://hadibrais.wordpress.com/2018/05/14/the-significance-of-the-x86-lfence-instruction/ explains more about LFENCE.

See also https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf for a way to use RDTSCP that keeps CPUID (or LFENCE) out of the timed region:
LFENCE     ; (or CPUID) Don't start the timed region until everything above has executed
RDTSC           ; EDX:EAX = timestamp
mov  ebx, eax   ; low 32 bits of start time

   code under test

RDTSCP     ; built-in one way barrier stops it from running early
LFENCE     ; (or CPUID) still use a barrier after to prevent anything weird
sub  eax, ebx   ; low 32 bits of end-start

hutch--

I have generally found that the combination of CPUID and RDTSC stabilise timings and improves the accuracy of benchmarking.

LiaoMi

@Hutch: I didn't know before why it was so  :thumbsup:

movntps + sfence - Example
xorps           macro   XMMReg1, XMMReg2
                db      0FH, 057H, 0C0H + (XMMReg1 * 8) + XMMReg2
                endm

movntps         macro   GeneralReg, Offset, XMMReg
                db      0FH, 02BH, 040H + (XmmReg * 8) + GeneralReg, Offset
                endm

sfence          macro
                db      0FH, 0AEH, 0F8H
                endm

movaps_load     macro   XMMReg, GeneralReg
                db      0FH, 028H, (XMMReg * 8) + 4, (4 * 8) + GeneralReg
                endm

movaps_store    macro   GeneralReg, XMMReg
                db      0FH, 029H, (XMMReg * 8) + 4, (4 * 8) + GeneralReg
                endm

;
; Register Definitions (for instruction macros).
;

rEAX            equ     0
rECX            equ     1
rEDX            equ     2
rEBX            equ     3
rESP            equ     4
rEBP            equ     5
rESI            equ     6
rEDI            equ     7


Test Proc

        sti                                     ; reenable context switching
        movaps_store rESP, 0                    ; save xmm0
        mov ecx, Dest
        call XMMZeroPage                        ; zero MEM
        movaps_load  0, rESP                    ; restore xmm

Test ENDP


XMMZeroPage Proc

        xorps   0, 0                            ; zero xmm0 (128 bits)
        mov     eax, SIZE                       ; Number of Iterations

inner:

        movntps rECX, 0,  0                     ; store bytes  0 - 15
        movntps rECX, 16, 0                     ;             16 - 31
        movntps rECX, 32, 0                     ;             32 - 47
        movntps rECX, 48, 0                     ;             48 - 63

        add     ecx, 64                         ; increment base
        dec     eax                             ; decrement loop count
        jnz     short inner

        ; Force all stores to complete before any other
        ; stores from this processor.

        sfence

ifndef SFENCE_IS_NOT_BUSTED

        ; the next uncached write to this processor's apic
        ; may fail unless the store pipes have drained.  sfence by
        ; itself is not enough.   Force drainage now by doing an
        ; interlocked exchange.

        xchg    [esp-4], eax

endif

        ret

XMMZeroPage ENDP


Intel memory ordering, fence instructions, and atomic operations - https://peeterjoot.wordpress.com/2009/12/04/intel-memory-ordering-fence-instructions-and-atomic-operations/
MFENCE and LFENCE micro-architectural implementation (Patent) - https://patents.google.com/patent/US6678810B1/en or https://patentimages.storage.googleapis.com/d4/fd/41/fd35729a18a3cd/US6678810.pdf
MFENCE and LFENCE micro-architectural implementation method and system - https://patents.google.com/patent/US6651151B2/en or https://patentimages.storage.googleapis.com/fe/41/a3/ddea1fb5732c17/US6651151.pdf

Why is (or isn't?) SFENCE + LFENCE equivalent to MFENCE?

x86 fence instructions can be briefly described as follows:
MFENCE prevents any later loads or stores from becoming globally observable before any earlier loads or stores. It drains the store buffer before later loads1 can execute.
LFENCE blocks instruction dispatch (Intel's terminology) until all earlier instructions retire. This is currently implemented by draining the ROB (ReOrder Buffer) before later instructions can issue into the back-end.
SFENCE only orders stores against other stores, i.e. prevents NT stores from committing from the store buffer ahead of SFENCE itself. But otherwise SFENCE is just like a plain store that moves through the store buffer. Think of it like putting a divider on a grocery-store checkout conveyor belt that stops NT stores from getting grabbed early. It does not necessarily force the store buffer to be drained before it retires from the ROB, so putting LFENCE after it doesn't add up to MFENCE.
A "serializing instruction" like CPUID (and IRET, etc) drains everything (ROB, store buffer) before later instructions can issue into the back-end. MFENCE + LFENCE would also do that, but true serializing instructions might also have other effects, I don't know.

Memory Reordering Caught in the Act - https://preshing.com/20120515/memory-reordering-caught-in-the-act/
Does the Intel Memory Model make SFENCE and LFENCE redundant? - https://stackoverflow.com/questions/32705169/does-the-intel-memory-model-make-sfence-and-lfence-redundant/32705560#32705560

hutch--

If I am timing something really critical, I use the API SleepEx() to pause the thread for about 100 ms to try and get the start of a time slice.

jj2007

Quote from: hutch-- on December 11, 2021, 02:16:41 AM
I have generally found that the combination of CPUID and RDTSC stabilise timings and improves the accuracy of benchmarking.

\Masm32\macros\timers.asm
        xor   eax, eax        ;; Use same CPUID input value for each call
        cpuid                 ;; Flush pipe & wait for pending ops to finish
        rdtsc                 ;; Read Time Stamp Counter


Michael Webster :thumbsup:

nidud

#38
deleted

daydreamer

Quote from: hutch-- on December 10, 2021, 10:33:46 PM
Intel Manual

66 0F E7 /r
MOVNTDQ m128, xmm1
A V/V SSE2 Move packed integer values in xmm1 to m128 using nontemporal hint.

is old advice to use for VRAM ->pciexpress to gpu is faster,but I forget to time different alternatives it when ddraw blend lots of circles,but using movaps looked very fast
note the 66 prefix,most packed SSE2 is one byte bigger than SSE versions so wonder how many more instructions in 64byte cacheline can be fit with SSE versions instead?
my none asm creations
https://masm32.com/board/index.php?topic=6937.msg74303#msg74303
I am an Invoker
"An Invoker is a mage who specializes in the manipulation of raw and elemental energies."
Like SIMD coding

hutch--

magnus,

I think you have missed something here, the mnemonic "movntdq" is designed to be used in conjunction with an instruction like either "movdqa" or "movdqu" where they are used to load memory through the cache and "movntdq" is used to write back to memory bypassing the cache. The reduction in cache pollution generally yields an improvement in performance.

jj2007

Quote from: hutch-- on December 12, 2021, 02:05:43 PM"movntdq" is designed to be used in conjunction with an instruction like either "movdqa" or "movdqu"

I had movaps before, but even with movdqa or movdqu it won't become any faster. Mysterious :rolleyes:

Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)

63929   cycles for 100 * rep movsb
95770   cycles for 100 * rep movsd
209628  cycles for 100 * movlps qword ptr [esi+8*ecx]
120512  cycles for 100 * movaps xmm0, oword ptr [esi]
176887  cycles for 100 * movdqa + movntdq
175955  cycles for 100 * movdqu + movntdq

65768   cycles for 100 * rep movsb
64697   cycles for 100 * rep movsd
206155  cycles for 100 * movlps qword ptr [esi+8*ecx]
122034  cycles for 100 * movaps xmm0, oword ptr [esi]
174827  cycles for 100 * movdqa + movntdq
176240  cycles for 100 * movdqu + movntdq

65109   cycles for 100 * rep movsb
64308   cycles for 100 * rep movsd
208594  cycles for 100 * movlps qword ptr [esi+8*ecx]
120838  cycles for 100 * movaps xmm0, oword ptr [esi]
176082  cycles for 100 * movdqa + movntdq
176391  cycles for 100 * movdqu + movntdq

65057   cycles for 100 * rep movsb
64755   cycles for 100 * rep movsd
206689  cycles for 100 * movlps qword ptr [esi+8*ecx]
121700  cycles for 100 * movaps xmm0, oword ptr [esi]
175600  cycles for 100 * movdqa + movntdq
176981  cycles for 100 * movdqu + movntdq

19      bytes for rep movsb
19      bytes for rep movsd
29      bytes for movlps qword ptr [esi+8*ecx]
34      bytes for movaps xmm0, oword ptr [esi]
36      bytes for movdqa + movntdq
36      bytes for movdqu + movntdq

LiaoMi

Quote from: jj2007 on December 12, 2021, 08:45:19 PM
Quote from: hutch-- on December 12, 2021, 02:05:43 PM"movntdq" is designed to be used in conjunction with an instruction like either "movdqa" or "movdqu"

I had movaps before, but even with movdqa or movdqu it won't become any faster. Mysterious :rolleyes:

Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)

63929   cycles for 100 * rep movsb
95770   cycles for 100 * rep movsd
209628  cycles for 100 * movlps qword ptr [esi+8*ecx]
120512  cycles for 100 * movaps xmm0, oword ptr [esi]
176887  cycles for 100 * movdqa + movntdq
175955  cycles for 100 * movdqu + movntdq

65768   cycles for 100 * rep movsb
64697   cycles for 100 * rep movsd
206155  cycles for 100 * movlps qword ptr [esi+8*ecx]
122034  cycles for 100 * movaps xmm0, oword ptr [esi]
174827  cycles for 100 * movdqa + movntdq
176240  cycles for 100 * movdqu + movntdq

65109   cycles for 100 * rep movsb
64308   cycles for 100 * rep movsd
208594  cycles for 100 * movlps qword ptr [esi+8*ecx]
120838  cycles for 100 * movaps xmm0, oword ptr [esi]
176082  cycles for 100 * movdqa + movntdq
176391  cycles for 100 * movdqu + movntdq

65057   cycles for 100 * rep movsb
64755   cycles for 100 * rep movsd
206689  cycles for 100 * movlps qword ptr [esi+8*ecx]
121700  cycles for 100 * movaps xmm0, oword ptr [esi]
175600  cycles for 100 * movdqa + movntdq
176981  cycles for 100 * movdqu + movntdq

19      bytes for rep movsb
19      bytes for rep movsd
29      bytes for movlps qword ptr [esi+8*ecx]
34      bytes for movaps xmm0, oword ptr [esi]
36      bytes for movdqa + movntdq
36      bytes for movdqu + movntdq


Hi jj2007,

please add two more examples from here http://masm32.com/board/index.php?topic=9691.msg106286#msg106286

"movntdq + mfence"
@@:
    movdqu  xmm0, [esi]                 ; esi may not be 16-bytes aligned
    movntdq [edi], xmm0                 ; edi should be 16-bytes aligned
    add     esi, 16
    add     edi, 16
    loop    @B
    mfence


11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz (SSE4)

17005   cycles for 100 * rep movsb
16922   cycles for 100 * rep movsd
106248  cycles for 100 * movlps qword ptr [esi+8*ecx]
41768   cycles for 100 * movaps xmm0, oword ptr [esi]
56037   cycles for 100 * movdqa + movntdq
55746   cycles for 100 * movdqu + movntdq

16797   cycles for 100 * rep movsb
17090   cycles for 100 * rep movsd
105885  cycles for 100 * movlps qword ptr [esi+8*ecx]
42111   cycles for 100 * movaps xmm0, oword ptr [esi]
56001   cycles for 100 * movdqa + movntdq
56026   cycles for 100 * movdqu + movntdq

17075   cycles for 100 * rep movsb
16702   cycles for 100 * rep movsd
107414  cycles for 100 * movlps qword ptr [esi+8*ecx]
41896   cycles for 100 * movaps xmm0, oword ptr [esi]
56205   cycles for 100 * movdqa + movntdq
56293   cycles for 100 * movdqu + movntdq

16736   cycles for 100 * rep movsb
17064   cycles for 100 * rep movsd
105788  cycles for 100 * movlps qword ptr [esi+8*ecx]
41915   cycles for 100 * movaps xmm0, oword ptr [esi]
56349   cycles for 100 * movdqa + movntdq
56819   cycles for 100 * movdqu + movntdq

19      bytes for rep movsb
19      bytes for rep movsd
29      bytes for movlps qword ptr [esi+8*ecx]
34      bytes for movaps xmm0, oword ptr [esi]
36      bytes for movdqa + movntdq
36      bytes for movdqu + movntdq


--- ok ---

TimoVJL

AMD Ryzen 5 3400G with Radeon Vega Graphics     (SSE4)

62547   cycles for 100 * rep movsb
63471   cycles for 100 * rep movsd
119633  cycles for 100 * movlps qword ptr [esi+8*ecx]
60383   cycles for 100 * movaps xmm0, oword ptr [esi]
120757  cycles for 100 * movdqa + movntdq
115172  cycles for 100 * movdqu + movntdq

63334   cycles for 100 * rep movsb
62718   cycles for 100 * rep movsd
118873  cycles for 100 * movlps qword ptr [esi+8*ecx]
60457   cycles for 100 * movaps xmm0, oword ptr [esi]
112820  cycles for 100 * movdqa + movntdq
116539  cycles for 100 * movdqu + movntdq

62664   cycles for 100 * rep movsb
63786   cycles for 100 * rep movsd
119998  cycles for 100 * movlps qword ptr [esi+8*ecx]
57309   cycles for 100 * movaps xmm0, oword ptr [esi]
118881  cycles for 100 * movdqa + movntdq
112190  cycles for 100 * movdqu + movntdq

63090   cycles for 100 * rep movsb
63073   cycles for 100 * rep movsd
118692  cycles for 100 * movlps qword ptr [esi+8*ecx]
59713   cycles for 100 * movaps xmm0, oword ptr [esi]
117861  cycles for 100 * movdqa + movntdq
117263  cycles for 100 * movdqu + movntdq

19      bytes for rep movsb
19      bytes for rep movsd
29      bytes for movlps qword ptr [esi+8*ecx]
34      bytes for movaps xmm0, oword ptr [esi]
36      bytes for movdqa + movntdq
36      bytes for movdqu + movntdq
May the source be with you

jj2007

Quote from: LiaoMi on December 12, 2021, 11:31:29 PMplease add two more examples from here http://masm32.com/board/index.php?topic=9691.msg106286#msg106286

"movntdq + mfence"
@@:
    movdqu  xmm0, [esi]                 ; esi may not be 16-bytes aligned
    movntdq [edi], xmm0                 ; edi should be 16-bytes aligned
    add     esi, 16
    add     edi, 16
    loop    @B
    mfence

I'm not impressed...
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)

65904   cycles for 100 * rep movsb
70679   cycles for 100 * rep movsd
207177  cycles for 100 * movlps qword ptr [esi+8*ecx]
121524  cycles for 100 * movaps xmm0, oword ptr [esi]
191206  cycles for 100 * movdqa + movntdq
194912  cycles for 100 * movdqu + movntdq
197640  cycles for 100 * movdqu + movntdq + mfence

66396   cycles for 100 * rep movsb
64295   cycles for 100 * rep movsd
207218  cycles for 100 * movlps qword ptr [esi+8*ecx]
121237  cycles for 100 * movaps xmm0, oword ptr [esi]
192188  cycles for 100 * movdqa + movntdq
193955  cycles for 100 * movdqu + movntdq
195811  cycles for 100 * movdqu + movntdq + mfence

65465   cycles for 100 * rep movsb
ID 10616    1 MB at 14:01:30  12.12.2021 13:25:28 wb    0 = 11553508 /  1 h    0 MB/day  firefox.exe
63888   cycles for 100 * rep movsd
209074  cycles for 100 * movlps qword ptr [esi+8*ecx]
122465  cycles for 100 * movaps xmm0, oword ptr [esi]
190494  cycles for 100 * movdqa + movntdq
192326  cycles for 100 * movdqu + movntdq
198034  cycles for 100 * movdqu + movntdq + mfence

65560   cycles for 100 * rep movsb
65119   cycles for 100 * rep movsd
206794  cycles for 100 * movlps qword ptr [esi+8*ecx]
121545  cycles for 100 * movaps xmm0, oword ptr [esi]
191100  cycles for 100 * movdqa + movntdq
196902  cycles for 100 * movdqu + movntdq
197136  cycles for 100 * movdqu + movntdq + mfence

19      bytes for rep movsb
19      bytes for rep movsd
29      bytes for movlps qword ptr [esi+8*ecx]
34      bytes for movaps xmm0, oword ptr [esi]
36      bytes for movdqa + movntdq
36      bytes for movdqu + movntdq
39      bytes for movdqu + movntdq + mfence