considering I didn't mess up the table:
align 16
every10th_at7:
;----------------- prolog:
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
;----------------- main loop starts:
db 00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b
db 00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b
db 01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
; ---------------- unroll for a 16byte multiples writes:
db 00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b
db 00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b
db 01000000b,00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b
db 00000001b,00000000b,01000000b,00010000b,00000100b,00000001b,00000000b,01000000b
db 00010000b,00000100b,00000001b,00000000b,01000000b,00010000b,00000100b,00000001b
mov rsi, _buff1
; rsi points to a buffer
mov rax, [every10th_at7]
mov rdx, [every10th_at7+8]
or [rsi], rax
or [rsi+8], rdx
add rsi, 16
mov rax, [every10th_at7+16]
mov rdx, [every10th_at7+24]
mov rdi, [every10th_at7+32]
mov rbx, [every10th_at7+40]
mov rbp, [every10th_at7+48]
; number of 40byte blocks
mov ecx, 10
@@:
or [rsi], rax
or [rsi+8], rdx
or [rsi+16], rdi
or [rsi+24], rbx
or [rsi+32], rbp
add rsi, 40
dec ecx
jnz @b
solution 2, ilustrates limitation of "x86 32bit mode" and "POR"
mov esi, _buff1
; esi points to 16 byte aligned buffer
mov eax, [every10th_at7]
mov edx, [every10th_at7+4]
mov ecx, [every10th_at7+8]
mov edi, [every10th_at7+12]
or [esi], eax
or [esi+4], edx
or [esi+8], ecx
or [esi+12], edi
add esi, 16
movdqa xmm0, [every10th_at7+16]
movdqa xmm1, [every10th_at7+32]
movdqa xmm2, [every10th_at7+48]
movdqa xmm3, [every10th_at7+64]
mov eax, [every10th_at7+80]
mov edx, [every10th_at7+84]
mov ebx, [every10th_at7+88]
mov edi, [every10th_at7+92]
; number of 80 byte blocks
mov ecx, 5
@@:
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
movdqa xmm7, xmm3
por xmm4, [esi]
por xmm5, [esi+16]
por xmm6, [esi+32]
por xmm7, [esi+48]
or [esi+64], eax
or [esi+68], edx
or [esi+72], ebx
or [esi+76], edi
movdqa [esi], xmm4
movdqa [esi+16], xmm5
movdqa [esi+32], xmm6
movdqa [esi+48], xmm7
add esi, 80
dec ecx
jnz @b
EDIT, this code had incorrect offsets
mov rax, [every10th_at7]
mov rdx, [every10th_at7+8]
mov rdi, [every10th_at7+16]
mov rbx, [every10th_at7+24]
mov rbp, [every10th_at7+32]