While looking through the Intel Instrinsics Guide I noticed there is no such function:
__m256i Shift(__m256i a, __m128i count)
meaning to shift a right/left by count number of bytes while shifting in zeros.
What is fastest?
Also I discovered a serious bug in MASM. Why won't fixed addresses / labels work? Its having none of it. It MUST be put into a register first or it crashes.
i.e. ymmword ptr [label + rax*4] instead of mov reg, OFFSET label and ymmword ptr[reg + rax*4]
;================================
;1.A
;================================
MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
ASM_Mod endp
MyCode ENDS
;================================
;1.B
;================================
CodeStart SEGMENT READ EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeStart ENDS
MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
MyCode ENDS
;================================
;2.
;================================
ASM_Switch proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov rcx, OFFSET _J0
lea rax, [rcx + 8*rdx]
jmp rax
_J0:
nop
nop
nop
nop
nop
nop
nop
ret
_J1:
nop
vpalignr ymm0, ymm1, ymm0, 1
ret
_J2:
nop
vpalignr ymm0, ymm1, ymm0, 2
ret
_J3:
nop
vpalignr ymm0, ymm1, ymm0, 3
ret
_J4:
nop
vpalignr ymm0, ymm1, ymm0, 4
ret
_J5:
nop
vpalignr ymm0, ymm1, ymm0, 5
ret
_J6:
nop
vpalignr ymm0, ymm1, ymm0, 6
ret
_J7:
nop
vpalignr ymm0, ymm1, ymm0, 7
ret
_J8:
nop
vpalignr ymm0, ymm1, ymm0, 8
ret
_J9:
nop
vpalignr ymm0, ymm1, ymm0, 9
ret
_J10:
nop
vpalignr ymm0, ymm1, ymm0, 10
ret
_J11:
nop
vpalignr ymm0, ymm1, ymm0, 11
ret
_J12:
nop
vpalignr ymm0, ymm1, ymm0, 12
ret
_J13:
nop
vpalignr ymm0, ymm1, ymm0, 13
ret
_J14:
nop
vpalignr ymm0, ymm1, ymm0, 14
ret
_J15:
nop
vpalignr ymm0, ymm1, ymm0, 15
ret
ASM_Switch endp
;================================
;3.
;================================
TestFunc proc
vmovdqu ymm0, ymmword ptr [rcx]
vmovdqu ymm5, ymmword ptr [permd_index]
mov ecx, edx
and ecx, 3
mov eax, 32
lea r8d, [8*ecx] ;8*xmod4
sub eax, r8d ;8*(4-xmod4)
mov r10d, edx ;x
sar edx, 2 ;x/4
mov r9d, edx
sar ecx, 2
sub r9d, ecx
inc r9d ;(4+x-xmod4) / 4
movd xmm1, edx
movd xmm2, r8d
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm3, ymm1, ymm0
vpsrld ymm3, ymm3, xmm2
movd xmm1, r9d
movd xmm2, eax
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm4, ymm1, ymm0
mov rax, OFFSET mask_index
vmovdqu ymm1, ymmword ptr [rax + r10]
vpslld ymm4, ymm4, xmm2
vxorps xmm5,xmm5,xmm5
vpcmpgtb ymm2, ymm4, ymm5
vpblendvb ymm0, ymm3, ymm4, ymm2
vpblendvb ymm0, ymm0, ymm5, ymm1
ret
permd_index DWORD 0,1,2,3,4,5,6,7
mask_index BYTE 32 dup (0), 32 dup (255)
TestFunc endp