News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

AVX2 __m256i (ymmword) variable byte shift.

Started by InfiniteLoop, February 18, 2021, 06:46:05 AM

Previous topic - Next topic

InfiniteLoop

While looking through the Intel Instrinsics Guide I noticed there is no such function:
__m256i Shift(__m256i a, __m128i count)
meaning to shift a right/left by count number of bytes while shifting in zeros.

What is fastest?
Also I discovered a serious bug in MASM. Why won't fixed addresses / labels work? Its having none of it. It MUST be put into a register first or it crashes.
i.e. ymmword ptr [label + rax*4] instead of mov reg, OFFSET label and ymmword ptr[reg + rax*4]


;================================
;1.A
;================================
MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
ASM_Mod endp
MyCode ENDS

;================================
;1.B
;================================
CodeStart SEGMENT READ EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeStart ENDS

MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
MyCode ENDS

;================================
;2.
;================================
ASM_Switch proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov rcx, OFFSET _J0
lea rax, [rcx + 8*rdx]
jmp rax
_J0:
nop
nop
nop
nop
nop
nop
nop
ret
_J1:
nop
vpalignr ymm0, ymm1, ymm0, 1
ret
_J2:
nop
vpalignr ymm0, ymm1, ymm0, 2
ret
_J3:
nop
vpalignr ymm0, ymm1, ymm0, 3
ret
_J4:
nop
vpalignr ymm0, ymm1, ymm0, 4
ret
_J5:
nop
vpalignr ymm0, ymm1, ymm0, 5
ret
_J6:
nop
vpalignr ymm0, ymm1, ymm0, 6
ret
_J7:
nop
vpalignr ymm0, ymm1, ymm0, 7
ret
_J8:
nop
vpalignr ymm0, ymm1, ymm0, 8
ret
_J9:
nop
vpalignr ymm0, ymm1, ymm0, 9
ret
_J10:
nop
vpalignr ymm0, ymm1, ymm0, 10
ret
_J11:
nop
vpalignr ymm0, ymm1, ymm0, 11
ret
_J12:
nop
vpalignr ymm0, ymm1, ymm0, 12
ret
_J13:
nop
vpalignr ymm0, ymm1, ymm0, 13
ret
_J14:
nop
vpalignr ymm0, ymm1, ymm0, 14
ret
_J15:
nop
vpalignr ymm0, ymm1, ymm0, 15
ret
ASM_Switch endp
;================================
;3.
;================================
TestFunc proc
vmovdqu ymm0, ymmword ptr [rcx]
vmovdqu ymm5, ymmword ptr [permd_index]
mov ecx, edx
and ecx, 3
mov eax, 32
lea r8d, [8*ecx] ;8*xmod4
sub eax, r8d ;8*(4-xmod4)
mov r10d, edx ;x
sar edx, 2 ;x/4
mov r9d, edx
sar ecx, 2
sub r9d, ecx
inc r9d ;(4+x-xmod4) / 4
movd xmm1, edx
movd xmm2, r8d
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm3, ymm1, ymm0
vpsrld ymm3, ymm3, xmm2
movd xmm1, r9d
movd xmm2, eax
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm4, ymm1, ymm0
mov rax, OFFSET mask_index
vmovdqu ymm1, ymmword ptr [rax + r10]
vpslld ymm4, ymm4, xmm2
vxorps xmm5,xmm5,xmm5
vpcmpgtb ymm2, ymm4, ymm5
vpblendvb ymm0, ymm3, ymm4, ymm2
vpblendvb ymm0, ymm0, ymm5, ymm1
ret
permd_index DWORD 0,1,2,3,4,5,6,7
mask_index BYTE 32 dup (0), 32 dup (255)
TestFunc endp

jj2007

You mean this type of addressing?
include \Masm32\MasmBasic\Res\JBasic.inc
_label YMMWORD 1111111111111111h, 2222222222222222h, 33333333333333333h
Init ; OPT_64 1 ; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")
  mov rax, 4  ; 4*8=get the second element
  int 3
  vmovdqu ymm0, ymmword ptr _label[rax*8]
  Inkey "all is fine"
EndOfCode
OPT_DebugL /LARGEADDRESSAWARE:NO


Works fine with UAsm and Polink. Masm gives me "invalid data initializer" in line 2, apparently it doesn't like the YMMWORD.
Note the linker option - it seems necessary.

I've inserted the int 3 so that you can see in the debugger that ymm0 gets correctly loaded with 2222222222222222h.

See also How to load 256 bits to AVX register at once?, post by user MASM.

Note that...
  mov rax, 2 ; element 2
  imul rax, rax, 32
  vmovdqu ymm0, ymmword ptr _label[rax]

... gets wrongly encoded by UAsm as vmovdqu ymm0, ymmword ptr [rax], i.e. no _label.
AsmC handles it correctly, but you should use the /Znk assembly option, otherwise it may complain about the "type" keyword :cool:


hutch--

I don't have time to chase it up but putting the immediate into a register first is Win64, not the assembler. You could do it in Win32 but Win64 may not have the available opcode to use an OFFSET like that.

TouEnMasm


uasm don't seem to have problem compiling that:
Quote
  00000001400027B7: C5 FE 6F 01        vmovdqu     ymm0,ymmword ptr [rcx]
  00000001400027BB: C4 E3 7D 39 C1 01  vextracti128 xmm1,ymm0,1
  00000001400027C1: 88 15 11 00 00 00  mov         byte ptr [00000001400027D8h],dl
  00000001400027C7: 48 B8 D3 27 00 40  mov         rax,1400027D3h
                    01 00 00 00
  00000001400027D1: FF E0              jmp         rax
  00000001400027D3: C4 E3 75 0F C0 00  vpalignr    ymm0,ymm1,ymm0,0
  00000001400027D9: C3                 ret
  00000001400027DA: 48 83 EC 08        sub         rsp,8
  00000001400027DE: C5 FE 6F 01        vmovdqu     ymm0,ymmword ptr [rcx]


ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret



Fa is a musical note to play with CL

InfiniteLoop

Quote from: TouEnMasm on February 18, 2021, 07:31:12 PM
there is __ll_lshift and __ll_rshift valid for Intel and AMD processor
Those are epi32 and epi64. There is no variable si256 shift by x bytes.

Quote from: jj2007 on February 18, 2021, 07:24:34 AM
You mean this type of addressing?
No I mean using fixed addresses.
e.g.
rax = some input value
;mov rdx, OFFSET Start ;should not need
;lea rax, [rdx + 8*rax] ;should not need
;jmp rax
jmp [Start+rax*8] ;absolute jmp should work no?
Start:
BYTE 7 DUP(0)
ret
BYTE 7 DUP(0)
ret


hutch--

Hi InfiniteLoop,

Looking at the code you have posted, it looks like you are converting 32 bit code to 64 bit code and the direct OFFSET included in parts of your code that worked fine in Win32 do not work in Win64 as you don't have the opcodes in the hardware to do so. In Win64 you have many more registers and the solution is to use them.

For example you could use r10 and r11 to hold the OFFSET values which you would only need to load once then in you complex addressing mode notation you use the registers instead of the direct OFFSETS. You have already done this and it is the right way to do it.

jj2007

Offset works for me:
include \Masm32\MasmBasic\Res\JBasic.inc ; ## console demo, builds in 32- or 64-bit mode with UAsm, ML, AsmC ##
_number dq 1234567890123456789
Init ; OPT_64 1 ; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")

  mov rax, offset _number   ; <<<<<<<<<<<<<

  Inkey Str$("The number is %lli", qword ptr [rax])
EndOfCode


Disassembly: movabs rax, 0x1400012a8

Output:
This program was assembled with ml64 in 64-bit format.
The number is 1234567890123456789


See MOVABS opcode in the assembly code at SOF. There is a lot of misleading info on the web; for example, some believe that movabs is GAS-specific.

Note also:
  mov rax, offset _number ; identical result,
  lea rax, _number ; but lea is shorter


Exe with an int 3 before mov rax, offset _number attached (i.e. run with an x64 debugger).

hutch--

 :biggrin:

See if you can build the app with /LARGEADDRESSAWARE using that old mnemonic. If not, you are stuck with Win32 memory limits.

jj2007

include \Masm32\MasmBasic\Res\JBasic.inc ; ## console demo, builds in 32- or 64-bit mode with UAsm, ML, AsmC ##
_number dq 1234567890123456789
Init ; OPT_64 1 ; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")

  mov rax, offset _number   ; <<<<<<<<<<<<<

  xchg rax, rdi
  Print Str$("The address is at %xh\n", rdi)
  Inkey Str$("The number is %lli", qword ptr [rdi])
EndOfCode


Output with /LARGEADDRESSAWARE (my default setting):
This program was assembled with ml64 in 64-bit format.
The address is at 40003018h
The number is 1234567890123456789


With /LARGEADDRESSAWARE:NO:
This program was assembled with ml64 in 64-bit format.
The address is at 403018h
The number is 1234567890123456789

hutch--

What did you miss about "using that old mnemonic" ?

six_L

Say you, Say me, Say the codes together for ever.

jj2007

Quote from: hutch-- on February 20, 2021, 02:26:56 PM
What did you miss about "using that old mnemonic" ?

No idea what you mean, sorry.

InfiniteLoop

This is the fastest I've found. 30% faster than the permd version.
Using intrinsics. Clang compiles it verbatim.
__m256i shift_8_right_2(__m256i a, unsigned int count)
{
__m256i mask = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m256i b = _mm256_set1_epi8(count);
__m256i c = _mm256_set1_epi8(15);
__m256i mask_B = _mm256_add_epi8(mask, b);
__m256i mask_A = _mm256_cmpgt_epi8(mask_B, c);
mask_B = _mm256_and_si256(mask_B, c);
__m256i v = _mm256_shuffle_epi8(a, mask_B);
__m256i temp = _mm256_castsi128_si256(_mm256_extracti128_si256(v, 1));
v = _mm256_blendv_epi8(v, temp, mask_A);
return(v);
}


LargeAddressAware does nothing anymore. Its a 32-bit thing.


jj2007