Author Topic: AVX2 __m256i (ymmword) variable byte shift.  (Read 2120 times)

InfiniteLoop

  • Regular Member
  • *
  • Posts: 17
AVX2 __m256i (ymmword) variable byte shift.
« on: February 18, 2021, 06:46:05 AM »
While looking through the Intel Instrinsics Guide I noticed there is no such function:
 __m256i Shift(__m256i a, __m128i count)
 meaning to shift a right/left by count number of bytes while shifting in zeros.

What is fastest?
Also I discovered a serious bug in MASM. Why won't fixed addresses / labels work? Its having none of it. It MUST be put into a register first or it crashes.
i.e. ymmword ptr [label + rax*4] instead of mov reg, OFFSET label and ymmword ptr[reg + rax*4]

Code: [Select]
;================================
;1.A
;================================
MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
ASM_Mod endp
MyCode ENDS

;================================
;1.B
;================================
CodeStart SEGMENT READ EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeStart ENDS

MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
MyCode ENDS

;================================
;2.
;================================
ASM_Switch proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov rcx, OFFSET _J0
lea rax, [rcx + 8*rdx]
jmp rax
_J0:
nop
nop
nop
nop
nop
nop
nop
ret
_J1:
nop
vpalignr ymm0, ymm1, ymm0, 1
ret
_J2:
nop
vpalignr ymm0, ymm1, ymm0, 2
ret
_J3:
nop
vpalignr ymm0, ymm1, ymm0, 3
ret
_J4:
nop
vpalignr ymm0, ymm1, ymm0, 4
ret
_J5:
nop
vpalignr ymm0, ymm1, ymm0, 5
ret
_J6:
nop
vpalignr ymm0, ymm1, ymm0, 6
ret
_J7:
nop
vpalignr ymm0, ymm1, ymm0, 7
ret
_J8:
nop
vpalignr ymm0, ymm1, ymm0, 8
ret
_J9:
nop
vpalignr ymm0, ymm1, ymm0, 9
ret
_J10:
nop
vpalignr ymm0, ymm1, ymm0, 10
ret
_J11:
nop
vpalignr ymm0, ymm1, ymm0, 11
ret
_J12:
nop
vpalignr ymm0, ymm1, ymm0, 12
ret
_J13:
nop
vpalignr ymm0, ymm1, ymm0, 13
ret
_J14:
nop
vpalignr ymm0, ymm1, ymm0, 14
ret
_J15:
nop
vpalignr ymm0, ymm1, ymm0, 15
ret
ASM_Switch endp
;================================
;3.
;================================
TestFunc proc
vmovdqu ymm0, ymmword ptr [rcx]
vmovdqu ymm5, ymmword ptr [permd_index]
mov ecx, edx
and ecx, 3
mov eax, 32
lea r8d, [8*ecx] ;8*xmod4
sub eax, r8d ;8*(4-xmod4)
mov r10d, edx ;x
sar edx, 2 ;x/4
mov r9d, edx
sar ecx, 2
sub r9d, ecx
inc r9d ;(4+x-xmod4) / 4
movd xmm1, edx
movd xmm2, r8d
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm3, ymm1, ymm0
vpsrld ymm3, ymm3, xmm2
movd xmm1, r9d
movd xmm2, eax
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm4, ymm1, ymm0
mov rax, OFFSET mask_index
vmovdqu ymm1, ymmword ptr [rax + r10]
vpslld ymm4, ymm4, xmm2
vxorps xmm5,xmm5,xmm5
vpcmpgtb ymm2, ymm4, ymm5
vpblendvb ymm0, ymm3, ymm4, ymm2
vpblendvb ymm0, ymm0, ymm5, ymm1
ret
permd_index DWORD 0,1,2,3,4,5,6,7
mask_index BYTE 32 dup (0), 32 dup (255)
TestFunc endp

jj2007

  • Member
  • *****
  • Posts: 11551
  • Assembler is fun ;-)
    • MasmBasic
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #1 on: February 18, 2021, 07:24:34 AM »
You mean this type of addressing?
Code: [Select]
include \Masm32\MasmBasic\Res\JBasic.inc
_label YMMWORD 1111111111111111h, 2222222222222222h, 33333333333333333h
Init ; OPT_64 1 ; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")
  mov rax, 4  ; 4*8=get the second element
  int 3
  vmovdqu ymm0, ymmword ptr _label[rax*8]
  Inkey "all is fine"
EndOfCode
OPT_DebugL /LARGEADDRESSAWARE:NO

Works fine with UAsm and Polink. Masm gives me "invalid data initializer" in line 2, apparently it doesn't like the YMMWORD.
Note the linker option - it seems necessary.

I've inserted the int 3 so that you can see in the debugger that ymm0 gets correctly loaded with 2222222222222222h.

See also How to load 256 bits to AVX register at once?, post by user MASM.

Note that...
Code: [Select]
  mov rax, 2 ; element 2
  imul rax, rax, 32
  vmovdqu ymm0, ymmword ptr _label[rax]
... gets wrongly encoded by UAsm as vmovdqu ymm0, ymmword ptr [rax], i.e. no _label.
AsmC handles it correctly, but you should use the /Znk assembly option, otherwise it may complain about the "type" keyword :cool:


hutch--

  • Administrator
  • Member
  • ******
  • Posts: 8493
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #3 on: February 18, 2021, 07:51:05 PM »
I don't have time to chase it up but putting the immediate into a register first is Win64, not the assembler. You could do it in Win32 but Win64 may not have the available opcode to use an OFFSET like that.
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :skrewy:

TouEnMasm

  • Member
  • *****
  • Posts: 1805
    • EditMasm
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #4 on: February 18, 2021, 08:58:05 PM »

uasm don't seem to have problem compiling that:
Quote
  00000001400027B7: C5 FE 6F 01        vmovdqu     ymm0,ymmword ptr [rcx]
  00000001400027BB: C4 E3 7D 39 C1 01  vextracti128 xmm1,ymm0,1
  00000001400027C1: 88 15 11 00 00 00  mov         byte ptr [00000001400027D8h],dl
  00000001400027C7: 48 B8 D3 27 00 40  mov         rax,1400027D3h
                    01 00 00 00
  00000001400027D1: FF E0              jmp         rax
  00000001400027D3: C4 E3 75 0F C0 00  vpalignr    ymm0,ymm1,ymm0,0
  00000001400027D9: C3                 ret
  00000001400027DA: 48 83 EC 08        sub         rsp,8
  00000001400027DE: C5 FE 6F 01        vmovdqu     ymm0,ymmword ptr [rcx]

Code: [Select]
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret


Fa is a musical note to play with CL

InfiniteLoop

  • Regular Member
  • *
  • Posts: 17
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #5 on: February 20, 2021, 09:53:35 AM »
there is __ll_lshift and __ll_rshift valid for Intel and AMD processor
Those are epi32 and epi64. There is no variable si256 shift by x bytes.

You mean this type of addressing?
No I mean using fixed addresses.
 e.g.
rax = some input value
;mov rdx, OFFSET Start ;should not need
;lea rax, [rdx + 8*rax] ;should not need
;jmp rax
jmp [Start+rax*8] ;absolute jmp should work no?
Start:
BYTE 7 DUP(0)
ret
BYTE 7 DUP(0)
ret


hutch--

  • Administrator
  • Member
  • ******
  • Posts: 8493
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #6 on: February 20, 2021, 10:16:55 AM »
Hi InfiniteLoop,

Looking at the code you have posted, it looks like you are converting 32 bit code to 64 bit code and the direct OFFSET included in parts of your code that worked fine in Win32 do not work in Win64 as you don't have the opcodes in the hardware to do so. In Win64 you have many more registers and the solution is to use them.

For example you could use r10 and r11 to hold the OFFSET values which you would only need to load once then in you complex addressing mode notation you use the registers instead of the direct OFFSETS. You have already done this and it is the right way to do it.
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :skrewy:

jj2007

  • Member
  • *****
  • Posts: 11551
  • Assembler is fun ;-)
    • MasmBasic
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #7 on: February 20, 2021, 11:31:03 AM »
Offset works for me:
Code: [Select]
include \Masm32\MasmBasic\Res\JBasic.inc ; ## console demo, builds in 32- or 64-bit mode with UAsm, ML, AsmC ##
_number dq 1234567890123456789
Init ; OPT_64 1 ; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")

  mov rax, offset _number   ; <<<<<<<<<<<<<

  Inkey Str$("The number is %lli", qword ptr [rax])
EndOfCode

Disassembly: movabs rax, 0x1400012a8

Output:
This program was assembled with ml64 in 64-bit format.
The number is 1234567890123456789


See MOVABS opcode in the assembly code at SOF. There is a lot of misleading info on the web; for example, some believe that movabs is GAS-specific.

Note also:
Code: [Select]
  mov rax, offset _number ; identical result,
  lea rax, _number ; but lea is shorter

Exe with an int 3 before mov rax, offset _number attached (i.e. run with an x64 debugger).

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 8493
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #8 on: February 20, 2021, 12:37:32 PM »
 :biggrin:

See if you can build the app with /LARGEADDRESSAWARE using that old mnemonic. If not, you are stuck with Win32 memory limits.
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :skrewy:

jj2007

  • Member
  • *****
  • Posts: 11551
  • Assembler is fun ;-)
    • MasmBasic
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #9 on: February 20, 2021, 01:06:37 PM »
Code: [Select]
include \Masm32\MasmBasic\Res\JBasic.inc ; ## console demo, builds in 32- or 64-bit mode with UAsm, ML, AsmC ##
_number dq 1234567890123456789
Init ; OPT_64 1 ; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")

  mov rax, offset _number   ; <<<<<<<<<<<<<

  xchg rax, rdi
  Print Str$("The address is at %xh\n", rdi)
  Inkey Str$("The number is %lli", qword ptr [rdi])
EndOfCode

Output with /LARGEADDRESSAWARE (my default setting):
This program was assembled with ml64 in 64-bit format.
The address is at 40003018h
The number is 1234567890123456789


With /LARGEADDRESSAWARE:NO:
This program was assembled with ml64 in 64-bit format.
The address is at 403018h
The number is 1234567890123456789

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 8493
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #10 on: February 20, 2021, 02:26:56 PM »
What did you miss about "using that old mnemonic" ?
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :skrewy:

six_L

  • Member
  • **
  • Posts: 225
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #11 on: February 20, 2021, 04:04:56 PM »
about "LARGEADDRESSAWARE"
Say you, Say me, Say the codes together for ever.

jj2007

  • Member
  • *****
  • Posts: 11551
  • Assembler is fun ;-)
    • MasmBasic
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #12 on: February 20, 2021, 10:21:29 PM »
What did you miss about "using that old mnemonic" ?

No idea what you mean, sorry.

InfiniteLoop

  • Regular Member
  • *
  • Posts: 17
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #13 on: February 22, 2021, 09:40:59 AM »
This is the fastest I've found. 30% faster than the permd version.
Using intrinsics. Clang compiles it verbatim.
Code: [Select]
__m256i shift_8_right_2(__m256i a, unsigned int count)
{
__m256i mask = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m256i b = _mm256_set1_epi8(count);
__m256i c = _mm256_set1_epi8(15);
__m256i mask_B = _mm256_add_epi8(mask, b);
__m256i mask_A = _mm256_cmpgt_epi8(mask_B, c);
mask_B = _mm256_and_si256(mask_B, c);
__m256i v = _mm256_shuffle_epi8(a, mask_B);
__m256i temp = _mm256_castsi128_si256(_mm256_extracti128_si256(v, 1));
v = _mm256_blendv_epi8(v, temp, mask_A);
return(v);
}

LargeAddressAware does nothing anymore. Its a 32-bit thing.


jj2007

  • Member
  • *****
  • Posts: 11551
  • Assembler is fun ;-)
    • MasmBasic
Re: AVX2 __m256i (ymmword) variable byte shift.
« Reply #14 on: February 22, 2021, 10:53:06 AM »
LargeAddressAware does nothing anymore. Its a 32-bit thing.

No. See reply #9.