AVX2 __m256i (ymmword) variable byte shift.

InfiniteLoop · February 18, 2021, 06:46:05 AM

While looking through the Intel Instrinsics Guide I noticed there is no such function:
__m256i Shift(__m256i a, __m128i count)
meaning to shift a right/left by count number of bytes while shifting in zeros.

What is fastest?
Also I discovered a serious bug in MASM. Why won't fixed addresses / labels work? Its having none of it. It MUST be put into a register first or it crashes.
i.e. ymmword ptr [label + rax*4] instead of mov reg, OFFSET label and ymmword ptr[reg + rax*4]

Code Select


;================================
;1.A
;================================
MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl 
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
ASM_Mod endp
MyCode ENDS

;================================
;1.B
;================================
CodeStart SEGMENT READ EXECUTE ALIGN(4096)
ASM_Mod proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl 
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeStart ENDS

MyCode SEGMENT READ WRITE EXECUTE ALIGN(4096)
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret
MyCode ENDS

;================================
;2.
;================================
ASM_Switch proc
vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov rcx, OFFSET _J0
lea rax, [rcx + 8*rdx]
jmp rax
_J0:
nop
nop
nop
nop
nop
nop
nop
ret
_J1:
nop
vpalignr ymm0, ymm1, ymm0, 1
ret
_J2:
nop
vpalignr ymm0, ymm1, ymm0, 2
ret
_J3:
nop
vpalignr ymm0, ymm1, ymm0, 3
ret
_J4:
nop
vpalignr ymm0, ymm1, ymm0, 4
ret
_J5:
nop
vpalignr ymm0, ymm1, ymm0, 5
ret
_J6:
nop
vpalignr ymm0, ymm1, ymm0, 6
ret
_J7:
nop
vpalignr ymm0, ymm1, ymm0, 7
ret
_J8:
nop
vpalignr ymm0, ymm1, ymm0, 8
ret
_J9:
nop
vpalignr ymm0, ymm1, ymm0, 9
ret
_J10:
nop
vpalignr ymm0, ymm1, ymm0, 10
ret
_J11:
nop
vpalignr ymm0, ymm1, ymm0, 11
ret
_J12:
nop
vpalignr ymm0, ymm1, ymm0, 12
ret
_J13:
nop
vpalignr ymm0, ymm1, ymm0, 13
ret
_J14:
nop
vpalignr ymm0, ymm1, ymm0, 14
ret
_J15:
nop
vpalignr ymm0, ymm1, ymm0, 15
ret
ASM_Switch endp
;================================
;3.
;================================
TestFunc proc
vmovdqu ymm0, ymmword ptr [rcx]
vmovdqu ymm5, ymmword ptr [permd_index]
mov ecx, edx
and ecx, 3
mov eax, 32
lea r8d, [8*ecx]	;8*xmod4
sub eax, r8d		;8*(4-xmod4)
mov r10d, edx		;x
sar edx, 2			;x/4
mov r9d, edx
sar ecx, 2
sub r9d, ecx
inc r9d				;(4+x-xmod4) / 4
movd xmm1, edx
movd xmm2, r8d
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm3, ymm1, ymm0
vpsrld ymm3, ymm3, xmm2
movd xmm1, r9d
movd xmm2, eax
vpbroadcastb ymm1, xmm1
vpaddd ymm1, ymm1, ymm5
vpermd ymm4, ymm1, ymm0
mov rax, OFFSET mask_index
vmovdqu ymm1, ymmword ptr [rax + r10]
vpslld ymm4, ymm4, xmm2
vxorps xmm5,xmm5,xmm5
vpcmpgtb ymm2, ymm4, ymm5
vpblendvb ymm0, ymm3, ymm4, ymm2
vpblendvb ymm0, ymm0, ymm5, ymm1
ret
permd_index DWORD 0,1,2,3,4,5,6,7
mask_index BYTE 32 dup (0), 32 dup (255)
TestFunc endp

jj2007 · February 18, 2021, 07:24:34 AM

You mean this type of addressing?

Code Select

include \Masm32\MasmBasic\Res\JBasic.inc
_label	YMMWORD 1111111111111111h, 2222222222222222h, 33333333333333333h
Init				; OPT_64 1	; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")
  mov rax, 4  ; 4*8=get the second element
  int 3
  vmovdqu ymm0, ymmword ptr _label[rax*8]
  Inkey "all is fine" 
EndOfCode
OPT_DebugL	/LARGEADDRESSAWARE:NO

Works fine with UAsm and Polink. Masm gives me "invalid data initializer" in line 2, apparently it doesn't like the YMMWORD.
Note the linker option - it seems necessary.

I've inserted the int 3 so that you can see in the debugger that ymm0 gets correctly loaded with 2222222222222222h.

See also How to load 256 bits to AVX register at once?, post by user MASM.

Note that...

Code Select

  mov rax, 2	; element 2
  imul rax, rax, 32
  vmovdqu ymm0, ymmword ptr _label[rax]

... gets wrongly encoded by UAsm as vmovdqu ymm0, ymmword ptr [rax], i.e. no _label.
AsmC handles it correctly, but you should use the /Znk assembly option, otherwise it may complain about the "type" keyword

TouEnMasm · February 18, 2021, 07:31:12 PM

look here : https://docs.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-160
there is __ll_lshift and __ll_rshift valid for Intel and AMD processor
https://stackoverflow.com/questions/8924729/using-avx-intrinsics-instead-of-sse-does-not-improve-speed-why
avx
https://docs.microsoft.com/fr-fr/dotnet/api/system.runtime.intrinsics.x86.avx2.shiftleftlogicalvariable?view=net-5.0
all functions are in immintrin.h (.sdk)

hutch-- · February 18, 2021, 07:51:05 PM

I don't have time to chase it up but putting the immediate into a register first is Win64, not the assembler. You could do it in Win32 but Win64 may not have the available opcode to use an OFFSET like that.

TouEnMasm · February 18, 2021, 08:58:05 PM

uasm don't seem to have problem compiling that:

Quote
00000001400027B7: C5 FE 6F 01 vmovdqu ymm0,ymmword ptr [rcx]
00000001400027BB: C4 E3 7D 39 C1 01 vextracti128 xmm1,ymm0,1
00000001400027C1: 88 15 11 00 00 00 mov byte ptr [00000001400027D8h],dl
00000001400027C7: 48 B8 D3 27 00 40 mov rax,1400027D3h
01 00 00 00
00000001400027D1: FF E0 jmp rax
00000001400027D3: C4 E3 75 0F C0 00 vpalignr ymm0,ymm1,ymm0,0
00000001400027D9: C3 ret
00000001400027DA: 48 83 EC 08 sub rsp,8
00000001400027DE: C5 FE 6F 01 vmovdqu ymm0,ymmword ptr [rcx]

Code Select


ASM_Mod proc
	vmovdqu ymm0, ymmword ptr [rcx]
vextracti128 xmm1, ymm0, 1
mov byte ptr [CodeChange + 5], dl
mov rax, OFFSET CodeChange
jmp rax
ASM_Mod endp
CodeChange: vpalignr ymm0, ymm1, ymm0, 0
ret

InfiniteLoop · February 20, 2021, 09:53:35 AM

Quote from: TouEnMasm on February 18, 2021, 07:31:12 PM
there is __ll_lshift and __ll_rshift valid for Intel and AMD processor

Those are epi32 and epi64. There is no variable si256 shift by x bytes.

Quote from: jj2007 on February 18, 2021, 07:24:34 AM
You mean this type of addressing?

No I mean using fixed addresses.
e.g.
rax = some input value
~~;mov rdx, OFFSET Start~~ ;should not need
~~;lea rax, [rdx + 8*rax]~~ ;should not need
~~;jmp rax~~
jmp [Start+rax*8] ;absolute jmp should work no?
Start:
BYTE 7 DUP(0)
ret
BYTE 7 DUP(0)
ret

hutch-- · February 20, 2021, 10:16:55 AM

Hi InfiniteLoop,

Looking at the code you have posted, it looks like you are converting 32 bit code to 64 bit code and the direct OFFSET included in parts of your code that worked fine in Win32 do not work in Win64 as you don't have the opcodes in the hardware to do so. In Win64 you have many more registers and the solution is to use them.

For example you could use r10 and r11 to hold the OFFSET values which you would only need to load once then in you complex addressing mode notation you use the registers instead of the direct OFFSETS. You have already done this and it is the right way to do it.

jj2007 · February 20, 2021, 11:31:03 AM

Offset works for me:

Code Select

include \Masm32\MasmBasic\Res\JBasic.inc	; ## console demo, builds in 32- or 64-bit mode with UAsm, ML, AsmC ##
_number dq 1234567890123456789
Init						; OPT_64 1	; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")

  mov rax, offset _number   ; <<<<<<<<<<<<<

  Inkey Str$("The number is %lli", qword ptr [rax])
EndOfCode

Disassembly: movabs rax, 0x1400012a8

Output:
This program was assembled with ml64 in 64-bit format.
The number is 1234567890123456789

See MOVABS opcode in the assembly code at SOF. There is a lot of misleading info on the web; for example, some believe that movabs is GAS-specific.

Note also:

Code Select

  mov rax, offset _number	; identical result,
  lea rax, _number		; but lea is shorter

Exe with an int 3 before mov rax, offset _number attached (i.e. run with an x64 debugger).

hutch-- · February 20, 2021, 12:37:32 PM

See if you can build the app with /LARGEADDRESSAWARE using that old mnemonic. If not, you are stuck with Win32 memory limits.

jj2007 · February 20, 2021, 01:06:37 PM

Code Select

include \Masm32\MasmBasic\Res\JBasic.inc	; ## console demo, builds in 32- or 64-bit mode with UAsm, ML, AsmC ##
_number dq 1234567890123456789
Init						; OPT_64 1	; put 0 for 32 bit, 1 for 64 bit assembly
  PrintLine Chr$("This program was assembled with ", @AsmUsed$(1), " in ", jbit$, "-bit format.")

  mov rax, offset _number   ; <<<<<<<<<<<<<

  xchg rax, rdi
  Print Str$("The address is at %xh\n", rdi) 
  Inkey Str$("The number is %lli", qword ptr [rdi])
EndOfCode

Output with /LARGEADDRESSAWARE (my default setting):
This program was assembled with ml64 in 64-bit format.
The address is at 40003018h
The number is 1234567890123456789

With /LARGEADDRESSAWARE:NO:
This program was assembled with ml64 in 64-bit format.
The address is at 403018h
The number is 1234567890123456789

hutch-- · February 20, 2021, 02:26:56 PM

What did you miss about "using that old mnemonic" ?

six_L · February 20, 2021, 04:04:56 PM

about "LARGEADDRESSAWARE"

jj2007 · February 20, 2021, 10:21:29 PM

Quote from: hutch-- on February 20, 2021, 02:26:56 PM
What did you miss about "using that old mnemonic" ?

No idea what you mean, sorry.

InfiniteLoop · February 22, 2021, 09:40:59 AM

This is the fastest I've found. 30% faster than the permd version.
Using intrinsics. Clang compiles it verbatim.

Code Select

__m256i shift_8_right_2(__m256i a, unsigned int count)
{
	__m256i mask = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
	__m256i b = _mm256_set1_epi8(count);
	__m256i c = _mm256_set1_epi8(15);
	__m256i mask_B = _mm256_add_epi8(mask, b);
	__m256i mask_A = _mm256_cmpgt_epi8(mask_B, c);
	mask_B = _mm256_and_si256(mask_B, c);
	__m256i v = _mm256_shuffle_epi8(a, mask_B);
	__m256i temp = _mm256_castsi128_si256(_mm256_extracti128_si256(v, 1));
	v = _mm256_blendv_epi8(v, temp, mask_A);
	return(v);
}

LargeAddressAware does nothing anymore. Its a 32-bit thing.

jj2007 · February 22, 2021, 10:53:06 AM

Quote from: InfiniteLoop on February 22, 2021, 09:40:59 AMLargeAddressAware does nothing anymore. Its a 32-bit thing.

No. See reply #9.

The MASM Forum

News:

AVX2 __m256i (ymmword) variable byte shift.

InfiniteLoop

jj2007

TouEnMasm

hutch--

TouEnMasm

InfiniteLoop

hutch--

jj2007

hutch--

jj2007

hutch--

six_L

jj2007

InfiniteLoop

jj2007