8x8 Matrix Transpose using AVX

aw27 · July 09, 2018, 11:07:08 PM

Continuing the matrix transpose craziness, this is a 8x8 matrix transpose using AVX.


includelib Kernel32.lib
ExitProcess proto :DWORD
includelib msvcrt.lib
printf proto :ptr, :vararg

_MM_SHUFFLE MACRO fp3, fp2, fp1, fp0
	exitm <( ( ( fp3 ) shl 6)  or  ( ( fp2 ) shl 4 )  or  ( ( fp1 ) shl 2 ) or  ( ( fp0 ) ) ) >
ENDM

data32 segment align(32) ".data"
invalues LABEL DWORD
invalue = 1
REPEAT 64
	dd invalue
	invalue = invalue + 1
ENDM
outvalues dd 64 dup (0)
data32 ends

.data
msgFmt db "row %d",9,"%d %d %d %d %d %d %d %d",10,0
msgBefore db "Transposing a 8x8 Matrix",10,"Before:",10,0
msgAfter db "After:",10,0

.code

mat8x8Transpose proc invalsptr :ptr, outvalsptr:ptr
	vmovaps ymm0, YMMWORD PTR[rcx]
	vmovaps ymm1, YMMWORD PTR[rcx+8*sizeof DWORD]
	vmovaps ymm2, YMMWORD PTR[rcx+16*sizeof DWORD]
	vmovaps ymm3, YMMWORD PTR[rcx+24*sizeof DWORD]
	vmovaps ymm4, YMMWORD PTR[rcx+32*sizeof DWORD]
	vmovaps ymm5, YMMWORD PTR[rcx+40*sizeof DWORD]
	vmovaps ymm6, YMMWORD PTR[rcx+48*sizeof DWORD]
	vmovaps ymm7, YMMWORD PTR[rcx+56*sizeof DWORD]
	
	vunpcklps ymm8, ymm0, ymm1
	vunpckhps ymm9, ymm0, ymm1
	vunpcklps ymm10, ymm2, ymm3
	vunpckhps ymm11, ymm2, ymm3
	vunpcklps ymm12, ymm4, ymm5
	vunpckhps ymm13, ymm4, ymm5
	vunpcklps ymm14, ymm6, ymm7
	vunpckhps ymm15, ymm6, ymm7
	
	vshufps ymm0, ymm8, ymm10, _MM_SHUFFLE(1,0,1,0)
	vshufps ymm1, ymm8, ymm10, _MM_SHUFFLE(3,2,3,2)
	vshufps ymm2, ymm9, ymm11, _MM_SHUFFLE(1,0,1,0)
	vshufps ymm3, ymm9, ymm11, _MM_SHUFFLE(3,2,3,2)	
	vshufps ymm4, ymm12, ymm14, _MM_SHUFFLE(1,0,1,0)
	vshufps ymm5, ymm12, ymm14, _MM_SHUFFLE(3,2,3,2)
	vshufps ymm6, ymm13, ymm15, _MM_SHUFFLE(1,0,1,0)
	vshufps ymm7, ymm13, ymm15, _MM_SHUFFLE(3,2,3,2)
	
	vperm2f128 ymm8, ymm0, ymm4, 20h
	vperm2f128 ymm9, ymm1, ymm5, 20h
	vperm2f128 ymm10, ymm2, ymm6, 20h	
	vperm2f128 ymm11, ymm3, ymm7, 20h	
	vperm2f128 ymm12, ymm0, ymm4, 31h
	vperm2f128 ymm13, ymm1, ymm5, 31h
	vperm2f128 ymm14, ymm2, ymm6, 31h	
	vperm2f128 ymm15, ymm3, ymm7, 31h	
	
	vmovaps YMMWORD PTR[rdx], ymm8
	vmovaps YMMWORD PTR[rdx+8*sizeof DWORD], ymm9
	vmovaps YMMWORD PTR[rdx+16*sizeof DWORD], ymm10
	vmovaps YMMWORD PTR[rdx+24*sizeof DWORD], ymm11
	vmovaps YMMWORD PTR[rdx+32*sizeof DWORD], ymm12
	vmovaps YMMWORD PTR[rdx+40*sizeof DWORD], ymm13
	vmovaps YMMWORD PTR[rdx+48*sizeof DWORD], ymm14
	vmovaps YMMWORD PTR[rdx+56*sizeof DWORD], ymm15	
	
	ret
mat8x8Transpose endp


printit proc passedPtr:ptr
	sub rsp, 50h
	mov r12,0
	mov rdi, rcx
@printloop:	
	cmp r12, 8
	jae @exit
	mov rcx, offset msgFmt
	mov edx, r12d
	mov r8d, dword ptr [rdi]
	mov r9d, dword ptr [rdi+sizeof DWORD]
	mov eax, dword ptr [rdi+2*sizeof DWORD]
	mov dword ptr [rsp+20h], eax
	mov eax, dword ptr [rdi+3*sizeof DWORD]
	mov dword ptr [rsp+28h], eax
	mov eax, dword ptr [rdi+4*sizeof DWORD]
	mov dword ptr [rsp+30h], eax
	mov eax, dword ptr [rdi+5*sizeof DWORD]
	mov dword ptr [rsp+38h], eax
	mov eax, dword ptr [rdi+6*sizeof DWORD]
	mov dword ptr [rsp+40h], eax
	mov eax, dword ptr [rdi+7*sizeof DWORD]
	mov dword ptr [rsp+48h], eax
	call printf
	add rdi, 8*sizeof DWORD
	inc r12	
	jmp short @printloop
@exit:
	ret
printit endp


main proc
	sub rsp, 28h
	; Print initial matrix
	mov rcx, offset msgBefore
	call printf	
	mov rcx, offset invalues
	call printit
	
	mov rcx, offset invalues
	mov rdx, offset outvalues
	call mat8x8Transpose
	
	; Let's print the results
	mov rcx, offset msgAfter
	call printf
	mov rcx, offset outvalues
	call printit

	mov rcx, 0
	call ExitProcess
main endp

end

Siekmanski · July 09, 2018, 11:16:47 PM

And the saga continues. :t

aw27 · July 10, 2018, 01:44:23 AM

People with Pentium 4 will feel left out, though. :icon_redface:

zedd151 · July 10, 2018, 06:21:38 AM

Something to do when I get home from work

Later --->

:( I was expecting it to tell me how fast it was. :P

Code Select


Transposing a 8x8 Matrix
Before:
row 0 1 2 3 4 5 6 7 8
row 1 9 10 11 12 13 14 15 16
row 2 17 18 19 20 21 22 23 24
row 3 25 26 27 28 29 30 31 32
row 4 33 34 35 36 37 38 39 40
row 5 41 42 43 44 45 46 47 48
row 6 49 50 51 52 53 54 55 56
row 7 57 58 59 60 61 62 63 64
After:
row 0 1 9 17 25 33 41 49 57
row 1 2 10 18 26 34 42 50 58
row 2 3 11 19 27 35 43 51 59
row 3 4 12 20 28 36 44 52 60
row 4 5 13 21 29 37 45 53 61
row 5 6 14 22 30 38 46 54 62
row 6 7 15 23 31 39 47 55 63
row 7 8 16 24 32 40 48 56 64

I guess no performance test today.

aw27 · July 10, 2018, 04:32:45 PM

Hi Zed

,
I will have to think about a performance test then. :t

zedd151 · July 10, 2018, 04:52:06 PM

Quote from: AW on July 10, 2018, 04:32:45 PM
Hi Zed ,
I will have to think about a performance test then. :t

No, that's ok.

These matrix transpositions reminds me of when I was using TI-92 Plus calculator. I needed to perform a similar operation
on a data set to change x to y and vice versa. It was surely ugly code (and fairly slow - comparable to bytewise shifting) - as I
didn't know enough about programming in assembler for the TI-92 calc - had to rely on 'TI-Basic'. (Motorola MC68000 if I recall correctly) :icon_mrgreen:

My very first taste (of pseudo-programming at least)

Oops! sorry for the thread hijack, just reminiscing...

aw27 · July 10, 2018, 05:12:17 PM

AhAh, those amazing TI calculators. I had one TI-59, considered by the author of this page the best programmable calculator in the world.

hutch-- · July 10, 2018, 05:37:50 PM

I still own a HP 11C that last time I checked it (10 years ago [may have been 20] ) it was still running on its original batteries. I used to use it back in the 1980s as my office calculator and mainly used it to design motor bike exhaust systems. You had to write it all up, type it in with the buttons and test the results. It would display LCD format RUN - NING until it chomped its way through all of the instructions but it still a damned site faster than doing it by hand.

aw27 · July 10, 2018, 07:19:47 PM

Just checking if my 43 year-old Casio FX-15 calculator still works.

It was very expensive, I paid it in 6 installments.

Siekmanski · July 10, 2018, 07:44:10 PM

Don't remember exactly but it looks like my first pocket calculator ( it could calculate sqrt(2) too

)

Caché GB · July 10, 2018, 07:55:37 PM

I had an HP 28S Scientific Calculator with HP 82240A Infrared-Thermal-Printer
Ha, now you all too much. Please no one start a "What do we all look like thread".

zedd151 · July 10, 2018, 08:44:14 PM

Damn, look what I started.

Siekmanski · July 14, 2018, 12:36:47 PM

Hi AW,

Found this piece of code when reading the Intel Optimization Reference Manual.

8x8 Matrix Transpose Using VINSRTPS ( example 12-20 )
It can gain 70% speedup relative to relying on VSHUFPS alone.

It's in chapter 12 section 11.1
http://members.home.nl/siekmanski/Intel_Optimization_Reference_Manual_248966-037.pdf

Code Select

mov rcx, inpBuf
mov rdx, outBuf
mov r8, iLineSize
mov r10, NumOfLoops
loop1:
vmovaps xmm0, [rcx]
vinsertf128 ymm0, ymm0, [rcx + 128], 1
vmovaps xmm1, [rcx + 32]
vinsertf128 ymm1, ymm1, [rcx + 160], 1
vunpcklpd ymm8, ymm0, ymm1
vunpckhpd ymm9, ymm0, ymm1
vmovaps xmm2, [rcx+64]
vinsertf128 ymm2, ymm2, [rcx + 192], 1
vmovaps xmm3, [rcx+96]
vinsertf128 ymm3, ymm3, [rcx + 224], 1
vunpcklpd ymm10, ymm2, ymm3
vunpckhpd ymm11, ymm2, ymm3
vshufps ymm4, ymm8, ymm10, 0x88
vmovaps [rdx], ymm4
vshufps ymm5, ymm8, ymm10, 0xDD
vmovaps [rdx+32], ymm5
vshufps ymm6, ymm9, ymm11, 0x88
vmovaps [rdx+64], ymm6
vshufps ymm7, ymm9, ymm11, 0xDD
vmovaps [rdx+96], ymm7
vmovaps xmm0, [rcx+16]
vinsertf128 ymm0, ymm0, [rcx + 144], 1
vmovaps xmm1, [rcx + 48]
vinsertf128 ymm1, ymm1, [rcx + 176], 1
vunpcklpd ymm8, ymm0, ymm1
vunpckhpd ymm9, ymm0, ymm1
vmovaps xmm2, [rcx+80]
vinsertf128 ymm2, ymm2, [rcx + 208], 1
vmovaps xmm3, [rcx+112]
vinsertf128 ymm3, ymm3, [rcx + 240], 1
vunpcklpd ymm10, ymm2, ymm3
vunpckhpd ymm11, ymm2, ymm3
vshufps ymm4, ymm8, ymm10, 0x88
vmovaps [rdx+128], ymm4
vshufps ymm5, ymm8, ymm10, 0xDD
vmovaps [rdx+160], ymm5
vshufps ymm6, ymm9, ymm11, 0x88
vmovaps [rdx+192], ymm6
vshufps ymm7, ymm9, ymm11, 0xDD
vmovaps [rdx+224], ymm7
dec r10
jnz loop1

jj2007 · July 14, 2018, 06:10:10 PM

Quote from: zedd151 on July 10, 2018, 08:44:14 PM
Damn, look what I started.

Clicking on "Show unread posts since last visit", there are FOUR matrix transpose threads active right now. And Marinus has just found a brand new VINSRTPS instruction. It's a mad place but I like it :icon_mrgreen:

zedd151 · July 14, 2018, 06:19:35 PM

Quote from: jj2007 on July 14, 2018, 06:10:10 PM

Clicking on "Show unread posts since last visit", there are FOUR matrix transpose threads active right now....

Seriously though, how often would the average person need to transpose a matrix of any size? Those folks really have too much time on their hands, but I still help them out when I'm able to.

The MASM Forum

News:

8x8 Matrix Transpose using AVX

aw27

Siekmanski

aw27

zedd151

aw27

zedd151

aw27

hutch--

aw27

Siekmanski

Caché GB

zedd151

Siekmanski

jj2007

zedd151