News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

Fast Matrix Transposing - All matrix sizes

Started by aw27, April 11, 2017, 12:06:16 AM

Previous topic - Next topic

aw27

Continuing the saga on Matrix Transposing...
This is a solution for transposing matrixes of any size, squared or not. It supports as well small matrixes or matrixes with less than 4 rows or 4 columns.
It was tested with up to 10000 rows x 10000 columns, which means 100 million elements, where the transposing took less than half a second in my computer, fast enough for most requirements.
It works by transposing 4x4 blocks whenever possible, here i used basically the method presented by Siekmanski, what is left is transposed element by element.
Note: Since we are working here with matrixes of any number of rows and columns it may not benefit from 16-byte data alignment, except on the first row - So, I skipped the alignment convinced it will not help much.

x86:

.686
.XMM

    .MODEL FLAT, C
    option casemap:none

.code
   
MatrixTranspose proc public USES ebx esi edi outMat : ptr, inMat : ptr, rows : dword, cols : dword
LOCAL X4x4Required : dword
LOCAL Y4x4Required : dword
LOCAL remainderX : dword
LOCAL remainderY : dword
LOCAL destRowSize : dword
LOCAL sourceRowSize : dword
LOCAL savedDest : dword

mov edi, outMat
mov esi, inMat

; How many cols % 4?
mov eax, cols
mov ebx, 4
mov edx, 0
div ebx
mov X4x4Required, eax
mov remainderX, edx

; How many rows %4?
mov eax, rows
mov ebx, 4
mov edx, 0
div ebx
mov Y4x4Required, eax
mov remainderY, edx

mov eax, rows
shl eax, 2
mov destRowSize, eax

mov eax, cols
shl eax, 2
mov sourceRowSize, eax

mov ebx, 0
.while ebx<Y4x4Required ; rows % 4
; find starting point for source
mov eax, ebx
mul sourceRowSize
shl eax, 2

mov esi, inMat
add esi, eax
mov ecx, esi ; save
; find starting point for destination
mov eax, ebx
shl eax, 4
mov edi, outMat
add edi, eax
mov savedDest, edi ; save
push ebx

mov ebx,0
.while ebx<X4x4Required
mov eax, ebx
shl eax, 4
mov esi, ecx
add esi, eax
movups xmm0, xmmword ptr [esi]
add esi, sourceRowSize
movups xmm1, xmmword ptr [esi]
add esi, sourceRowSize
movups xmm2, xmmword ptr [esi]
add esi, sourceRowSize
movups xmm3, xmmword ptr [esi]

movaps      xmm4,xmm0
movaps      xmm5,xmm2
unpcklps    xmm4,xmm1
unpcklps    xmm5,xmm3
unpckhps    xmm0,xmm1
unpckhps    xmm2,xmm3
movaps      xmm1,xmm4
movaps      xmm6,xmm0
movlhps     xmm4,xmm5
movlhps     xmm6,xmm2
movhlps     xmm5,xmm1
movhlps     xmm2,xmm0

mov eax, destRowSize
shl eax, 2
mul ebx
mov edi, savedDest
add edi, eax

movups xmmword ptr [edi], xmm4
add edi, destRowSize
movups xmmword ptr [edi], xmm5
add edi, destRowSize
movups xmmword ptr [edi], xmm6
add edi, destRowSize
movups xmmword ptr [edi], xmm2

inc ebx
.endw
pop ebx
inc ebx
.endw
; deal with rows not multiple of 4
.if remainderX >=1
mov eax, X4x4Required
shl eax, 4
mov esi, inMat
add esi, eax

mov eax, X4x4Required
shl eax, 2
mul destRowSize
mov edi, outMat
add edi, eax

mov edx, 0
.while edx < remainderX
mov ecx, 0
mov eax, 0
.while ecx<rows
mov ebx, dword ptr [esi+eax]
mov dword ptr [edi+4*ecx], ebx
add eax, sourceRowSize
inc ecx
.endw
add esi, 4
add edi, destRowSize
inc edx
.endw
.endif

; deal with columns not multiple of 4
.if remainderY >=1
mov eax, Y4x4Required
shl eax, 2
mul sourceRowSize
mov esi, inMat
add esi, eax

mov eax, Y4x4Required
shl eax, 4
mov edi, outMat
add edi, eax

mov edx,0
.while edx < remainderY
mov ecx, 0
mov eax, 0
.while ecx<cols
mov ebx, dword ptr [esi+4*ecx]
mov dword ptr [edi+eax], ebx
add eax, destRowSize
inc ecx
.endw
add esi, sourceRowSize
add edi, 4
inc edx
.endw
.endif

ret
MatrixTranspose endp
   
end   


x64:

option casemap:none
option frame:auto
OPTION STACKBASE:RBP

.code

MatrixTranspose proc public FRAME uses xmm6 rbx rsi rdi r12 outMat : ptr, inMat : ptr, rows : qword, cols : qword
LOCAL X4x4Required : qword
LOCAL Y4x4Required : qword
LOCAL remainderX : qword
LOCAL remainderY : qword

mov outMat, rcx
mov inMat, rdx
mov rows, r8
mov cols, r9

mov rdi, rcx
mov rsi, rdx

; How many cols % 4?
mov rax, cols
mov r10, 4
mov rdx, 0
div r10
mov X4x4Required, rax
mov remainderX, rdx

; How many rows %4?
mov rax, rows
mov r10, 4
mov rdx, 0
div r10
mov Y4x4Required, rax
mov remainderY, rdx

mov rax, rows
shl rax, 2
mov r8, rax ; r8 = destination row size in bytes

mov rax, cols
shl rax, 2
mov r12, rax ; r12 = source row size in bytes

mov r11, 0
.while r11<Y4x4Required ; rows % 4
mov r10,0
; find starting point for source
mov rax, r11
mul r12
shl rax, 2

mov rsi, inMat
add rsi, rax
mov rcx, rsi ; save
; find starting point for destination
mov rax, r11
shl rax, 4
mov rdi, outMat
add rdi, rax
mov r9, rdi ; save
.while r10<X4x4Required
mov rax, r10
shl rax, 4
mov rsi, rcx
add rsi, rax
movups xmm0, xmmword ptr [rsi]
add rsi, r12
movups xmm1, xmmword ptr [rsi]
add rsi, r12
movups xmm2, xmmword ptr [rsi]
add rsi, r12
movups xmm3, xmmword ptr [rsi]

movaps      xmm4,xmm0
movaps      xmm5,xmm2
unpcklps    xmm4,xmm1
unpcklps    xmm5,xmm3
unpckhps    xmm0,xmm1
unpckhps    xmm2,xmm3
movaps      xmm1,xmm4
movaps      xmm6,xmm0
movlhps     xmm4,xmm5
movlhps     xmm6,xmm2
movhlps     xmm5,xmm1
movhlps     xmm2,xmm0

mov rax, r8
shl rax, 2
mul r10
mov rdi, r9
add rdi, rax

movups xmmword ptr [rdi], xmm4
add rdi, r8
movups xmmword ptr [rdi], xmm5
add rdi, r8
movups xmmword ptr [rdi], xmm6
add rdi, r8
movups xmmword ptr [rdi], xmm2

inc r10
.endw

inc r11
.endw
; deal with rows not multiple of 4
.if remainderX >=1
mov rax, X4x4Required
shl rax, 4
mov rsi, inMat
add rsi, rax

mov rax, X4x4Required
shl rax, 2
mul r8
mov rdi, outMat
add rdi, rax

mov r10, 0
.while r10 < remainderX
mov rcx, 0
mov rdx, 0
.while rcx<rows
mov ebx, dword ptr [rsi+rdx]
mov dword ptr [rdi+4*rcx], ebx
add rdx, r12
inc rcx
.endw
add rsi, 4
add rdi, r8
inc r10
.endw
.endif

; deal with columns not multiple of 4
.if remainderY >=1
mov rax, Y4x4Required
shl rax, 2
mul r12
mov rsi, inMat
add rsi, rax

mov rax, Y4x4Required
shl rax, 4
mov rdi, outMat
add rdi, rax

mov r10,0
.while r10 < remainderY
mov rcx, 0
mov rdx, 0
.while rcx<cols
mov ebx, dword ptr [rsi+4*rcx]
mov dword ptr [rdi+rdx], ebx
add rdx, r8
inc rcx
.endw
add rsi, r12
add rdi, 4
inc r10
.endw
.endif

ret
MatrixTranspose endp

end


It can be tested with the following C++ console program, which builds random sized matrixes up to 99x99, transposes them and in the end shows what has been done and the time taken for the transposing operation.


#include <Windows.h>
#include <stdio.h>
#include <tchar.h>
#include <stdlib.h>
#include <time.h>

extern "C"
{
void MatrixTranspose(void* outMat, void* inMat, size_t rows, size_t cols);
}


int main()
{
int i, j;
int numRows, numCols;
int *inMat, *outMat;
LARGE_INTEGER frequency;
LARGE_INTEGER t1, t2;
double elapsedTime;

srand(time(NULL));
numRows = rand() % 99 + 1;
numCols = rand() % 99 + 1;
inMat = (int*)malloc(numRows*numCols * sizeof(int));
outMat = (int*)malloc(numRows*numCols * sizeof(int));
for (i = 0;i < numRows;i++)
for (j = 0;j < numCols;j++)
inMat[i*numCols + j] = rand() % 20 + 1;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&t1);
MatrixTranspose(outMat, inMat, numRows, numCols);
QueryPerformanceCounter(&t2);
elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;

printf("Rows=%d Cols=%d\n\n", numRows, numCols);
printf("Transpose - Input matrix\n");
for (int i = 0; i < numRows; i++)
{
for (int j = 0; j < numCols; j++)
printf("%d\t", inMat[i*numCols + j]);
printf("\n");
}
printf("\n");
printf("Transpose - Output matrix\n");
for (int i = 0; i < numCols; i++)
{
for (int j = 0; j < numRows; j++)
printf("%d\t", outMat[i*numRows + j]);
printf("\n");
}
printf("\n");
printf("Elapsed time: %f miliseconds", elapsedTime);
free(inMat);
free(outMat);
getchar(); return 0;
}




Siekmanski

Nice.  :t

Please use the latest 4*4 Transpose Matrix algorithm to gain some more speed:

http://masm32.com/board/index.php?topic=6127.msg65026#msg65026
Creative coders use backward thinking techniques as a strategy.

aw27

Quote from: Siekmanski on April 11, 2017, 01:35:25 AM
Nice.  :t

Please use the latest 4*4 Transpose Matrix algorithm to gain some more speed:

http://masm32.com/board/index.php?topic=6127.msg65026#msg65026

Thanks, I modified it!  :t