Continuing the saga on Matrix Transposing...
This is a solution for transposing matrixes of any size, squared or not. It supports as well small matrixes or matrixes with less than 4 rows or 4 columns.
It was tested with up to 10000 rows x 10000 columns, which means 100 million elements, where the transposing took less than half a second in my computer, fast enough for most requirements.
It works by transposing 4x4 blocks whenever possible, here i used basically the method presented by Siekmanski, what is left is transposed element by element.
Note: Since we are working here with matrixes of any number of rows and columns it may not benefit from 16-byte data alignment, except on the first row - So, I skipped the alignment convinced it will not help much.
x86:
.686
.XMM
.MODEL FLAT, C
option casemap:none
.code
MatrixTranspose proc public USES ebx esi edi outMat : ptr, inMat : ptr, rows : dword, cols : dword
LOCAL X4x4Required : dword
LOCAL Y4x4Required : dword
LOCAL remainderX : dword
LOCAL remainderY : dword
LOCAL destRowSize : dword
LOCAL sourceRowSize : dword
LOCAL savedDest : dword
mov edi, outMat
mov esi, inMat
; How many cols % 4?
mov eax, cols
mov ebx, 4
mov edx, 0
div ebx
mov X4x4Required, eax
mov remainderX, edx
; How many rows %4?
mov eax, rows
mov ebx, 4
mov edx, 0
div ebx
mov Y4x4Required, eax
mov remainderY, edx
mov eax, rows
shl eax, 2
mov destRowSize, eax
mov eax, cols
shl eax, 2
mov sourceRowSize, eax
mov ebx, 0
.while ebx<Y4x4Required ; rows % 4
; find starting point for source
mov eax, ebx
mul sourceRowSize
shl eax, 2
mov esi, inMat
add esi, eax
mov ecx, esi ; save
; find starting point for destination
mov eax, ebx
shl eax, 4
mov edi, outMat
add edi, eax
mov savedDest, edi ; save
push ebx
mov ebx,0
.while ebx<X4x4Required
mov eax, ebx
shl eax, 4
mov esi, ecx
add esi, eax
movups xmm0, xmmword ptr [esi]
add esi, sourceRowSize
movups xmm1, xmmword ptr [esi]
add esi, sourceRowSize
movups xmm2, xmmword ptr [esi]
add esi, sourceRowSize
movups xmm3, xmmword ptr [esi]
movaps xmm4,xmm0
movaps xmm5,xmm2
unpcklps xmm4,xmm1
unpcklps xmm5,xmm3
unpckhps xmm0,xmm1
unpckhps xmm2,xmm3
movaps xmm1,xmm4
movaps xmm6,xmm0
movlhps xmm4,xmm5
movlhps xmm6,xmm2
movhlps xmm5,xmm1
movhlps xmm2,xmm0
mov eax, destRowSize
shl eax, 2
mul ebx
mov edi, savedDest
add edi, eax
movups xmmword ptr [edi], xmm4
add edi, destRowSize
movups xmmword ptr [edi], xmm5
add edi, destRowSize
movups xmmword ptr [edi], xmm6
add edi, destRowSize
movups xmmword ptr [edi], xmm2
inc ebx
.endw
pop ebx
inc ebx
.endw
; deal with rows not multiple of 4
.if remainderX >=1
mov eax, X4x4Required
shl eax, 4
mov esi, inMat
add esi, eax
mov eax, X4x4Required
shl eax, 2
mul destRowSize
mov edi, outMat
add edi, eax
mov edx, 0
.while edx < remainderX
mov ecx, 0
mov eax, 0
.while ecx<rows
mov ebx, dword ptr [esi+eax]
mov dword ptr [edi+4*ecx], ebx
add eax, sourceRowSize
inc ecx
.endw
add esi, 4
add edi, destRowSize
inc edx
.endw
.endif
; deal with columns not multiple of 4
.if remainderY >=1
mov eax, Y4x4Required
shl eax, 2
mul sourceRowSize
mov esi, inMat
add esi, eax
mov eax, Y4x4Required
shl eax, 4
mov edi, outMat
add edi, eax
mov edx,0
.while edx < remainderY
mov ecx, 0
mov eax, 0
.while ecx<cols
mov ebx, dword ptr [esi+4*ecx]
mov dword ptr [edi+eax], ebx
add eax, destRowSize
inc ecx
.endw
add esi, sourceRowSize
add edi, 4
inc edx
.endw
.endif
ret
MatrixTranspose endp
end
x64:
option casemap:none
option frame:auto
OPTION STACKBASE:RBP
.code
MatrixTranspose proc public FRAME uses xmm6 rbx rsi rdi r12 outMat : ptr, inMat : ptr, rows : qword, cols : qword
LOCAL X4x4Required : qword
LOCAL Y4x4Required : qword
LOCAL remainderX : qword
LOCAL remainderY : qword
mov outMat, rcx
mov inMat, rdx
mov rows, r8
mov cols, r9
mov rdi, rcx
mov rsi, rdx
; How many cols % 4?
mov rax, cols
mov r10, 4
mov rdx, 0
div r10
mov X4x4Required, rax
mov remainderX, rdx
; How many rows %4?
mov rax, rows
mov r10, 4
mov rdx, 0
div r10
mov Y4x4Required, rax
mov remainderY, rdx
mov rax, rows
shl rax, 2
mov r8, rax ; r8 = destination row size in bytes
mov rax, cols
shl rax, 2
mov r12, rax ; r12 = source row size in bytes
mov r11, 0
.while r11<Y4x4Required ; rows % 4
mov r10,0
; find starting point for source
mov rax, r11
mul r12
shl rax, 2
mov rsi, inMat
add rsi, rax
mov rcx, rsi ; save
; find starting point for destination
mov rax, r11
shl rax, 4
mov rdi, outMat
add rdi, rax
mov r9, rdi ; save
.while r10<X4x4Required
mov rax, r10
shl rax, 4
mov rsi, rcx
add rsi, rax
movups xmm0, xmmword ptr [rsi]
add rsi, r12
movups xmm1, xmmword ptr [rsi]
add rsi, r12
movups xmm2, xmmword ptr [rsi]
add rsi, r12
movups xmm3, xmmword ptr [rsi]
movaps xmm4,xmm0
movaps xmm5,xmm2
unpcklps xmm4,xmm1
unpcklps xmm5,xmm3
unpckhps xmm0,xmm1
unpckhps xmm2,xmm3
movaps xmm1,xmm4
movaps xmm6,xmm0
movlhps xmm4,xmm5
movlhps xmm6,xmm2
movhlps xmm5,xmm1
movhlps xmm2,xmm0
mov rax, r8
shl rax, 2
mul r10
mov rdi, r9
add rdi, rax
movups xmmword ptr [rdi], xmm4
add rdi, r8
movups xmmword ptr [rdi], xmm5
add rdi, r8
movups xmmword ptr [rdi], xmm6
add rdi, r8
movups xmmword ptr [rdi], xmm2
inc r10
.endw
inc r11
.endw
; deal with rows not multiple of 4
.if remainderX >=1
mov rax, X4x4Required
shl rax, 4
mov rsi, inMat
add rsi, rax
mov rax, X4x4Required
shl rax, 2
mul r8
mov rdi, outMat
add rdi, rax
mov r10, 0
.while r10 < remainderX
mov rcx, 0
mov rdx, 0
.while rcx<rows
mov ebx, dword ptr [rsi+rdx]
mov dword ptr [rdi+4*rcx], ebx
add rdx, r12
inc rcx
.endw
add rsi, 4
add rdi, r8
inc r10
.endw
.endif
; deal with columns not multiple of 4
.if remainderY >=1
mov rax, Y4x4Required
shl rax, 2
mul r12
mov rsi, inMat
add rsi, rax
mov rax, Y4x4Required
shl rax, 4
mov rdi, outMat
add rdi, rax
mov r10,0
.while r10 < remainderY
mov rcx, 0
mov rdx, 0
.while rcx<cols
mov ebx, dword ptr [rsi+4*rcx]
mov dword ptr [rdi+rdx], ebx
add rdx, r8
inc rcx
.endw
add rsi, r12
add rdi, 4
inc r10
.endw
.endif
ret
MatrixTranspose endp
end
It can be tested with the following C++ console program, which builds random sized matrixes up to 99x99, transposes them and in the end shows what has been done and the time taken for the transposing operation.
#include <Windows.h>
#include <stdio.h>
#include <tchar.h>
#include <stdlib.h>
#include <time.h>
extern "C"
{
void MatrixTranspose(void* outMat, void* inMat, size_t rows, size_t cols);
}
int main()
{
int i, j;
int numRows, numCols;
int *inMat, *outMat;
LARGE_INTEGER frequency;
LARGE_INTEGER t1, t2;
double elapsedTime;
srand(time(NULL));
numRows = rand() % 99 + 1;
numCols = rand() % 99 + 1;
inMat = (int*)malloc(numRows*numCols * sizeof(int));
outMat = (int*)malloc(numRows*numCols * sizeof(int));
for (i = 0;i < numRows;i++)
for (j = 0;j < numCols;j++)
inMat[i*numCols + j] = rand() % 20 + 1;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&t1);
MatrixTranspose(outMat, inMat, numRows, numCols);
QueryPerformanceCounter(&t2);
elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
printf("Rows=%d Cols=%d\n\n", numRows, numCols);
printf("Transpose - Input matrix\n");
for (int i = 0; i < numRows; i++)
{
for (int j = 0; j < numCols; j++)
printf("%d\t", inMat[i*numCols + j]);
printf("\n");
}
printf("\n");
printf("Transpose - Output matrix\n");
for (int i = 0; i < numCols; i++)
{
for (int j = 0; j < numRows; j++)
printf("%d\t", outMat[i*numRows + j]);
printf("\n");
}
printf("\n");
printf("Elapsed time: %f miliseconds", elapsedTime);
free(inMat);
free(outMat);
getchar(); return 0;
}
Nice. :t
Please use the latest 4*4 Transpose Matrix algorithm to gain some more speed:
http://masm32.com/board/index.php?topic=6127.msg65026#msg65026
Quote from: Siekmanski on April 11, 2017, 01:35:25 AM
Nice. :t
Please use the latest 4*4 Transpose Matrix algorithm to gain some more speed:
http://masm32.com/board/index.php?topic=6127.msg65026#msg65026
Thanks, I modified it! :t