Fast Matrix Transposing - All matrix sizes

aw27 · April 11, 2017, 12:06:16 AM

Continuing the saga on Matrix Transposing...
This is a solution for transposing matrixes of any size, squared or not. It supports as well small matrixes or matrixes with less than 4 rows or 4 columns.
It was tested with up to 10000 rows x 10000 columns, which means 100 million elements, where the transposing took less than half a second in my computer, fast enough for most requirements.
It works by transposing 4x4 blocks whenever possible, here i used basically the method presented by Siekmanski, what is left is transposed element by element.
Note: Since we are working here with matrixes of any number of rows and columns it may not benefit from 16-byte data alignment, except on the first row - So, I skipped the alignment convinced it will not help much.

x86:

Code Select


.686
.XMM

    .MODEL FLAT, C
    option casemap:none

.code
    
MatrixTranspose proc public USES ebx esi edi outMat : ptr, inMat : ptr, rows : dword, cols : dword
	LOCAL X4x4Required : dword
	LOCAL Y4x4Required : dword
	LOCAL remainderX : dword
	LOCAL remainderY : dword
	LOCAL destRowSize : dword
	LOCAL sourceRowSize : dword
	LOCAL savedDest : dword

	mov edi, outMat
	mov esi, inMat

	; How many cols % 4?
	mov eax, cols
	mov ebx, 4
	mov edx, 0
	div ebx
	mov X4x4Required, eax
	mov remainderX, edx
	
	; How many rows %4?	
	mov eax, rows
	mov ebx, 4
	mov edx, 0
	div ebx
	mov Y4x4Required, eax
	mov remainderY, edx
	
	mov eax, rows
	shl eax, 2
	mov destRowSize, eax
	
	mov eax, cols
	shl eax, 2
	mov sourceRowSize, eax
	
	mov ebx, 0
	.while ebx<Y4x4Required ; rows % 4
		; find starting point for source
		mov eax, ebx
		mul sourceRowSize
		shl eax, 2
		
		mov esi, inMat		
		add esi, eax
		mov ecx, esi ; save
		; find starting point for destination
		mov eax, ebx
		shl eax, 4
		mov edi, outMat
		add edi, eax 
		mov savedDest, edi ; save
		push ebx
		
		mov ebx,0
		.while ebx<X4x4Required
			mov eax, ebx
			shl eax, 4
			mov esi, ecx
			add esi, eax
			movups xmm0, xmmword ptr [esi]
			add esi, sourceRowSize
			movups xmm1, xmmword ptr [esi]
			add esi, sourceRowSize
			movups xmm2, xmmword ptr [esi]
			add esi, sourceRowSize
			movups xmm3, xmmword ptr [esi]
			
			movaps      xmm4,xmm0
			movaps      xmm5,xmm2
			unpcklps    xmm4,xmm1
			unpcklps    xmm5,xmm3
			unpckhps    xmm0,xmm1
			unpckhps    xmm2,xmm3
			movaps      xmm1,xmm4
			movaps      xmm6,xmm0
			movlhps     xmm4,xmm5
			movlhps     xmm6,xmm2
			movhlps     xmm5,xmm1
			movhlps     xmm2,xmm0

			mov eax, destRowSize
			shl eax, 2
			mul ebx
			mov edi, savedDest			
			add edi, eax
			
			movups xmmword ptr [edi], xmm4
			add edi, destRowSize
			movups xmmword ptr [edi], xmm5
			add edi, destRowSize
			movups xmmword ptr [edi], xmm6
			add edi, destRowSize
			movups xmmword ptr [edi], xmm2

			inc ebx
		.endw
		pop ebx
		inc ebx
	.endw
	; deal with rows not multiple of 4	
	.if remainderX >=1
		mov eax, X4x4Required
		shl eax, 4
		mov esi, inMat
		add esi, eax
			
		mov eax, X4x4Required
		shl eax, 2
		mul destRowSize
		mov edi, outMat			
		add edi, eax
		
		mov edx, 0
		.while edx < remainderX
			mov ecx, 0
			mov eax, 0
			.while ecx<rows
				mov ebx, dword ptr [esi+eax]
				mov dword ptr [edi+4*ecx], ebx
				add eax, sourceRowSize
				inc ecx
			.endw
			add esi, 4
			add edi, destRowSize
			inc edx
		.endw
	.endif
	
	; deal with columns not multiple of 4
	.if remainderY >=1
		mov eax, Y4x4Required
		shl eax, 2
		mul sourceRowSize
		mov esi, inMat
		add esi, eax

		mov eax, Y4x4Required
		shl eax, 4
		mov edi, outMat
		add edi, eax
	
		mov edx,0
		.while edx < remainderY
			mov ecx, 0
			mov eax, 0
			.while ecx<cols
				mov ebx, dword ptr [esi+4*ecx]
				mov dword ptr [edi+eax], ebx
				add eax, destRowSize
				inc ecx
			.endw
			add esi, sourceRowSize
			add edi, 4
			inc edx
		.endw
	.endif
	
	ret
MatrixTranspose endp
    
end

x64:

Code Select


option casemap:none
option frame:auto
OPTION STACKBASE:RBP

.code

MatrixTranspose proc public FRAME uses xmm6 rbx rsi rdi r12 outMat : ptr, inMat : ptr, rows : qword, cols : qword
	LOCAL X4x4Required : qword
	LOCAL Y4x4Required : qword
	LOCAL remainderX : qword
	LOCAL remainderY : qword

	mov outMat, rcx
	mov inMat, rdx
	mov rows, r8
	mov cols, r9

	mov rdi, rcx
	mov rsi, rdx

	; How many cols % 4?
	mov rax, cols
	mov r10, 4
	mov rdx, 0
	div r10
	mov X4x4Required, rax
	mov remainderX, rdx
	
	; How many rows %4?	
	mov rax, rows
	mov r10, 4
	mov rdx, 0
	div r10
	mov Y4x4Required, rax
	mov remainderY, rdx
	
	mov rax, rows
	shl rax, 2
	mov r8, rax ; r8 = destination row size in bytes
	
	mov rax, cols
	shl rax, 2
	mov r12, rax ; r12 = source row size in bytes
	
	mov r11, 0
	.while r11<Y4x4Required ; rows % 4
		mov r10,0
		; find starting point for source
		mov rax, r11
		mul r12
		shl rax, 2
		
		mov rsi, inMat		
		add rsi, rax
		mov rcx, rsi ; save
		; find starting point for destination
		mov rax, r11
		shl rax, 4
		mov rdi, outMat
		add rdi, rax 
		mov r9, rdi ; save
		.while r10<X4x4Required
			mov rax, r10
			shl rax, 4
			mov rsi, rcx
			add rsi, rax
			movups xmm0, xmmword ptr [rsi]
			add rsi, r12
			movups xmm1, xmmword ptr [rsi]
			add rsi, r12
			movups xmm2, xmmword ptr [rsi]
			add rsi, r12
			movups xmm3, xmmword ptr [rsi]

			movaps      xmm4,xmm0
			movaps      xmm5,xmm2
			unpcklps    xmm4,xmm1
			unpcklps    xmm5,xmm3
			unpckhps    xmm0,xmm1
			unpckhps    xmm2,xmm3
			movaps      xmm1,xmm4
			movaps      xmm6,xmm0
			movlhps     xmm4,xmm5
			movlhps     xmm6,xmm2
			movhlps     xmm5,xmm1
			movhlps     xmm2,xmm0

			mov rax, r8
			shl rax, 2
			mul r10
			mov rdi, r9			
			add rdi, rax
			
			movups xmmword ptr [rdi], xmm4
			add rdi, r8
			movups xmmword ptr [rdi], xmm5
			add rdi, r8
			movups xmmword ptr [rdi], xmm6
			add rdi, r8
			movups xmmword ptr [rdi], xmm2

			inc r10
		.endw
		
		inc r11
	.endw
	; deal with rows not multiple of 4	
	.if remainderX >=1
		mov rax, X4x4Required
		shl rax, 4
		mov rsi, inMat
		add rsi, rax
			
		mov rax, X4x4Required
		shl rax, 2
		mul r8
		mov rdi, outMat			
		add rdi, rax
		
		mov r10, 0
		.while r10 < remainderX
			mov rcx, 0
			mov rdx, 0
			.while rcx<rows
				mov ebx, dword ptr [rsi+rdx]
				mov dword ptr [rdi+4*rcx], ebx
				add rdx, r12
				inc rcx
			.endw
			add rsi, 4
			add rdi, r8
			inc r10
		.endw
	.endif
	
	; deal with columns not multiple of 4
	.if remainderY >=1
		mov rax, Y4x4Required
		shl rax, 2
		mul r12
		mov rsi, inMat
		add rsi, rax

		mov rax, Y4x4Required
		shl rax, 4
		mov rdi, outMat
		add rdi, rax
	
		mov r10,0
		.while r10 < remainderY
			mov rcx, 0
			mov rdx, 0
			.while rcx<cols
				mov ebx, dword ptr [rsi+4*rcx]
				mov dword ptr [rdi+rdx], ebx
				add rdx, r8
				inc rcx
			.endw
			add rsi, r12
			add rdi, 4
			inc r10
		.endw
	.endif
	
	ret
MatrixTranspose endp

end

It can be tested with the following C++ console program, which builds random sized matrixes up to 99x99, transposes them and in the end shows what has been done and the time taken for the transposing operation.

Code Select


#include <Windows.h>
#include <stdio.h>
#include <tchar.h>
#include <stdlib.h>
#include <time.h>

extern "C"
{
	void MatrixTranspose(void* outMat, void* inMat, size_t rows, size_t cols);
}


int main()
{
	int i, j;
	int numRows, numCols;
	int *inMat, *outMat;
	LARGE_INTEGER frequency;
	LARGE_INTEGER t1, t2;
	double elapsedTime;

	srand(time(NULL));
	numRows = rand() % 99 + 1;
	numCols = rand() % 99 + 1;
	inMat = (int*)malloc(numRows*numCols * sizeof(int));
	outMat = (int*)malloc(numRows*numCols * sizeof(int));
	for (i = 0;i < numRows;i++)
		for (j = 0;j < numCols;j++)
			inMat[i*numCols + j] = rand() % 20 + 1;
	QueryPerformanceFrequency(&frequency);
	QueryPerformanceCounter(&t1);
	MatrixTranspose(outMat, inMat, numRows, numCols);
	QueryPerformanceCounter(&t2);
	elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;

	printf("Rows=%d Cols=%d\n\n", numRows, numCols);
	printf("Transpose - Input matrix\n");
	for (int i = 0; i < numRows; i++)
	{
		for (int j = 0; j < numCols; j++)
			printf("%d\t", inMat[i*numCols + j]);
		printf("\n");
	}
	printf("\n");
	printf("Transpose - Output matrix\n");
	for (int i = 0; i < numCols; i++)
	{
		for (int j = 0; j < numRows; j++)
			printf("%d\t", outMat[i*numRows + j]);
		printf("\n");
	}
	printf("\n");
	printf("Elapsed time: %f miliseconds", elapsedTime);
	free(inMat);
	free(outMat);
	getchar();	return 0;
}

Siekmanski · April 11, 2017, 01:35:25 AM

Nice. :t

Please use the latest 4*4 Transpose Matrix algorithm to gain some more speed:

http://masm32.com/board/index.php?topic=6127.msg65026#msg65026

aw27 · April 11, 2017, 03:12:39 AM

Quote from: Siekmanski on April 11, 2017, 01:35:25 AM
Nice. :t

Please use the latest 4*4 Transpose Matrix algorithm to gain some more speed:

http://masm32.com/board/index.php?topic=6127.msg65026#msg65026

Thanks, I modified it! :t

The MASM Forum

News:

Fast Matrix Transposing - All matrix sizes

aw27

Siekmanski

aw27