News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Matrix Multiplication optimization.

Started by InfiniteLoop, January 28, 2020, 05:33:58 AM

Previous topic - Next topic

InfiniteLoop

Hello again,

I am attempting to perform matrix multiplication. I've wasted ages trying to beat Clang and failed horribly. (512ms vs 530ms for 2048*2048 matrices)

I gave up and examined the assembly listing.

The only difference was the inner AVX loop. Theoretically you can only perform 2 * 64 byte reads so I don't see where the improvement comes from. The CPU automatically reads those 128 bytes with just 2 movaps surely?

The rest of the code is all negligible to the  point I Can delete it all and the execution times are identical so its 100% read speed limited.

extern "C" void ASM_Product_Matrix(float *dest, float* a, float* b_t, int num_cols, int num_rows);

Clang

Sums:
vmovaps ymm0, ymmword ptr [r8 + 4*rax]
vmovaps ymm1, ymmword ptr [r8 + 4*rax+32]
vmovaps ymm2, ymmword ptr [r8 + 4*rax+64]
vmovaps ymm3, ymmword ptr [r8 + 4*rax+96]
vfmadd231ps ymm4, ymm0, ymmword ptr [r9 + 4*rax]
vfmadd231ps ymm5, ymm1, ymmword ptr [r9 + 4*rax+32]
vfmadd231ps ymm4, ymm2, ymmword ptr [r9 + 4*rax+64]
vfmadd231ps ymm5, ymm3, ymmword ptr [r9 + 4*rax+96]

vmovaps ymm0, ymmword ptr [r8 + 4*rax+128]
vmovaps ymm1, ymmword ptr [r8 + 4*rax+160]
vfmadd231ps ymm4, ymm0, ymmword ptr [r9 + 4*rax+128]
vfmadd231ps ymm5, ymm1, ymmword ptr [r9 + 4*rax+160]

vmovaps ymm2, ymmword ptr [r8 + 4*rax+192]
vmovaps ymm3, ymmword ptr [r8 + 4*rax+224]
vfmadd231ps ymm4, ymm2, ymmword ptr [r9 + 4*rax+192]
vfmadd231ps ymm5, ymm3, ymmword ptr [r9 + 4*rax+224]

add eax, 64
cmp rax, [rsp+32] ;rows of C is columns of A and B
jne Sums



Mine:

xor eax, eax ;Zero eax
SumAVX: ;=================================================
vmovaps ymm0, ymmword ptr [r8 + 4*rax]
vfmadd231ps ymm1, ymm0, ymmword ptr [r9 + 4*rax]

vmovaps ymm2, ymmword ptr [r8 + 4*rax+32]
vfmadd231ps ymm3, ymm2, ymmword ptr [r9 + 4*rax+32]
add eax, 16
cmp eax, ebx
jnz SumAVX


[/code]

aw27

Is that right that you are doing in 4x7=28 instructions what CLang does in 19?

InfiniteLoop

Clang uses 8 vmovups and 8 fma's per loop but its converting the C++ verbatim otherwise.

This is the fastest I can make it
For Matrix C = Matrix A * Transpose of Matrix B. columns of C and rows of C.

;--------------------------------------------------------------------------------
;Matrix Multiply 1.00 (Matrix C, A, B^T, Dimension_X, Dimension_Y)
;--------------------------------------------------------------------------------
ASM_Product_Matrix_Old proc
;Set r10 to number of rows
mov r10, qword ptr [rsp+40]
;Save ymm registers
mov r11, rsp
sub rsp, 112
and rsp, -32
vmovdqa ymmword ptr [rsp], ymm6
vmovdqa ymmword ptr [rsp+32], ymm7
vmovdqa ymmword ptr [rsp+64], ymm8
vzeroall
push rbx
push rbp
push r12
push r13
push r14
push r15
push rdi
push rdx
push r8
push rcx
push r10
mov ebx, r9d
mov r12d, ebx
sal r12d, 2
sub rsp, 16
mov r9, r8
mov r8, rdx
mov r10, rcx

mov dword ptr [rsp+8], 32 ;blocksize
mov ecx, dword ptr [rsp+8]
mov eax, ecx
mul eax ;blocksize * blocksize
mov ecx, eax
mov eax, dword ptr [rsp+16] ;rows
mul ebx ;rows * cols
div ecx ;(rows.cols)/(blocksize.blocksize)
mov [rsp], eax ;blocks
mov eax, ebx
div dword ptr [rsp+8]
mov edi, eax ;block_columns
vxorps xmm4, xmm4, xmm4
vxorps xmm5, xmm5, xmm5
comment @
R8 A address
R9 B address
R10 C address
[rsp+40] A base
[rsp+32] B base
[rsp+24] C base
EBX total_cols
R12 total_cols * 4
[rsp+16] total_rows
[rsp+8] blocksize
[rsp] total_blocks
EDI blocks_per_column
R14 current_block_col
R15 current_block_row
RBP current_block
RAX,RCX,RDX, ESI, R13
Use stack for Matrix Start address
Set to Start of block
Increase rows
Increase columns
@
xor ebp, ebp
Block: ;EBP =================================================
xor r15d, r15d
;Reset Address of C to Start of Block
;((block count / blocks per column) * blocksize * no of columns + remainder edx * blocksize)*4
mov eax, ebp ;block count
div edi ;per col
mov ecx, edx ;ecx remainder
mul dword ptr [rsp+8]
mul ebx
mov r10d, eax ;from rows
;Reset address of A to Start of block.
;Set row to row of C
sal eax, 2
mov r8, [rsp+40]
add r8,  rax
mov eax, dword ptr [rsp+8] ;blocksize
mul ecx
add r10d, eax
;Reset address of B
;Set row to column of C
mul r12d
mov r9, [rsp+32]
add r9, rax
sal r10d, 2 ;multiply by 4
add r10, [rsp+24]
Row: ;R15=================================================
xor r14d, r14d
Col: ;R14=================================================
xor eax, eax ;Zero eax
SumAVX: ;=================================================
vmovaps ymm0, ymmword ptr [r8 + 4*rax]
vmovaps ymm1, ymmword ptr [r8 + 4*rax+32]
vmovaps ymm2, ymmword ptr [r8 + 4*rax+64]
vmovaps ymm3, ymmword ptr [r8 + 4*rax+96]
vfmadd231ps ymm4, ymm0, ymmword ptr [r9 + 4*rax]
vfmadd231ps ymm5, ymm1, ymmword ptr [r9 + 4*rax+32]
vfmadd231ps ymm4, ymm2, ymmword ptr [r9 + 4*rax+64]
vfmadd231ps ymm5, ymm3, ymmword ptr [r9 + 4*rax+96]

vmovaps ymm0, ymmword ptr [r8 + 4*rax+128]
vmovaps ymm1, ymmword ptr [r8 + 4*rax+160]
vfmadd231ps ymm4, ymm0, ymmword ptr [r9 + 4*rax+128]
vfmadd231ps ymm5, ymm1, ymmword ptr [r9 + 4*rax+160]

vmovaps ymm2, ymmword ptr [r8 + 4*rax+192]
vmovaps ymm3, ymmword ptr [r8 + 4*rax+224]
vfmadd231ps ymm4, ymm2, ymmword ptr [r9 + 4*rax+192]
vfmadd231ps ymm5, ymm3, ymmword ptr [r9 + 4*rax+224]
add eax, 64
cmp eax, ebx
jnz SumAVX
;Add FMA values
;Horizontal sum
vaddps ymm0, ymm4, ymm5
vhaddps ymm2, ymm0, ymm0
vhaddps ymm1, ymm2, ymm2
vextractf128 xmm0, ymm1, 1
vaddss xmm0, xmm1, xmm0
vmovss real4 ptr[r10+4*r14], xmm0
;Empty registers.
vxorps xmm4, xmm4, xmm4
vxorps xmm5, xmm5, xmm5
;Increase B row.
mov eax, r12d
add r9, rax
inc r14d
cmp r14d, dword ptr [rsp+8]
jnz Col
;Increase A row.
add r8, rax
;Reset B row.
mul dword ptr [rsp+8]
sub r9, rax
;Increase C row. yay
mov eax, r12d
add r10, rax
inc r15d
cmp r15d, dword ptr [rsp+8]
jnz Row
inc ebp
cmp ebp, dword ptr [rsp]
jnz Block
add rsp, 48
pop rdi
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
vmovdqa ymm6, ymmword ptr [rsp]
vmovdqa ymm7, ymmword ptr [rsp+32]
vmovdqa ymm8, ymmword ptr [rsp+64]
mov rsp, r11
ret
ASM_Product_Matrix_Old endp

aw27

Now, I don't know what you want.
Have a look at Agner Fog's Assembly optimization tips and see if they can help you.

daydreamer

Quote from: AW on January 28, 2020, 04:55:10 PM
Now, I don't know what you want.
Have a look at Agner Fog's Assembly optimization tips and see if they can help you.
I have read agner fog about number of units in modern cpus that can do different things, and I see CLANG programmer has too,so you see its more than a unroll loop clang produces
so take a look at that section
my none asm creations
https://masm32.com/board/index.php?topic=6937.msg74303#msg74303
I am an Invoker
"An Invoker is a mage who specializes in the manipulation of raw and elemental energies."
Like SIMD coding

aw27

2048x2048 matrix multiplication using CUDA (sample in CUDA SDK):


c:\matrixMul.exe -wA=2048 -hA=2048 -wB=2048 -hB=2048
[Matrix Multiply Using CUDA] - Starting...
GPU Device 0: "GeForce GTX 1060 6GB" with compute capability 6.1

MatrixA(2048,2048), MatrixB(2048,2048)
Computing result using CUDA Kernel...
done
Performance= 630.04 GFlop/s, Time= 27.268 msec, Size= 17179869184 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS

NOTE: The CUDA Samples are not meant for performancemeasurements. Results may vary when GPU Boost is enabled.



Average Time: 27.268 ms
(Average of 50 iterations. )


8192x8192 matrix multiplication using CUDA:


c:\matrixMul.exe -wA=8192 -hA=8192 -wB=8192 -hB=8192
[Matrix Multiply Using CUDA] - Starting...
GPU Device 0: "GeForce GTX 1060 6GB" with compute capability 6.1

MatrixA(8192,8192), MatrixB(8192,8192)
Computing result using CUDA Kernel...
done
Performance= 636.52 GFlop/s, Time= 1727.381 msec, Size= 1099511627776 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS

NOTE: The CUDA Samples are not meant for performancemeasurements. Results may vary when GPU Boost is enabled.


Average of 10 iterations.

InfiniteLoop

#6
Edit: According to the Intel MKL documentation, total operations is equal to (2 * rows - 1) * number of elements. Since it takes 125ms for 2048*2048 that would be 137Glfops.

How is the Intel MKL so fast? The best I can do is 300ms for 2048x2048.

2048x2048 Multiplication using Intel MKL:

#include <iostream>
#include <Windows.h>
#include <chrono>

#include <mkl.h>

int main()

{

float* A, * B, * C;
int m = 2048;
int n = 2048;
int k = m;

A = (float*)mkl_malloc(m * n * sizeof(float), 64);
B = (float*)mkl_malloc(m * n * sizeof(float), 64);
C = (float*)mkl_malloc(m * n * sizeof(float), 64);

for (int i = 0; i < m*n; i++)
{
float value = (float)rand() / (float)RAND_MAX;
//float value = 0.0f;
A[i] = value;
B[i] = value;
C[i] = 0.0f;
}


A[0] = 1.0f;
A[1] = 2.0f;
A[m] = 3.0f;
A[m+1] = 4.0f;
B[0] = 5.0f;
B[1] = 6.0f;
B[m] = 7.0f;
B[m + 1] = 8.0f;



std::chrono::steady_clock::time_point Start;

Start = std::chrono::steady_clock::now();

std::cout << " Test Intel MKL." << std::endl;

cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,n, m, k, 1.0f, A, k, B, n, 1.0f, C, n);


std::cout << " Time " << std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::duration(std::chrono::steady_clock::now() - Start)).count() << " us " << std::endl;

for (int j = 0; j < 4; j++)
{

for (int i = 0; i < 4; i++)
{
std::cout << C[i + j * n] << "  ";
}
std::cout << std::endl;
}

Sleep(5000);

mkl_free(A);
mkl_free(B);
mkl_free(C);

return 0;
}


Single-threaded "sequential": ~125ms
Multi-threaded "parallel": < 50ms