News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

Using the GPU

Started by Biterider, April 30, 2019, 07:26:14 PM

Previous topic - Next topic

aw27

This is a simplified version of the log 2 example produced by LiaoMi, under 3 API:
- Runtime API
- Driver API
- Driver API in MASM

Common C/C++


#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>


#define N 32

extern "C" __global__ void log_2(float* inout, int length) {
int xind = blockIdx.x * blockDim.x + threadIdx.x;
if (xind < length) {
float tempFloat = log2(*(inout + xind));
*(inout +  xind) = tempFloat;
}
}

//#define RUNTIMEAPI 1
//#define DRIVERAPI 1
#define MASMAPI 1

extern "C" int logasm2();
extern "C" char PTXFunction;
extern "C" char PTXSourceData;

#if defined(RUNTIMEAPI)
void testruntimeAPi()
{
float hinOut[N];
float* dinOut;
cudaMalloc((void**)& dinOut, N * sizeof(float));

for (int i = 1; i <= N; ++i) {
hinOut[i-1] = (float)i;
}
cudaMemcpy(dinOut, hinOut, N * sizeof(float), cudaMemcpyHostToDevice);
log_2 <<<N, 1 >>> (dinOut, N);
cudaMemcpy(hinOut, dinOut, N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("%d %f\n", i+1, hinOut[i]);
}
cudaFree(dinOut);
getchar();
}
#endif

#if defined(DRIVERAPI)
int testDriverApi()
{
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction cudaFunction;
CUresult result;
CUdeviceptr dinOut;
int count;

float hinOut[N];
unsigned int memSize = sizeof(float) * N;

if (cuInit(0))
return 1;
if (cuDeviceGet(&cuDevice, 0))
return 1;
if (cuCtxCreate(&cuContext, 0, cuDevice))
return 1;
if (cuModuleLoadData(&cuModule, &PTXSourceData))
return 1;
if (cuModuleGetFunction(&cudaFunction, cuModule, &PTXFunction))
return 1;
for (int i = 1; i <= N; ++i) {
hinOut[i - 1] = (float)i;
}
cuMemAlloc(&dinOut, memSize);
cuMemcpyHtoD(dinOut, hinOut, memSize);
count = N;
void *args[] = { &dinOut, &count };

result = cuLaunchKernel(cudaFunction, N, 1, 1, 1, 1, 1, 0, 0, &args[0], 0);
if (result!=CUDA_SUCCESS)
return 1;
cuCtxSynchronize();
cuMemcpyDtoH(hinOut, dinOut, memSize);
for (int i = 0; i < N; ++i) {
printf("%d %f\n", i + 1, hinOut[i]);
}
cuMemFree(dinOut);

getchar();
return 0;
}
#endif

#if defined(MASMAPI)
void testMasmAPI()
{
int err=logasm2();
printf("Error %d (0=NO ERROR)",err);
getchar();
return;
}
#endif

int main()
{
#if RUNTIMEAPI
testruntimeAPi();
#elif DRIVERAPI
testDriverApi();
#elif MASMAPI
testMasmAPI();
#endif
return 0;
}




MASM:


OPTION casemap:none

N equ 32
public PTXFunction
public PTXSourceData

printf proto :ptr, :vararg

cuInit proto :dword
cuDeviceGet proto :ptr, :dword
cuCtxCreate_v2 proto :ptr, :dword, :dword
cuModuleLoadData proto :ptr, :ptr
cuModuleGetFunction proto :ptr, :ptr, :ptr
cuMemAlloc_v2 proto :ptr, :qword
cuMemcpyHtoD_v2 proto :ptr, :ptr, :qword
cuMemcpyDtoH_v2 proto :ptr, :ptr, :qword
cuLaunchKernel proto :ptr, :dword, :dword, :dword, :dword, :dword, :dword, :ptr, :ptr, :ptr
cuCtxSynchronize proto
cuMemFree_v2 proto :ptr
cuCtxGetApiVersion proto :ptr, :ptr

.code

logasm2 proc
LOCAL cuDevice:dword
LOCAL cuContext:ptr
LOCAL cuModule:ptr
LOCAL cudaFunction:ptr
LOCAL dInOut: qword
LOCAL hInOut[N]:real4
LOCAL memSize : qword
LOCAL args[2] : ptr
LOCAL qt : dword;

sub rsp, 58h
and rsp, -16 ; align
mov eax, sizeof REAL4 * N
mov memSize, rax
mov rcx,0
call cuInit
cmp eax, 0
jnz @exit

lea rcx, cuDevice
mov edx,0
call cuDeviceGet
cmp eax, 0
jnz @exit

lea rcx, cuContext
mov edx, 0
mov r8d, cuDevice
call cuCtxCreate_v2
cmp eax, 0
jnz @exit

lea rcx, cuModule
mov rdx, offset PTXSourceData
call cuModuleLoadData
cmp eax, 0
jnz @exit

lea rcx, cudaFunction
mov rdx, cuModule
lea r8, PTXFunction
call cuModuleGetFunction
cmp eax, 0
jnz @exit

mov ecx, N
lea r11, hInOut
fld1
fld1
@@:
fst dword ptr [r11]
fadd st,st(1)
add r11, 4
dec ecx
jnz @B

lea rcx, dInOut
mov rdx, memSize
call cuMemAlloc_v2
cmp eax, 0
jnz @exit

mov rcx, dInOut
lea rdx, hInOut
mov r8, memSize
call cuMemcpyHtoD_v2
cmp eax, 0
jnz @exit

lea rax, dInOut
mov qword ptr args, rax
mov qt, N
lea rax, qt
mov qword ptr args+8, rax

mov rcx, cudaFunction
mov edx ,1
mov r8d, 1
mov r9d, 1
mov dword ptr [rsp+20h], N
mov dword ptr [rsp+28h], 1
mov dword ptr [rsp+30h], 1
mov dword ptr [rsp+38h], 0
mov qword ptr [rsp+40h], 0
lea rax, args
mov qword ptr [rsp+48h], rax
mov qword ptr [rsp+50h], 0
call cuLaunchKernel
cmp eax, 0
jnz @exit

call cuCtxSynchronize
cmp eax, 0
jnz @exit

lea rcx, hInOut
mov rdx, dInOut
mov r8, memSize
call cuMemcpyDtoH_v2
cmp eax, 0
jnz @exit

mov ebx, 1
lea rdi, hInOut
@@:
mov rcx, offset report
mov edx, ebx
fld dword ptr [rdi]
fstp qword ptr [rsp+20h]
mov r8, qword ptr [rsp+20h]
call printf
add rdi, 4
inc ebx
cmp ebx, N
jle @B

mov rcx, dInOut
call cuMemFree_v2
mov eax,0
@exit:
ret
logasm2 endp

.data
report db "%d %f",10,0

; Generated by NVIDIA NVVM Compiler
; Compiler Build ID: CL-25769353
; Cuda compilation tools, release 10.1, V10.1.105
; Based on LLVM 3.4svn
PTXFunction db 'log_2',0
PTXSourceData \
db '.version 6.4 ',10
db '.target sm_30 ',10
db '.address_size 64 ',10
db ' ',10
db ' // .globl log_2 ',10
db ' ',10
db '.visible .entry log_2( ',10
db ' .param .u64 log_2_param_0, ',10
db ' .param .u32 log_2_param_1 ',10
db ') ',10
db '{ ',10
db ' .reg .pred %p<2>; ',10
db ' .reg .f32 %f<3>; ',10
db ' .reg .b32 %r<6>; ',10
db ' .reg .b64 %rd<5>; ',10
db ' ',10
db ' ',10
db ' ld.param.u64 %rd1, [log_2_param_0]; ',10
db ' ld.param.u32 %r2, [log_2_param_1]; ',10
db ' mov.u32 %r3, %ctaid.x; ',10
db ' mov.u32 %r4, %ntid.x; ',10
db ' mov.u32 %r5, %tid.x; ',10
db ' mad.lo.s32 %r1, %r4, %r3, %r5; ',10
db ' setp.ge.s32 %p1, %r1, %r2; ',10
db ' @%p1 bra BB0_2; ',10
db ' ',10
db ' cvta.to.global.u64 %rd2, %rd1; ',10
db ' mul.wide.s32 %rd3, %r1, 4; ',10
db ' add.s64 %rd4, %rd2, %rd3; ',10
db ' ld.global.f32 %f1, [%rd4]; ',10
db ' lg2.approx.ftz.f32 %f2, %f1; ',10
db ' st.global.f32 [%rd4], %f2; ',10
db ' ',10
db 'BB0_2: ',10
db ' ret; ',10
db '} ',10
db 0
end



Output:
1 0.000000
2 1.000000
3 1.584962
4 2.000000
5 2.321928
6 2.584962
7 2.807355
8 3.000000
9 3.169925
10 3.321928
11 3.459432
12 3.584962
13 3.700440
14 3.807355
15 3.906890
16 4.000000
17 4.087463
18 4.169925
19 4.247927
20 4.321928
21 4.392317
22 4.459432
23 4.523562
24 4.584962
25 4.643856
26 4.700439
27 4.754887
28 4.807355
29 4.857981
30 4.906890
31 4.954196
32 5.000000
Error 0 (0=NO ERROR)

TimoVJL

clang backend supports CUDA :thumbsup:
So for textual .ptx, only ptxas / fatbinary stubs are needed.
a minimal ci_include.htypedef struct _uint3
{
    unsigned int x, y, z;
}uint3;

typedef struct _dim3
{
    unsigned int x, y, z;
}dim3;

extern const uint3 threadIdx;
extern const uint3 blockIdx;
extern const dim3 blockDim;
extern const dim3 gridDim;
extern const int warpSize;

#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))

#define bool _Bool
a fake ptxas just copy that temporary .s file#define WIN32_LEAN_AND_MEAN
#include <windows.h>

int __cdecl main(int argc, char **argv)
{
char fname[260];
char *ptx = argv[argc-1];
while(*ptx) ptx++; // find end
char *pend = ptx;
while(*pend != '-') pend--;
ptx = pend;
while(*ptx != '\\') ptx--;
ptx++;
strcpy(fname, ptx);
char *pext = fname + (pend - ptx);
*(long*)pext = *(long*)".ptx"; // add ext
*(pext+4) = 0; // cut name
return !CopyFile(argv[argc-1], fname, FALSE);
}
a commandline used in testsclang.exe -c add.cu -nocudainc -nocudalib --cuda-gpu-arch=sm_30
May the source be with you

LiaoMi

Quote from: TimoVJL on May 24, 2019, 07:50:02 PM
clang backend supports CUDA :thumbsup:

Hi TimoVJL,

need to try, there is a web compiler for Cuda and Cuda LLVM
https://cuda.godbolt.org/
ordinary compiler
https://godbolt.org/

CUDA Compression
A GPU-based LZSS compression algorithm, highly tuned for NVIDIA GPGPUs and for streaming data, leveraging the respective strengths of CPUs and GPUs together - https://github.com/adnanozsoy/CUDA_Compression
Cuda lzss compression https://github.com/abshkbh/cuda-lzss
Algorithms for Compression on GPUs http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6642/pdf/imm6642.pdf
Compression library using Nvidia's CUDA - https://stackoverflow.com/questions/456829/compression-library-using-nvidias-cuda
Parallel lossless compression using GPUs http://on-demand.gputechconf.com/gtc/2014/presentations/S4459-parallel-lossless-compression-using-gpus.pdf
Breakthrough in CUDA data compression https://www.wave-access.com/public_en/blog/2011/april/22/breakthrough-in-cuda-data-compression.aspx

The results:

BMP, 540 Kb
Full bzip2 compression on CPU - 218 ms
BW Transform on CPU - 171 ms
Full bzip2 compression on GPU - 93 ms [ minus 53% ]
BW Transform on GPU - 46 ms [ minus 73% ]

BMP, 1112 Kb
Full bzip2 compression on CPU - 467 ms
BW Transform on CPU - 343 ms
Full bzip2 compression on GPU - 249 ms [ minus 46% ]
BW Transform on GPU - 140 ms [ minus 59% ]

PDF, 1919 Kb
Full bzip2 compression on CPU - 1513 ms
BW Transform on CPU - 731 ms
Full bzip2 compression on GPU - 1107 ms [ minus 26% ]
BW Transform on GPU - 311 ms [ minus 57% ]

PDF, 3425 Kb
Full bzip2 compression on CPU - 2168 ms
BW Transform on CPU - 793 ms
Full bzip2 compression on GPU - 1856 ms [ minus 14% ]
BW Transform on GPU - 481 ms [ minus 39% ]




Learn How To Do Alphablending with CUDA
https://www.codeproject.com/Articles/41977/Learn-How-To-Do-Alphablending-with-CUDA

TimoVJL

A good link :thumbsup:
Less reasons to download a huge nVidia SDK.
May the source be with you

LiaoMi

Hi,

I contacted the author of the program CudaPAD, with the result that there was an update for the Visual Studio 2019, now the program works on the latest version of both Visual Studio and cuda_10.1.168_425.25_win10, source code can be found here https://github.com/SunsetQuest/CudaPAD

Everything works ...

aw27

Thank you, LiaoMi. Very helpful  :thumbsup:

daydreamer

are there a way to get aviable VRAM I can use for memory allocating a big array? and compare to system ram,on older computer that have been upgraded with better nvidia or not it would probably run much faster
my none asm creations
https://masm32.com/board/index.php?topic=6937.msg74303#msg74303
I am an Invoker
"An Invoker is a mage who specializes in the manipulation of raw and elemental energies."
Like SIMD coding

Biterider

#52
Hi
Now that I have some free time, I've been working on CUDA for a bit. Really amazing.
I was able to integrate the code from LiaoMi into one of my 64-bit demos and it works like a charm.  :thumbsup:
I failed in 32 bits because I need to change the PTX code.  :sad:
So far I found the newest ISA documentation I need to read https://docs.nvidia.com/cuda/pdf/ptx_isa_6.4.pdf

For me, the question remains whether the PTX was the right choice or to use nvcc to create the code ...  :icon_idea:

Biterider

PS: link corrected - thanks to TimoVJL -

TimoVJL

May the source be with you

Biterider

Hi
I found a bunch of CUDA tutorials that go from 1 to 12. You're really good at getting a basic understanding of the GPU architecture, the features, the pitfalls. Each tutorial is accompanied with examples.  :thup:

Here the link to the first one https://www.youtube.com/watch?v=m0nhePeHwFs
The rest is easy to find.

Biterider

LiaoMi

Quote from: Biterider on June 19, 2019, 10:43:39 PM
Hi
I found a bunch of CUDA tutorials that go from 1 to 12. You're really good at getting a basic understanding of the GPU architecture, the features, the pitfalls. Each tutorial is accompanied with examples.  :thup:

Here the link to the first one https://www.youtube.com/watch?v=m0nhePeHwFs
The rest is easy to find.

Biterider

Hi Biterider,

thanks for the interesting video!


Quote from: Biterider on June 18, 2019, 06:35:05 PM
For me, the question remains whether the PTX was the right choice or to use nvcc to create the code ...  :icon_idea:

I watched standalone compilers and emulators for Cuda, they were not perfect, in addition, writing a Cuda translator is a very difficult task. If you try to write your own macros for ptx, this will require a serious study of the internal architecture of the GPU, Nvidia tried to create a special emulator that could help developers, and later they closed the project, since it was too complicated. For me personally, the best solution is the CudaPad application, with dynamic programming of C code.

From the documentation...
1.1.3. Purpose of NVCC
The compilation trajectory involves several splitting, compilation, preprocessing, and merging steps for each CUDA source file. It is the purpose of nvcc, the CUDA compiler driver, to hide the intricate details of CUDA compilation from developers. It accepts a range of conventional compiler options, such as for defining macros and include/library paths, and for steering the compilation process. All non-CUDA compilation steps are forwarded to a C++ host compiler that is supported by nvcc, and nvcc translates its options to appropriate host compiler command line options.

1.2. Supported Host Compilers
A general purpose C++ host compiler is needed by nvcc in the following situations:
During non-CUDA phases (except the run phase), because these phases will be forwarded by nvcc to this compiler.
During CUDA phases, for several preprocessing stages and host code compilation (see also The CUDA Compilation Trajectory).
nvcc assumes that the host compiler is installed with the standard method designed by the compiler provider. If the host compiler installation is non-standard, the user must make sure that the environment is set appropriately and use relevant nvcc compile options.

Figure 1. CUDA Compilation Trajectory

daydreamer

thanks Biterider
I downloaded NVASM many years ago,it was only for the oldest 128bit hardware version of pixelshaders
I tried out nvidias' Cg+ toolkit years ago,similar to C,but often 4 floats simultanously like SSE
also newer C++ you can include code <vector>
is that what you should code in C++ with vectors and compile to different cpus or CUDA and benchmark same code on different hardware?CUDA probably supports vectors in C++???

Biterider,LiaoMi or anyone else,have you tested CUDA with benchmark fibonnacci,primes,PI code?
my none asm creations
https://masm32.com/board/index.php?topic=6937.msg74303#msg74303
I am an Invoker
"An Invoker is a mage who specializes in the manipulation of raw and elemental energies."
Like SIMD coding