Using the GPU

aw27 · May 18, 2019, 05:11:08 AM

This is a simplified version of the log 2 example produced by LiaoMi, under 3 API:
- Runtime API
- Driver API
- Driver API in MASM

Common C/C++

Code Select


#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>


#define N 32

extern "C" __global__ void log_2(float* inout, int length) {
	int xind = blockIdx.x * blockDim.x + threadIdx.x;
	if (xind < length) {
		float tempFloat = log2(*(inout + xind));
		*(inout +  xind) = tempFloat;
	}
}

//#define RUNTIMEAPI 1
//#define DRIVERAPI 1
#define MASMAPI 1

extern "C" int logasm2();
extern "C" char PTXFunction;
extern "C" char PTXSourceData;

#if defined(RUNTIMEAPI)
void testruntimeAPi()
{
	float hinOut[N];
	float* dinOut;
	cudaMalloc((void**)& dinOut, N * sizeof(float));
	
	for (int i = 1; i <= N; ++i) {
		hinOut[i-1] = (float)i;
	}
	cudaMemcpy(dinOut, hinOut, N * sizeof(float), cudaMemcpyHostToDevice);
	log_2 <<<N, 1 >>> (dinOut, N);
	cudaMemcpy(hinOut, dinOut, N * sizeof(float), cudaMemcpyDeviceToHost);
	for (int i = 0; i < N; ++i) {
		printf("%d %f\n", i+1, hinOut[i]);
	}
	cudaFree(dinOut);
	getchar();
}
#endif

#if defined(DRIVERAPI)
int testDriverApi()
{
	CUdevice cuDevice;
	CUcontext cuContext;
	CUmodule cuModule;
	CUfunction cudaFunction;
	CUresult result;
	CUdeviceptr dinOut;
	int count;
	
	float hinOut[N];
	unsigned int memSize = sizeof(float) * N;

	if (cuInit(0))
		return 1;
	if (cuDeviceGet(&cuDevice, 0))
		return 1;
	if (cuCtxCreate(&cuContext, 0, cuDevice))
		return 1;
	if (cuModuleLoadData(&cuModule, &PTXSourceData))
		return 1;
	if (cuModuleGetFunction(&cudaFunction, cuModule, &PTXFunction))
		return 1;
	for (int i = 1; i <= N; ++i) {
		hinOut[i - 1] = (float)i;
	}
	cuMemAlloc(&dinOut, memSize);
	cuMemcpyHtoD(dinOut, hinOut, memSize);
	count = N;
	void *args[] = { &dinOut, &count };
	
	result = cuLaunchKernel(cudaFunction, N, 1, 1, 1, 1, 1, 0, 0, &args[0], 0);
	if (result!=CUDA_SUCCESS)
		return 1;
	cuCtxSynchronize();
	cuMemcpyDtoH(hinOut, dinOut, memSize);
	for (int i = 0; i < N; ++i) {
		printf("%d %f\n", i + 1, hinOut[i]);
	}
	cuMemFree(dinOut);
	
	getchar();
	return 0;
}
#endif

#if defined(MASMAPI)
void testMasmAPI()
{
	int err=logasm2();
	printf("Error %d (0=NO ERROR)",err);
	getchar();
	return;
}
#endif

int main()
{
#if RUNTIMEAPI
	testruntimeAPi();
#elif DRIVERAPI
	testDriverApi();
#elif MASMAPI
	testMasmAPI();
#endif
	return 0;
}

MASM:

Code Select


OPTION casemap:none

N equ 32
public PTXFunction
public PTXSourceData

printf proto :ptr, :vararg

cuInit proto :dword
cuDeviceGet proto :ptr, :dword
cuCtxCreate_v2 proto :ptr, :dword, :dword
cuModuleLoadData proto :ptr, :ptr
cuModuleGetFunction proto :ptr, :ptr, :ptr
cuMemAlloc_v2 proto :ptr, :qword
cuMemcpyHtoD_v2 proto :ptr, :ptr, :qword
cuMemcpyDtoH_v2 proto :ptr, :ptr, :qword
cuLaunchKernel proto :ptr, :dword, :dword, :dword, :dword, :dword, :dword, :ptr, :ptr, :ptr
cuCtxSynchronize proto
cuMemFree_v2 proto :ptr
cuCtxGetApiVersion proto :ptr, :ptr

.code

logasm2 proc
	LOCAL cuDevice:dword
	LOCAL cuContext:ptr
	LOCAL cuModule:ptr
	LOCAL cudaFunction:ptr
	LOCAL dInOut: qword
	LOCAL hInOut[N]:real4
	LOCAL memSize : qword
	LOCAL args[2] : ptr
	LOCAL qt : dword;
		
	sub rsp, 58h
	and rsp, -16 ; align
	mov eax, sizeof REAL4 * N
	mov memSize, rax
	mov rcx,0
	call cuInit
	cmp eax, 0
	jnz @exit

	lea rcx, cuDevice
	mov edx,0
	call cuDeviceGet
	cmp eax, 0
	jnz @exit

	lea rcx, cuContext
	mov edx, 0
	mov r8d, cuDevice
	call cuCtxCreate_v2
	cmp eax, 0
	jnz @exit

	lea rcx, cuModule
	mov rdx, offset PTXSourceData
	call cuModuleLoadData
	cmp eax, 0
	jnz @exit

	lea rcx, cudaFunction
	mov rdx, cuModule
	lea r8, PTXFunction
	call cuModuleGetFunction
	cmp eax, 0
	jnz @exit

	mov ecx, N
	lea r11, hInOut
	fld1
	fld1
@@:
	fst dword ptr [r11]
	fadd st,st(1)
	add r11, 4
	dec ecx
	jnz @B

	lea rcx, dInOut
	mov rdx, memSize
	call cuMemAlloc_v2
	cmp eax, 0
	jnz @exit

	mov rcx, dInOut
	lea rdx, hInOut
	mov r8, memSize
	call cuMemcpyHtoD_v2
	cmp eax, 0
	jnz @exit	

	lea rax, dInOut
	mov qword ptr args, rax
	mov qt, N
	lea rax, qt
	mov qword ptr args+8, rax

	mov rcx, cudaFunction
	mov edx ,1
	mov r8d, 1
	mov r9d, 1
	mov dword ptr [rsp+20h], N
	mov dword ptr [rsp+28h], 1
	mov dword ptr [rsp+30h], 1
	mov dword ptr [rsp+38h], 0
	mov qword ptr [rsp+40h], 0
	lea rax, args
	mov qword ptr [rsp+48h], rax
	mov qword ptr [rsp+50h], 0
	call cuLaunchKernel
	cmp eax, 0
	jnz @exit
	
	call cuCtxSynchronize
	cmp eax, 0
	jnz @exit	

	lea rcx, hInOut
	mov rdx, dInOut
	mov r8, memSize
	call cuMemcpyDtoH_v2
	cmp eax, 0
	jnz @exit	

	mov ebx, 1
	lea rdi, hInOut
@@:
	mov rcx, offset report
	mov edx, ebx
	fld dword ptr [rdi]
	fstp qword ptr [rsp+20h]
	mov r8, qword ptr [rsp+20h]
	call printf
	add rdi, 4
	inc ebx
	cmp ebx, N
	jle @B

	mov rcx, dInOut
	call cuMemFree_v2
	mov eax,0
@exit:
	ret
logasm2 endp

.data
report db "%d %f",10,0

; Generated by NVIDIA NVVM Compiler
; Compiler Build ID: CL-25769353
; Cuda compilation tools, release 10.1, V10.1.105
; Based on LLVM 3.4svn
PTXFunction db 'log_2',0
PTXSourceData \
db '.version 6.4 ',10
db '.target sm_30 ',10
db '.address_size 64 ',10
db ' ',10
db '	// .globl	log_2 ',10
db ' ',10
db '.visible .entry log_2( ',10
db '	.param .u64 log_2_param_0, ',10
db '	.param .u32 log_2_param_1 ',10
db ') ',10
db '{ ',10
db '	.reg .pred 	%p<2>; ',10
db '	.reg .f32 	%f<3>; ',10
db '	.reg .b32 	%r<6>; ',10
db '	.reg .b64 	%rd<5>; ',10
db ' ',10
db ' ',10
db '	ld.param.u64 	%rd1, [log_2_param_0]; ',10
db '	ld.param.u32 	%r2, [log_2_param_1]; ',10
db '	mov.u32 	%r3, %ctaid.x; ',10
db '	mov.u32 	%r4, %ntid.x; ',10
db '	mov.u32 	%r5, %tid.x; ',10
db '	mad.lo.s32 	%r1, %r4, %r3, %r5; ',10
db '	setp.ge.s32	%p1, %r1, %r2; ',10
db '	@%p1 bra 	BB0_2; ',10
db ' ',10
db '	cvta.to.global.u64 	%rd2, %rd1; ',10
db '	mul.wide.s32 	%rd3, %r1, 4; ',10
db '	add.s64 	%rd4, %rd2, %rd3; ',10
db '	ld.global.f32 	%f1, [%rd4]; ',10
db '	lg2.approx.ftz.f32 	%f2, %f1; ',10
db '	st.global.f32 	[%rd4], %f2; ',10
db ' ',10
db 'BB0_2: ',10
db '	ret; ',10
db '} ',10
db 0
end

Output:
1 0.000000
2 1.000000
3 1.584962
4 2.000000
5 2.321928
6 2.584962
7 2.807355
8 3.000000
9 3.169925
10 3.321928
11 3.459432
12 3.584962
13 3.700440
14 3.807355
15 3.906890
16 4.000000
17 4.087463
18 4.169925
19 4.247927
20 4.321928
21 4.392317
22 4.459432
23 4.523562
24 4.584962
25 4.643856
26 4.700439
27 4.754887
28 4.807355
29 4.857981
30 4.906890
31 4.954196
32 5.000000
Error 0 (0=NO ERROR)

TimoVJL · May 24, 2019, 07:50:02 PM

clang backend supports CUDA

So for textual .ptx, only ptxas / fatbinary stubs are needed.
a minimal ci_include.h

Code Select

typedef struct _uint3
{
    unsigned int x, y, z;
}uint3;

typedef struct _dim3
{
    unsigned int x, y, z;
}dim3;

extern const uint3 threadIdx;
extern const uint3 blockIdx;
extern const dim3 blockDim;
extern const dim3 gridDim;
extern const int warpSize;

#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))

#define bool	_Bool

a fake ptxas just copy that temporary .s file

Code Select

#define WIN32_LEAN_AND_MEAN
#include <windows.h>

int __cdecl main(int argc, char **argv)
{
	char fname[260];
	char *ptx = argv[argc-1];
	while(*ptx) ptx++;	// find end
	char *pend = ptx;
	while(*pend != '-') pend--;
	ptx = pend;
	while(*ptx != '\\') ptx--;
	ptx++;
	strcpy(fname, ptx);
	char *pext = fname + (pend - ptx);
	*(long*)pext = *(long*)".ptx";	// add ext
	*(pext+4) = 0;	// cut name
	return !CopyFile(argv[argc-1], fname, FALSE);
}

a commandline used in tests

Code Select

clang.exe -c add.cu -nocudainc -nocudalib --cuda-gpu-arch=sm_30

LiaoMi · May 28, 2019, 10:46:49 PM

Quote from: TimoVJL on May 24, 2019, 07:50:02 PM
clang backend supports CUDA

Hi TimoVJL,

need to try, there is a web compiler for Cuda and Cuda LLVM
https://cuda.godbolt.org/
ordinary compiler
https://godbolt.org/

CUDA Compression
A GPU-based LZSS compression algorithm, highly tuned for NVIDIA GPGPUs and for streaming data, leveraging the respective strengths of CPUs and GPUs together - https://github.com/adnanozsoy/CUDA_Compression
Cuda lzss compression https://github.com/abshkbh/cuda-lzss
Algorithms for Compression on GPUs http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6642/pdf/imm6642.pdf
Compression library using Nvidia's CUDA - https://stackoverflow.com/questions/456829/compression-library-using-nvidias-cuda
Parallel lossless compression using GPUs http://on-demand.gputechconf.com/gtc/2014/presentations/S4459-parallel-lossless-compression-using-gpus.pdf
Breakthrough in CUDA data compression https://www.wave-access.com/public_en/blog/2011/april/22/breakthrough-in-cuda-data-compression.aspx

Code Select

The results:

BMP, 540 Kb
Full bzip2 compression on CPU - 218 ms
BW Transform on CPU - 171 ms
Full bzip2 compression on GPU - 93 ms [ minus 53% ]
BW Transform on GPU - 46 ms [ minus 73% ]

BMP, 1112 Kb
Full bzip2 compression on CPU - 467 ms
BW Transform on CPU - 343 ms
Full bzip2 compression on GPU - 249 ms [ minus 46% ]
BW Transform on GPU - 140 ms [ minus 59% ]

PDF, 1919 Kb
Full bzip2 compression on CPU - 1513 ms
BW Transform on CPU - 731 ms
Full bzip2 compression on GPU - 1107 ms [ minus 26% ]
BW Transform on GPU - 311 ms [ minus 57% ]

PDF, 3425 Kb
Full bzip2 compression on CPU - 2168 ms
BW Transform on CPU - 793 ms
Full bzip2 compression on GPU - 1856 ms [ minus 14% ]
BW Transform on GPU - 481 ms [ minus 39% ]

Learn How To Do Alphablending with CUDA
https://www.codeproject.com/Articles/41977/Learn-How-To-Do-Alphablending-with-CUDA

TimoVJL · May 29, 2019, 12:49:08 AM

A good link

Less reasons to download a huge nVidia SDK.

LiaoMi · June 03, 2019, 09:41:45 PM

Hi,

I contacted the author of the program CudaPAD, with the result that there was an update for the Visual Studio 2019, now the program works on the latest version of both Visual Studio and cuda_10.1.168_425.25_win10, source code can be found here https://github.com/SunsetQuest/CudaPAD

Everything works ...

aw27 · June 04, 2019, 04:01:03 AM

Thank you, LiaoMi. Very helpful

daydreamer · June 13, 2019, 04:25:45 AM

are there a way to get aviable VRAM I can use for memory allocating a big array? and compare to system ram,on older computer that have been upgraded with better nvidia or not it would probably run much faster

Biterider · June 18, 2019, 06:35:05 PM

Hi
Now that I have some free time, I've been working on CUDA for a bit. Really amazing.
I was able to integrate the code from LiaoMi into one of my 64-bit demos and it works like a charm.

I failed in 32 bits because I need to change the PTX code.

So far I found the newest ISA documentation I need to read https://docs.nvidia.com/cuda/pdf/ptx_isa_6.4.pdf

For me, the question remains whether the PTX was the right choice or to use nvcc to create the code ...

Biterider

PS: link corrected - thanks to TimoVJL -

TimoVJL · June 18, 2019, 07:56:02 PM

A working link for PARALLEL THREAD EXECUTION ISA

Biterider · June 19, 2019, 10:43:39 PM

Hi
I found a bunch of CUDA tutorials that go from 1 to 12. You're really good at getting a basic understanding of the GPU architecture, the features, the pitfalls. Each tutorial is accompanied with examples.

Here the link to the first one https://www.youtube.com/watch?v=m0nhePeHwFs
The rest is easy to find.

Biterider

LiaoMi · June 20, 2019, 07:32:38 PM

Quote from: Biterider on June 19, 2019, 10:43:39 PM
Hi
I found a bunch of CUDA tutorials that go from 1 to 12. You're really good at getting a basic understanding of the GPU architecture, the features, the pitfalls. Each tutorial is accompanied with examples.

Here the link to the first one https://www.youtube.com/watch?v=m0nhePeHwFs
The rest is easy to find.

Biterider

Hi Biterider,

thanks for the interesting video!

Quote from: Biterider on June 18, 2019, 06:35:05 PM
For me, the question remains whether the PTX was the right choice or to use nvcc to create the code ...

I watched standalone compilers and emulators for Cuda, they were not perfect, in addition, writing a Cuda translator is a very difficult task. If you try to write your own macros for ptx, this will require a serious study of the internal architecture of the GPU, Nvidia tried to create a special emulator that could help developers, and later they closed the project, since it was too complicated. For me personally, the best solution is the CudaPad application, with dynamic programming of C code.

From the documentation...
1.1.3. Purpose of NVCC
The compilation trajectory involves several splitting, compilation, preprocessing, and merging steps for each CUDA source file. It is the purpose of nvcc, the CUDA compiler driver, to hide the intricate details of CUDA compilation from developers. It accepts a range of conventional compiler options, such as for defining macros and include/library paths, and for steering the compilation process. All non-CUDA compilation steps are forwarded to a C++ host compiler that is supported by nvcc, and nvcc translates its options to appropriate host compiler command line options.

1.2. Supported Host Compilers
A general purpose C++ host compiler is needed by nvcc in the following situations:
During non-CUDA phases (except the run phase), because these phases will be forwarded by nvcc to this compiler.
During CUDA phases, for several preprocessing stages and host code compilation (see also The CUDA Compilation Trajectory).
nvcc assumes that the host compiler is installed with the standard method designed by the compiler provider. If the host compiler installation is non-standard, the user must make sure that the environment is set appropriately and use relevant nvcc compile options.

Figure 1. CUDA Compilation Trajectory

daydreamer · June 20, 2019, 09:15:45 PM

thanks Biterider
I downloaded NVASM many years ago,it was only for the oldest 128bit hardware version of pixelshaders
I tried out nvidias' Cg+ toolkit years ago,similar to C,but often 4 floats simultanously like SSE
also newer C++ you can include code <vector>
is that what you should code in C++ with vectors and compile to different cpus or CUDA and benchmark same code on different hardware?CUDA probably supports vectors in C++???

Biterider,LiaoMi or anyone else,have you tested CUDA with benchmark fibonnacci,primes,PI code?

The MASM Forum

News:

Using the GPU

aw27

TimoVJL

LiaoMi

TimoVJL

LiaoMi

aw27

daydreamer

Biterider

TimoVJL

Biterider

LiaoMi

daydreamer