Using the GPU

LiaoMi · May 14, 2019, 12:45:26 AM

The latest version for testing, reached -

lea rbx, [_ cuCtxSynchronize]
call cuCtxSynchronize

Code Select

---------------------------
Error
---------------------------
cuCtxSynchronize   
CUDA_ERROR_LAUNCH_FAILED                  
---------------------------
OK   
---------------------------

The task to convert header files was not so easy, there are places where api variables have the form as DWORD, additional adjustment of translator templates and API checking is strongly required :icon_exclaim:

Code Select

#if defined(_WIN64) || defined(__LP64__)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned int CUdeviceptr;
#endif

vs

Code Select

IF DEFINED(_WIN64) OR DEFINED(__LP64__)
long TYPEDEF DWORD
ELSE
CUdeviceptr TYPEDEF DWORD
ENDIF

Biterider · May 14, 2019, 12:51:12 AM

Hi LiaoMi
I found the same issue while translating the header files. In the next few days, I'll make some additions to h2incX to better convert these files.
Biterider

aw27 · May 15, 2019, 09:19:02 PM

Quote from: LiaoMi on May 14, 2019, 12:45:26 AM
The latest version for testing, reached -
Code Select Expand
lea rbx, [_ cuCtxSynchronize] call cuCtxSynchronize

Code Select Expand
--------------------------- Error --------------------------- cuCtxSynchronize CUDA_ERROR_LAUNCH_FAILED --------------------------- OK ---------------------------

I produced a very simple example and faced as well that issue :( . It took me a few hours to realize what might be the problem. I got it fixed by calling the v2 versions. :t

Here is the example which consists in calculating the double of each element of an array of 1000 elements. It does it by launching 1024 threads (24 will do nothing), each will handle only one element.

It may need to be modified to run in other targets, it was done for sm_30

Code Select


OPTION casemap:none

N equ 1000

includelib kernel32.lib
ExitProcess proto :dword
includelib msvcrt.lib
printf proto :ptr, :vararg
_getch proto
includelib cuda.lib
cuInit proto :dword
cuDeviceGet proto :ptr, :dword
cuCtxCreate_v2 proto :ptr, :dword, :dword
cuModuleLoadData proto :ptr, :ptr
cuModuleGetFunction proto :ptr, :ptr, :ptr
cuMemAlloc_v2 proto :ptr, :qword
cuMemcpyHtoD_v2 proto :ptr, :ptr, :qword
cuMemcpyDtoH_v2 proto :ptr, :ptr, :qword
cuLaunchKernel proto :ptr, :dword, :dword, :dword, :dword, :dword, :dword, :ptr, :ptr, :ptr
cuCtxSynchronize proto
cuMemFree_v2 proto :ptr

.code

mulby2 proc
	LOCAL cuDevice:dword
	LOCAL cuContext:ptr
	LOCAL cuModule:ptr
	LOCAL cudaFunction:ptr
	LOCAL _da: qword
	LOCAL _db: qword
	LOCAL ha[N]:dword
	LOCAL hb[N]:dword
	LOCAL memSize : qword
	LOCAL args[2] : ptr
		
	sub rsp, 58h
	and rsp, -16 ; align
	mov eax, sizeof DWORD * N
	mov memSize, rax
	mov rcx,0
	call cuInit
	cmp eax, 0
	jnz @exit

	lea rcx, cuDevice
	mov edx,0
	call cuDeviceGet
	cmp eax, 0
	jnz @exit

	lea rcx, cuContext
	mov edx, 0
	mov r8d, cuDevice
	call cuCtxCreate_v2
	cmp eax, 0
	jnz @exit

	lea rcx, cuModule
	mov rdx, offset PTXSourceData
	call cuModuleLoadData
	cmp eax, 0
	jnz @exit

	lea rcx, cudaFunction
	mov rdx, cuModule
	lea r8, PTXFunction
	call cuModuleGetFunction
	cmp eax, 0
	jnz @exit

	mov ecx, N
	mov r10d,0
	lea r11, ha
@@:
	mov dword ptr [r11], r10d
	add r11, 4
	inc r10d
	dec ecx
	jnz @B

	lea rcx, _da
	mov rdx, memSize
	call cuMemAlloc_v2
	cmp eax, 0
	jnz @exit
	lea rcx, _db
	mov rdx, memSize
	call cuMemAlloc_v2
	cmp eax, 0
	jnz @exit

	mov rcx, _da
	lea rdx, ha
	mov r8, memSize
	call cuMemcpyHtoD_v2
	cmp eax, 0
	jnz @exit	

	lea rax, _da
	mov qword ptr args, rax
	lea rax, _db
	mov qword ptr args+8, rax

	mov rcx, cudaFunction
	mov edx ,1
	mov r8d, 1
	mov r9d, 1
	mov dword ptr [rsp+20h], 1024
	mov dword ptr [rsp+28h], 1
	mov dword ptr [rsp+30h], 1
	mov dword ptr [rsp+38h], 0
	mov qword ptr [rsp+40h], 0
	lea rax, args
	mov qword ptr [rsp+48h], rax
	mov qword ptr [rsp+50h], 0
	call cuLaunchKernel
	cmp eax, 0
	jnz @exit
	
	call cuCtxSynchronize
	cmp eax, 0
	jnz @exit	

	lea rcx, hb
	mov rdx, _db
	mov r8, memSize
	call cuMemcpyDtoH_v2
	cmp eax, 0
	jnz @exit	

	mov ebx, 0
	lea rdi, hb
@@:
	mov rcx, offset report
	mov edx, dword ptr [rdi]
	call printf
	add rdi, 4
	inc ebx
	cmp ebx, N
	jl @B

	mov rcx, _da
	call cuMemFree_v2
	mov rcx, _db
	call cuMemFree_v2
	mov eax,0
@exit:
	mov ecx,0
	call _getch
	call ExitProcess
mulby2 endp

.data
report db "%d",10,0

; Generated by NVIDIA NVVM Compiler
; Compiler Build ID: CL-25769353
; Cuda compilation tools, release 10.1, V10.1.105
; Based on LLVM 3.4svn

PTXFunction db '_Z3addPiS_',0
PTXSourceData \
db '.version 6.4 ',10
db '.target sm_30 ',10
db '.address_size 64 ',10
db ' ',10
db '	// .globl	_Z3addPiS_ ',10
db ' ',10
db '.visible .entry _Z3addPiS_( ',10
db '	.param .u64 _Z3addPiS__param_0, ',10
db '	.param .u64 _Z3addPiS__param_1 ',10
db ') ',10
db '{ ',10
db '	.reg .pred 	%p<2>; ',10
db '	.reg .b32 	%r<4>; ',10
db '	.reg .b64 	%rd<8>; ',10
db ' ',10
db ' ',10
db '	ld.param.u64 	%rd1, [_Z3addPiS__param_0]; ',10
db '	ld.param.u64 	%rd2, [_Z3addPiS__param_1]; ',10
db '	mov.u32 	%r1, %tid.x; ',10
db '	setp.gt.s32	%p1, %r1, 999; ',10
db '	@%p1 bra 	BB0_2; ',10
db ' ',10
db '	cvta.to.global.u64 	%rd3, %rd1; ',10
db '	mul.wide.s32 	%rd4, %r1, 4; ',10
db '	add.s64 	%rd5, %rd3, %rd4; ',10
db '	ld.global.u32 	%r2, [%rd5]; ',10
db '	shl.b32 	%r3, %r2, 1; ',10
db '	cvta.to.global.u64 	%rd6, %rd2; ',10
db '	add.s64 	%rd7, %rd6, %rd4; ',10
db '	st.global.u32 	[%rd7], %r3; ',10
db ' ',10
db 'BB0_2: ',10
db '	ret; ',10
db '} ',10
db  0

end

The PTX was obtained from this:

Code Select


#define N 1000

__global__ void add(int* a, int* b) {
	int i = threadIdx.x;
	if (i < N) {
		b[i] = 2 * a[i];
	}
}

To build:
SET MASM64LIB=\masm32\lib64
\masm32\bin\ml64 -c -Zp8 c64.asm
\masm32\bin\link /ENTRY:mulby2 /SUBSYSTEM:console /LIBPATH:%MASM64LIB% /FIXED /MACHINE:X64 c64.obj

LiaoMi · May 15, 2019, 10:41:01 PM

I need to have some kind of manual to deal with architecture and api. Here you can download full documentation in pdf, on api, on inline functions and more - https://docs.nvidia.com/cuda/

Quote from: Biterider on May 14, 2019, 12:51:12 AM
Hi LiaoMi
I found the same issue while translating the header files. In the next few days, I'll make some additions to h2incX to better convert these files.
Biterider

Hi Biterider,

I fixed in my own conversion the following steps ..
1.Make a copy of all files for backup
2.Replace all types described here https://en.cppreference.com/w/cpp/language/types in the form of __int64 (unsigned long long = __int64, unsigned long = long) everything on the table, there are many of them ..
3.Define type in converter CUdeviceptr = CUdeviceptr
4.Delete all parameters in original headers CUdeviceptr = DWORD
5.Add the correct macro

Code Select

IF DEFINED(_WIN64) OR DEFINED(__LP64__)
CUdeviceptr TYPEDEF QWORD
ELSE
CUdeviceptr TYPEDEF DWORD
ENDIF

Perhaps there are also other bottlenecks that I did not see .. Compiling with my converted file works fine .. I have done only one so far - cuda.inc, in the post above the file was already corrected a couple of days ago ..

Quote from: AW on May 15, 2019, 09:19:02 PM
Quote from: LiaoMi on May 14, 2019, 12:45:26 AM
The latest version for testing, reached -
Code Select Expand
lea rbx, [_ cuCtxSynchronize] call cuCtxSynchronize

Code Select Expand
--------------------------- Error --------------------------- cuCtxSynchronize CUDA_ERROR_LAUNCH_FAILED --------------------------- OK ---------------------------

I produced a very simple example and faced as well that issue :( . It took me a few hours to realize what might be the problem. I got it fixed by calling the v2 versions. :t

Here is the example which consists in calculating the double of each element of an array of 1000 elements. It does it by launching 1024 threads (24 will do nothing), each will handle only one element.

It may need to be modified to run in other targets, it was done for sm_30

Code Select Expand
OPTION casemap:none N equ 1000 includelib kernel32.lib ExitProcess proto :dword includelib msvcrt.lib printf proto :ptr, :vararg _getch proto includelib cuda.lib cuInit proto :dword cuDeviceGet proto :ptr, :dword cuCtxCreate_v2 proto :ptr, :dword, :dword cuModuleLoadData proto :ptr, :ptr cuModuleGetFunction proto :ptr, :ptr, :ptr cuMemAlloc_v2 proto :ptr, :qword cuMemcpyHtoD_v2 proto :ptr, :ptr, :qword cuMemcpyDtoH_v2 proto :ptr, :ptr, :qword cuLaunchKernel proto :ptr, :dword, :dword, :dword, :dword, :dword, :dword, :ptr, :ptr, :ptr cuCtxSynchronize proto cuMemFree_v2 proto :ptr .code mulby2 proc LOCAL cuDevice:dword LOCAL cuContext:ptr LOCAL cuModule:ptr LOCAL cudaFunction:ptr LOCAL _da: qword LOCAL _db: qword LOCAL ha[N]:dword LOCAL hb[N]:dword LOCAL memSize : qword LOCAL args[2] : ptr sub rsp, 58h and rsp, -16 ; align mov eax, sizeof DWORD * N mov memSize, rax mov rcx,0 call cuInit cmp eax, 0 jnz @exit lea rcx, cuDevice mov edx,0 call cuDeviceGet cmp eax, 0 jnz @exit lea rcx, cuContext mov edx, 0 mov r8d, cuDevice call cuCtxCreate_v2 cmp eax, 0 jnz @exit lea rcx, cuModule mov rdx, offset PTXSourceData call cuModuleLoadData cmp eax, 0 jnz @exit lea rcx, cudaFunction mov rdx, cuModule lea r8, PTXFunction call cuModuleGetFunction cmp eax, 0 jnz @exit mov ecx, N mov r10d,0 lea r11, ha @@: mov dword ptr [r11], r10d add r11, 4 inc r10d dec ecx jnz @B lea rcx, _da mov rdx, memSize call cuMemAlloc_v2 cmp eax, 0 jnz @exit lea rcx, _db mov rdx, memSize call cuMemAlloc_v2 cmp eax, 0 jnz @exit mov rcx, _da lea rdx, ha mov r8, memSize call cuMemcpyHtoD_v2 cmp eax, 0 jnz @exit lea rax, _da mov qword ptr args, rax lea rax, _db mov qword ptr args+8, rax mov rcx, cudaFunction mov edx ,1 mov r8d, 1 mov r9d, 1 mov dword ptr [rsp+20h], 1024 mov dword ptr [rsp+28h], 1 mov dword ptr [rsp+30h], 1 mov dword ptr [rsp+38h], 0 mov qword ptr [rsp+40h], 0 lea rax, args mov qword ptr [rsp+48h], rax mov qword ptr [rsp+50h], 0 call cuLaunchKernel cmp eax, 0 jnz @exit call cuCtxSynchronize cmp eax, 0 jnz @exit lea rcx, hb mov rdx, _db mov r8, memSize call cuMemcpyDtoH_v2 cmp eax, 0 jnz @exit mov ebx, 0 lea rdi, hb @@: mov rcx, offset report mov edx, dword ptr [rdi] call printf add rdi, 4 inc ebx cmp ebx, N jl @B mov rcx, _da call cuMemFree_v2 mov rcx, _db call cuMemFree_v2 mov eax,0 @exit: mov ecx,0 call _getch call ExitProcess mulby2 endp .data report db "%d",10,0 ; Generated by NVIDIA NVVM Compiler ; Compiler Build ID: CL-25769353 ; Cuda compilation tools, release 10.1, V10.1.105 ; Based on LLVM 3.4svn PTXFunction db '_Z3addPiS_',0 PTXSourceData \ db '.version 6.4 ',10 db '.target sm_30 ',10 db '.address_size 64 ',10 db ' ',10 db ' // .globl _Z3addPiS_ ',10 db ' ',10 db '.visible .entry _Z3addPiS_( ',10 db ' .param .u64 _Z3addPiS__param_0, ',10 db ' .param .u64 _Z3addPiS__param_1 ',10 db ') ',10 db '{ ',10 db ' .reg .pred %p<2>; ',10 db ' .reg .b32 %r<4>; ',10 db ' .reg .b64 %rd<8>; ',10 db ' ',10 db ' ',10 db ' ld.param.u64 %rd1, [_Z3addPiS__param_0]; ',10 db ' ld.param.u64 %rd2, [_Z3addPiS__param_1]; ',10 db ' mov.u32 %r1, %tid.x; ',10 db ' setp.gt.s32 %p1, %r1, 999; ',10 db ' @%p1 bra BB0_2; ',10 db ' ',10 db ' cvta.to.global.u64 %rd3, %rd1; ',10 db ' mul.wide.s32 %rd4, %r1, 4; ',10 db ' add.s64 %rd5, %rd3, %rd4; ',10 db ' ld.global.u32 %r2, [%rd5]; ',10 db ' shl.b32 %r3, %r2, 1; ',10 db ' cvta.to.global.u64 %rd6, %rd2; ',10 db ' add.s64 %rd7, %rd6, %rd4; ',10 db ' st.global.u32 [%rd7], %r3; ',10 db ' ',10 db 'BB0_2: ',10 db ' ret; ',10 db '} ',10 db 0 end

The PTX was obtained from this:

Code Select Expand
#define N 1000 __global__ void add(int* a, int* b) { int i = threadIdx.x; if (i < N) { b[i] = 2 * a[i]; } }

To build:
SET MASM64LIB=\masm32\lib64
\masm32\bin\ml64 -c -Zp8 c64.asm
\masm32\bin\link /ENTRY:mulby2 /SUBSYSTEM:console /LIBPATH:%MASM64LIB% /FIXED /MACHINE:X64 c64.obj

Hi AW,

great job!

:t I was so tortured with this function that I started thinking about errors in the code. I will try to change the function and I will try your example! Thank you!

What is the difference between these functions?

Code Select

The new CUDA "v2" API has been supported since v4.0 of the CUDA toolkit (i.e. for quite a while; we are now on 7)..... The old API is deprecated by NVidia, and does not support certain useful functionality such as batched matrix-multiply.

From my understanding, new CUDA "v2" API is a functional extension of all api to a new level. Therefore, the parameters may be different, I did not take this into account. Some features are preserved to maintain compatibility, and some exist only in one old form. All these parameters should be described by the version of the toolkit. ::)

All current documentation in one archive - CUDA Toolkit v10.1.168_pdf.zip https://www.solidfiles.com/v/5aYWVB7nx3Qq5 100Mb

jj2007 · May 15, 2019, 11:52:56 PM

I've never seriously considered using CUDA because I thought it's useful for one brand only, Nvidia. Now I am confused by two elements:

1. It seems Nvidia has over 80% of the market
2. There are many manufacturers but they all produce Nvidia cards, apparently: https://graphicscardhub.com/graphics-card-manufacturers-brands/

So what is "Nvidia"? A manufacturer brand, a "graphics OS" of some sorts, a standard?

LiaoMi · May 16, 2019, 02:38:53 AM

Quote from: jj2007 on May 15, 2019, 11:52:56 PM
I've never seriously considered using CUDA because I thought it's useful for one brand only, Nvidia. Now I am confused by two elements:

1. It seems Nvidia has over 80% of the market
2. There are many manufacturers but they all produce Nvidia cards, apparently: https://graphicscardhub.com/graphics-card-manufacturers-brands/

So what is "Nvidia"? A manufacturer brand, a "graphics OS" of some sorts, a standard?

Hi jj2007,

for me it is the market leader in the production of GPU (graphics processing unit)

Intel = CPU
NVIDIA = GPU, NVIDIA, inventor of the GPU, which creates interactive graphics on laptops, workstations, mobile devices, notebooks, PCs, and more.

Cuda Initial release June 23, 2007 (11 years ago) CUDA is a parallel computing platform and application programming interface (API) model created by Nvidia. It allows software developers and software engineers to use a CUDA-enabled graphics processing unit (GPU) for general purpose processing — an approach termed GPGPU (General-Purpose computing on Graphics Processing Units). The CUDA platform is a software layer that gives direct access to the GPU's virtual instruction set and parallel computational elements, for the execution of compute kernels.

NVIDIA Major releases and acquisitions - The release of the RIVA TNT in 1998 solidified Nvidia's reputation for developing capable graphics adapters. In late 1999, Nvidia released the GeForce 256 (NV10), most notably introducing on-board transformation and lighting (T&L) to consumer-level 3D hardware. Running at 120 MHz and featuring four pixel pipelines, it implemented advanced video acceleration, motion compensation and hardware sub-picture alpha blending.

Unification is not required for GPU computations, this is probably one of the reasons why there is no computation standard in graphical interfaces, gpu calculations seem to me more tied to the programming language than to the technical features of parallelization.

TimoVJL · May 16, 2019, 03:17:55 AM

OpenCL
nvidia OpenCL

GPU_Caps_Viewer

aw27 · May 16, 2019, 03:22:06 AM

Hi LiaoMi,

I noticed that in the #include file they have
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
...
#define cuCtxCreate cuCtxCreate_v2

So I run, cuCtxGetApiVersion and got that the version was 3020.

All explained, so easy when we know the answer.

LiaoMi · May 16, 2019, 09:28:13 PM

Another attractive example! :P

Code Select

Here is a demonstration of Neural Network created in a CUDA GPU using FASM. 

This program requires win64 and a NVIDIA cuda enabled graphics card. The code is written specifically for a NVIDIA 620 96-core card with Clockrate 1G4Hz. As the neural network size was chosen as multiples of 96, the code may have to be modded for cards with more or less cores. 

The first part of the code constructs a neural network in PC memory and in GPU memory. You can run it in either to test speed. The second part of the code is the PTX (GPU pseudo assembly) program thread that is loaded into the NVIDIA GPU in ascii, and run on continuous 96 core 'blocks' (one thread per core) until the program has finished. 

My time results show this 96-core card to run at equivalent speed to a dual-core CPU running at 3GHz and fully using 128-bit SIMD. 

To put this another way, if this program was run on the latest 3000 core NVIDIA card it would run 5 times faster than on the latest 16-core AVX (256-bit SIMD) CPU available today. 

I was a bit disappointed with CUDA performance, CPU core for GPU core, it should have run up to 5 times faster, however, this is just a first attempt.

It does not work for me ... :(
---------------------------
Error
---------------------------
error code:
hex: 0x000e28c0
dec: 927936
---------------------------
OK
---------------------------

LiaoMi · May 16, 2019, 10:06:57 PM

Quote from: AW on May 16, 2019, 03:22:06 AM
Hi LiaoMi,

I noticed that in the #include file they have
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
...
#define cuCtxCreate cuCtxCreate_v2

So I run, cuCtxGetApiVersion and got that the version was 3020.

All explained, so easy when we know the answer.

I need to find an error, somewhere something is wrong, I also have version 3020 based on the results of the function. I compared the original binary file and the import of api, only old api are used there, in the example (Cuda_UASM_v4.zip) I'm trying to translate, all api are similar to your version.

Code Select

---------------------------
Error
---------------------------
cuCtxSynchronize   
CUDA_ERROR_LAUNCH_FAILED                  
---------------------------
OK   
---------------------------

I will try to set the old api and once again check the parameters of the functions.

Quote from: TimoVJL on May 16, 2019, 03:17:55 AM
OpenCL
nvidia OpenCL

GPU_Caps_Viewer

Hi TimoVJL,

if I understand correctly, the main meaning of the OpenCL is to provide a common set of api for all variations of the GPU, this is almost like Qt for the world of graphics accelerators :idea:

TimoVJL · May 16, 2019, 10:46:50 PM

Quote from: LiaoMi on May 16, 2019, 10:06:57 PM
Quote from: TimoVJL on May 16, 2019, 03:17:55 AM
OpenCL
nvidia OpenCL

GPU_Caps_Viewer

Hi TimoVJL,

if I understand correctly, the main meaning of the OpenCL is to provide a common set of api for all variations of the GPU, this is almost like Qt for the world of graphics accelerators :idea:

Even with Android smart phones ;)

CudaVsOpenCL

HSE · May 16, 2019, 11:14:34 PM

Quote from: TimoVJL on May 16, 2019, 03:17:55 AM
GPU_Caps_Viewer

:t

aw27 · May 17, 2019, 01:18:27 AM

Cuda is also on Android with the most recent (NVidia) Tegra models.
And you can start developing right now , everything is organized to make the journey as pleasant as possible. :t
Is OpenCL disappearing from the map? :shock:

TimoVJL · May 17, 2019, 01:46:58 AM

Quite a short list:

https://www.nvidia.in/object/tegra-phones-tablets-in.html

aw27 · May 17, 2019, 02:03:36 AM

They are mostly into chips, not end user products.
Like this one: https://www.nvidia.in/object/tegra-x1-processor-in.html

The MASM Forum

News:

Using the GPU

LiaoMi

Biterider

aw27

LiaoMi

jj2007

LiaoMi

TimoVJL

aw27

LiaoMi

LiaoMi

TimoVJL

HSE

aw27

TimoVJL

aw27