I am working with the mod % way of testing primes
Going SIMT would be advantage, because spread division to multiple cores,because of lack of many execution units for division
So I start with usual c optimize techniques
Stepping 2 between odd numbers and restrict max prime test to sqrt(number), breaks code so numbers ending with 5 falls thru. Test
Unroll loop with or without SIMD?
That's a hard problem to solve,because uncertainly # of loops,before stop when find its not a prime
If you start off with the assumption that SSE is the technique to use, you are blocking most of the techniques that can be used. The idea is to design what you have to do then select the best technique to perform that task. Also give some thought to multi-threading different numeric ranges in parallel as this will improve your speed.
SSE is useful legacy technology but it remains to be seen if you can write that type of code for the task you have in mind.
Quote from: hutch-- on June 29, 2021, 09:01:04 PMSSE is useful legacy technology
Some recent C compilers still haven't discovered that "legacy" technology :cool:
I tried double scalar mod test version, slow but works
You could do sieve in one thread,but interesting test would be 8 core cpu doing lots of divs,vs fewer cores cpu
Hi daydreamer,
https://stackoverflow.com/questions/17241028/finding-lists-of-prime-numbers-with-simd-sse-avx
https://stackoverflow.com/questions/8045807/do-i-get-a-performance-penalty-when-mixing-simd-instructions-and-multithreading
https://github.com/kozub/PrimalityMillerRabinTest-CUDA_GPU
https://github.com/MichaelBell/PrimesCUDA
https://www.openssl.org/docs/man1.1.0/man3/BN_is_prime_fasttest_ex.html
int BN_is_prime_fasttest_ex ( const BIGNUM * a,
int checks,
BN_CTX * ctx_passed,
int do_trial_division,
BN_GENCB * cb
)
Fast isPrime() for n < 2^32 - https://www.mersenneforum.org/showthread.php?t=12209&page=1
Tables of pseudoprimes and related data - http://www.cecm.sfu.ca/Pseudoprimes/index-2-to-64.html
isPrime64Bit.c
mulmod(unsigned long long, unsigned long long, unsigned long long): # @mulmod(unsigned long long, unsigned long long, unsigned long long)
mov rcx, rdx
mov rax, rdi
mul rsi
div rcx
mov rcx, rdx
mov rax, rcx
ret
powmod(unsigned long long, unsigned long long, unsigned long long): # @powmod(unsigned long long, unsigned long long, unsigned long long)
test rsi, rsi
je .LBB1_1
mov rcx, rdx
mov r8d, 1
jmp .LBB1_3
.LBB1_5: # in Loop: Header=BB1_3 Depth=1
mov rax, rdi
mul rdi
div rcx
mov rdi, rdx
shr rsi
je .LBB1_6
.LBB1_3: # =>This Inner Loop Header: Depth=1
test sil, 1
je .LBB1_5
mov rax, r8
mul rdi
div rcx
mov r8, rdx
jmp .LBB1_5
.LBB1_6:
mov rax, r8
ret
.LBB1_1:
mov eax, 1
ret
miller_rabin(unsigned long long, int*, int): # @miller_rabin(unsigned long long, int*, int)
push rbp
push r15
push r14
push r12
push rbx
lea r8, [rdi - 1]
xor r12d, r12d
mov r11, r8
test r8b, 1
jne .LBB2_3
xor r12d, r12d
mov rax, r8
mov r11, r8
.LBB2_2: # =>This Inner Loop Header: Depth=1
add r12d, 1
shr r11
test al, 2
mov rax, r11
je .LBB2_2
.LBB2_3:
mov r9d, 1
test edx, edx
jle .LBB2_28
test r11, r11
je .LBB2_28
mov r10d, edx
xor r15d, r15d
test r12d, r12d
je .LBB2_8
xor r9d, r9d
jmp .LBB2_17
.LBB2_7: # in Loop: Header=BB2_8 Depth=1
add r15, 1
cmp r15, r10
je .LBB2_29
.LBB2_8: # =>This Loop Header: Depth=1
movsxd r14, dword ptr [rsi + 4*r15]
mov ebx, 1
mov rbp, r14
mov rcx, r11
jmp .LBB2_10
.LBB2_9: # in Loop: Header=BB2_10 Depth=2
mov rax, rbp
mul rbp
div rdi
mov rbp, rdx
shr rcx
je .LBB2_12
.LBB2_10: # Parent Loop BB2_8 Depth=1
test cl, 1
je .LBB2_9
mov rax, rbx
mul rbp
div rdi
mov rbx, rdx
jmp .LBB2_9
.LBB2_12: # in Loop: Header=BB2_8 Depth=1
cmp rbx, 1
je .LBB2_7
cmp rbx, r8
je .LBB2_7
test r14d, r14d
je .LBB2_7
xor r9d, r9d
jmp .LBB2_29
.LBB2_16: # in Loop: Header=BB2_17 Depth=1
add r15, 1
cmp r15, r10
je .LBB2_28
.LBB2_17: # =>This Loop Header: Depth=1
movsxd r14, dword ptr [rsi + 4*r15]
mov ebx, 1
mov rcx, r14
mov rbp, r11
jmp .LBB2_19
.LBB2_18: # in Loop: Header=BB2_19 Depth=2
mov rax, rcx
mul rcx
div rdi
mov rcx, rdx
shr rbp
je .LBB2_21
.LBB2_19: # Parent Loop BB2_17 Depth=1
test bpl, 1
je .LBB2_18
mov rax, rbx
mul rcx
div rdi
mov rbx, rdx
jmp .LBB2_18
.LBB2_21: # in Loop: Header=BB2_17 Depth=1
cmp rbx, 1
je .LBB2_16
mov ecx, r12d
cmp rbx, r8
je .LBB2_16
.LBB2_23: # Parent Loop BB2_17 Depth=1
mov rax, rbx
mul rbx
div rdi
mov rbx, rdx
cmp rbx, 1
je .LBB2_29
cmp rbx, r8
je .LBB2_16
add ecx, -1
jne .LBB2_23
test r14d, r14d
je .LBB2_16
jmp .LBB2_29
.LBB2_28:
mov r9d, 1
.LBB2_29:
mov eax, r9d
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
isPrime64Bit(unsigned long long): # @isPrime64Bit(unsigned long long)
sub rsp, 24
movabs rax, 1395864371202
mov qword ptr [rsp], rax
mov dword ptr [rsp + 8], 9375
movabs rax, 3141592653589793239
imul rax, rdi
shr rax, 40
and eax, 8188
mov eax, dword ptr [rax + SPRPlookup]
mov dword ptr [rsp + 12], eax
cmp rdi, 13
ja .LBB3_2
mov eax, 1
mov ecx, 8236
bt rcx, rdi
jae .LBB3_2
.LBB3_5:
add rsp, 24
ret
.LBB3_2:
xor eax, eax
cmp rdi, 5
jb .LBB3_5
mov ecx, edi
and ecx, 1
test rcx, rcx
je .LBB3_5
mov rsi, rsp
mov edx, 4
call miller_rabin(unsigned long long, int*, int)
add rsp, 24
ret
main: # @main
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 24
mov r15, rsi
cmp edi, 2
jge .LBB4_1
mov rsi, qword ptr [r15]
mov edi, offset .L.str
xor eax, eax
call printf
.LBB4_9:
xor eax, eax
add rsp, 24
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB4_1:
mov r12d, edi
mov ebx, 1
movabs rbp, 1395864371202
movabs r14, 3141592653589793239
jmp .LBB4_2
.LBB4_7: # in Loop: Header=BB4_2 Depth=1
mov edx, offset .L.str.3
.LBB4_8: # in Loop: Header=BB4_2 Depth=1
mov edi, offset .L.str.1
mov rsi, r13
xor eax, eax
call printf
add rbx, 1
cmp r12, rbx
je .LBB4_9
.LBB4_2: # =>This Inner Loop Header: Depth=1
mov rdi, qword ptr [r15 + 8*rbx]
xor esi, esi
mov edx, 10
call strtoull
mov r13, rax
mov qword ptr [rsp], rbp
mov dword ptr [rsp + 8], 9375
imul rax, r14
shr rax, 40
and eax, 8188
mov eax, dword ptr [rax + SPRPlookup]
mov dword ptr [rsp + 12], eax
cmp r13, 13
ja .LBB4_4
mov eax, 8236
bt rax, r13
jae .LBB4_4
mov edx, offset .L.str.2
jmp .LBB4_8
.LBB4_4: # in Loop: Header=BB4_2 Depth=1
cmp r13, 5
jb .LBB4_7
mov eax, r13d
and eax, 1
test rax, rax
je .LBB4_7
mov rdi, r13
mov rsi, rsp
mov edx, 4
call miller_rabin(unsigned long long, int*, int)
mov edx, offset .L.str.2
test eax, eax
jne .LBB4_8
jmp .LBB4_7
SPRPlookup:
.long 99 # 0x63
.long 447 # 0x1bf
.long 426 # 0x1aa
..........
.L.str:
.asciz "Usage: %s <num>...\n"
.L.str.1:
.asciz "%llu: %s\n"
.L.str.2:
.asciz "true"
.L.str.3:
.asciz "false"
Or
isPrime64Bit.c
clang -C -O3 -mavx2 -fno-slp-vectorize -fno-vectorize -mllvm -unroll-count=20
Thanks LiaoMi :thumbsup:
I could maybe try AVX version
seen output is the performance hog,so 1 lowpriority thread taking care of IO and others are x numbers of highpriority threads,x number of cores?would that be good solution?
Quote from: daydreamer on July 06, 2021, 11:30:20 PM
I could maybe try AVX version
seen output is the performance hog,so 1 lowpriority thread taking care of IO and others are x numbers of highpriority threads,x number of cores?would that be good solution?
SSE unrolled version between 0 and 15ms for 65k numbers,for 8800000 numbers,156ms
now as exercise making avx calculator,maybe SIMD ascii2double and double2ascii would be faster ?,well at least its an exercise