News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Multiply two QWORDs

Started by jj2007, September 15, 2018, 11:11:11 PM

Previous topic - Next topic

jj2007

Good job, Rui :t
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)

1192    cycles for 100 * MultQQ
1843    cycles for 100 * Multiply64_64_v3v(Rui)
2168    cycles for 100 * u64_mul (Rui)
1215    cycles for 100 * PCLMULQDQ
2830    cycles for 100 * MultPclmulqdq2
4281    cycles for 100 * doMul (aw27)
963     cycles for 100 * muld (Nidud)
2304    cycles for 100 * _mul128 (Nidud)

913     cycles for 100 * MultQQ
1846    cycles for 100 * Multiply64_64_v3v(Rui)
2163    cycles for 100 * u64_mul (Rui)
1208    cycles for 100 * PCLMULQDQ
2827    cycles for 100 * MultPclmulqdq2
4270    cycles for 100 * doMul (aw27)
964     cycles for 100 * muld (Nidud)
2301    cycles for 100 * _mul128 (Nidud)

912     cycles for 100 * MultQQ
1840    cycles for 100 * Multiply64_64_v3v(Rui)
2166    cycles for 100 * u64_mul (Rui)
1210    cycles for 100 * PCLMULQDQ
2833    cycles for 100 * MultPclmulqdq2
4288    cycles for 100 * doMul (aw27)
968     cycles for 100 * muld (Nidud)
2297    cycles for 100 * _mul128 (Nidud)

909     cycles for 100 * MultQQ
1850    cycles for 100 * Multiply64_64_v3v(Rui)
2173    cycles for 100 * u64_mul (Rui)
1208    cycles for 100 * PCLMULQDQ
2831    cycles for 100 * MultPclmulqdq2
4326    cycles for 100 * doMul (aw27)
957     cycles for 100 * muld (Nidud)
2316    cycles for 100 * _mul128 (Nidud)

62      bytes for MultQQ
194     bytes for Multiply64_64_v3v(Rui)
126     bytes for u64_mul (Rui)
46      bytes for PCLMULQDQ
52      bytes for MultPclmulqdq2
253     bytes for doMul (aw27)
68      bytes for muld (Nidud)
146     bytes for _mul128 (Nidud)

MultQQ                 6760860027809745732
Multiply64_64_v3v(Rui) 6760860027809745732  - high QWORD: 1728378107
u64_mul (Rui)          6760860027809745732  - high QWORD: 1728378107
PCLMULQDQ              7817399311675693060
MultPclmulqdq2         7817399311675693060
doMul (aw27)           6760860027809745732  - high QWORD: 1728378107
muld (Nidud)           6760860027809745732
_mul128 (Nidud)        6760860027809745732

RuiLoureiro


aw27

Quote from: nidud on September 19, 2018, 07:04:13 AM

    mov rax,0x1122334455667788
    mov rcx,0x99aabbccddeeff00
    mul rcx
    printf("%#I64x%I64x\n", rdx, rax)


Amazing serendipity after countless tries and fails! Congratulations!  :t

nidud

#33
deleted

RuiLoureiro

Hi nidud,
            you are using the old code. This is the last code i posted tested in reply #30

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
Multiply64by64_v3       proc     pX64:DWORD, pY64:DWORD, pZ:DWORD
                        push     ebx
                     push     esi

                     mov      ebx, [esp+12]           ;pX64
                     mov      esi, [esp+16]           ;pY64
                     mov      ecx, [esp+20]           ;pZ

                  ; ----------------------
                  ;    IdxX0*IdxY0
                  ; ----------------------
                     mov      eax, [ebx+IdxX0]
                     mul      dword ptr [esi+IdxY0]
                     mov      [ecx+IdxZ0], eax
                     mov      [ecx+IdxZ1], edx

                  ; ----------------------
                  ;   IdxX0*IdxY1
                  ; ----------------------
                     mov      eax, [ebx+IdxX0]
                     mul      dword ptr [esi+IdxY1]

                     add      eax, [ecx+IdxZ1]
                     mov      [ecx+IdxZ1], eax
                  ;
                     adc      edx, 0
                     mov      [ecx+IdxZ2], edx

                      ; ----------------------
                  ;   IdxX1*IdxY0
                  ; ----------------------
                     mov      eax, [ebx+IdxX1]
                     mul      dword ptr [esi+IdxY0]

                     add      eax, [ecx+IdxZ1]
                     mov      [ecx+IdxZ1], eax
                  ;
                     adc      edx, [ecx+IdxZ2]
                     mov      [ecx+IdxZ2], edx

                  ; ----------------------
                  ;   IdxX1*IdxY1
                  ; ----------------------
                     mov      eax, [ebx+IdxX1]
                     mul      dword ptr [esi+IdxY1]

                     add      eax, [ecx+IdxZ2]
                     mov      [ecx+IdxZ2], eax
                  ;
                     adc      edx, 0
                     mov      [ecx+IdxZ3], edx

                     pop      esi
                     pop      ebx
                     ret      12
Multiply64by64_v3       endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

nidud

#35
deleted

aw27

Back into action  :badgrin:
This 32-bit version (*) is the fastest of those that work (i.e. those that produce 128-bit results) and within those that work is the only one whose author is known (the other have been copied and pasted from some unknown excavated ditch).


.486
.model flat, stdcall

includelib \masm32\lib\kernel32.lib
ExitProcess proto :dword
includelib \masm32\lib\msvcrt.lib
printf proto C :ptr, :vararg

twoQwords struc
q1l dword 0
q1h dword 0
q2l dword 0
q2h dword 0
twoQwords ends

.data
qword1 qword 0F122334455667788h
qword2 qword 0F9AABBCCDDEEFF00h
result twoQwords <>
valuePrint db "val1=0x%llx",10,"val2=0x%llx",10,"val1*val2=0x%llx%llx",10,0

.code

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
firstL EQU [esp+12]
firstH EQU [esp+16]
secL EQU [esp+20]
secH EQU [esp+24]
res EQU [esp+28]
multifast64x64_128 proc arg1:qword, arg2 :qword, arg3:ptr
push ebx
push esi

mov eax, firstL
mul dword ptr secL
mov ecx, res
mov (twoQwords PTR [ecx]).q1l, eax
mov ebx, edx

mov eax, firstH
mul dword ptr secL
add eax, ebx
adc edx,0
mov ebx, eax
mov esi, edx

mov eax, firstL
mul dword ptr secH
add eax, ebx

mov (twoQwords PTR [ecx]).q1h, eax
adc edx, 0
mov ebx, edx

mov eax, firstH
mul dword ptr secH
add eax, esi
adc edx,0
add eax, ebx
adc edx,0

mov (twoQwords PTR [ecx]).q2l, eax
mov (twoQwords PTR [ecx]).q2h, edx

pop esi
pop ebx
ret 20
multifast64x64_128 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

main proc
invoke multifast64x64_128, qword1, qword2, addr result
lea esi, result
invoke printf, addr valuePrint, qword1, qword2, qword ptr [esi+8], qword ptr [esi]

invoke ExitProcess,0
main endp

end


val1=0xf122334455667788
val2=0xf9aabbccddeeff00
val1*val2=0xeb2b15787630c963479983e499807800


(*) Guaranteed True Masm (TM) code, no BASIC or C imitations.

jj2007

Quote from: AW on September 20, 2018, 04:39:00 PMThis 32-bit version (*) is the fastest of those that work (i.e. those that produce 128-bit results)

I know one shouldn't feed the troll but for others who follow this thread a reminder of the original post:

Quote from: jj2007 on September 15, 2018, 11:11:11 PM
This pops up sometimes with random number generation: How to multiply two QWORDs?

There might be an exotic case where a 128-bit integer is useful, but almost nobody needs that, therefore the algos that produce QWORD output are perfectly valid for this purpose (and they work also for multiplying a dq 18446744073709551615, except PCLMULQDQ).

Even the "distorted" carry-less PCLMULQDQ is valid for random number generation. And of course, mul rcx also produces a qword, but we are talking 32-bit code here.

aw27

Quote
There might be an exotic case where a 128-bit integer is useful, but almost nobody needs that
Everything you can't manage is exotic. One thing I am sure, nobody needs the crapware you drop here.

Quote
And of course, mul rcx also produces a qword,
It does not (do you need a link to learn about mul?), that's why I congratulated nidud. Sort of Colombo egg, he found it by serendipity after many unsuccessful tries.

jj2007

Dear José,

In the interest of everybody, please take your pills regularly :icon14:

Quote from: AW on September 18, 2018, 09:39:05 PM
As always, you are a scam artist.

Quote from: AW on September 19, 2018, 03:39:16 PMAmazing serendipity after countless tries and fails! Congratulations!  :t

Quote from: AW on September 20, 2018, 05:28:59 PM
Everything you can't manage is exotic. One thing I am sure, nobody needs the crapware you drop here.

hutch--

Here is a quick play with mulx. This is 64 bit code but apparently it also works in 32 bit but only with 32 bit variables.

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    include \masm32\include64\masm64rt.inc

    .code

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

entry_point proc

    USING r12,r13,r14,r15

    SaveRegs

    mov r12, 0
    mov r13, 3
    mov rdx, 2000000000000000000    ; 19 digits

    mulx rdx,r12,r13        ; mul rdx x r13

    conout "  r12 = ",str$(r12),lf
    conout "  r13 = ",str$(r13),lf
    conout "  rdx = ",str$(rdx),lf

    waitkey
    RestoreRegs
    .exit

entry_point endp

; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤

    end

; Description
;
; Performs an unsigned multiplication of the implicit source operand (EDX/RDX)
; and the specified source operand (the third operand) and stores the low half
; of the result in the second destination (second operand), the high half
; of the result in the first destination operand (first operand), without reading
; or writing the arithmetic flags.
;
; This enables efficient programming where the software can interleave add with
; carry operations and multiplications.

; If the first and second operand are identical, it will contain the high half
; of the multiplication result.
;
; This instruction is not supported in real mode and virtual-8086 mode. The operand
; size is always 32 bits if not in 64-bit mode. In 64-bit mode operand size 64
; requires VEX.W1. VEX.W1 is ignored in non-64-bit modes. An
; attempt to execute this instruction with VEX.L not equal to 0 will cause #UD.

jj2007

Quote from: hutch-- on September 20, 2018, 06:20:51 PM
Here is a quick play with mulx. This is 64 bit code but apparently it also works in 32 bit but only with 32 bit variables.
Looks interesting, and mulx eax, ebx, ecx builds fine but causes an illegal instruction exception on my trusty old Core i5 :(

aw27

Quote from: jj2007 on September 20, 2018, 07:37:04 PM
Looks interesting,
You mean its exotic, let me correct:
There might be an exotic case where a mulx is useful, but almost nobody needs that, therefore the algos that use mul are perfectly valid for this purpose

aw27

Using the new mulx instructions on the above 32-bit code we can save at least 4 instructions giving a new total 26 instructions.
It is necessary the .xmm directive with this instruction, something a bit exotic would say someone from this forum.


.686p
.xmm
.model flat, stdcall

includelib \masm32\lib\kernel32.lib
ExitProcess proto :dword
includelib \masm32\lib\msvcrt.lib
printf proto C :ptr, :vararg

twoQwords struc
q1l dword 0
q1h dword 0
q2l dword 0
q2h dword 0
twoQwords ends

.data
qword1 qword 0F122334455667788h
qword2 qword 0F9AABBCCDDEEFF00h
result twoQwords <>
valuePrint db "val1=0x%llx",10,"val2=0x%llx",10,"val1*val2=0x%llx%llx",10,0

.code

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
firstL EQU [esp+12]
firstH EQU [esp+16]
secL EQU [esp+20]
secH EQU [esp+24]
res EQU [esp+28]
multifast64x64_128 proc arg1:qword, arg2 :qword, arg3:ptr
push ebx
push esi

mov edx, firstL
mulx ebx, eax, dword ptr secL
mov ecx, res
mov (twoQwords PTR [ecx]).q1l, eax

mov edx, firstH
mulx esi, eax, dword ptr secL
add ebx, eax

mov edx, firstL
mulx edx, eax, dword ptr secH
add eax, ebx
mov (twoQwords PTR [ecx]).q1h, eax
adc edx, 0
mov ebx, edx

mov edx, firstH
mulx edx, eax, dword ptr secH
add eax, esi
adc edx,0
add eax, ebx
adc edx,0

mov (twoQwords PTR [ecx]).q2l, eax
mov (twoQwords PTR [ecx]).q2h, edx

pop esi
pop ebx
ret 20
multifast64x64_128 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

main proc
invoke multifast64x64_128, qword1, qword2, addr result
lea esi, result
invoke printf, addr valuePrint, qword1, qword2, qword ptr [esi+8], qword ptr [esi]

invoke ExitProcess,0
main endp

end


Some less young (but trusty) computers will refuse to run the mulx instruction.