News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

Byte shuffling on GPR

Started by alikim, August 30, 2017, 08:24:39 PM

Previous topic - Next topic

alikim

Hello, this is my first assembly program compiled as a part of a C++ VS project, I'd like to know if this code can be optimized for speed, on general purpose registers without using extensions like pshufb, thank you!

        mov rax, qword ptr [rdx]         ; load state first 8 bytes
        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

                        ; shuffle
; f-e-d-2 c-b-a-1  rax
; M-L-K-4 I-H-G-3  rbx
;        v
; I-L-a-2 f-H-K-1  rax
; c-e-G-4 M-b-d-3  rbx

mov r9,  00FF000000FF0000h
mov rdx, r9 
and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
mov r11,0FF0000000000FF00h   
and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
and r11, rax                 ; c-0-0-0 0-0-d-0  r11
add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
mov r11, 0000FF00FF000000h
rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
add r9,  r11                 ; c-e-G-0 M-b-d-0  r9
mov r11,0FF0000000000FF00h
and r11, rbx                 ; I-0-0-0 0-0-K-0  r11
add rdx, r11                 ; I-L-0-0 0-H-K-0  rdx
mov r11, 0000FF00FF000000h
and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
mov r11, 000000FF000000FFh
rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
mov r11, 000000FF000000FFh
rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
add rax, rdx
and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
add rbx, r9

                        mov qword ptr [r8], rax          ; return output
        mov qword ptr [r8 + 8], rbx      ; return output

jj2007

Looks OK, but you can't judge until you run the code in a testbed and try your luck with little modifications.
What's wrong with pshufb? It's not an "extension", it's just a SIMD instruction, and I doubt that you can find a CPU nowadays that cannot understand at least SSE2.

alikim

Thank you, there is nothing wrong with pshufb, I'm studying assembly as a hobby and just being curious how would I shuffle bytes without it.

jj2007

Quote from: alikim on August 30, 2017, 08:44:44 PMI'm studying assembly as a hobby

Welcome to the club :icon14:

From Masm32 Tips, Tricks and Traps
Quote- Last but not least: A few Rules for the Forum

* Use include \masm32\include\masm32rt.inc, not include C:\masm32\include\masm32rt.inc; many people have installed Masm32 on a different drive, and it's a real nuisance having to replace C: with D: to test a code snippet

* To allow others to test your code, do not use environment variables for your paths. Masm32 has a hard-coded path structure, for good reasons.

* Post your complete code. Some believe that older members are eager to construct the missing headers around your snippets, but this belief is FALSE

* Formulate precise questions, provide precise error messages and error lines. You will get quicker and better answers

* Have fun with Masm, and enjoy the friendly atmosphere at the Masm32 Forum!
;)

Siekmanski

alikim, welcome.

Only had a quick look, and spotted this:

        mov rax, qword ptr [rdx]         ; load state first 8 bytes
        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

                        ; shuffle
; f-e-d-2 c-b-a-1  rax
; M-L-K-4 I-H-G-3  rbx
;        v
; I-L-a-2 f-H-K-1  rax
; c-e-G-4 M-b-d-3  rbx

mov r9,  00FF000000FF0000h
mov rdx, r9 
and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
mov r11,0FF0000000000FF00h   ; same as the 4th mask ( use 1 extra register for the 3th mask and delete the line with the 4th mask )
and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
and r11, rax                 ; c-0-0-0 0-0-d-0  r11
add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
mov r11, 0000FF00FF000000h   ; use another register for this mask and preserve 2th mask (r11) for the 4th mask
rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
add r9,  r11                 ; c-e-G-0 M-b-d-0  r9
mov r11,0FF0000000000FF00h   ; <-- 4th mask
and r11, rbx                 ; I-0-0-0 0-0-K-0  r11
add rdx, r11                 ; I-L-0-0 0-H-K-0  rdx
mov r11, 0000FF00FF000000h
and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
mov r11, 000000FF000000FFh
rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
mov r11, 000000FF000000FFh   ; <-- delete this line, it's already loaded in r11
rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
add rax, rdx
and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
add rbx, r9

                        mov qword ptr [r8], rax          ; return output
        mov qword ptr [r8 + 8], rbx      ; return output
Creative coders use backward thinking techniques as a strategy.

Siekmanski

I'm not on a computer with masm installed, so it may not work ( can't test it )
Shuffled the order of the code a bit.

        mov rax, qword ptr [rdx]         ; load state first 8 bytes
        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

                        ; shuffle
; f-e-d-2 c-b-a-1  rax
; M-L-K-4 I-H-G-3  rbx
;        v
; I-L-a-2 f-H-K-1  rax
; c-e-G-4 M-b-d-3  rbx

mov r9,  00FF000000FF0000h
mov rdx, r9 
and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
mov r11,0FF0000000000FF00h
mov r8,  r11
and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
and r11, rax                 ; c-0-0-0 0-0-d-0  r11
add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
mov r11, 0000FF00FF000000h
rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
and r8,  rbx                 ; I-0-0-0 0-0-K-0  r11
add r9,  r11                 ; c-e-G-0 M-b-d-0  r9
add rdx, r8                  ; I-L-0-0 0-H-K-0  rdx
mov r11, 0000FF00FF000000h
and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
and rbx, 000000FF000000FFh   ; 0-0-0-4 0-0-0-3  rbx
add rax, rdx
add rbx, r9

                        mov qword ptr [r8], rax          ; return output
        mov qword ptr [r8 + 8], rbx      ; return output
Creative coders use backward thinking techniques as a strategy.

alikim

Thank you for your suggestions!
Using an extra register gave it a significant boost ( I tried code shuffling - that did not do anything).
I only use masm inside VS shell, so the code below is probably not for stand-alone masm. I also added MMX version.

    _DATA SEGMENT
ALIGN 16
msk db 0,13,10,7, 4,1,14,11, 8,5,2,15, 12,9,6,3
    _DATA ENDS
    _TEXT SEGMENT

    PUBLIC avx_decode                    ; void avx_decode(void *key, void *message, void *output);
    avx_decode PROC                      ; RCX, RDX, R8 - params
                                 ; non volatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15

push rbp                         ; save frame pointer
mov rbp, rsp                     ; fix stack pointer
sub rsp, 8 * (4 + 2)             ; allocate shadow register area + 2 QWORDs for stack alignment

movdqu xmm0, xmmword ptr [rdx]         ; load message
movdqu xmm2, xmmword ptr msk           ; load mask

; 1-A-B-C  2-D-E-F
; 3-G-H-I  4-K-L-M
;         v
; 1-K-H-F  2-A-L-I
; 3-D-B-M  4-G-E-C
pshufb xmm0, xmm2

movdqu xmmword ptr [r8], xmm0    ; return output

emms
                   
mov rsp, rbp                     ; epilog. restore stack pointer
pop rbp
ret
    avx_decode ENDP


    PUBLIC asm_decode                    ; void asm_decode(void *key, void *message, void *output);
    asm_decode PROC                      ; RCX, RDX, R8 - params
                                 ; non volatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15

push rbp                         ; save frame pointer
mov rbp, rsp                     ; fix stack pointer
sub rsp, 8 * (4 + 2)             ; allocate shadow register area + 2 QWORDs for stack alignment

push rbx
push r12

mov rax, qword ptr [rdx]         ; load state first 8 bytes
mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

; f-e-d-2 c-b-a-1  rax
; M-L-K-4 I-H-G-3  rbx
;        v
; I-L-a-2 f-H-K-1  rax
; c-e-G-4 M-b-d-3  rbx
mov r9,  00FF000000FF0000h
mov rdx, r9 
and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
mov r11,0FF0000000000FF00h
mov r12, r11   
and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
and r11, rax                 ; c-0-0-0 0-0-d-0  r11
add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
mov r11, 0000FF00FF000000h
rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
and r12, rbx                 ; I-0-0-0 0-0-K-0  r12
add r9,  r11                 ; c-e-G-0 M-b-d-0  r9
add rdx, r12                 ; I-L-0-0 0-H-K-0  rdx
mov r11, 0000FF00FF000000h
and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
mov r11, 000000FF000000FFh
rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
add rax, rdx
and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
add rbx, r9

mov qword ptr [r8], rax          ; return output
mov qword ptr [r8 + 8], rbx      ; return output

pop r12
pop rbx
                   
mov rsp, rbp                     ; epilog. restore stack pointer
pop rbp
ret
    asm_decode ENDP
    _TEXT ENDS
    END

aw27

#7
This is a solution with output:



option casemap :None

includelib \masm32\lib64\kernel32.lib
ExitProcess proto :dword
includelib \masm32\lib64\msvcrt.lib
printf proto :ptr, :vararg

.data
data1 db "fed2cba1",0
data2 db "MLK4IHG3",0
data3 db 9 dup (0)
data4 db 9 dup (0)
format0 db "Initial: Value1-> %s value2-> %s",13,10,0
format1 db "Final: Value1-> %s value2-> %s",13,10,0

.code

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

calc proc
        sub rsp, 8
mov r8, rcx ; save first
mov r9, rdx ; save second

; Resolve 2nd
mov rax, 0FF000000FF00h
and rcx, rax ; rcx=0-e-0-0 0-b-0-0
mov r10, r8
rol r10, 32  ; c-b-a-1 f-e-d-2
mov rax, 0FF0000000000FFh
and r10, rax ; r10 = c-0-0-0 0-0-d-0
mov r11, r9
mov rax, 0FF000000FF000000h
and r11, rax ; r11 = 0-0-0-2 0-0-0-1
rol rdx, 32
mov rax, 0FF00FF0000h
and rdx, rax   ; rdx=0-0-G-0 M-0-0-0
or rdx, rcx
or rdx, r10
or rdx, r11 ; rdx=c-e-G-4 M-b-d-3

; Resolve 1st
mov rcx, r8
rol rcx, 32
mov rax, 0FF00FF0000h
and rcx, rax ; rcx = 0-0-a-0 f-0-0-0
mov r10, r9
mov rax, 0FF000000FF00h
and r10, rax ; r10 = 0-L-0-0-0-H-0-0
mov r11, r9
rol r11, 32
mov rax, 0FF0000000000FFh
and r11, rax ; r11 = I-0-0-0-0-0-K-0
mov rax, 0FF000000FF000000h
and r8, rax ; r8=0-0-0-4 0-0-0-3
or r8, rcx
or r8, r10
or r8, r11 ; r8=I-L-a-2 f-H-K-1
add rsp, 8
ret
calc endp

main proc
sub rsp, 28h

mov rcx, qword ptr [data1]
mov rdx, qword ptr [data2]

call calc

mov qword ptr [data3], r8
mov qword ptr [data4], rdx

mov rcx, offset format0
mov rdx, offset data1
mov r8, offset data2
call printf ; print initial value

mov rcx, offset format1
mov rdx, offset data3
mov r8, offset data4
call printf ; print final value

add rsp, 28h
mov rcx,0
       
call ExitProcess
main endp

end

aw27

@alikim
Ref your last code:
1) It is not MMX it is SSE
2) You don't need EMMS with SSE
3) You don't need this here:
sub rsp, 8 * (4 + 2)
because it is a leaf function and the stack is already aligned by the push rbp you did earlier. If you had not done push rbp, which is not needed here because you don't have local variables, you should have done only sub rsp, 8.

These are just a few cents. BTW, I have not run your code so may be other problems.