General > The Laboratory

Byte shuffling on GPR

(1/2) > >>

alikim:
Hello, this is my first assembly program compiled as a part of a C++ VS project, I'd like to know if this code can be optimized for speed, on general purpose registers without using extensions like pshufb, thank you!


--- Code: ---         mov rax, qword ptr [rdx]         ; load state first 8 bytes
        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

                        ; shuffle
; f-e-d-2 c-b-a-1  rax
; M-L-K-4 I-H-G-3  rbx
;        v
; I-L-a-2 f-H-K-1  rax
; c-e-G-4 M-b-d-3  rbx

mov r9,  00FF000000FF0000h
mov rdx, r9 
and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
mov r11,0FF0000000000FF00h   
and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
and r11, rax                 ; c-0-0-0 0-0-d-0  r11
add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
mov r11, 0000FF00FF000000h
rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
add r9,  r11                 ; c-e-G-0 M-b-d-0  r9
mov r11,0FF0000000000FF00h
and r11, rbx                 ; I-0-0-0 0-0-K-0  r11
add rdx, r11                 ; I-L-0-0 0-H-K-0  rdx
mov r11, 0000FF00FF000000h
and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
mov r11, 000000FF000000FFh
rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
mov r11, 000000FF000000FFh
rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
add rax, rdx
and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
add rbx, r9

                        mov qword ptr [r8], rax          ; return output
        mov qword ptr [r8 + 8], rbx      ; return output
--- End code ---

jj2007:
Looks OK, but you can't judge until you run the code in a testbed and try your luck with little modifications.
What's wrong with pshufb? It's not an "extension", it's just a SIMD instruction, and I doubt that you can find a CPU nowadays that cannot understand at least SSE2.

alikim:
Thank you, there is nothing wrong with pshufb, I'm studying assembly as a hobby and just being curious how would I shuffle bytes without it.

jj2007:

--- Quote from: alikim on August 30, 2017, 08:44:44 PM ---I'm studying assembly as a hobby
--- End quote ---

Welcome to the club :icon14:

From Masm32 Tips, Tricks and Traps
--- Quote ---- Last but not least: A few Rules for the Forum

* Use include \masm32\include\masm32rt.inc, not include C:\masm32\include\masm32rt.inc; many people have installed Masm32 on a different drive, and it's a real nuisance having to replace C: with D: to test a code snippet

* To allow others to test your code, do not use environment variables for your paths. Masm32 has a hard-coded path structure, for good reasons.

* Post your complete code. Some believe that older members are eager to construct the missing headers around your snippets, but this belief is FALSE

* Formulate precise questions, provide precise error messages and error lines. You will get quicker and better answers

* Have fun with Masm, and enjoy the friendly atmosphere at the Masm32 Forum!
--- End quote ---
;)

Siekmanski:
alikim, welcome.

Only had a quick look, and spotted this:


--- Code: ---         mov rax, qword ptr [rdx]         ; load state first 8 bytes
        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

                        ; shuffle
; f-e-d-2 c-b-a-1  rax
; M-L-K-4 I-H-G-3  rbx
;        v
; I-L-a-2 f-H-K-1  rax
; c-e-G-4 M-b-d-3  rbx

mov r9,  00FF000000FF0000h
mov rdx, r9 
and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
mov r11,0FF0000000000FF00h   ; same as the 4th mask ( use 1 extra register for the 3th mask and delete the line with the 4th mask )
and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
and r11, rax                 ; c-0-0-0 0-0-d-0  r11
add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
mov r11, 0000FF00FF000000h   ; use another register for this mask and preserve 2th mask (r11) for the 4th mask
rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
add r9,  r11                 ; c-e-G-0 M-b-d-0  r9
mov r11,0FF0000000000FF00h   ; <-- 4th mask
and r11, rbx                 ; I-0-0-0 0-0-K-0  r11
add rdx, r11                 ; I-L-0-0 0-H-K-0  rdx
mov r11, 0000FF00FF000000h
and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
mov r11, 000000FF000000FFh
rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
mov r11, 000000FF000000FFh   ; <-- delete this line, it's already loaded in r11
rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
add rax, rdx
and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
add rbx, r9

                        mov qword ptr [r8], rax          ; return output
        mov qword ptr [r8 + 8], rbx      ; return output

--- End code ---

Navigation

[0] Message Index

[#] Next page

Go to full version