Byte shuffling on GPR

alikim · August 30, 2017, 08:24:39 PM

Hello, this is my first assembly program compiled as a part of a C++ VS project, I'd like to know if this code can be optimized for speed, on general purpose registers without using extensions like pshufb, thank you!

Code Select

		        mov rax, qword ptr [rdx]         ; load state first 8 bytes
		        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes		

                        ; shuffle 
			; f-e-d-2 c-b-a-1  rax
			; M-L-K-4 I-H-G-3  rbx
			;        v
			; I-L-a-2 f-H-K-1  rax
			; c-e-G-4 M-b-d-3  rbx

			mov r9,  00FF000000FF0000h 
			mov rdx, r9  
			and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
			rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
			mov r11,0FF0000000000FF00h   
			and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
			and r11, rax                 ; c-0-0-0 0-0-d-0  r11
			add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
			mov r11, 0000FF00FF000000h
			rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
			and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
			add r9,  r11                 ; c-e-G-0 M-b-d-0  r9 
			mov r11,0FF0000000000FF00h
			and r11, rbx                 ; I-0-0-0 0-0-K-0  r11
			add rdx, r11                 ; I-L-0-0 0-H-K-0  rdx
			mov r11, 0000FF00FF000000h
			and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
			add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
			mov r11, 000000FF000000FFh
			rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
			and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
			mov r11, 000000FF000000FFh
			rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
			add rax, rdx
			and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
			add rbx, r9

                        mov qword ptr [r8], rax          ; return output
		        mov qword ptr [r8 + 8], rbx      ; return output

jj2007 · August 30, 2017, 08:39:04 PM

Looks OK, but you can't judge until you run the code in a testbed and try your luck with little modifications.
What's wrong with pshufb? It's not an "extension", it's just a SIMD instruction, and I doubt that you can find a CPU nowadays that cannot understand at least SSE2.

alikim · August 30, 2017, 08:44:44 PM

Thank you, there is nothing wrong with pshufb, I'm studying assembly as a hobby and just being curious how would I shuffle bytes without it.

jj2007 · August 30, 2017, 08:51:48 PM

Quote from: alikim on August 30, 2017, 08:44:44 PMI'm studying assembly as a hobby

Welcome to the club :icon14:

From Masm32 Tips, Tricks and Traps

Quote- Last but not least: A few Rules for the Forum

* Use include \masm32\include\masm32rt.inc, not include C:\masm32\include\masm32rt.inc; many people have installed Masm32 on a different drive, and it's a real nuisance having to replace C: with D: to test a code snippet

* To allow others to test your code, do not use environment variables for your paths. Masm32 has a hard-coded path structure, for good reasons.

* Post your complete code. Some believe that older members are eager to construct the missing headers around your snippets, but this belief is FALSE

* Formulate precise questions, provide precise error messages and error lines. You will get quicker and better answers

* Have fun with Masm, and enjoy the friendly atmosphere at the Masm32 Forum!

;)

Siekmanski · August 30, 2017, 09:14:07 PM

alikim, welcome.

Only had a quick look, and spotted this:

Code Select

		        mov rax, qword ptr [rdx]         ; load state first 8 bytes
		        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes		

                        ; shuffle 
			; f-e-d-2 c-b-a-1  rax
			; M-L-K-4 I-H-G-3  rbx
			;        v
			; I-L-a-2 f-H-K-1  rax
			; c-e-G-4 M-b-d-3  rbx

			mov r9,  00FF000000FF0000h 
			mov rdx, r9  
			and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
			rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
			mov r11,0FF0000000000FF00h   ; same as the 4th mask ( use 1 extra register for the 3th mask and delete the line with the 4th mask )
			and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
			and r11, rax                 ; c-0-0-0 0-0-d-0  r11
			add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
			mov r11, 0000FF00FF000000h   ; use another register for this mask and preserve 2th mask (r11) for the 4th mask
			rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
			and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
			add r9,  r11                 ; c-e-G-0 M-b-d-0  r9 
			mov r11,0FF0000000000FF00h   ; <-- 4th mask
			and r11, rbx                 ; I-0-0-0 0-0-K-0  r11
			add rdx, r11                 ; I-L-0-0 0-H-K-0  rdx
			mov r11, 0000FF00FF000000h
			and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
			add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
			mov r11, 000000FF000000FFh
			rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
			and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
			mov r11, 000000FF000000FFh   ; <-- delete this line, it's already loaded in r11
			rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
			add rax, rdx
			and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
			add rbx, r9

                        mov qword ptr [r8], rax          ; return output
		        mov qword ptr [r8 + 8], rbx      ; return output

Siekmanski · August 30, 2017, 10:14:02 PM

I'm not on a computer with masm installed, so it may not work ( can't test it )
Shuffled the order of the code a bit.

Code Select

		        mov rax, qword ptr [rdx]         ; load state first 8 bytes
		        mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes		

                        ; shuffle 
			; f-e-d-2 c-b-a-1  rax
			; M-L-K-4 I-H-G-3  rbx
			;        v
			; I-L-a-2 f-H-K-1  rax
			; c-e-G-4 M-b-d-3  rbx

			mov r9,  00FF000000FF0000h 
			mov rdx, r9  
			and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
			rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
			mov r11,0FF0000000000FF00h
			mov r8,  r11				
			and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
			and r11, rax                 ; c-0-0-0 0-0-d-0  r11
			add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
			mov r11, 0000FF00FF000000h
			rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
			and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
			and r8,  rbx                 ; I-0-0-0 0-0-K-0  r11
			add r9,  r11                 ; c-e-G-0 M-b-d-0  r9 
			add rdx, r8                  ; I-L-0-0 0-H-K-0  rdx
			mov r11, 0000FF00FF000000h
			and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
			add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
			rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
			rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
			and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
			and rbx, 000000FF000000FFh   ; 0-0-0-4 0-0-0-3  rbx
			add rax, rdx
			add rbx, r9

                        mov qword ptr [r8], rax          ; return output
		        mov qword ptr [r8 + 8], rbx      ; return output

alikim · August 31, 2017, 12:36:00 AM

Thank you for your suggestions!
Using an extra register gave it a significant boost ( I tried code shuffling - that did not do anything).
I only use masm inside VS shell, so the code below is probably not for stand-alone masm. I also added MMX version.

Code Select

    _DATA SEGMENT
		ALIGN 16
		msk db 0,13,10,7, 4,1,14,11, 8,5,2,15, 12,9,6,3
    _DATA ENDS
    _TEXT SEGMENT

    PUBLIC avx_decode                    ; void avx_decode(void *key, void *message, void *output);
    avx_decode PROC                      ; RCX, RDX, R8 - params
	                                 ; non volatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15

		push rbp                         ; save frame pointer
		mov rbp, rsp                     ; fix stack pointer
		sub rsp, 8 * (4 + 2)             ; allocate shadow register area + 2 QWORDs for stack alignment
		
		movdqu xmm0, xmmword ptr [rdx]         ; load message
		movdqu xmm2, xmmword ptr msk           ; load mask

										 ; 1-A-B-C  2-D-E-F
										 ; 3-G-H-I  4-K-L-M
										 ;         v
										 ; 1-K-H-F  2-A-L-I
										 ; 3-D-B-M  4-G-E-C
		pshufb xmm0, xmm2

		movdqu xmmword ptr [r8], xmm0    ; return output

		emms
                   
		mov rsp, rbp                     ; epilog. restore stack pointer
		pop rbp
		ret
    avx_decode ENDP


    PUBLIC asm_decode                    ; void asm_decode(void *key, void *message, void *output);
    asm_decode PROC                      ; RCX, RDX, R8 - params
	                                 ; non volatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15

		push rbp                         ; save frame pointer
		mov rbp, rsp                     ; fix stack pointer
		sub rsp, 8 * (4 + 2)             ; allocate shadow register area + 2 QWORDs for stack alignment

		push rbx
		push r12

		mov rax, qword ptr [rdx]         ; load state first 8 bytes
		mov rbx, qword ptr [rdx + 8]     ; load state second 8 bytes

										 ; f-e-d-2 c-b-a-1  rax
										 ; M-L-K-4 I-H-G-3  rbx
										 ;        v
										 ; I-L-a-2 f-H-K-1  rax
										 ; c-e-G-4 M-b-d-3  rbx
			mov r9,  00FF000000FF0000h 
			mov rdx, r9  
			and r9,  rax                 ; 0-e-0-0 0-b-0-0  r9
			rol rax, 32                  ; c-b-a-1 f-e-d-2  rax
			mov r11,0FF0000000000FF00h
			mov r12, r11   
			and rdx, rbx                 ; 0-L-0-0 0-H-0-0  rdx
			and r11, rax                 ; c-0-0-0 0-0-d-0  r11
			add r9,  r11                 ; c-e-0-0 0-b-d-0  r9
			mov r11, 0000FF00FF000000h
			rol rbx, 32                  ; I-H-G-3 M-L-K-4  rbx
			and r11, rbx                 ; 0-0-G-0 M-0-0-0  r11
			and r12, rbx                 ; I-0-0-0 0-0-K-0  r12
			add r9,  r11                 ; c-e-G-0 M-b-d-0  r9 
			add rdx, r12                 ; I-L-0-0 0-H-K-0  rdx
			mov r11, 0000FF00FF000000h
			and r11, rax                 ; 0-0-a-0 f-0-0-0  r11
			add rdx, r11                 ; I-L-a-0 f-H-K-0  rdx
			mov r11, 000000FF000000FFh
			rol rax, 32                  ; f-e-d-2 c-b-a-1  rax
			rol rbx, 32                  ; M-L-K-4 I-H-G-3  rbx
			and rax, r11                 ; 0-0-0-2 0-0-0-1  rax
			add rax, rdx
			and rbx, r11                 ; 0-0-0-4 0-0-0-3  rbx
			add rbx, r9

		mov qword ptr [r8], rax          ; return output
		mov qword ptr [r8 + 8], rbx      ; return output

		pop r12
		pop rbx
                   
		mov rsp, rbp                     ; epilog. restore stack pointer
		pop rbp
		ret
    asm_decode ENDP
    _TEXT ENDS
    END

aw27 · August 31, 2017, 12:46:46 AM

This is a solution with output:

Code Select


 
option casemap :None 

includelib \masm32\lib64\kernel32.lib
ExitProcess proto :dword
includelib \masm32\lib64\msvcrt.lib
printf proto :ptr, :vararg

.data
data1 db "fed2cba1",0
data2 db "MLK4IHG3",0
data3 db 9 dup (0)
data4 db 9 dup (0)
format0 db "Initial: Value1-> %s value2-> %s",13,10,0
format1 db "Final: Value1-> %s value2-> %s",13,10,0

.code

OPTION PROLOGUE:NONE 
OPTION EPILOGUE:NONE 

calc proc 
        sub rsp, 8
	mov r8, rcx ; save first
	mov r9, rdx ; save second
	
	; Resolve 2nd
	mov rax, 0FF000000FF00h 
	and rcx, rax ; rcx=0-e-0-0 0-b-0-0
	mov r10, r8
	rol r10, 32  ; c-b-a-1 f-e-d-2 
	mov rax, 0FF0000000000FFh
	and r10, rax ; r10 = c-0-0-0 0-0-d-0
	mov r11, r9
	mov rax, 0FF000000FF000000h
	and r11, rax ; r11 = 0-0-0-2 0-0-0-1
	rol rdx, 32
	mov rax, 0FF00FF0000h
	and rdx, rax   ; rdx=0-0-G-0 M-0-0-0
	or rdx, rcx
	or rdx, r10
	or rdx, r11 ; rdx=c-e-G-4 M-b-d-3
	
	; Resolve 1st
	mov rcx, r8
	rol rcx, 32 
	mov rax, 0FF00FF0000h
	and rcx, rax ; rcx = 0-0-a-0 f-0-0-0
	mov r10, r9
	mov rax, 0FF000000FF00h
	and r10, rax ; r10 = 0-L-0-0-0-H-0-0
	mov r11, r9
	rol r11, 32
	mov rax, 0FF0000000000FFh
	and r11, rax ; r11 = I-0-0-0-0-0-K-0
	mov rax, 0FF000000FF000000h
	and r8, rax ; r8=0-0-0-4 0-0-0-3
	or r8, rcx
	or r8, r10
	or r8, r11 ; r8=I-L-a-2 f-H-K-1
	add rsp, 8
	ret
calc endp

main proc
	sub rsp, 28h
	
	mov rcx, qword ptr [data1]
	mov rdx, qword ptr [data2]
	
	call calc
	
	mov qword ptr [data3], r8
	mov qword ptr [data4], rdx

	mov rcx, offset format0
	mov rdx, offset data1
	mov r8, offset data2
	call printf ; print initial value
	
	mov rcx, offset format1
	mov rdx, offset data3
	mov r8, offset data4
	call printf ; print final value
	
	add rsp, 28h
	mov rcx,0
       
	call ExitProcess
main endp	

end

aw27 · August 31, 2017, 01:17:29 AM

@alikim
Ref your last code:
1) It is not MMX it is SSE
2) You don't need EMMS with SSE
3) You don't need this here:
sub rsp, 8 * (4 + 2)
because it is a leaf function and the stack is already aligned by the push rbp you did earlier. If you had not done push rbp, which is not needed here because you don't have local variables, you should have done only sub rsp, 8.

These are just a few cents. BTW, I have not run your code so may be other problems.

The MASM Forum

News:

Byte shuffling on GPR

alikim

jj2007

alikim

jj2007

Siekmanski

Siekmanski

alikim

aw27

aw27