Thank you for your suggestions!
Using an extra register gave it a significant boost ( I tried code shuffling - that did not do anything).
I only use masm inside VS shell, so the code below is probably not for stand-alone masm. I also added MMX version.
_DATA SEGMENT
ALIGN 16
msk db 0,13,10,7, 4,1,14,11, 8,5,2,15, 12,9,6,3
_DATA ENDS
_TEXT SEGMENT
PUBLIC avx_decode ; void avx_decode(void *key, void *message, void *output);
avx_decode PROC ; RCX, RDX, R8 - params
; non volatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15
push rbp ; save frame pointer
mov rbp, rsp ; fix stack pointer
sub rsp, 8 * (4 + 2) ; allocate shadow register area + 2 QWORDs for stack alignment
movdqu xmm0, xmmword ptr [rdx] ; load message
movdqu xmm2, xmmword ptr msk ; load mask
; 1-A-B-C 2-D-E-F
; 3-G-H-I 4-K-L-M
; v
; 1-K-H-F 2-A-L-I
; 3-D-B-M 4-G-E-C
pshufb xmm0, xmm2
movdqu xmmword ptr [r8], xmm0 ; return output
emms
mov rsp, rbp ; epilog. restore stack pointer
pop rbp
ret
avx_decode ENDP
PUBLIC asm_decode ; void asm_decode(void *key, void *message, void *output);
asm_decode PROC ; RCX, RDX, R8 - params
; non volatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15
push rbp ; save frame pointer
mov rbp, rsp ; fix stack pointer
sub rsp, 8 * (4 + 2) ; allocate shadow register area + 2 QWORDs for stack alignment
push rbx
push r12
mov rax, qword ptr [rdx] ; load state first 8 bytes
mov rbx, qword ptr [rdx + 8] ; load state second 8 bytes
; f-e-d-2 c-b-a-1 rax
; M-L-K-4 I-H-G-3 rbx
; v
; I-L-a-2 f-H-K-1 rax
; c-e-G-4 M-b-d-3 rbx
mov r9, 00FF000000FF0000h
mov rdx, r9
and r9, rax ; 0-e-0-0 0-b-0-0 r9
rol rax, 32 ; c-b-a-1 f-e-d-2 rax
mov r11,0FF0000000000FF00h
mov r12, r11
and rdx, rbx ; 0-L-0-0 0-H-0-0 rdx
and r11, rax ; c-0-0-0 0-0-d-0 r11
add r9, r11 ; c-e-0-0 0-b-d-0 r9
mov r11, 0000FF00FF000000h
rol rbx, 32 ; I-H-G-3 M-L-K-4 rbx
and r11, rbx ; 0-0-G-0 M-0-0-0 r11
and r12, rbx ; I-0-0-0 0-0-K-0 r12
add r9, r11 ; c-e-G-0 M-b-d-0 r9
add rdx, r12 ; I-L-0-0 0-H-K-0 rdx
mov r11, 0000FF00FF000000h
and r11, rax ; 0-0-a-0 f-0-0-0 r11
add rdx, r11 ; I-L-a-0 f-H-K-0 rdx
mov r11, 000000FF000000FFh
rol rax, 32 ; f-e-d-2 c-b-a-1 rax
rol rbx, 32 ; M-L-K-4 I-H-G-3 rbx
and rax, r11 ; 0-0-0-2 0-0-0-1 rax
add rax, rdx
and rbx, r11 ; 0-0-0-4 0-0-0-3 rbx
add rbx, r9
mov qword ptr [r8], rax ; return output
mov qword ptr [r8 + 8], rbx ; return output
pop r12
pop rbx
mov rsp, rbp ; epilog. restore stack pointer
pop rbp
ret
asm_decode ENDP
_TEXT ENDS
END