News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Tiny RC4

Started by lorenzo, February 05, 2015, 11:31:11 PM

Previous topic - Next topic

lorenzo

Although RC4 is unsuitable for most applications that require robust security, I like its simplicity and thought it might be interesting to see how much this could be optimized. I've managed 77 bytes but Peter Ferrie implemented version with 69 bytes. Can you do better?

Peter's version -> http://pferrie.host22.com/misc/rc4.htm


; structure used store x, y, s for state and buf for plaintext/ciphertext
RC4_CTX struct
    x   dword ?
    y   dword ?
    s   byte 256 dup (?)
    buf byte 256 dup (?)
RC4_CTX ends

.code

; -----------------------------------------------------------------
; RC4 in 77 bytes using fastcall convention
; ecx holds first parameter, edx second
;
; set the key          : rc4 (0, ctx)
; encrypt/decrypt data : rc4 (data_len, ctx)
; -----------------------------------------------------------------
rc4x proc fastcall len:dword, ctx:dword
    pushad
    mov   esi, edx
    lodsd                        ; load x
    xchg  eax, ebx
    lodsd                        ; load y
    xchg  eax, ebx
    mov   edi, esi               ; edi = s
    cdq                          ; edx = 0
    jecxz init_sbox
    dec   dl                     ; 256
    add   edi, edx               ; edi = buf
; -----------------------------------------------------------------
; PRNG. edi=buf, eax=x, ebx=y, ecx=data_len
; -----------------------------------------------------------------
crypt_data:
    inc   al                     ; x++
    mov   dl, [esi+eax]          ; dl = s[x]
    add   bl, dl                 ; y += dl
    xchg  dl, [esi+ebx]          ; swap s[y], s[x]
    mov   [esi+eax], dl          ; s[x] = s[y]
    add   dl, [esi+ebx]          ; dl = s[x] + s[y]
    mov   dl, [esi+edx]          ; dl = s[ dl ]
    inc   edi                    ; p++
    xor   byte ptr[edi], dl      ; *p ^= (s[ s[x] + s[y] ])   
    loop  crypt_data
    mov   [esi-8], eax           ; store x
    mov   [esi-4], ebx           ; store y
    popad
    ret
; -----------------------------------------------------------------
; KSA. edi=s, eax=0, ebx=j=0, ecx=i=0, edx=0
; -----------------------------------------------------------------
init_sbox:
    stosb                        ; s[i] = i
    inc   al                     ; i++
    jnz   init_sbox
; now edi points to buf with 128-bit key
init_key:
    mov   al, cl                 ; key_idx
    and   al, 15                 ; keys should be 128-bits
    mov   dl, [esi+ecx]          ; s[i]
    add   bl, dl                 ; j += s[i]
    add   bl, [edi+eax]          ; j += key[key_idx % 16]
    xchg  dl, [esi+ebx]          ; s[j] = s[i]
    mov   [esi+ecx], dl          ; s[i] = s[j]
    inc   cl                     ; i++
    jnz   init_key               ;
    popad
    ret
rc4x endp

Gunther

Hi Lorenzo,

is that a kind of a challenge? I think Peter Ferrie's solution is well written and hard to beat. Well, it's only my impression, but please have a look into Peter's vita.

Gunther
You have to know the facts before you can distort them.

lorenzo

Not really challenge to anyone, just thought it would be interesting to see other attempts at it.
The version Peter wrote only works on one block of plaintext or ciphertext so it can be compacted to 69 bytes.
Were it capable of multiple calls, it would add more bytes.

The other thing I noticed was that the XOR of stream is missing so it's really 72 bytes if I'm not mistaken.
I imagine there might be way to reduce it further using combination of carry flag and ecx register but it's a nice solution alright.


_compute_rc4:
pushad
mov esi, [esp + 32 + 4]
mov ecx, [esi]
add esi, 4 + 16
mov edi, esi
xor eax, eax
start_fill:
stosb
inc al
jnz start_fill
xor ebx, ebx
zero_edx:
cdq
loop_start:
pushfd
add dl, [ESI + ebx]
popfd
jb skip_key
mov al, bl
and al, 0xF
add dl, [ESI + eax - 16]
clc
skip_key:
mov al, [ESI + ebx]
xchg al, [ESI + edx] ; eax = K[i]
mov [ESI + ebx], al
inc bl
jb stream_start
jnz loop_start
inc ebx
stc
jb zero_edx
stream_start:
add al, [ESI + edx] ; eax = K[i] + K[j]
mov al, [ESI + eax] ; eax = K[(K[i] + K[j]) % 256]
stosb ; Output the byte.
stc
loop loop_start
stream_end:
popad
ret

dedndave

speed better than size in this case   :t

lorenzo

I agree. Peter's implementation was written for some security challenge (not sure what) where code had to be tiny as possible.