Print Page - Tiny RC4

Title: Tiny RC4
Post by: lorenzo on February 05, 2015, 11:31:11 PM

Although RC4 is unsuitable for most applications that require robust security, I like its simplicity and thought it might be interesting to see how much this could be optimized. I've managed 77 bytes but Peter Ferrie implemented version with 69 bytes. Can you do better?

Peter's version -> http://pferrie.host22.com/misc/rc4.htm

Code Select


; structure used store x, y, s for state and buf for plaintext/ciphertext
RC4_CTX struct
    x   dword ?
    y   dword ?
    s   byte 256 dup (?)
    buf byte 256 dup (?)
RC4_CTX ends

.code

; -----------------------------------------------------------------
; RC4 in 77 bytes using fastcall convention
; ecx holds first parameter, edx second
;
; set the key          : rc4 (0, ctx)
; encrypt/decrypt data : rc4 (data_len, ctx)
; -----------------------------------------------------------------
rc4x proc fastcall len:dword, ctx:dword
    pushad
    mov   esi, edx
    lodsd                        ; load x
    xchg  eax, ebx
    lodsd                        ; load y
    xchg  eax, ebx 
    mov   edi, esi               ; edi = s
    cdq                          ; edx = 0
    jecxz init_sbox
    dec   dl                     ; 256
    add   edi, edx               ; edi = buf
; -----------------------------------------------------------------
; PRNG. edi=buf, eax=x, ebx=y, ecx=data_len 
; -----------------------------------------------------------------
crypt_data:
    inc   al                     ; x++
    mov   dl, [esi+eax]          ; dl = s[x]
    add   bl, dl                 ; y += dl
    xchg  dl, [esi+ebx]          ; swap s[y], s[x]
    mov   [esi+eax], dl          ; s[x] = s[y]
    add   dl, [esi+ebx]          ; dl = s[x] + s[y]
    mov   dl, [esi+edx]          ; dl = s[ dl ]
    inc   edi                    ; p++ 
    xor   byte ptr[edi], dl      ; *p ^= (s[ s[x] + s[y] ])   
    loop  crypt_data
    mov   [esi-8], eax           ; store x
    mov   [esi-4], ebx           ; store y
    popad
    ret
; -----------------------------------------------------------------
; KSA. edi=s, eax=0, ebx=j=0, ecx=i=0, edx=0
; -----------------------------------------------------------------
init_sbox:
    stosb                        ; s[i] = i
    inc   al                     ; i++
    jnz   init_sbox
; now edi points to buf with 128-bit key
init_key:
    mov   al, cl                 ; key_idx
    and   al, 15                 ; keys should be 128-bits
    mov   dl, [esi+ecx]          ; s[i]
    add   bl, dl                 ; j += s[i]
    add   bl, [edi+eax]          ; j += key[key_idx % 16]
    xchg  dl, [esi+ebx]          ; s[j] = s[i]
    mov   [esi+ecx], dl          ; s[i] = s[j]
    inc   cl                     ; i++
    jnz   init_key               ; 
    popad
    ret
rc4x endp

Title: Re: Tiny RC4
Post by: Gunther on February 06, 2015, 06:30:29 AM

Hi Lorenzo,

is that a kind of a challenge? I think Peter Ferrie's solution is well written and hard to beat. Well, it's only my impression, but please have a look into Peter's vita (http://www.microsoft.com/security/portal/mmpc/shared/team-peter.aspx).

Gunther

Title: Re: Tiny RC4
Post by: lorenzo on February 06, 2015, 06:39:24 AM

Not really challenge to anyone, just thought it would be interesting to see other attempts at it.
The version Peter wrote only works on one block of plaintext or ciphertext so it can be compacted to 69 bytes.
Were it capable of multiple calls, it would add more bytes.

The other thing I noticed was that the XOR of stream is missing so it's really 72 bytes if I'm not mistaken.
I imagine there might be way to reduce it further using combination of carry flag and ecx register but it's a nice solution alright.

Code Select


_compute_rc4:
	pushad
	mov	esi, [esp + 32 + 4]
	mov	ecx, [esi]
	add	esi, 4 + 16
	mov	edi, esi
	xor	eax, eax
start_fill:
	stosb
	inc	al
	jnz	start_fill
	xor	ebx, ebx
zero_edx:
	cdq
loop_start:
	pushfd
	add	dl, [ESI + ebx]
	popfd
	jb	skip_key
	mov	al, bl
	and	al, 0xF
	add	dl, [ESI + eax - 16]
	clc
skip_key:
	mov	al, [ESI + ebx]
	xchg	al, [ESI + edx] ; eax = K[i]
	mov	[ESI + ebx], al
	inc	bl
	jb	stream_start
	jnz	loop_start
	inc	ebx
	stc
	jb	zero_edx
stream_start:
	add	al, [ESI + edx] ; eax = K[i] + K[j]
	mov	al, [ESI + eax] ; eax = K[(K[i] + K[j]) % 256]
	stosb	; Output the byte.
	stc
	loop	loop_start
stream_end:
	popad
	ret

Title: Re: Tiny RC4
Post by: dedndave on February 06, 2015, 09:30:18 AM

speed better than size in this case :t

Title: Re: Tiny RC4
Post by: lorenzo on February 08, 2015, 09:12:07 PM

I agree. Peter's implementation was written for some security challenge (not sure what) where code had to be tiny as possible.

The MASM Forum

General => The Laboratory => Topic started by: lorenzo on February 05, 2015, 11:31:11 PM