Although RC4 is unsuitable for most applications that require robust security, I like its simplicity and thought it might be interesting to see how much this could be optimized. I've managed 77 bytes but Peter Ferrie implemented version with 69 bytes. Can you do better?
Peter's version -> http://pferrie.host22.com/misc/rc4.htm
; structure used store x, y, s for state and buf for plaintext/ciphertext
RC4_CTX struct
x dword ?
y dword ?
s byte 256 dup (?)
buf byte 256 dup (?)
RC4_CTX ends
.code
; -----------------------------------------------------------------
; RC4 in 77 bytes using fastcall convention
; ecx holds first parameter, edx second
;
; set the key : rc4 (0, ctx)
; encrypt/decrypt data : rc4 (data_len, ctx)
; -----------------------------------------------------------------
rc4x proc fastcall len:dword, ctx:dword
pushad
mov esi, edx
lodsd ; load x
xchg eax, ebx
lodsd ; load y
xchg eax, ebx
mov edi, esi ; edi = s
cdq ; edx = 0
jecxz init_sbox
dec dl ; 256
add edi, edx ; edi = buf
; -----------------------------------------------------------------
; PRNG. edi=buf, eax=x, ebx=y, ecx=data_len
; -----------------------------------------------------------------
crypt_data:
inc al ; x++
mov dl, [esi+eax] ; dl = s[x]
add bl, dl ; y += dl
xchg dl, [esi+ebx] ; swap s[y], s[x]
mov [esi+eax], dl ; s[x] = s[y]
add dl, [esi+ebx] ; dl = s[x] + s[y]
mov dl, [esi+edx] ; dl = s[ dl ]
inc edi ; p++
xor byte ptr[edi], dl ; *p ^= (s[ s[x] + s[y] ])
loop crypt_data
mov [esi-8], eax ; store x
mov [esi-4], ebx ; store y
popad
ret
; -----------------------------------------------------------------
; KSA. edi=s, eax=0, ebx=j=0, ecx=i=0, edx=0
; -----------------------------------------------------------------
init_sbox:
stosb ; s[i] = i
inc al ; i++
jnz init_sbox
; now edi points to buf with 128-bit key
init_key:
mov al, cl ; key_idx
and al, 15 ; keys should be 128-bits
mov dl, [esi+ecx] ; s[i]
add bl, dl ; j += s[i]
add bl, [edi+eax] ; j += key[key_idx % 16]
xchg dl, [esi+ebx] ; s[j] = s[i]
mov [esi+ecx], dl ; s[i] = s[j]
inc cl ; i++
jnz init_key ;
popad
ret
rc4x endp
Hi Lorenzo,
is that a kind of a challenge? I think Peter Ferrie's solution is well written and hard to beat. Well, it's only my impression, but please have a look into Peter's vita (http://www.microsoft.com/security/portal/mmpc/shared/team-peter.aspx).
Gunther
Not really challenge to anyone, just thought it would be interesting to see other attempts at it.
The version Peter wrote only works on one block of plaintext or ciphertext so it can be compacted to 69 bytes.
Were it capable of multiple calls, it would add more bytes.
The other thing I noticed was that the XOR of stream is missing so it's really 72 bytes if I'm not mistaken.
I imagine there might be way to reduce it further using combination of carry flag and ecx register but it's a nice solution alright.
_compute_rc4:
pushad
mov esi, [esp + 32 + 4]
mov ecx, [esi]
add esi, 4 + 16
mov edi, esi
xor eax, eax
start_fill:
stosb
inc al
jnz start_fill
xor ebx, ebx
zero_edx:
cdq
loop_start:
pushfd
add dl, [ESI + ebx]
popfd
jb skip_key
mov al, bl
and al, 0xF
add dl, [ESI + eax - 16]
clc
skip_key:
mov al, [ESI + ebx]
xchg al, [ESI + edx] ; eax = K[i]
mov [ESI + ebx], al
inc bl
jb stream_start
jnz loop_start
inc ebx
stc
jb zero_edx
stream_start:
add al, [ESI + edx] ; eax = K[i] + K[j]
mov al, [ESI + eax] ; eax = K[(K[i] + K[j]) % 256]
stosb ; Output the byte.
stc
loop loop_start
stream_end:
popad
ret
speed better than size in this case :t
I agree. Peter's implementation was written for some security challenge (not sure what) where code had to be tiny as possible.