here are three examples which will tell you why
first is 64 beauty:
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov r9,rcx
.if (rcx != rdx)
.for (¦r8¦al=[rdx],[rcx]=al,rcx++,rdx++,r8--)
.endfor
.endif
mov rax,r9
ret
xmemcpy ENDP
compiled to this:
000000014004DA78 push rbp
000000014004DA79 mov rbp,rsp
000000014004DA7C mov r9,rcx
000000014004DA7F cmp rcx,rdx
000000014004DA82 je xmemcpy+22h (14004DA9Ah)
000000014004DA84 jmp xmemcpy+1Bh (14004DA93h)
000000014004DA86 mov al,byte ptr [rdx]
000000014004DA88 mov byte ptr [rcx],al
000000014004DA8A inc rcx
000000014004DA8D inc rdx
000000014004DA90 dec r8
000000014004DA93 and r8,r8
000000014004DA96 je xmemcpy+22h (14004DA9Ah)
000000014004DA98 jmp xmemcpy+0Eh (14004DA86h)
000000014004DA9A mov rax,r9
000000014004DA9D leave
000000014004DA9E ret
the second is the same routine but 32 bit:
xmemcpy PROC USES ebx dest:DORD,src:DWORD,count:DWORD
mov ecx,dest
.if (ecx != src)
.for (edx=src,ebx=count¦ebx¦al=[edx],[ecx]=al,ecx++,edx++,ebx--)
.endfor
.endif
mov eax,dest
ret
xmemcpy ENDP
compiled to this:
00401020 55 push ebp
00401021 8bec mov ebp,esp
00401023 53 push ebx
00401024 8b4d08 mov ecx,dword ptr [ebp+8]
00401027 3b4d0c cmp ecx,dword ptr [ebp+0Ch]
0040102a 7415 je xmemcpy+0x21 (00401041)
0040102c 8b550c mov edx,dword ptr [ebp+0Ch]
0040102f 8b5d10 mov ebx,dword ptr [ebp+10h]
00401032 eb07 jmp xmemcpy+0x1b (0040103b)
00401034 8a02 mov al,byte ptr [edx]
00401036 8801 mov byte ptr [ecx],al
00401038 41 inc ecx
00401039 42 inc edx
0040103a 4b dec ebx
0040103b 23db and ebx,ebx
0040103d 7402 je xmemcpy+0x21 (00401041)
0040103f ebf3 jmp xmemcpy+0x14 (00401034)
00401041 8b4508 mov eax,dword ptr [ebp+8]
00401044 5b pop ebx
00401045 5d pop ebp
00401046 c20c00 ret 0Ch
and here is the C version:
void* xmemcpy(void *dest, const void *src, int count)
{
unsigned char *byte_dest=(unsigned char *)dest;
unsigned char *byte_src=(unsigned char *)src;
if (byte_dest != byte_src)
{
if (count)
{
for (;;)
{
*byte_dest=*byte_src;
if (!--count) break;
++byte_dest;
++byte_src;
}
}
}
return dest;
}
compiled to this:
01271AF0 push ebp
01271AF1 mov ebp,esp
01271AF3 sub esp,0D8h
01271AF9 push ebx
01271AFA push esi
01271AFB push edi
01271AFC lea edi,[ebp-0D8h]
01271B02 mov ecx,36h
01271B07 mov eax,0CCCCCCCCh
01271B0C rep stos dword ptr es:[edi]
01271B0E mov eax,dword ptr [dest]
01271B11 mov dword ptr [byte_dest],eax
01271B14 mov eax,dword ptr [src]
01271B17 mov dword ptr [byte_src],eax
01271B1A mov eax,dword ptr [byte_dest]
01271B1D cmp eax,dword ptr [byte_src]
01271B20 je xmemcpy+63h (01271B53h)
01271B22 cmp dword ptr [count],0
01271B26 je xmemcpy+63h (01271B53h)
01271B28 mov eax,dword ptr [byte_dest]
01271B2B mov ecx,dword ptr [byte_src]
01271B2E mov dl,byte ptr [ecx]
01271B30 mov byte ptr [eax],dl
01271B32 mov eax,dword ptr [count]
01271B35 sub eax,1
01271B38 mov dword ptr [count],eax
01271B3B jne xmemcpy+4Fh (01271B3Fh)
01271B3D jmp xmemcpy+63h (01271B53h)
01271B3F mov eax,dword ptr [byte_dest]
01271B42 add eax,1
01271B45 mov dword ptr [byte_dest],eax
01271B48 mov eax,dword ptr [byte_src]
01271B4B add eax,1
01271B4E mov dword ptr [byte_src],eax
01271B51 jmp xmemcpy+38h (01271B28h)
01271B53 mov eax,dword ptr [dest]
01271B56 pop edi
01271B57 pop esi
01271B58 pop ebx
01271B59 mov esp,ebp
01271B5B pop ebp
01271B5C ret
Do you need more reasons? ;)