Hello
First sorry for the AT&T syntax.
The code is copied directly from my editor, works well, but is not assembled and function have to change some things.
It would be a round of encryption, if you would make 8 loops, and last 2 are therefore modify something SubBytes, Shifrows and AddRounkey.
I hope if someone copies the first author is respected me.
.section .data
sbx_: .quad 0xc56f6bf27b777c63,0x76abd7fe2b670130
.quad 0xf04759fa7dc982ca,0xc072a49cafa2d4ad
.quad 0xccf73f362693fdb7,0x1531d871f1e5a534
.quad 0x9a059618c323c704,0x75b227ebe2801207
.quad 0xa05a6e1b1a2c8309,0x842fe329b3d63b52
.quad 0x5bb1fc20ed00d153,0xcf584c4a39becb6a
.quad 0x85334d43fbaaefd0,0xa89f3c507f02f945
.quad 0xf5389d928f40a351,0xd2f3ff1021dab6bc
.quad 0x1744975fec130ccd,0x73195d643d7ea7c4
.quad 0x88902a22dc4f8160,0xdb0b5ede14b8ee46
.quad 0x5c2406490a3a32e0,0x79e4959162acd3c2
.quad 0xa94ed58d6d37c8e7,0x08ae7a65eaf4566c
.quad 0xc6b4a61c2e2578ba,0x8a8bbd4b1f74dde8
.quad 0x0ef6034866b53e70,0x9e1dc186b9573561
.quad 0x948ed9691198f8e1,0xdf2855cee9871e9b
.quad 0x6842e6bf0d89a18c,0x16bb54b00f2d9941
sq0_: .quad 0x000000ff000000ff,0x000000ff000000ff
sq1_: .quad 0x0000ff000000ff00,0x0000ff000000ff00
sq2_: .quad 0x00ff000000ff0000,0x00ff000000ff0000
sq3_: .quad 0xff000000ff000000,0xff000000ff000000
bts_: .quad 0x8080808080808080,0x8080808080808080
dlb_: .quad 0x7f7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f7f
xrb_: .quad 0x1b1b1b1b1b1b1b1b,0x1b1b1b1b1b1b1b1b
rcn_: .quad 0x0000000200000001,0x0000000800000004
.quad 0x0000002000000010,0x0000008000000040
.quad 0x000000360000001b
.section .text
.globl _start
_start:
pushq %r11
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %r8
pushq %r9
pushq %rax
###movq $key_, %r11
###movq $inp_, %r10
movq $rcn_, %r12
prefetch sbx_
prefetch sq0_
movq $-160, %r13
movdqu (%r11), %xmm0
movdqu %xmm0, -16(%rsp, %r13, 1)
movl 12(%r11), %r11d
movnti %r11d, (%rsp, %r13, 1)
_KeyExpansion:
rorl $8, (%rsp, %r13, 1)
movzx (%rsp, %r13, 1), %r14
movzx 1(%rsp, %r13, 1), %r15
movzx 2(%rsp, %r13, 1), %r8
movzx 3(%rsp, %r13, 1), %r9
movzx sbx_(, %r14, 1), %r14
movzx sbx_(, %r15, 1), %r15
movzx sbx_(, %r8, 1), %r8
movzx sbx_(, %r9, 1), %r9
movnti %r14d, (%rsp, %r13, 1)
movnti %r15d, 1(%rsp, %r13, 1)
movnti %r8d, 2(%rsp, %r13, 1)
movnti %r9d, 3(%rsp, %r13, 1)
movl (%rsp, %r13, 1), %eax
xorl (%r12), %eax
addq $4, %r12
xorl -16(%rsp, %r13, 1), %eax
movnti %eax, (%rsp, %r13, 1)
xorl -12(%rsp, %r13, 1), %eax
movnti %eax, 4(%rsp, %r13, 1)
xorl -8(%rsp, %r13, 1), %eax
movnti %eax, 8(%rsp, %r13, 1)
xorl -4(%rsp, %r13, 1), %eax
movnti %eax, 12(%rsp, %r13, 1)
movnti %eax, 16(%rsp, %r13, 1)
addq $16, %r13
jnz _KeyExpansion
_Cipher:
movdqu (%r10), %xmm0
pxor -176(%rsp), %xmm0
movdqu %xmm0, -208(%rsp)
movq $-16, %r13
_SubBytes:
movzx -192(%rsp, %r13, 1), %r14
movzx -191(%rsp, %r13, 1), %r15
movzx -190(%rsp, %r13, 1), %r8
movzx -189(%rsp, %r13, 1), %r9
movzx sbx_(, %r14, 1), %r14
movzx sbx_(, %r15, 1), %r15
movzx sbx_(, %r8, 1), %r8
movzx sbx_(, %r9, 1), %r9
movnti %r14d, -224(%rsp, %r13, 1)
movnti %r15d, -223(%rsp, %r13, 1)
movnti %r8d, -222(%rsp, %r13, 1)
movnti %r9d, -221(%rsp, %r13, 1)
addq $4, %r13
jnz _SubBytes
_Shiftows:
movdqu -240(%rsp), %xmm0
pshufd $0x39, %xmm0, %xmm1
pshufd $0x4e, %xmm0, %xmm2
pshufd $0x93, %xmm0, %xmm3
pand sq1_, %xmm1
pand sq2_, %xmm2
pand sq3_, %xmm3
pand sq0_, %xmm0
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pxor %xmm3, %xmm0
_Mixcolumns:
movdqa %xmm0, %xmm1
movdqa %xmm1, %xmm2
movdqa %xmm2, %xmm3
movdqa %xmm3, %xmm4
pand bts_, %xmm0
pcmpeqb bts_, %xmm0
movdqa %xmm0, %xmm5
pand %xmm0, %xmm1
pandn %xmm2, %xmm0
pand dlb_, %xmm1
pslld $1, %xmm0
pslld $1, %xmm1
pxor xrb_, %xmm1
pand %xmm5, %xmm1
pxor %xmm1, %xmm0
pxor %xmm0, %xmm2
movdqa %xmm2, %xmm6
psrld $8, %xmm2
pxor %xmm2, %xmm0
pslld $8, %xmm3
pxor %xmm3, %xmm0
pslld $8, %xmm3
pxor %xmm3, %xmm0
psrld $16, %xmm4
pxor %xmm4, %xmm0
psrld $8, %xmm4
pxor %xmm4, %xmm0
pslld $24, %xmm6
pxor %xmm6, %xmm0
_AddRoundkey:
pxor -160(%rsp), %xmm0
Thanks.
PS: The prefetch instruction if not properly laid, the intention is to copy the tables "lookup table" in the L1 cache to minimize frequency so because SubBytes uses much the tables.
PS2: If someone knowledgeable in the subject read this, I would like to see your criticism.
Hi cpu2,
how is that to assemble? With gas, I guess. Could you provide the complete environment with assembler and linker switches, please?
Gunther
Hello Gunther.
Simple, I`m use as (Gas) for assembly, and linker is ld.
If you do not require a special option, is simple.
Terminal:
as aes.S -o aesOBJ.o
aesOBJ.o is object code, now link.
ld aesOBJ.o -o aes
aes is the binary, want to examine the code?
Regards.
hi, can you show example of how to call the code please?
i looked at code in editor and noticed no labels for _exk and _sxm
There's no ret opcode in there neither so it's not clear where each function for aes is (key setup, encrypt, decrypt)
Thank you cpu2. I'll try it tomorrow.
Gunther
Quote from: peter_asm on May 15, 2014, 02:38:48 PM
hi, can you show example of how to call the code please?
i looked at code in editor and noticed no labels for _exk and _sxm
There's no ret opcode in there neither so it's not clear where each function for aes is (key setup, encrypt, decrypt)
This example is key setup and encrypt. I'm about to finish InvMixcomns and you're ready.
My intention was to show a code not 100% functional, just wanted to know the opinion of people.
Key setup 260 OPS, Encrypt about 104 OPS, As they see? This better than code IPXE?
Sorry for the labels.
Quote from: Gunther on May 15, 2014, 07:30:25 PM
Thank you cpu2. I'll try it tomorrow.
Gunther
You're welcome.
Regards.
Quote from: cpu2 on May 16, 2014, 02:59:25 AM
This example is key setup and encrypt. I'm about to finish InvMixcomns and you're ready.
My intention was to show a code not 100% functional, just wanted to know the opinion of people.
Key setup 260 OPS, Encrypt about 104 OPS, As they see? This better than code IPXE?
Sorry for the labels.
No problem, I'm just unfamiliar with AES completely but would be interested in a tiny implementation to learn more.
I'd be interested in seeing how to use AES-NI too although intel has some examples in sdk, wondering if there are better ways to utilize the instructions to reserve space for example.
Hi cpu2,
I assume we're talking about a Unix application, didn't we?
Gunther
Quote from: peter_asm on May 16, 2014, 09:08:19 PM
Quote from: cpu2 on May 16, 2014, 02:59:25 AM
This example is key setup and encrypt. I'm about to finish InvMixcomns and you're ready.
My intention was to show a code not 100% functional, just wanted to know the opinion of people.
Key setup 260 OPS, Encrypt about 104 OPS, As they see? This better than code IPXE?
Sorry for the labels.
No problem, I'm just unfamiliar with AES completely but would be interested in a tiny implementation to learn more.
I'd be interested in seeing how to use AES-NI too although intel has some examples in sdk, wondering if there are better ways to utilize the instructions to reserve space for example.
You may create the application, but for now I have the code in "dirty". As I said earlier, was to know the opinion of you.
Yes, AES-NI is very fast, so my CPU not support AES-NI extension, and also I can not utilize because then I lose portability, and that is not right for my project.
If you need help, say so.
Quote from: Gunther on May 17, 2014, 02:27:46 AM
Hi cpu2,
I assume we're talking about a Unix application, didn't we?
Gunther
No, this code is platform, is true that has syntax and unix and programmed under, if syntax change in Windows, I think if the application, using the syntax will write you.
Regards.
QuoteYes, AES-NI is very fast, so my CPU not support AES-NI extension, and also I can not utilize because then I lose portability, and that is not right for my project.
okay, i implemented basic 128-bit encryption/decryption in CBC mode just to grasp instructions.
I'll post in another thread so as it's slightly unrelated.
Converted your code to MASM syntax but the calling convention doesn't appear to be fastcall, correct?
Could you show parameters to code?
.code
_start:
push r11
push r12
push r13
push r14
push r15
push r8
push r9
push rax
lea r12, qword ptr[rcn_]
prefetch sbx_
prefetch sq0_
mov r13, -160
movdqu xmm0, xmmword ptr [r11]
movdqu xmmword ptr [rsp+r13-10h], xmm0
mov r11d, [r11+0Ch]
movnti dword ptr [rsp+r13], r11d
_KeyExpansion:
ror dword ptr [rsp+r13], 8
movzx r14, byte ptr [rsp+r13]
movzx r15, byte ptr [rsp+r13+1]
movzx r8, byte ptr [rsp+r13+2]
movzx r9, byte ptr [rsp+r13+3]
movzx r14, byte ptr sbx_[r14]
movzx r15, byte ptr sbx_[r15]
movzx r8, byte ptr sbx_[r8]
movzx r9, byte ptr sbx_[r9]
movnti dword ptr [rsp+r13], r14d
movnti dword ptr [rsp+r13+1], r15d
movnti dword ptr [rsp+r13+2], r8d
movnti dword ptr [rsp+r13+3], r9d
mov eax, [rsp+r13]
xor eax, [r12]
add r12, 4
xor eax, [rsp+r13-10h]
movnti dword ptr [rsp+r13], eax
xor eax, [rsp+r13-0Ch]
movnti dword ptr [rsp+r13+4], eax
xor eax, [rsp+r13-8]
movnti dword ptr [rsp+r13+8], eax
xor eax, [rsp+r13-4]
movnti dword ptr [rsp+r13+0Ch], eax
add r13, 16
jnz _KeyExpansion
_Cipher:
movdqu xmm0, xmmword ptr [r10]
pxor xmm0, xmmword ptr [rsp-0B0h]
movdqu xmmword ptr [rsp-0D0h], xmm0
mov r13, -16
_SubBytes:
movzx r14, byte ptr [rsp+r13-0C0h]
movzx r15, byte ptr [rsp+r13-0C1h]
movzx r8, byte ptr [rsp+r13-0C2h]
movzx r9, byte ptr [rsp+r13-0C3h]
movzx r14, byte ptr sbx_[r14]
movzx r15, byte ptr sbx_[r15]
movzx r8, byte ptr sbx_[r8]
movzx r9, byte ptr sbx_[r9]
movnti dword ptr [rsp+r13-0D0h], r14d
movnti dword ptr [rsp+r13-0D1h], r15d
movnti dword ptr [rsp+r13-0D2h], r8d
movnti dword ptr [rsp+r13-0D3h], r9d
add r13, 4
jnz short _SubBytes
_Shiftows:
pshufd xmm1, xmmword ptr [rsp-0D0h], 39h
pshufd xmm2, xmmword ptr [rsp-0D0h], 4Eh
pshufd xmm3, xmmword ptr [rsp-0D0h], 93h
pand xmm1, xmmword ptr [sq1_]
pand xmm2, xmmword ptr [sq2_]
pand xmm3, xmmword ptr [sq3_]
pand xmm0, xmmword ptr [sq0_]
pxor xmm0, xmm1
pxor xmm0, xmm2
pxor xmm0, xmm3
_Mixcolumns:
movdqa xmm1, xmm0
movdqa xmm2, xmm1
movdqa xmm3, xmm2
movdqa xmm4, xmm3
pand xmm0, xmmword ptr [bts_]
pcmpeqb xmm0, xmmword ptr [bts_]
movdqa xmm5, xmm0
pand xmm1, xmm0
pandn xmm0, xmm2
pand xmm1, xmmword ptr [dlb_]
pslld xmm0, 1
pslld xmm1, 1
pxor xmm1, xmmword ptr [xrb_]
pand xmm1, xmm5
pxor xmm0, xmm1
pxor xmm2, xmm0
movdqa xmm6, xmm2
psrld xmm2, 8
pxor xmm0, xmm2
pslld xmm3, 8
pxor xmm0, xmm3
pslld xmm3, 8
pxor xmm0, xmm3
psrld xmm4, 16
pxor xmm0, xmm4
psrld xmm4, 8
pxor xmm0, xmm4
pslld xmm6, 24
pxor xmm0, xmm6
_AddRoundkey:
pxor xmm0, xmmword ptr [rsp-0A0h]
.data
sbx_ dd 7B777C63h, 0C56F6BF2h, 2B670130h, 76ABD7FEh, 7DC982CAh
dd 0F04759FAh, 0AFA2D4ADh, 0C072A49Ch, 2693FDB7h, 0CCF73F36h
dd 0F1E5A534h, 1531D871h, 0C323C704h, 9A059618h, 0E2801207h
dd 75B227EBh, 1A2C8309h, 0A05A6E1Bh, 0B3D63B52h, 842FE329h
dd 0ED00D153h, 5BB1FC20h, 39BECB6Ah, 0CF584C4Ah, 0FBAAEFD0h
dd 85334D43h, 7F02F945h, 0A89F3C50h, 8F40A351h, 0F5389D92h
dd 21DAB6BCh, 0D2F3FF10h, 0EC130CCDh, 1744975Fh, 3D7EA7C4h
dd 73195D64h, 0DC4F8160h, 88902A22h, 14B8EE46h, 0DB0B5EDEh
dd 0A3A32E0h, 5C240649h, 62ACD3C2h, 79E49591h, 6D37C8E7h
dd 0A94ED58Dh, 0EAF4566Ch, 8AE7A65h, 2E2578BAh, 0C6B4A61Ch
dd 1F74DDE8h, 8A8BBD4Bh, 66B53E70h, 0EF60348h, 0B9573561h
dd 9E1DC186h, 1198F8E1h, 948ED969h, 0E9871E9Bh, 0DF2855CEh
dd 0D89A18Ch, 6842E6BFh, 0F2D9941h, 16BB54B0h
sq0_ dq 0FF000000FFh
dq 0FF000000FFh
sq1_ dd 4 dup(0FF00h)
sq2_ dd 4 dup(0FF0000h)
sq3_ dd 4 dup(0FF000000h)
bts_ dd 4 dup(80808080h)
dlb_ dd 4 dup(7F7F7F7Fh)
xrb_ dd 4 dup(1B1B1B1Bh)
rcn_ dd 1, 2, 4, 8, 10h, 20h, 40h, 80h, 1Bh, 36h, 2 dup(0)
end
Firts:
I found an error in SubBytes and ShiftRows, and are corrected, sorry.
Quote from: peter_asm on May 19, 2014, 12:44:24 AM
Converted your code to MASM syntax but the calling convention doesn't appear to be fastcall, correct?
Could you show parameters to code?
No, it's fastcall. What happens is that it does not respect the standard.
In the two
mov I have commented that, %r11 the key, %r10 the plain text.
There is where you need to pass parameters, also be wanting the parameter length, sorry to have it this way.
You have the part decrypt, and I will correct.
Regards.
P.S: Great translation, the code is more clean.
.code
public aes_crypt
aes_crypt:
int 3
align 16
push r11
push r12
push r13
push r14
push r15
push r8
push r9
push rax
lea r12, [rcn_]
prefetch [sbx_]
prefetch [sq0_]
mov r13, 0FFFFFFFFFFFFFF60h
movdqu xmm0, xmmword ptr [r11]
movdqu xmmword ptr [rsp+r13-10h], xmm0
mov r11d, [r11+0Ch]
movnti dword ptr [rsp+r13], r11d
key_expand:
ror dword ptr [rsp+r13], 8
movzx r14, byte ptr [rsp+r13]
movzx r15, byte ptr [rsp+r13+1]
movzx r8, byte ptr [rsp+r13+2]
movzx r9, byte ptr [rsp+r13+3]
movzx r14, byte ptr sbx_[r14]
movzx r15, byte ptr sbx_[r15]
movzx r8, byte ptr sbx_[r8]
movzx r9, byte ptr sbx_[r9]
movnti dword ptr [rsp+r13], r14d
movnti dword ptr [rsp+r13+1], r15d
movnti dword ptr [rsp+r13+2], r8d
movnti dword ptr [rsp+r13+3], r9d
mov eax, [rsp+r13]
xor eax, [r12]
add r12, 4
xor eax, [rsp+r13-10h]
movnti dword ptr [rsp+r13], eax
xor eax, [rsp+r13-0Ch]
movnti dword ptr [rsp+r13+4], eax
xor eax, [rsp+r13-8]
movnti dword ptr [rsp+r13+8], eax
xor eax, [rsp+r13-4]
movnti dword ptr [rsp+r13+0Ch], eax
add r13, 10h
jnz key_expand
movdqu xmm0, xmmword ptr [r10]
pxor xmm0, xmmword ptr [rsp-0B0h]
movdqu xmmword ptr [rsp-0D0h], xmm0
mov r13, 0FFFFFFFFFFFFFFF0h
_SubBytes:
movzx r14, byte ptr [rsp+r13-0C0h]
movzx r15, byte ptr [rsp+r13-0BFh]
movzx r8, byte ptr [rsp+r13-0BEh]
movzx r9, byte ptr [rsp+r13-0BDh]
movzx r14, byte ptr sbx_[r14]
movzx r15, byte ptr sbx_[r15]
movzx r8, byte ptr sbx_[r8]
movzx r9, byte ptr sbx_[r9]
movnti dword ptr [rsp+r13-0E0h], r14d
movnti dword ptr [rsp+r13-0DFh], r15d
movnti dword ptr [rsp+r13-0DEh], r8d
movnti dword ptr [rsp+r13-0DDh], r9d
add r13, 4
jnz _SubBytes
movdqu xmm0, xmmword ptr [rsp-0F0h]
pshufd xmm1, xmm0, 39h
pshufd xmm2, xmm0, 4Eh
pshufd xmm3, xmm0, 93h
pand xmm1, [sq1_]
pand xmm2, [sq2_]
pand xmm3, [sq3_]
pand xmm0, [sq0_]
pxor xmm0, xmm1
pxor xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm1, xmm0
movdqa xmm2, xmm1
movdqa xmm3, xmm2
movdqa xmm4, xmm3
pand xmm0, [bts_]
pcmpeqb xmm0, [bts_]
movdqa xmm5, xmm0
pand xmm1, xmm0
pandn xmm0, xmm2
pand xmm1, [dlb_]
pslld xmm0, 1
pslld xmm1, 1
pxor xmm1, [xrb_]
pand xmm1, xmm5
pxor xmm0, xmm1
pxor xmm2, xmm0
movdqa xmm6, xmm2
psrld xmm2, 8
pxor xmm0, xmm2
pslld xmm3, 8
pxor xmm0, xmm3
pslld xmm3, 8
pxor xmm0, xmm3
psrld xmm4, 10h
pxor xmm0, xmm4
psrld xmm4, 8
pxor xmm0, xmm4
pslld xmm6, 18h
pxor xmm0, xmm6
pxor xmm0, xmmword ptr [rsp-0A0h]
align 16
sbx_ dq 0C56F6BF27B777C63h, 76ABD7FE2B670130h, 0F04759FA7DC982CAh
dq 0C072A49CAFA2D4ADh, 0CCF73F362693FDB7h, 1531D871F1E5A534h
dq 9A059618C323C704h, 75B227EBE2801207h, 0A05A6E1B1A2C8309h
dq 842FE329B3D63B52h, 5BB1FC20ED00D153h, 0CF584C4A39BECB6Ah
dq 85334D43FBAAEFD0h, 0A89F3C507F02F945h, 0F5389D928F40A351h
dq 0D2F3FF1021DAB6BCh, 1744975FEC130CCDh, 73195D643D7EA7C4h
dq 88902A22DC4F8160h, 0DB0B5EDE14B8EE46h, 5C2406490A3A32E0h
dq 79E4959162ACD3C2h, 0A94ED58D6D37C8E7h, 8AE7A65EAF4566Ch
dq 0C6B4A61C2E2578BAh, 8A8BBD4B1F74DDE8h, 0EF6034866B53E70h
dq 9E1DC186B9573561h, 948ED9691198F8E1h, 0DF2855CEE9871E9Bh
dq 6842E6BF0D89A18Ch, 16BB54B00F2D9941h
align 16
sq0_ oword 0FF000000FF000000FF000000FFh
sq1_ oword 0FF000000FF000000FF000000FF00h
sq2_ oword 0FF000000FF000000FF000000FF0000h
sq3_ oword 0FF000000FF000000FF000000FF000000h
bts_ oword 80808080808080808080808080808080h
dlb_ oword 7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F7Fh
xrb_ oword 1B1B1B1B1B1B1B1B1B1B1B1B1B1B1B1Bh
rcn_ dq 200000001h, 800000004h, 2000000010h, 8000000040h, 360000001Bh
end
After the first round of key expansion, it looks fine but the second looks wrong (taken from debugger)
00000000`0034fd50 c1 99 9d 74 49 cd b1 c5-6a 6e 88 fc 40 02 fe f9
what i have for key routine is
F2h, C2h, 95h, F2h, 7Ah, 96h, B9h, 43h, 59h, 35h, 80h, 7Ah, 73h, 59h, F6h, 7Fh
Using the following text + key
text db 06bh,0c1h,0beh,0e2h,02eh,040h,09fh,096h,0e9h,03dh,07eh,011h,073h,093h,017h,02ah
db 0aeh,02dh,08ah,057h,01eh,003h,0ach,09ch,09eh,0b7h,06fh,0ach,045h,0afh,08eh,051h
db 030h,0c8h,01ch,046h,0a3h,05ch,0e4h,011h,0e5h,0fbh,0c1h,019h,01ah,00ah,052h,0efh
db 0f6h,09fh,024h,045h,0dfh,04fh,09bh,017h,0adh,02bh,041h,07bh,0e6h,06ch,037h,010h
align 16
key db 02bh,07eh,015h,016h,028h,0aeh,0d2h,0a6h,0abh,0f7h,015h,088h,009h,0cfh,04fh,03ch
What i would recommend if you want people to work with/study your code is:
- Seperate key expansion and encryption functions
- Use fastcall convention if using 64-bit or stdcall for 32-bit
- Switch to INTEL syntax (more accessible for assembly programmers using NASM, FASM, JWASM, MASM)
- Document parameters required by each function and what registers are used (if you don't use fastcall / stdcall)
- Provide an example using assembly or C/C++
Quote from: peter_asm on May 19, 2014, 03:58:43 PM
What i would recommend if you want people to work with/study your code is:
- Seperate key expansion and encryption functions
- Use fastcall convention if using 64-bit or stdcall for 32-bit
- Switch to INTEL syntax (more accessible for assembly programmers using NASM, FASM, JWASM, MASM)
- Document parameters required by each function and what registers are used (if you don't use fastcall / stdcall)
- Provide an example using assembly or C/C++
Good proposals. :t
Gunther
Quote from: peter_asm on May 19, 2014, 03:58:43 PM
After the first round of key expansion, it looks fine but the second looks wrong (taken from debugger)
00000000`0034fd50 c1 99 9d 74 49 cd b1 c5-6a 6e 88 fc 40 02 fe f9
what i have for key routine is
F2h, C2h, 95h, F2h, 7Ah, 96h, B9h, 43h, 59h, 35h, 80h, 7Ah, 73h, 59h, F6h, 7Fh
Yes, the error this corrected sorry, is another place I forget movnti.
Now everything works fine, the only function that can give you problems is AddRounkey, realize that is scheduled for a single round.
As I said earlier, this implementation was not ready to function as a code of OpenSSL, is a prototype.
Quote from: peter_asm on May 19, 2014, 03:58:43 PM
What i would recommend if you want people to work with/study your code is:
- Seperate key expansion and encryption functions
- Use fastcall convention if using 64-bit or stdcall for 32-bit
- Switch to INTEL syntax (more accessible for assembly programmers using NASM, FASM, JWASM, MASM)
- Document parameters required by each function and what registers are used (if you don't use fastcall / stdcall)
- Provide an example using assembly or C/C++
InvMixcolumns fails when done well and the code, but you can always do the translations. :icon_mrgreen:
Regards.
P.S: Check the correction, the firts round is working, and generating codes of Key expand are correct.
Another question. Why are you using MOVNTI?
Explain the purpose of your code.
Provide comments for why you use instructions.
Quote from: peter_asm on May 20, 2014, 12:45:59 PM
Another question. Why are you using MOVNTI?
To minimize the cache pollution.
Quote from: peter_asm on May 20, 2014, 12:45:59 PM
Explain the purpose of your code.
This code is a fragment of one of my projects, not intended for an implementation with fastcall and C / C + +.
If that's why you believe it is an ineffective code and I'm not serious, it is a shame.
The objective of this code is test me, and if possible provide faster than some projects code, which I think and got, if it is false please let me know.
Quote from: peter_asm on May 20, 2014, 12:45:59 PM
Provide comments for why you use instructions.
Okay, seeing how strict they are with the syntax and presentation, when ready InvMixcolumns.
Regards.
I haven't given up on this. I'm genuinely very interested in your approach to encrypting with AES but haven't had time lately to test the code again.
If you would consider using INTEL syntax and using stdcall/fastcall convention in addition to providing example, I'm sure many more forum members would provide feedback.
Right now for me it's pain to convert into INTEL syntax, then assemble 2 files before loading into a debugger just to monitor the data because if i run exe it just crashes.
No offense, I think you're on to something and it's worth exploring but why you make it difficult for people to test/use your code is what i'm having difficulty understanding.
Is it because you don't want people to steal it?
Okay, I promise to write comments in the code, and translated into intel syntax, and use the fastcall system.
So the question on movnti was a doubt, I thought it was something like "what are you doing". Although because of the syntax not understand some instructions are not put, because if it is simply better to mov.
And not put the code that way so people do not steal, this is how I program.
Regards.
P.S: Here a few days, publishes InvMixcolumns, I was busy and could not finish it.
Hi cpu2,
take care, slow down. Don't rush.
Gunther
Quote from: cpu2 on May 22, 2014, 02:29:42 PM
Okay, I promise to write comments in the code, and translated into intel syntax, and use the fastcall system.
So the question on movnti was a doubt, I thought it was something like "what are you doing". Although because of the syntax not understand some instructions are not put, because if it is simply better to mov.
And not put the code that way so people do not steal, this is how I program.
Regards.
P.S: Here a few days, publishes InvMixcolumns, I was busy and could not finish it.
No problem. As Gunther said, take your time and I personally look forward to seeing your results.
The code is interesting and I think it is worth developing further but I don't completely understand AES and wouldn't be much help right now. I'd like to help and I'm sure many others on here would too.
I did try to optimize the AES key generation algorithm for encryption by WiteG.
It isn't optimized for speed. I like your idea and hope it can be realized as it could be very useful.
setkey:
pushad
mov esi, [esp+32+4] ; input
mov edi, [esp+32+8] ; output
lea ebx, [sbox]
push 4
pop ecx
load_key:
lodsd
stosd
loop load_key
push 1
pop edx
mov cl, 10
init_key:
push ecx
mov cl, 4
swap_bytes:
ror eax, 8
xlatb
loop swap_bytes
pop ecx
ror eax, 8
xor eax, edx
shl dl, 1
jnc no_carry
xor dl, 1Bh
no_carry:
push ecx
mov cl, 4
xor_dword:
xor eax, dword ptr [edi-16]
stosd
loop xor_dword
pop ecx
loop init_key
popad
ret 2*4
Quote from: Gunther on May 22, 2014, 04:11:43 PM
Hi cpu2,
take care, slow down. Don't rush.
Gunther
Okay. :icon_mrgreen:
Quote from: peter_asm on May 23, 2014, 07:16:22 AM
Quote from: cpu2 on May 22, 2014, 02:29:42 PM
Okay, I promise to write comments in the code, and translated into intel syntax, and use the fastcall system.
So the question on movnti was a doubt, I thought it was something like "what are you doing". Although because of the syntax not understand some instructions are not put, because if it is simply better to mov.
And not put the code that way so people do not steal, this is how I program.
Regards.
P.S: Here a few days, publishes InvMixcolumns, I was busy and could not finish it.
No problem. As Gunther said, take your time and I personally look forward to seeing your results.
The code is interesting and I think it is worth developing further but I don't completely understand AES and wouldn't be much help right now. I'd like to help and I'm sure many others on here would too.
Thanks, did not think that would be so interesting to them.
Quote from: peter_asm on May 23, 2014, 07:16:22 AM
I did try to optimize the AES key generation algorithm for encryption by WiteG.
It isn't optimized for speed. I like your idea and hope it can be realized as it could be very useful.
setkey:
pushad
mov esi, [esp+32+4] ; input
mov edi, [esp+32+8] ; output
lea ebx, [sbox]
push 4
pop ecx
load_key:
lodsd
stosd
loop load_key
push 1
pop edx
mov cl, 10
init_key:
push ecx
mov cl, 4
swap_bytes:
ror eax, 8
xlatb
loop swap_bytes
pop ecx
ror eax, 8
xor eax, edx
shl dl, 1
jnc no_carry
xor dl, 1Bh
no_carry:
push ecx
mov cl, 4
xor_dword:
xor eax, dword ptr [edi-16]
stosd
loop xor_dword
pop ecx
loop init_key
popad
ret 2*4
OPS would calculate that a Sandy Bridge, would be about 572 OPS. I assume that everyone will have the carry, if they would 500- 552 OPS.
Mine was a 274 OPS.
But I would not modular reduction in key expand, you saw that I did in Mixcolumns, I guess that is the intention of this
xor.
if you need help, ask.
Regards.
Hi cpu2,
Quote from: cpu2 on May 23, 2014, 09:38:34 AM
if you need help, ask.
okay, so be prepared.
Gunther
Quote from: Gunther on May 23, 2014, 07:51:01 PM
Hi cpu2,
Quote from: cpu2 on May 23, 2014, 09:38:34 AM
if you need help, ask.
okay, so be prepared.
Gunther
Okay. :t
-------
Days ago any comment, just tell them that I am already writing InvMixcolumns, you can not do before for a few personal problems.
I found a person who is willing to translate my code to intel syntax and fastcall, is a member of a Hispanic forum which I also belong.
I guess in a few days and will, but the truth is more complicated function.
Regards.
Hi cpu2,
Quote from: cpu2 on May 26, 2014, 05:57:43 PM
I found a person who is willing to translate my code to intel syntax and fastcall, is a member of a Hispanic forum which I also belong.
translating to Intel syntax isn't hard. Compile it with gas, use objdump -d -Mintel myfile.o and you've the Intel syntax.
Gunther
Nice one Gunther
>objdump -d -Mintel --no-show-raw-insn aes.o
aes.o: file format pe-x86-64
Disassembly of section .text:
0000000000000000 <_aes_crypt>:
0: push r11
2: push r12
4: push r13
6: push r14
8: push r15
a: push r8
c: push r9
e: push rax
f: mov r12,0x170
16: prefetch BYTE PTR ds:0x0
1e: prefetch BYTE PTR ds:0x100
26: mov r13,0xffffffffffffff60
2d: movdqu xmm0,XMMWORD PTR [r11]
32: movdqu XMMWORD PTR [rsp+r13*1-0x10],xmm0
39: mov r11d,DWORD PTR [r11+0xc]
3d: movnti QWORD PTR [rsp+r13*1],r11d
any idea how to remove the prefixed addresses?
Quote from: peter_asm on May 27, 2014, 11:19:14 PM
Nice one Gunther
but it did work, didn't it? But what the heck:
I think that prefetch instructions are necessary (http://x86.renejeschke.de/html/file_module_x86_id_252.html).
Gunther
No, I mean the addresses before each mnemonic so i could assemble with JWASM, that's all.
might be possible using cut command.
something like : @echo off & for /f "tokens=2 delims=:" %i in (aes.asm) do echo %i >aes_jwasm.asm
ah, it's okay, would just love an easy way to convert at&t into intel.
My own way was using ida pro disassembler which wasn't all that great.
Hi peter_asm,
cut & paste should be the right way, I think.
Quote from: peter_asm on May 28, 2014, 12:24:07 AM
something like : @echo off & for /f "tokens=2 delims=:" %i in (aes.asm) do echo %i >aes_jwasm.asm
That's to crazy, but could work. :lol: :lol: :lol:
Gunther
Do not worry for the syntax, as I said earlier.
Had personal problems this week so I finally write something, is a small step but it classified the bytes to later while the modular reduction.
.section .data
bt0_: .quad 0x8080808080808080,0x8080808080808080
bt1_: .quad 0x4040404040404040,0x4040404040404040
bt2_: .quad 0x2020202020202020,0x2020202020202020
bd1_: .quad 0x3f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f3f
bd2_: .quad 0x7f7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f7f
.section .text
.globl _start
_start:
movdqa %xmm0, %xmm1
movdqa %xmm1, %xmm2
movdqa %xmm2, %xmm3
movdqa %xmm3, %xmm4
pand bt0_, %xmm1
pcmpeqb bt0_, %xmm1
movdqa %xmm1, %xmm8
pand %xmm1, %xmm2
movdqa %xmm2, %xmm5
pand bd2_, %xmm2
pcmpeqb bd2_, %xmm2
movdqa %xmm2, %xmm6
pand %xmm0, %xmm2
pand bd1_, %xmm5
pcmpeqb bd1_, %xmm5
movdqa %xmm5, %xmm7
pand %xmm0, %xmm5
pandn %xmm0, %xmm1
pand bt1_, %xmm1
pcmpeqb bt1_, %xmm1
movdqa %xmm1, %xmm9
pand %xmm1, %xmm3
movdqa %xmm3, %xmm11
pand bd1_, %xmm3
pcmpeqb bt1_, %xmm3
movdqa %xmm3, %xmm6
pand %xmm0, %xmm3
pandn %xmm0, %xmm1
pand bt2_, %xmm1
pcmpeqb bt2_, %xmm1
movdqa %xmm1, %xmm10
pand %xmm1, %xmm4
I hope to finish soon.
Regards.
Hi cpu2,
is the AT&T syntax the output from gcc?
Gunther
No, the code is written in vi my favorite editor. Sorry again for the syntax, when everything is in intel and fastcall.
Regards.
vi and at&t syntax....is someone whipping you while you write the code too? :biggrin:
Joking dude, just seems like you're being hard on yourself when it comes to coding.
I use notepad++ on windows but unfortunately doesn't exist on *nix.
Geany on the other hand is pretty good, so long as you have GUI components installed.
I have not installed any GUI system, my system is OpenBSD, and work and I work in an old resolution for VT100 (80x25), but I feel good, of course they used another system to communicate with you, merely for convenience.
Band leaving it all, that you think the code snippet?
Regards.
Quote from: peter_asm on May 31, 2014, 03:06:58 AM
I use notepad++ on windows but unfortunately doesn't exist on *nix.
TEA exists on both platforms. I would recommend it.
Gunther
I'm on it no and left it hanging. I been talking to this person, will soon translated and the encrypt part fastcall.
I'll leave so they can study better, while just the decrypt.
Regards.
The user said and translated the code.
.data
sbx_:
dq 0c56f6bf27b777c63h,076abd7fe2b670130h
dq 0f04759fa7dc982cah,0c072a49cafa2d4adh
dq 0ccf73f362693fdb7h,01531d871f1e5a534h
dq 09a059618c323c704h,075b227ebe2801207h
dq 0a05a6e1b1a2c8309h,0842fe329b3d63b52h
dq 05bb1fc20ed00d153h,0cf584c4a39becb6ah
dq 085334d43fbaaefd0h,0a89f3c507f02f945h
dq 0f5389d928f40a351h,0d2f3ff1021dab6bch
dq 01744975fec130ccdh,073195d643d7ea7c4h
dq 088902a22dc4f8160h,0db0b5ede14b8ee46h
dq 05c2406490a3a32e0h,079e4959162acd3c2h
dq 0a94ed58d6d37c8e7h,008ae7a65eaf4566ch
dq 0c6b4a61c2e2578bah,08a8bbd4b1f74dde8h
dq 00ef6034866b53e70h,09e1dc186b9573561h
dq 0948ed9691198f8e1h,0df2855cee9871e9bh
dq 06842e6bf0d89a18ch,016bb54b00f2d9941h
sq0_:
dq 0000000ff000000ffh,0000000ff000000ffh
sq1_:
dq 00000ff000000ff00h,00000ff000000ff00h
sq2_:
dq 000ff000000ff0000h,000ff000000ff0000h
sq3_:
dq 0ff000000ff000000h,0ff000000ff000000h
bts_:
dq 08080808080808080h,08080808080808080h
dlb_:
dq 07f7f7f7f7f7f7f7fh,07f7f7f7f7f7f7f7fh
xrb_:
dq 01b1b1b1b1b1b1b1bh,01b1b1b1b1b1b1b1bh
rcn_:
dq 01b1b1b1b1b1b1b1bh,01b1b1b1b1b1b1b1bh
dq 00000002000000010h,00000008000000040h
dq 0000000360000001bh
ptr_:
dq 00706050403020100h,00f0e0d0c0b0a0908h
dq 07766554433221100h,0ffeeddccbbaa9988h
.code
_start:
lea r11,ptr_
prefetch sbx_
prefetch [sbx_+40h]
prefetch [sbx_+80h]
prefetch [sbx_+0c0h]
prefetch [sq0_]
prefetch [sq0_+40h]
prefetch [sq0_+80h]
mov r13,0ffffffffffffff60h
movdqu xmm0,xmmword [r11]
movdqu xmmword[rsp+r13-10h],xmm0
mov r11d,dword [r11+0ch]
movnti dword [rsp+r13],r11d
lea r12,rcn_
lea r11,ptr_
_exk:
ror dword [rsp+r13],08h
movzx r14,byte [rsp+r13]
movzx r15,byte [rsp+r13+1]
movzx r8,byte [rsp+r13+2]
movzx r9,byte [rsp+r13+3]
movzx r14,byte [r14+sbx_]
movzx r15,byte [r15+sbx_]
movzx r8,byte [r8+sbx_]
movzx r9,byte [r9+sbx_]
movnti dword [rsp+r13],r14d
movnti dword [rsp+r13+01h],r15d
movnti dword [rsp+r13+02h],r8d
movnti dword [rsp+r13+03h],r9d
mov eax,[rsp+r13]
xor eax,[r12]
add r12,04h
xor eax,[rsp+r13-10h]
movnti dword [rsp+r13],eax
xor eax,[rsp+r13-0ch]
movnti dword [rsp+r13+04h],eax
xor eax,[rsp+r13-08h]
movnti dword [rsp+r13+08h],eax
xor eax,[rsp+r13-04h]
movnti dword [rsp+r13+0ch],eax
movnti dword [rsp+r13+10h],eax
add r13,10h
jnz _exk
_ak1:
movdqu xmm0,dqword [r11+10h]
pxor xmm0,dqword [rsp-0b0h]
mov r11,0ffffffffffffff60h
mov rbx,09h
_x0:
mov r13,0fffffffffffffff0h
movdqu dqword [rsp-0d0h],xmm0
_sxm:
movzx r14,byte [rsp+r13-0c0h]
movzx r15,byte [rsp+r13-0bfh]
movzx r8,byte [rsp+r13-0beh]
movzx r9,byte [rsp+r13-0bdh]
movzx r14,byte [r14+sbx_]
movzx r15,byte [r15+sbx_]
movzx r8,byte [r8+sbx_]
movzx r9,byte [r9+sbx_]
movnti dword [rsp+r13-0e0h],r14d
movnti dword [rsp+r13-0dfh],r15d
movnti dword [rsp+r13-0deh],r8d
movnti dword [rsp+r13-0ddh],r9d
add r13,04h
jnz _sxm
_shw:
movdqu xmm0,dqword [rsp-0f0h]
pshufd xmm1,dqword [rsp-0f0h],39h
pshufd xmm2,dqword [rsp-0f0h],4eh
pshufd xmm3,dqword [rsp-0f0h],93h
pand xmm1,dqword [sq1_]
pand xmm2,dqword [sq2_]
pand xmm3,dqword [sq3_]
pand xmm0,dqword [sq0_]
pxor xmm0,xmm1
pxor xmm0,xmm2
pxor xmm0,xmm3
_mxm:
movdqa xmm1,xmm0
movdqa xmm2,xmm1
movdqa xmm3,xmm2
movdqa xmm4,xmm3
pand xmm0,dqword [bts_]
pcmpeqb xmm0,dqword [bts_]
movdqa xmm5,xmm0
pand xmm1,xmm0
pandn xmm0,xmm2
pand xmm1,dqword [dlb_]
pslld xmm0,01h
pslld xmm1,01h
pxor xmm1, dqword [xrb_]
pand xmm1,xmm5
pxor xmm0,xmm1
pxor xmm2,xmm0
movdqa xmm6,xmm2
psrld xmm2,08h
pxor xmm0,xmm2
pslld xmm3,08h
pxor xmm0,xmm3
pslld xmm3,08h
pxor xmm0,xmm3
psrld xmm4,10h
pxor xmm0,xmm4
psrld xmm4,08h
pxor xmm0,xmm4
pslld xmm6,18h
pxor xmm0,xmm6
pxor xmm0,dqword [rsp+r11]
_ark:
add r11,10h
dec rbx
jnz _x0
movdqu dqword [rsp-0d0h],xmm0
mov r13,0fffffffffffffff0h
_ltn:
movzx r14,byte [rsp+r13-0c0h]
movzx r15,byte [rsp+r13-0bfh]
movzx r8,byte [rsp+r13-0beh]
movzx r9,byte [rsp+r13-0bdh]
movzx r14,byte [r14+sbx_]
movzx r15,byte [r15+sbx_]
movzx r8,byte [r8+sbx_]
movzx r9,byte [r9+sbx_]
movnti dword [rsp+r13-0e0h],r14d
movnti dword [rsp+r13-0dfh],r15d
movnti dword [rsp+r13-0deh],r8d
movnti dword [rsp+r13-0ddh],r9d
add r13,04h
jnz _ltn
movdqu xmm0,dqword [rsp-0f0h]
pshufd xmm1,dqword [rsp-0f0h],39h
pshufd xmm2,dqword [rsp-0f0h],4eh
pshufd xmm3,dqword [rsp-0f0h],93h
pand xmm1,dqword [sq1_]
pand xmm2,dqword [sq2_]
pand xmm3,dqword [sq3_]
pand xmm0,dqword [sq0_]
pxor xmm0,xmm1
pxor xmm0,xmm2
pxor xmm0,xmm3
pxor xmm0,dqword [rsp+r11]
movdqu dqword [rsp-0f0h],xmm0
_ptr is taken as a pointer to key and plaintext, is only for a test and no calls or anything like that, the result is
0x69c4e0d86a7b0430d8cdb78070b4c55a
And is stored in [rsp-0f0h]
Regards.
I have not answered here, just come to say that we finished everything, even decrypt.
Encrypt = 66 OPS for 1 round
Decrypt = 171 OPS for 1 round, InvMixcolumns 130 OPS SSE2 :icon_mrgreen:
KeyExpand = 220 OPS.
As seen, sorry the delay.
Thanks
Are you satisfied with the results?
Gunther
Yes, I did what I was told, leave the independent functions and are called a call, the code no longer linked as before.
On cycles I think it's okay. I can not share the code as it is personal, if you have questions about the code, say it.
Thanks.
Hi cpu2,
Quote from: cpu2 on October 13, 2014, 03:50:08 AM
On cycles I think it's okay. I can not share the code as it is personal, if you have questions about the code, say it.
only one basic question: do you use AT&T syntax or have you converted the sources?
Gunther
No, my personal sources is written in AT&T syntax, I like this syntax.
If you have more questions codes, say it.
Thanks.
Hi cpu2,
Quote from: cpu2 on October 13, 2014, 02:59:11 PM
No, my personal sources is written in AT&T syntax, I like this syntax.
why not, I'm familiar with AT&T syntax. It has advantages but drawbacks, too. If you would like to write assembly language programs for the PowerPC, AT&T syntax gives an easy entry.
Gunther