FWIW,
Here's what I came up with. According to my (unstable) timings, takes about 2000 cycles for the inputs you used for timing (111... and 222...). Takes about 400 more for the other test inputs (where the answer is 0101010101010101010101736973578888
297386792883747746736446685377357724486347246553692855982856
984682746982760763986982798077775597748957736987058639774692
870248639867730348707686676977588882886675865579578755697664
858857) since that pair involves carries. On AMD takes considerably longer.
Here's the main routine:
; ascii adder by rrr3134159 9/25/2015
include support.inc
Timer_Data
ASCII_LENGTH = 256 ; must be divisible by 4
ASCII_LENGTH_DWORDS = ASCII_LENGTH / 4
copy_from_end MACRO dest, src, cnt
lea esi, src
lea edi, dest
mov ecx, cnt
inc ecx
std
rep movsb
cld
ENDM
.data
; didn't bother to input these from console ...
a1 db "72687256878728728578278273764572634567527634762347624623645268275497274697367264"
db "59726597629769726979767654976479567269726957629764592769247629767629347697676576"
db "876587872876575764578568745687564758756", 0
;a1 db 256 dup (31h), 0 ; these data take about 2000 cycles, 400 less, since no carry
a1_end = $ - 1
a1_length = a1_end - a1
a2 db "10101010101010101010101010101010101010101010101010101010101010101010101010101010"
db "10101010101100110101010101010010100101001010101001010010100100101010010100101001"
db "010100101001010010100101001010010100101001010010010100100101", 0
;a2 db 256 dup (32h), 0
a2_end = $ - 1
a2_length = a2_end - a2
num1 db ASCII_LENGTH dup(30h)
num1_end = $
db 0
num2 db ASCII_LENGTH dup(30h)
num2_end = $
db 0
ans db ASCII_LENGTH dup(30h)
ans_end = $
db 0
ans_length dd 0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
aa:
pp "****************************************************************\n"
pp "num1 %s\n\n", offset a1
pp "num2 %s\n\n", offset a2
InitCycles ; start timer
copy_from_end num1_end, a1_end, a1_length
copy_from_end num2_end, a2_end, a2_length
; get_time inputting took ; uncomment to separate input timing
; add per dword, and propagate the carry thru the dword
mov ecx, ASCII_LENGTH_DWORDS
dec ecx
mov dl, 0
add_loop:
mov eax, DWORD PTR num1[ecx*4]
add eax, DWORD PTR num2[ecx*4]
sub eax, 30303030h
add al, dl
cmp al, 3ah
jl @f
sub al, 0ah
add ah, 1
@@:
mov dl, 0
cmp ah, 3ah
jl @F
sub ah, 0ah
mov dl, 1
@@:
bswap eax
add ah, dl
cmp ah, 3ah
jl @f
sub ah, 0ah
add al, 1
@@:
mov dl, 0
cmp al, 3ah
jl @F
sub al, 0ah
mov dl, 1
@@:
bswap eax
mov DWORD PTR ans[ecx*4], eax
dec ecx
jge add_loop
; if dl = 1 the addition overflowed
; find first non-zero digit to determine length of answer
xor ecx, ecx
@@:
cmp BYTE PTR ans[ecx], 30h
jne @F
inc ecx
jmp @B
@@:
sub ecx, ASCII_LENGTH
neg ecx
mov ans_length, ecx
; get time, print answer
get_time adding took
lea esi, ans_end
sub esi, ans_length
pp "\nans %s \n\n", esi
ret
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end aa
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
I really have no idea if this is worth anything, if anyone cares I can clean it up in various ways.