I prototyped these 2 in 32 bit and they were a very easy conversion to 64 bit MASM. They run OK but I have a sneaking suspicion that there is a much faster way to do this using either supplementary SSE3 or SSE4.
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
include \masm32\include64\masm64rt.inc
.code
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
entry_point proc
LOCAL pstr :QWORD
mrm pstr, "This is a test"
rcall upper,pstr
conout pstr,lf
rcall lower,pstr
conout pstr,lf
waitkey
.exit
entry_point endp
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
upper proc
mov rax, rcx ; load string address
sub rax, 1 ; set up for loop
lea rdx, table ; load the table address
jmp lbl ; jump over pre into loop
pre:
sub BYTE PTR [rax], 32 ; sub 32 = convert character to upper case
lbl:
add rax, 1
movzx rcx, BYTE PTR [rax] ; load byte address in rcx
cmp BYTE PTR [rcx+rdx], 1 ; test if that byte is lower case
je pre ; jump to pre to convert
test rcx, rcx ; test for terminator
jnz lbl ; loop back if not
ret
align 16
table:
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 ; lower case table
db 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
upper endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
NOSTACKFRAME
lower proc
mov rax, rcx ; load string address
sub rax, 1 ; set up for loop
lea rdx, table1 ; load the table address
jmp lbl ; jump over pre into loop
pre:
add BYTE PTR [rax], 32 ; add 32 = make lower case
lbl:
add rax, 1
movzx rcx, BYTE PTR [rax] ; load byte address in rcx
cmp BYTE PTR [rcx+rdx], 1 ; test if that byte is upper case
je pre ; jump to pre to convert
test rcx, rcx ; test for terminator
jnz lbl ; loop back if not
ret
align 16
table1:
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 ; upper case table
db 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
lower endp
STACKFRAME
; ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
end
I've hacked together some timings but cannot see a clear winner on my Core i5. Upper$() and Lower$() use the naive and al, -31 resp. or al, 32 thing:This code was assembled with ml64 in 64-bit format
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1108 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 1123 ms for upper
@#%&$+*this is a test string {123}[456] - 1404 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1108 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 858 ms for upper
@#%&$+*this is a test string {123}[456] - 1389 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1092 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 1107 ms for upper
@#%&$+*this is a test string {123}[456] - 1389 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1108 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 858 ms for upper
@#%&$+*this is a test string {123}[456] - 1419 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1108 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 1092 ms for upper
@#%&$+*this is a test string {123}[456] - 1404 ms for lower
Same with OPT_64 0:This code was assembled with ML in 32-bit format
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1092 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 874 ms for upper
@#%&$+*this is a test string {123}[456] - 1107 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1092 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 874 ms for upper
@#%&$+*this is a test string {123}[456] - 874 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 889 ms for Upper$()
@#%&$+*this is a test string {123}[456] - 1076 ms for Lower$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 843 ms for upper
@#%&$+*this is a test string {123}[456] - 873 ms for lower
I think for a lot of these things that memory read/write speed is the limiting factor.
Maybe. Here are timings for a version (attached) using also CharUpperBuff:This code was assembled with ml64 in 64-bit format
@#%&$+*THIS IS A TEST STRING {123}[456] - 1763 ms for CharUpperBuff
@#%&$+*THIS IS A TEST STRING {123}[456] - 296 ms for Upper$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 468 ms for upper
@#%&$+*this is a test string {123}[456] - 359 ms for Lower$()
@#%&$+*this is a test string {123}[456] - 281 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 1747 ms for CharUpperBuff
@#%&$+*THIS IS A TEST STRING {123}[456] - 281 ms for Upper$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 452 ms for upper
@#%&$+*this is a test string {123}[456] - 359 ms for Lower$()
@#%&$+*this is a test string {123}[456] - 296 ms for lower
@#%&$+*THIS IS A TEST STRING {123}[456] - 1763 ms for CharUpperBuff
@#%&$+*THIS IS A TEST STRING {123}[456] - 297 ms for Upper$()
@#%&$+*THIS IS A TEST STRING {123}[456] - 452 ms for upper
@#%&$+*this is a test string {123}[456] - 374 ms for Lower$()
@#%&$+*this is a test string {123}[456] - 375 ms for lower
It's horribly slow, probably using a table for the LOCALE. I have tried to think of how to use SIMD instructions here, but it's not so easy because of the range restrictions. For example:
mov rdi, offset srcdest
movups xmm0, spaces
movups xmm1, OWORD ptr src
orps xmm1, xmm0
movups OWORD ptr srcdest, xmm1
Output:
@#%&$+*This is A
`#%&$+*this is a
Almost perfect ;)