News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

String Compare - Updated

Started by guga, January 28, 2013, 12:12:44 PM

Previous topic - Next topic

MichaelW

Quote from: jj2007 on January 29, 2013, 03:19:29 AM
Strange - I get this...

Yes, if I substitute your strings in my test app, then both of these comparisons return zero:

    invoke FastStringCompare, ADDR str1, ADDR str1
    invoke FastStringCompare, ADDR str2, ADDR str2


But if I add a space to the end of both strings, then both comparisons return 1, so the problem appears to be with the length of the strings.

And if I then comment out the align directives, both comparisons return zero. So the code is sensitive to the length and the alignment.

The above applies for my P3 and my P4.
Well Microsoft, here's another nice mess you've gotten us into.

MichaelW

#16
Quote from: guga on January 29, 2013, 09:19:10 AM
Have anyone knows if all compilers align their strings (or others data) always in a 4 byte boundary?

I would guess that mainstream compilers do. The Microsoft compilers appear to maintain a minimum 4-byte alignment for any string longer than the null terminator, but I'm not sure that my test data layout is correct for what I'm trying to determine.

#include <conio.h>
#include <stdio.h>
#include <stdlib.h>

int alignment( void *address )
{
    int rval;
    __asm
    {
      xor eax, eax
      mov ecx, address
      bsf ecx, ecx
      jz done
      mov eax, 1
      Shl eax, cl
    done:
      mov rval, eax
    }
    return rval;
}

char x6[6],x5[5],x4[4],x3[3],x2[2],x1[1],y1[1],y2[2],y3[3],y4[4],y5[5],y6[6];

void main()
{
  char i6[6],i5[5],i4[4],i3[3],i2[2],i1[1],j1[1],j2[2],j3[3],j4[4],j5[5],j6[6];

  printf("%d\n",alignment(i6));
  printf("%d\n",alignment(i5));
  printf("%d\n",alignment(i4));
  printf("%d\n",alignment(i3));
  printf("%d\n",alignment(i2));
  printf("%d\n",alignment(i1));
  printf("%d\n",alignment(j1));
  printf("%d\n",alignment(j2));
  printf("%d\n",alignment(j3));
  printf("%d\n",alignment(j4));
  printf("%d\n",alignment(j5));
  printf("%d\n\n",alignment(j6));

  printf("%d\n",alignment(x6));
  printf("%d\n",alignment(x5));
  printf("%d\n",alignment(x4));
  printf("%d\n",alignment(x3));
  printf("%d\n",alignment(x2));
  printf("%d\n",alignment(x1));
  printf("%d\n",alignment(y1));
  printf("%d\n",alignment(y2));
  printf("%d\n",alignment(y3));
  printf("%d\n",alignment(y4));
  printf("%d\n",alignment(y5));
  printf("%d\n\n",alignment(y6));

  getch();
}


8
64
16
8
4
1
1
4
8
32
4
8

16
32
4
128
4
1
2
8
4
8
8
8


; Listing generated by Microsoft (R) Optimizing Compiler Version 13.10.3077

TITLE test.c
.386P
include listing.inc
if @Version gt 510
.model FLAT
else
_TEXT SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT ENDS
_DATA SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA ENDS
CONST SEGMENT DWORD USE32 PUBLIC 'CONST'
CONST ENDS
_BSS SEGMENT DWORD USE32 PUBLIC 'BSS'
_BSS ENDS
$$SYMBOLS SEGMENT BYTE USE32 'DEBSYM'
$$SYMBOLS ENDS
_TLS SEGMENT DWORD USE32 PUBLIC 'TLS'
_TLS ENDS
FLAT GROUP _DATA, CONST, _BSS
ASSUME CS: FLAT, DS: FLAT, SS: FLAT
endif

INCLUDELIB LIBC
INCLUDELIB OLDNAMES

_DATA SEGMENT
COMM _x6:BYTE:06H
COMM _x5:BYTE:05H
COMM _x4:BYTE:04H
COMM _x3:BYTE:03H
COMM _x2:BYTE:02H
COMM _x1:BYTE
COMM _y1:BYTE
COMM _y2:BYTE:02H
COMM _y3:BYTE:03H
COMM _y4:BYTE:04H
COMM _y5:BYTE:05H
COMM _y6:BYTE:06H
$SG1250 DB '%d', 0aH, 00H
$SG1251 DB '%d', 0aH, 00H
$SG1252 DB '%d', 0aH, 00H
$SG1253 DB '%d', 0aH, 00H
$SG1254 DB '%d', 0aH, 00H
$SG1255 DB '%d', 0aH, 00H
$SG1256 DB '%d', 0aH, 00H
$SG1257 DB '%d', 0aH, 00H
$SG1258 DB '%d', 0aH, 00H
$SG1259 DB '%d', 0aH, 00H
$SG1260 DB '%d', 0aH, 00H
$SG1261 DB '%d', 0aH, 0aH, 00H
ORG $+3
$SG1262 DB '%d', 0aH, 00H
$SG1263 DB '%d', 0aH, 00H
$SG1264 DB '%d', 0aH, 00H
$SG1265 DB '%d', 0aH, 00H
$SG1266 DB '%d', 0aH, 00H
$SG1267 DB '%d', 0aH, 00H
$SG1268 DB '%d', 0aH, 00H
$SG1269 DB '%d', 0aH, 00H
$SG1270 DB '%d', 0aH, 00H
$SG1271 DB '%d', 0aH, 00H
$SG1272 DB '%d', 0aH, 00H
$SG1273 DB '%d', 0aH, 0aH, 00H
_DATA ENDS
PUBLIC _alignment
; Function compile flags: /Odt
_TEXT SEGMENT
_rval$ = -4 ; size = 4
_address$ = 8 ; size = 4
_alignment PROC NEAR
; File c:\program files\microsoft visual c++ toolkit 2003\my\alignment\test.c
; Line 6
push ebp
mov ebp, esp
push ecx
; Line 10
xor eax, eax
; Line 11
mov ecx, DWORD PTR _address$[ebp]
; Line 12
bsf ecx, ecx
; Line 13
je SHORT $done$1223
; Line 14
mov eax, 1
; Line 15
shl eax, cl
$done$1223:
; Line 17
mov DWORD PTR _rval$[ebp], eax
; Line 19
mov eax, DWORD PTR _rval$[ebp]
; Line 20
mov esp, ebp
pop ebp
ret 0
_alignment ENDP
_TEXT ENDS
PUBLIC _main
EXTRN _getch:NEAR
EXTRN _printf:NEAR
; Function compile flags: /Odt
_TEXT SEGMENT
_i2$ = -64 ; size = 2
_j6$ = -60 ; size = 6
_i4$ = -52 ; size = 4
_j1$ = -45 ; size = 1
_i3$ = -44 ; size = 3
_j2$ = -40 ; size = 2
_i5$ = -36 ; size = 5
_j3$ = -28 ; size = 3
_j5$ = -24 ; size = 5
_i1$ = -13 ; size = 1
_i6$ = -12 ; size = 6
_j4$ = -4 ; size = 4
_main PROC NEAR
; Line 25
push ebp
mov ebp, esp
sub esp, 64 ; 00000040H
; Line 28
lea eax, DWORD PTR _i6$[ebp]
push eax
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1250
call _printf
add esp, 8
; Line 29
lea ecx, DWORD PTR _i5$[ebp]
push ecx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1251
call _printf
add esp, 8
; Line 30
lea edx, DWORD PTR _i4$[ebp]
push edx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1252
call _printf
add esp, 8
; Line 31
lea eax, DWORD PTR _i3$[ebp]
push eax
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1253
call _printf
add esp, 8
; Line 32
lea ecx, DWORD PTR _i2$[ebp]
push ecx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1254
call _printf
add esp, 8
; Line 33
lea edx, DWORD PTR _i1$[ebp]
push edx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1255
call _printf
add esp, 8
; Line 34
lea eax, DWORD PTR _j1$[ebp]
push eax
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1256
call _printf
add esp, 8
; Line 35
lea ecx, DWORD PTR _j2$[ebp]
push ecx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1257
call _printf
add esp, 8
; Line 36
lea edx, DWORD PTR _j3$[ebp]
push edx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1258
call _printf
add esp, 8
; Line 37
lea eax, DWORD PTR _j4$[ebp]
push eax
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1259
call _printf
add esp, 8
; Line 38
lea ecx, DWORD PTR _j5$[ebp]
push ecx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1260
call _printf
add esp, 8
; Line 39
lea edx, DWORD PTR _j6$[ebp]
push edx
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1261
call _printf
add esp, 8
; Line 42
push OFFSET FLAT:_x6
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1262
call _printf
add esp, 8
; Line 43
push OFFSET FLAT:_x5
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1263
call _printf
add esp, 8
; Line 44
push OFFSET FLAT:_x4
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1264
call _printf
add esp, 8
; Line 45
push OFFSET FLAT:_x3
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1265
call _printf
add esp, 8
; Line 46
push OFFSET FLAT:_x2
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1266
call _printf
add esp, 8
; Line 47
push OFFSET FLAT:_x1
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1267
call _printf
add esp, 8
; Line 48
push OFFSET FLAT:_y1
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1268
call _printf
add esp, 8
; Line 49
push OFFSET FLAT:_y2
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1269
call _printf
add esp, 8
; Line 50
push OFFSET FLAT:_y3
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1270
call _printf
add esp, 8
; Line 51
push OFFSET FLAT:_y4
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1271
call _printf
add esp, 8
; Line 52
push OFFSET FLAT:_y5
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1272
call _printf
add esp, 8
; Line 53
push OFFSET FLAT:_y6
call _alignment
add esp, 4
push eax
push OFFSET FLAT:$SG1273
call _printf
add esp, 8
; Line 55
call _getch
; Line 56
xor eax, eax
mov esp, ebp
pop ebp
ret 0
_main ENDP
_TEXT ENDS
END


Edit:

Analyzing this further, and noting in the listing that the compiler had changed the order of the local strings in main, but not the order of the COMM strings in the DATA section, I added this code:

  printf("%d\n",&x6);
  printf("%d\n",&x5);
  printf("%d\n",&x4);
  printf("%d\n",&x3);
  printf("%d\n",&x2);
  printf("%d\n",&x1);
  printf("%d\n",&y1);
  printf("%d\n",&y2);
  printf("%d\n",&y3);
  printf("%d\n",&y4);
  printf("%d\n",&y5);
  printf("%d\n\n",&y6);


And got this output:

4233648
4233632
4233628
4233664
4233668
4233677
4233662
4233624
4233644
4233640
4233672
4233656


While I cannot readily equate the layout of the addresses to the compiler's layout of the local strings, the data has obviously been shuffled. So the question is, who did the shuffling? And the answer from the MASM 6.0 Programmer's Guide, under Selecting Data-Sharing Methods, is:
Quote
As an alternative, you can use the COMM directive instead of PUBLIC and EXTERN. However, communal variables have some limitations. You cannot depend on their location in memory because they are allocated by the linker, and they cannot be initialized.
Well Microsoft, here's another nice mess you've gotten us into.

guga

Ok, guys, i guess i finished with string alignment. Didn´t tested the performance. So feel free to test it.

On the strings i tested the result is accurate. (0 if don´t match, 1 if match) - If this new version is fast i´ll implement the return 0-1 on it.


Proc FastStringCompare:
    Arguments @String1, @String2
    Uses esi, ecx, ebx, edx

    mov esi D@String1
    mov ecx 0
    mov eax 0
    mov ebx D@String2

    mov edx D$esi
    cmp edx D$ebx   ; just to force the zero flags. if we have a match zero flag is settled. But we need to check if the last byte at string1 is zero.

jz L0> ; The 1st 4 bytes are not zero, jmp over

        mov ecx 3
        shr edx 8 | jz L1> ; If string is only 1 byte len. ecx = 3
        inc ecx | jmp L1> ; else, ecx = 4

    L0:
        cmp dh 0 | jz L1> ; the last byte of the 1st string is zero. We reached the end of the search. Contunue computing the remaining bytes at "L1"
        lea ecx D$ecx+4 ; compute the len of the string1
        mov edx D$esi+ecx
        cmp D$ebx+ecx edx
        jz L0<

        jmp L3>> ; strings definitelly dont match

L1:

        lea ebx D$ebx+ecx | mov edx D$ebx-3
        lea esi D$esi+ecx
        cmp edx D$esi-3 ; if both words on string1 and string2 don´ match jmp over. Otherwise, continue analysing the remaining bytes
        jnz L3>>
        ; We are almost there. If ecx = 0, means that the string1 have only 2 byte len. So let´t search for the 2nd byte of string2
;;
    ; so far all bytes are equal. Let´t see if teh string2 is null terminated as it is on string1. Ex: str1) "guga1" | str2) "guga1" Or if it is str1) "guga1" | str2) "gugaM"...
    ; On the dword, we need to check the byte after the dx like: 00XX_0000
    If B$ebx-1 = 0
        inc eax
    End_If

    Note for me, i commented it out, because the above assumption is wrong. In fact, when we reach here we have analysed all the string
    and both are exactly the same.
;;

    ; In mine tests no matter what bytes gets here, because it always means that we fully analysed the strings, and they always matches
    inc eax

L3:

EndP


Tested strings:


    call FastStringCompare {"1", 0}, {"g", 0}
    call FastStringCompare {"g", 0}, {"g", 0}
    call FastStringCompare {"gu", 0}, {"gu", 0}
    call FastStringCompare {"gug", 0}, {"gug", 0}
    call FastStringCompare {"guga", 0}, {"guga", 0}
    call FastStringCompare {"guga1", 0}, {"guga1", 0}
    call FastStringCompare {"guga12", 0}, {"guga12", 0}
    call FastStringCompare {"guga123", 0}, {"guga123", 0}
    call FastStringCompare {"guga1234", 0}, {"guga1234", 0}
    call FastStringCompare {"guga12345", 0}, {"guga12345", 0}
    call FastStringCompare {"guga123456", 0}, {"guga123456", 0}
   
    call FastStringCompare {"1234", 0}, {"5678", 0}
    call FastStringCompare {"1", 0}, {"g", 0}
    call FastStringCompare {"gustavo trigu", 0}, {"gustavo trigueiros e guilherme", 0}
    call FastStringCompare {"g", 0}, {"gu", 0}
    call FastStringCompare {"gu", 0}, {"gu", 0}
    call FastStringCompare {"gu", 0}, {"guu", 0}
    call FastStringCompare {"guuu", 0}, {"guu", 0}
    call FastStringCompare {"guu", 0}, {"guuu", 0}
    call FastStringCompare {"gug", 0}, {"guga", 0}
    call FastStringCompare {"g44564", 0}, {"112345g", 0}
    call FastStringCompare {"gu", 0}, {"g", 0}
    call FastStringCompare {"gu", 0}, {"gu", 0}
    call FastStringCompare {"g", 0}, {"gu", 0}
    call FastStringCompare {"g", 0}, {"g", 0}
    call FastStringCompare {"1", 0}, {"g", 0}

        call FastStringCompare {"g1stavo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gu1tavo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gus1avo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gust1vo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gusta1o trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustav1 trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo1trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo 1rigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilherm1", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilher1e", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilhe1me", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilh1rme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guil1erme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e gui1herme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e gu1lherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e g1ilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e 1uilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e1guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros 1 guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros1e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiro1 e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueir1s e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}

        call FastStringCompare {"guga", 0}, {"guga1", 0}
        call FastStringCompare {"guga1", 0}, {"guga", 0}
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

The preliminary results i´ve got is
Not sure if the result is accurated because i wasn´t be able to assemble the masm app from JJ2007, so i just disassembled it, inserted my code and reassembled it back. But comparing the older results i guess it is faster then my previous version.

Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 189/100 cycles


7185    cycles for 100 * crt_strcmp
2060    cycles for 100 * FastStringCompare
2058    cycles for 100 * MasmBasic StringsDiffer
3325    cycles for 100 * STRCMP$

7211    cycles for 100 * crt_strcmp
2057    cycles for 100 * FastStringCompare
2059    cycles for 100 * MasmBasic StringsDiffer
3216    cycles for 100 * STRCMP$

7177    cycles for 100 * crt_strcmp
2058    cycles for 100 * FastStringCompare
2062    cycles for 100 * MasmBasic StringsDiffer
3219    cycles for 100 * STRCMP$

7178    cycles for 100 * crt_strcmp
2057    cycles for 100 * FastStringCompare
2059    cycles for 100 * MasmBasic StringsDiffer
3309    cycles for 100 * STRCMP$

7179    cycles for 100 * crt_strcmp
2208    cycles for 100 * FastStringCompare
2073    cycles for 100 * MasmBasic StringsDiffer
3222    cycles for 100 * STRCMP$

-1      = eax crt_strcmp
0       = eax FastStringCompare
-23     = eax MasmBasic StringsDiffer
-23     = eax STRCMP$

--- ok ---
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

Quote from: guga on January 29, 2013, 09:19:10 AMDoes anyone know if all compilers align their strings (or other data) always on a 4 byte boundary ?

Freshly created strings are likely to be 8 byte aligned, due to HeapAlloc. But a library function must be able to handle any alignment, e.g. when you pass a pointer to a substring. There shouldn't be any problem, though, unless you use SSE2. The MasmBasic StringsDiffer() does indeed use SSE2 and has some overhead, which makes it relatively slow for very short strings.

Here are two examples for a somewhat longer string, showing the influence of alignment:

Str1   db "gustavo trigueiros e guilherme alias 'guga' is today the brain behind RosAsm, one of the most powerful assemblers ever developedA", 0
Str2   db "gustavo ...X", 0


align 4 for both strings:
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
++++++++++++++++++++
2894    cycles for 10 * crt_strcmp
2315    cycles for 10 * FastStringCompare
1186    cycles for 10 * MasmBasic StringsDiffer
1964    cycles for 10 * STRCMP$


align 16 for both strings:
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
++++++++++++++++++++
2892    cycles for 10 * crt_strcmp
2309    cycles for 10 * FastStringCompare
969     cycles for 10 * MasmBasic StringsDiffer
1954    cycles for 10 * STRCMP$


guga

Hi JJ, thanks for the nice words :)

I built a new version (using alignment) and just made a new improve:

Proc FastStringCompare:
    Arguments @String1, @String2
    Uses esi, ecx, ebx, edx

    mov esi D@String1
    mov ecx 0
    mov eax 0
    mov ebx D@String2

    mov edx D$esi
    cmp edx D$ebx   ; just to force the zero flags. if we have a match zero flag is settled. But we need to check if the last byte at string1 is zero.

jz L0> ; The 1st 4 bytes are not zero, jmp over

        mov ecx 3
        shr edx 8 | jz L1> ; If string is only 1 byte len. ecx = 3
        inc ecx | jmp L1> ; else, ecx = 4

    L0:
        test dh dh | jz L1> ; the last byte of the 1st string is zero. We reached the end of the search. Contunue computing the remaining bytes at "L1"
        lea ecx D$ecx+4 ; compute the len of the string1
        mov edx D$esi+ecx
        cmp D$ebx+ecx edx
        jz L0<

        jmp L3>> ; strings definitelly dont match

L1:

        lea ebx D$ebx+ecx | mov edx D$ebx-3
        lea esi D$esi+ecx
        cmp D$esi-3 edx; if both words on string1 and string2 don´ match jmp over. Otherwise, continue analysing the remaining bytes
        jnz L3>>
        ; We are almost there. If ecx = 0, means that the string1 have only 2 byte len. So let´t search for the 2nd byte of string2
;;
    ; so far all bytes are equal. Let´t see if teh string2 is null terminated as it is on string1. Ex: str1) "guga1" | str2) "guga1" Or if it is str1) "guga1" | str2) "gugaM"...
    ; On the dword, we need to check the byte after the dx like: 00XX_0000
    If B$ebx-1 = 0
        inc eax
    End_If

    Note for me, i commented it out, because the above assumption is wrong. In fact, when we reach here we have analysed all the string
    and both are exactly the same.
;;

    ; In mine tests no matter what bytes gets here, because it always means that we fully analysed the strings, and they always matches
    inc eax


L3:

EndP


Using the same strings as the post above i got this results:
Str1   db "gustavo trigueiros e guilherme alias 'guga' is today the brain behind RosAsm, one of the most powerful assemblers ever developedA", 0
Str2   db "gustavo ...X", 0

Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (£E4)
loop overhead is approx. 181/100 cycles

2103 cycles for 100 * crt_strcmp
1087 cycles for 100 * FastStringCompare
1774 cycles for 100 * MasmBasic StringsDiffer
1599 cycles for 100 * STRCMP$

2100 cycles for 100 * crt_strcmp
1091 cycles for 100 * FastStringCompare
1768 cycles for 100 * MasmBasic StringsDiffer
1593 cycles for 100 * STRCMP$

2479 cycles for 100 * crt_strcmp
1559 cycles for 100 * FastStringCompare
1849 cycles for 100 * MasmBasic StringsDiffer
1698 cycles for 100 * STRCMP$

2182 cycles for 100 * crt_strcmp
1087 cycles for 100 * FastStringCompare
1781 cycles for 100 * MasmBasic StringsDiffer
1607 cycles for 100 * STRCMP$

2109 cycles for 100 * crt_strcmp
1090 cycles for 100 * FastStringCompare
1774 cycles for 100 * MasmBasic StringsDiffer
1592 cycles for 100 * STRCMP$

1 = eax crt_strcmp
0 = eax FastStringCompare
70 = eax MasmBasic StringsDiffer
70 = eax STRCMP$

--- ok ---


And this results using
Str1   db "gustavo trigueiros e guilhermeA", 0
Str2   db "gustavo trigueiros e guilhermeX", 0


Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 181/100 cycles


7380 cycles for 100 * crt_strcmp
2048 cycles for 100 * FastStringCompare
2142 cycles for 100 * MasmBasic StringsDiffer
3322 cycles for 100 * STRCMP$

7889 cycles for 100 * crt_strcmp
2052 cycles for 100 * FastStringCompare
2136 cycles for 100 * MasmBasic StringsDiffer
3326 cycles for 100 * STRCMP$

7370 cycles for 100 * crt_strcmp
2361 cycles for 100 * FastStringCompare
2128 cycles for 100 * MasmBasic StringsDiffer
3334 cycles for 100 * STRCMP$

7378 cycles for 100 * crt_strcmp
2052 cycles for 100 * FastStringCompare
2132 cycles for 100 * MasmBasic StringsDiffer
3708 cycles for 100 * STRCMP$

7392 cycles for 100 * crt_strcmp
2061 cycles for 100 * FastStringCompare
2106 cycles for 100 * MasmBasic StringsDiffer
3306 cycles for 100 * STRCMP$

-1 = eax crt_strcmp
0 = eax FastStringCompare
-23 = eax MasmBasic StringsDiffer
-23 = eax STRCMP$

--- ok ---


I´m not sure if it can be even faster, but i´ll give it a try. The important (at least for me) is not only speed, but accuracy in the results.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Ficko

Quote
Freshly created strings are likely to be 8 byte aligned, due to HeapAlloc. But a library function must be able to handle any alignment, e.g. when you pass a pointer to a substring.

That's correct. Handling unaligned data is very important.

From Guga:
Quote
Or even worst, what if the string ends on the end of a memory chunck ?

That's right though very unlikely it can happen.
If you request 1 byte trough "HeapAlloc" you will get at least 4 bytes back - if not 8 - but it is not documented by MS as such.
So what I am doing is to wrap the "HeapAlloc" API and add 3 bytes to any heap request if using 32-bit memory access or add 16 bytes if using SSE.

It is a good practice if you use SSE a lot.
Like some of us do.
I use almost exclusivelly SSE for strlen and strcpy because they are so much faster.

Like Dave sad above somewhere it is not allways possible to have both string compared as aligned.
What I am doing it to access at least one string as aligned and the other one by chance. So I get at least 50% alignment on both strings - worst case scenario -



Ficko

And BTW the above test would be more accurate if it would generate a table of random lenght and aligned strings and use that as input. IMO

dedndave

yah - you can't depend on the strings having any particular alignment
probably the best approach is to test all the alignment combinations in each pass of the timing test
then divide the result by the number of tests
that way, you optimize your code for overall performance

        ALIGN   4
szTest0 db 'String',0
szTestA db 'Strinx',0

        ALIGN   4
        db 0
szTest1 db 'String',0
szTestB db 'Strinx',0

        ALIGN   4
        db 0,0
szTest2 db 'String',0
szTestC db 'Strinx',0

        ALIGN   4
        db 0,0,0
szTest3 db 'String',0
szTestD db 'Strinx',0


compilers may try to align strings
but, they often wind up being partials of some previous string
or in a buffer that is not aligned, etc
or - perhaps you are comparing strings, starting at some unaligned offset

strings are byte entities in a dword world   :P
because aligned SSE is probably the fastest way, it's a 16-byte world

guga

On the updated version, the alignment really dont seems to be any problem. I tried to updated it to handle both aligned and unaligned strings, without having to use test, cmp mnemonics for each different alignment.

Also, i just did a small improve on the code and aligned the code itself (lea and some cmp mnemonics) and the result is really impressive. It seems to be faster then JJ2007 SSE version.

Proc FastStringCompare:
    Arguments @String1, @String2
    Uses esi, ecx, ebx, edx

    mov esi D@String1
    mov ebx D@String2
    mov eax 0
    mov ecx 4
    mov edx D$esi
    cmp edx D$ebx   ; just to force the zero flags. if we have a match zero flag is settled. But we need to check if the last byte at string1 is zero.
jz L0> ; The 1st 4 bytes are not zero, jmp over

        mov ecx 3
        shr edx 8 | jz L1> ; If string is only 1 byte len. ecx = 3
        inc ecx | jmp L1> ; else, ecx = 4

    L0:
        test dh dh | jz L1> ; the last byte of the 1st string is zero. We reached the end of the search. Contunue computing the remaining bytes at "L1"
        lea ecx D$ecx+4 ; compute the len of the string1
Align 4 ; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it don´t insert any nop operation. This line can be commented out

        mov edx D$esi+ecx
        cmp D$ebx+ecx edx
        jz L0<

        jmp L3>> ; strings definitelly dont match

L1:
        lea ebx D$ebx+ecx
        Align 4 ; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it don´t insert any nop operation. This line can be commented out
        mov edx D$ebx-3
        lea esi D$esi+ecx
        Align 4 ; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it insert 2 nops operations
        cmp D$esi-3 edx; if both words on string1 and string2 don´ match jmp over. Otherwise, continue analysing the remaining bytes
         Align 4; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it insert 1 nop operations
        jnz L3>>

    ; In mine tests no matter what bytes gets here, because it always means that we fully analysed the strings, and they always matches
    inc eax


L3:

EndP




Results

Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 189/100 cycles


7327 cycles for 100 * crt_strcmp
1885 cycles for 100 * FastStringCompare
2461 cycles for 100 * MasmBasic StringsDiffer
3291 cycles for 100 * STRCMP$

8410 cycles for 100 * crt_strcmp
1903 cycles for 100 * FastStringCompare
2127 cycles for 100 * MasmBasic StringsDiffer
3279 cycles for 100 * STRCMP$

7286 cycles for 100 * crt_strcmp
1847 cycles for 100 * FastStringCompare
2115 cycles for 100 * MasmBasic StringsDiffer
3278 cycles for 100 * STRCMP$

7304 cycles for 100 * crt_strcmp
1847 cycles for 100 * FastStringCompare
2094 cycles for 100 * MasmBasic StringsDiffer
3302 cycles for 100 * STRCMP$




At night i´ll try to see if this can be improved a little bit more.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

Interesting, guga. Can you post the exe (and source), so that we can test it?

guga

Hi JJ

OK, i´ll post in a couple of minutes (or 1 or 2 hours) before i finish my testings for speed and reliability. So far i fixed a small mistake i made that was generating the previous report. On unaligned strings it was missing 1 single byte during the check causing the report to decrease a few ms, and the code miss 1 byte unaligned strings.

Now, it seems to be ok, i modified the code and it seems to handle all strings (aligned and unaligned). So far, the result is:


Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 208/100 cycles


7589    cycles for 100 * crt_strcmp
2208    cycles for 100 * FastStringCompare
2539    cycles for 100 * MasmBasic StringsDiffer
2950    cycles for 100 * STRCMP$

7593    cycles for 100 * crt_strcmp
2223    cycles for 100 * FastStringCompare
2544    cycles for 100 * MasmBasic StringsDiffer
2946    cycles for 100 * STRCMP$

7578    cycles for 100 * crt_strcmp
2206    cycles for 100 * FastStringCompare
2545    cycles for 100 * MasmBasic StringsDiffer
2947    cycles for 100 * STRCMP$

7575    cycles for 100 * crt_strcmp
2209    cycles for 100 * FastStringCompare
2545    cycles for 100 * MasmBasic StringsDiffer
2947    cycles for 100 * STRCMP$

7658    cycles for 100 * crt_strcmp
2214    cycles for 100 * FastStringCompare
2544    cycles for 100 * MasmBasic StringsDiffer
2947    cycles for 100 * STRCMP$

0       = eax crt_strcmp
1       = eax FastStringCompare
0       = eax MasmBasic StringsDiffer
0       = eax STRCMP$

--- ok ---


And on my tests it didn´t missed unaligned bytes anylonger. But, i´ll make a couple of more tests before i post the source,k since i´m focusing in accuracy and then speed.

I´ll upload it as soon as possible. Hold on :)
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com