String Compare - Updated - Page 2

Welcome to The MASM Forum.
Log in
Sign up

July 01, 2025, 02:13:26 PM

News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

Home

String Compare - Updated

Started by guga, January 28, 2013, 12:12:44 PM

Previous topic - Next topic

Print

Go Down Pages 1 2

MichaelW

Member
Posts: 1,049
Logged

Re: String Compare

#15

January 29, 2013, 11:16:06 AM

Quote from: jj2007 on January 29, 2013, 03:19:29 AM
Strange - I get this...

Yes, if I substitute your strings in my test app, then both of these comparisons return zero:

Code Select


    invoke FastStringCompare, ADDR str1, ADDR str1
    invoke FastStringCompare, ADDR str2, ADDR str2

But if I add a space to the end of both strings, then both comparisons return 1, so the problem appears to be with the length of the strings.

And if I then comment out the align directives, both comparisons return zero. So the code is sensitive to the length and the alignment.

The above applies for my P3 and my P4.

Well Microsoft, here's another nice mess you've gotten us into.

MichaelW

Member
Posts: 1,049
Logged

Re: String Compare

#16

January 29, 2013, 11:59:49 AM Last Edit: January 30, 2013, 05:51:20 AM by MichaelW

Quote from: guga on January 29, 2013, 09:19:10 AM
Have anyone knows if all compilers align their strings (or others data) always in a 4 byte boundary?

I would guess that mainstream compilers do. The Microsoft compilers appear to maintain a minimum 4-byte alignment for any string longer than the null terminator, but I'm not sure that my test data layout is correct for what I'm trying to determine.

Code Select


#include <conio.h>
#include <stdio.h>
#include <stdlib.h>

int alignment( void *address )
{
    int rval;
    __asm
    {
      xor eax, eax
      mov ecx, address
      bsf ecx, ecx
      jz done
      mov eax, 1
      Shl eax, cl
    done:
      mov rval, eax
    }
    return rval;
}

char x6[6],x5[5],x4[4],x3[3],x2[2],x1[1],y1[1],y2[2],y3[3],y4[4],y5[5],y6[6];

void main()
{
  char i6[6],i5[5],i4[4],i3[3],i2[2],i1[1],j1[1],j2[2],j3[3],j4[4],j5[5],j6[6];

  printf("%d\n",alignment(i6));
  printf("%d\n",alignment(i5));
  printf("%d\n",alignment(i4));
  printf("%d\n",alignment(i3));
  printf("%d\n",alignment(i2));
  printf("%d\n",alignment(i1));
  printf("%d\n",alignment(j1));
  printf("%d\n",alignment(j2));
  printf("%d\n",alignment(j3));
  printf("%d\n",alignment(j4));
  printf("%d\n",alignment(j5));
  printf("%d\n\n",alignment(j6));

  printf("%d\n",alignment(x6));
  printf("%d\n",alignment(x5));
  printf("%d\n",alignment(x4));
  printf("%d\n",alignment(x3));
  printf("%d\n",alignment(x2));
  printf("%d\n",alignment(x1));
  printf("%d\n",alignment(y1));
  printf("%d\n",alignment(y2));
  printf("%d\n",alignment(y3));
  printf("%d\n",alignment(y4));
  printf("%d\n",alignment(y5));
  printf("%d\n\n",alignment(y6));

  getch();
}

Code Select

Code Select


; Listing generated by Microsoft (R) Optimizing Compiler Version 13.10.3077 

	TITLE	test.c
	.386P
include listing.inc
if @Version gt 510
.model FLAT
else
_TEXT	SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT	ENDS
_DATA	SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA	ENDS
CONST	SEGMENT DWORD USE32 PUBLIC 'CONST'
CONST	ENDS
_BSS	SEGMENT DWORD USE32 PUBLIC 'BSS'
_BSS	ENDS
$$SYMBOLS	SEGMENT BYTE USE32 'DEBSYM'
$$SYMBOLS	ENDS
_TLS	SEGMENT DWORD USE32 PUBLIC 'TLS'
_TLS	ENDS
FLAT	GROUP _DATA, CONST, _BSS
	ASSUME	CS: FLAT, DS: FLAT, SS: FLAT
endif

INCLUDELIB LIBC
INCLUDELIB OLDNAMES

_DATA	SEGMENT
COMM	_x6:BYTE:06H
COMM	_x5:BYTE:05H
COMM	_x4:BYTE:04H
COMM	_x3:BYTE:03H
COMM	_x2:BYTE:02H
COMM	_x1:BYTE
COMM	_y1:BYTE
COMM	_y2:BYTE:02H
COMM	_y3:BYTE:03H
COMM	_y4:BYTE:04H
COMM	_y5:BYTE:05H
COMM	_y6:BYTE:06H
$SG1250	DB	'%d', 0aH, 00H
$SG1251	DB	'%d', 0aH, 00H
$SG1252	DB	'%d', 0aH, 00H
$SG1253	DB	'%d', 0aH, 00H
$SG1254	DB	'%d', 0aH, 00H
$SG1255	DB	'%d', 0aH, 00H
$SG1256	DB	'%d', 0aH, 00H
$SG1257	DB	'%d', 0aH, 00H
$SG1258	DB	'%d', 0aH, 00H
$SG1259	DB	'%d', 0aH, 00H
$SG1260	DB	'%d', 0aH, 00H
$SG1261	DB	'%d', 0aH, 0aH, 00H
	ORG $+3
$SG1262	DB	'%d', 0aH, 00H
$SG1263	DB	'%d', 0aH, 00H
$SG1264	DB	'%d', 0aH, 00H
$SG1265	DB	'%d', 0aH, 00H
$SG1266	DB	'%d', 0aH, 00H
$SG1267	DB	'%d', 0aH, 00H
$SG1268	DB	'%d', 0aH, 00H
$SG1269	DB	'%d', 0aH, 00H
$SG1270	DB	'%d', 0aH, 00H
$SG1271	DB	'%d', 0aH, 00H
$SG1272	DB	'%d', 0aH, 00H
$SG1273	DB	'%d', 0aH, 0aH, 00H
_DATA	ENDS
PUBLIC	_alignment
; Function compile flags: /Odt
_TEXT	SEGMENT
_rval$ = -4						; size = 4
_address$ = 8						; size = 4
_alignment PROC NEAR
; File c:\program files\microsoft visual c++ toolkit 2003\my\alignment\test.c
; Line 6
	push	ebp
	mov	ebp, esp
	push	ecx
; Line 10
	xor	eax, eax
; Line 11
	mov	ecx, DWORD PTR _address$[ebp]
; Line 12
	bsf	ecx, ecx
; Line 13
	je	SHORT $done$1223
; Line 14
	mov	eax, 1
; Line 15
	shl	eax, cl
$done$1223:
; Line 17
	mov	DWORD PTR _rval$[ebp], eax
; Line 19
	mov	eax, DWORD PTR _rval$[ebp]
; Line 20
	mov	esp, ebp
	pop	ebp
	ret	0
_alignment ENDP
_TEXT	ENDS
PUBLIC	_main
EXTRN	_getch:NEAR
EXTRN	_printf:NEAR
; Function compile flags: /Odt
_TEXT	SEGMENT
_i2$ = -64						; size = 2
_j6$ = -60						; size = 6
_i4$ = -52						; size = 4
_j1$ = -45						; size = 1
_i3$ = -44						; size = 3
_j2$ = -40						; size = 2
_i5$ = -36						; size = 5
_j3$ = -28						; size = 3
_j5$ = -24						; size = 5
_i1$ = -13						; size = 1
_i6$ = -12						; size = 6
_j4$ = -4						; size = 4
_main	PROC NEAR
; Line 25
	push	ebp
	mov	ebp, esp
	sub	esp, 64					; 00000040H
; Line 28
	lea	eax, DWORD PTR _i6$[ebp]
	push	eax
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1250
	call	_printf
	add	esp, 8
; Line 29
	lea	ecx, DWORD PTR _i5$[ebp]
	push	ecx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1251
	call	_printf
	add	esp, 8
; Line 30
	lea	edx, DWORD PTR _i4$[ebp]
	push	edx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1252
	call	_printf
	add	esp, 8
; Line 31
	lea	eax, DWORD PTR _i3$[ebp]
	push	eax
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1253
	call	_printf
	add	esp, 8
; Line 32
	lea	ecx, DWORD PTR _i2$[ebp]
	push	ecx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1254
	call	_printf
	add	esp, 8
; Line 33
	lea	edx, DWORD PTR _i1$[ebp]
	push	edx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1255
	call	_printf
	add	esp, 8
; Line 34
	lea	eax, DWORD PTR _j1$[ebp]
	push	eax
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1256
	call	_printf
	add	esp, 8
; Line 35
	lea	ecx, DWORD PTR _j2$[ebp]
	push	ecx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1257
	call	_printf
	add	esp, 8
; Line 36
	lea	edx, DWORD PTR _j3$[ebp]
	push	edx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1258
	call	_printf
	add	esp, 8
; Line 37
	lea	eax, DWORD PTR _j4$[ebp]
	push	eax
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1259
	call	_printf
	add	esp, 8
; Line 38
	lea	ecx, DWORD PTR _j5$[ebp]
	push	ecx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1260
	call	_printf
	add	esp, 8
; Line 39
	lea	edx, DWORD PTR _j6$[ebp]
	push	edx
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1261
	call	_printf
	add	esp, 8
; Line 42
	push	OFFSET FLAT:_x6
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1262
	call	_printf
	add	esp, 8
; Line 43
	push	OFFSET FLAT:_x5
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1263
	call	_printf
	add	esp, 8
; Line 44
	push	OFFSET FLAT:_x4
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1264
	call	_printf
	add	esp, 8
; Line 45
	push	OFFSET FLAT:_x3
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1265
	call	_printf
	add	esp, 8
; Line 46
	push	OFFSET FLAT:_x2
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1266
	call	_printf
	add	esp, 8
; Line 47
	push	OFFSET FLAT:_x1
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1267
	call	_printf
	add	esp, 8
; Line 48
	push	OFFSET FLAT:_y1
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1268
	call	_printf
	add	esp, 8
; Line 49
	push	OFFSET FLAT:_y2
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1269
	call	_printf
	add	esp, 8
; Line 50
	push	OFFSET FLAT:_y3
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1270
	call	_printf
	add	esp, 8
; Line 51
	push	OFFSET FLAT:_y4
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1271
	call	_printf
	add	esp, 8
; Line 52
	push	OFFSET FLAT:_y5
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1272
	call	_printf
	add	esp, 8
; Line 53
	push	OFFSET FLAT:_y6
	call	_alignment
	add	esp, 4
	push	eax
	push	OFFSET FLAT:$SG1273
	call	_printf
	add	esp, 8
; Line 55
	call	_getch
; Line 56
	xor	eax, eax
	mov	esp, ebp
	pop	ebp
	ret	0
_main	ENDP
_TEXT	ENDS
END

Edit:

Analyzing this further, and noting in the listing that the compiler had changed the order of the local strings in main, but not the order of the COMM strings in the DATA section, I added this code:

Code Select


  printf("%d\n",&x6);
  printf("%d\n",&x5);
  printf("%d\n",&x4);
  printf("%d\n",&x3);
  printf("%d\n",&x2);
  printf("%d\n",&x1);
  printf("%d\n",&y1);
  printf("%d\n",&y2);
  printf("%d\n",&y3);
  printf("%d\n",&y4);
  printf("%d\n",&y5);
  printf("%d\n\n",&y6);

And got this output:

Code Select

While I cannot readily equate the layout of the addresses to the compiler's layout of the local strings, the data has obviously been shuffled. So the question is, who did the shuffling? And the answer from the MASM 6.0 Programmer's Guide, under Selecting Data-Sharing Methods, is:

Quote
As an alternative, you can use the COMM directive instead of PUBLIC and EXTERN. However, communal variables have some limitations. You cannot depend on their location in memory because they are allocated by the linker, and they cannot be initialized.

Well Microsoft, here's another nice mess you've gotten us into.

guga

Moderator
Member
Posts: 1,760
Assembly is a state of art.
Location: Brazil
Logged

Re: String Compare

#17

January 29, 2013, 05:10:41 PM

Ok, guys, i guess i finished with string alignment. Didn´t tested the performance. So feel free to test it.

On the strings i tested the result is accurate. (0 if don´t match, 1 if match) - If this new version is fast i´ll implement the return 0-1 on it.

Code Select


Proc FastStringCompare:
    Arguments @String1, @String2
    Uses esi, ecx, ebx, edx

    mov esi D@String1
    mov ecx 0
    mov eax 0
    mov ebx D@String2

    mov edx D$esi
    cmp edx D$ebx   ; just to force the zero flags. if we have a match zero flag is settled. But we need to check if the last byte at string1 is zero.

jz L0> ; The 1st 4 bytes are not zero, jmp over

        mov ecx 3
        shr edx 8 | jz L1> ; If string is only 1 byte len. ecx = 3
        inc ecx | jmp L1> ; else, ecx = 4

    L0:
        cmp dh 0 | jz L1> ; the last byte of the 1st string is zero. We reached the end of the search. Contunue computing the remaining bytes at "L1"
        lea ecx D$ecx+4 ; compute the len of the string1
        mov edx D$esi+ecx
        cmp D$ebx+ecx edx
        jz L0<

        jmp L3>> ; strings definitelly dont match

L1:

        lea ebx D$ebx+ecx | mov edx D$ebx-3
        lea esi D$esi+ecx
        cmp edx D$esi-3 ; if both words on string1 and string2 don´ match jmp over. Otherwise, continue analysing the remaining bytes
        jnz L3>>
        ; We are almost there. If ecx = 0, means that the string1 have only 2 byte len. So let´t search for the 2nd byte of string2
;;
    ; so far all bytes are equal. Let´t see if teh string2 is null terminated as it is on string1. Ex: str1) "guga1" | str2) "guga1" Or if it is str1) "guga1" | str2) "gugaM"...
    ; On the dword, we need to check the byte after the dx like: 00XX_0000
    If B$ebx-1 = 0
        inc eax
    End_If

    Note for me, i commented it out, because the above assumption is wrong. In fact, when we reach here we have analysed all the string
    and both are exactly the same.
;;

    ; In mine tests no matter what bytes gets here, because it always means that we fully analysed the strings, and they always matches
    inc eax

L3:

EndP

Tested strings:

Code Select



    call FastStringCompare {"1", 0}, {"g", 0}
    call FastStringCompare {"g", 0}, {"g", 0}
    call FastStringCompare {"gu", 0}, {"gu", 0}
    call FastStringCompare {"gug", 0}, {"gug", 0}
    call FastStringCompare {"guga", 0}, {"guga", 0}
    call FastStringCompare {"guga1", 0}, {"guga1", 0}
    call FastStringCompare {"guga12", 0}, {"guga12", 0}
    call FastStringCompare {"guga123", 0}, {"guga123", 0}
    call FastStringCompare {"guga1234", 0}, {"guga1234", 0}
    call FastStringCompare {"guga12345", 0}, {"guga12345", 0}
    call FastStringCompare {"guga123456", 0}, {"guga123456", 0}
    
    call FastStringCompare {"1234", 0}, {"5678", 0}
    call FastStringCompare {"1", 0}, {"g", 0}
    call FastStringCompare {"gustavo trigu", 0}, {"gustavo trigueiros e guilherme", 0}
    call FastStringCompare {"g", 0}, {"gu", 0}
    call FastStringCompare {"gu", 0}, {"gu", 0}
    call FastStringCompare {"gu", 0}, {"guu", 0}
    call FastStringCompare {"guuu", 0}, {"guu", 0}
    call FastStringCompare {"guu", 0}, {"guuu", 0}
    call FastStringCompare {"gug", 0}, {"guga", 0}
    call FastStringCompare {"g44564", 0}, {"112345g", 0}
    call FastStringCompare {"gu", 0}, {"g", 0}
    call FastStringCompare {"gu", 0}, {"gu", 0}
    call FastStringCompare {"g", 0}, {"gu", 0}
    call FastStringCompare {"g", 0}, {"g", 0}
    call FastStringCompare {"1", 0}, {"g", 0}

        call FastStringCompare {"g1stavo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gu1tavo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gus1avo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gust1vo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gusta1o trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustav1 trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo1trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo 1rigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilherm1", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilher1e", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilhe1me", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilh1rme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guil1erme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e gui1herme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e gu1lherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e g1ilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e 1uilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e1guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros 1 guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros1e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiro1 e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueir1s e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}
        call FastStringCompare {"gustavo trigueiros e guilherme", 0}, {"gustavo trigueiros e guilherme", 0}

        call FastStringCompare {"guga", 0}, {"guga1", 0}
        call FastStringCompare {"guga1", 0}, {"guga", 0}

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

Moderator
Member
Posts: 1,760
Assembly is a state of art.
Location: Brazil
Logged

Re: String Compare - Updated

#18

January 29, 2013, 06:10:01 PM

The preliminary results i´ve got is
Not sure if the result is accurated because i wasn´t be able to assemble the masm app from JJ2007, so i just disassembled it, inserted my code and reassembled it back. But comparing the older results i guess it is faster then my previous version.

Code Select

Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 189/100 cycles


7185    cycles for 100 * crt_strcmp
2060    cycles for 100 * FastStringCompare
2058    cycles for 100 * MasmBasic StringsDiffer
3325    cycles for 100 * STRCMP$

7211    cycles for 100 * crt_strcmp
2057    cycles for 100 * FastStringCompare
2059    cycles for 100 * MasmBasic StringsDiffer
3216    cycles for 100 * STRCMP$

7177    cycles for 100 * crt_strcmp
2058    cycles for 100 * FastStringCompare
2062    cycles for 100 * MasmBasic StringsDiffer
3219    cycles for 100 * STRCMP$

7178    cycles for 100 * crt_strcmp
2057    cycles for 100 * FastStringCompare
2059    cycles for 100 * MasmBasic StringsDiffer
3309    cycles for 100 * STRCMP$

7179    cycles for 100 * crt_strcmp
2208    cycles for 100 * FastStringCompare
2073    cycles for 100 * MasmBasic StringsDiffer
3222    cycles for 100 * STRCMP$

-1      = eax crt_strcmp
0       = eax FastStringCompare
-23     = eax MasmBasic StringsDiffer
-23     = eax STRCMP$

--- ok ---

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

Member
Posts: 14,905
Assembly is fun ;-)
Location: Italy
Logged

Re: String Compare

#19

January 29, 2013, 06:28:00 PM

Quote from: guga on January 29, 2013, 09:19:10 AMDoes anyone know if all compilers align their strings (or other data) always on a 4 byte boundary ?

Freshly created strings are likely to be 8 byte aligned, due to HeapAlloc. But a library function must be able to handle any alignment, e.g. when you pass a pointer to a substring. There shouldn't be any problem, though, unless you use SSE2. The MasmBasic StringsDiffer() does indeed use SSE2 and has some overhead, which makes it relatively slow for very short strings.

Here are two examples for a somewhat longer string, showing the influence of alignment:

Str1 db "gustavo trigueiros e guilherme alias 'guga' is today the brain behind RosAsm, one of the most powerful assemblers ever developedA", 0
Str2 db "gustavo ...X", 0

align 4 for both strings:
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
++++++++++++++++++++
2894 cycles for 10 * crt_strcmp
2315 cycles for 10 * FastStringCompare
1186 cycles for 10 * MasmBasic StringsDiffer
1964 cycles for 10 * STRCMP$

align 16 for both strings:
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
++++++++++++++++++++
2892 cycles for 10 * crt_strcmp
2309 cycles for 10 * FastStringCompare
969 cycles for 10 * MasmBasic StringsDiffer
1954 cycles for 10 * STRCMP$

Masm32 Tips, Tricks and Traps * enable the deb macro

guga

Moderator
Member
Posts: 1,760
Assembly is a state of art.
Location: Brazil
Logged

Re: String Compare - Updated

#20

January 29, 2013, 06:58:55 PM

Hi JJ, thanks for the nice words :)

I built a new version (using alignment) and just made a new improve:

Code Select

Proc FastStringCompare:
    Arguments @String1, @String2
    Uses esi, ecx, ebx, edx

    mov esi D@String1
    mov ecx 0
    mov eax 0
    mov ebx D@String2

    mov edx D$esi
    cmp edx D$ebx   ; just to force the zero flags. if we have a match zero flag is settled. But we need to check if the last byte at string1 is zero.

jz L0> ; The 1st 4 bytes are not zero, jmp over

        mov ecx 3
        shr edx 8 | jz L1> ; If string is only 1 byte len. ecx = 3
        inc ecx | jmp L1> ; else, ecx = 4

    L0:
        test dh dh | jz L1> ; the last byte of the 1st string is zero. We reached the end of the search. Contunue computing the remaining bytes at "L1"
        lea ecx D$ecx+4 ; compute the len of the string1
        mov edx D$esi+ecx
        cmp D$ebx+ecx edx
        jz L0<

        jmp L3>> ; strings definitelly dont match

L1:

        lea ebx D$ebx+ecx | mov edx D$ebx-3
        lea esi D$esi+ecx
        cmp D$esi-3 edx; if both words on string1 and string2 don´ match jmp over. Otherwise, continue analysing the remaining bytes
        jnz L3>>
        ; We are almost there. If ecx = 0, means that the string1 have only 2 byte len. So let´t search for the 2nd byte of string2
;;
    ; so far all bytes are equal. Let´t see if teh string2 is null terminated as it is on string1. Ex: str1) "guga1" | str2) "guga1" Or if it is str1) "guga1" | str2) "gugaM"...
    ; On the dword, we need to check the byte after the dx like: 00XX_0000
    If B$ebx-1 = 0
        inc eax
    End_If

    Note for me, i commented it out, because the above assumption is wrong. In fact, when we reach here we have analysed all the string
    and both are exactly the same.
;;

    ; In mine tests no matter what bytes gets here, because it always means that we fully analysed the strings, and they always matches
    inc eax


L3:

EndP

Using the same strings as the post above i got this results:
Str1 db "gustavo trigueiros e guilherme alias 'guga' is today the brain behind RosAsm, one of the most powerful assemblers ever developedA", 0
Str2 db "gustavo ...X", 0

Code Select

Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (£E4)
loop overhead is approx. 181/100 cycles

2103	cycles for 100 * crt_strcmp
1087	cycles for 100 * FastStringCompare
1774	cycles for 100 * MasmBasic StringsDiffer
1599	cycles for 100 * STRCMP$

2100	cycles for 100 * crt_strcmp
1091	cycles for 100 * FastStringCompare
1768	cycles for 100 * MasmBasic StringsDiffer
1593	cycles for 100 * STRCMP$

2479	cycles for 100 * crt_strcmp
1559	cycles for 100 * FastStringCompare
1849	cycles for 100 * MasmBasic StringsDiffer
1698	cycles for 100 * STRCMP$

2182	cycles for 100 * crt_strcmp
1087	cycles for 100 * FastStringCompare
1781	cycles for 100 * MasmBasic StringsDiffer
1607	cycles for 100 * STRCMP$

2109	cycles for 100 * crt_strcmp
1090	cycles for 100 * FastStringCompare
1774	cycles for 100 * MasmBasic StringsDiffer
1592	cycles for 100 * STRCMP$

1	= eax crt_strcmp
0	= eax FastStringCompare
70	= eax MasmBasic StringsDiffer
70	= eax STRCMP$

--- ok ---

And this results using
Str1 db "gustavo trigueiros e guilhermeA", 0
Str2 db "gustavo trigueiros e guilhermeX", 0

Code Select


Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 181/100 cycles


7380	cycles for 100 * crt_strcmp
2048	cycles for 100 * FastStringCompare
2142	cycles for 100 * MasmBasic StringsDiffer
3322	cycles for 100 * STRCMP$

7889	cycles for 100 * crt_strcmp
2052	cycles for 100 * FastStringCompare
2136	cycles for 100 * MasmBasic StringsDiffer
3326	cycles for 100 * STRCMP$

7370	cycles for 100 * crt_strcmp
2361	cycles for 100 * FastStringCompare
2128	cycles for 100 * MasmBasic StringsDiffer
3334	cycles for 100 * STRCMP$

7378	cycles for 100 * crt_strcmp
2052	cycles for 100 * FastStringCompare
2132	cycles for 100 * MasmBasic StringsDiffer
3708	cycles for 100 * STRCMP$

7392	cycles for 100 * crt_strcmp
2061	cycles for 100 * FastStringCompare
2106	cycles for 100 * MasmBasic StringsDiffer
3306	cycles for 100 * STRCMP$

-1	= eax crt_strcmp
0	= eax FastStringCompare
-23	= eax MasmBasic StringsDiffer
-23	= eax STRCMP$

--- ok ---

I´m not sure if it can be even faster, but i´ll give it a try. The important (at least for me) is not only speed, but accuracy in the results.

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Ficko

Regular Member
Posts: 46
Logged

Re: String Compare - Updated

#21

January 29, 2013, 07:07:02 PM

Quote
Freshly created strings are likely to be 8 byte aligned, due to HeapAlloc. But a library function must be able to handle any alignment, e.g. when you pass a pointer to a substring.

That's correct. Handling unaligned data is very important.

From Guga:

Quote
Or even worst, what if the string ends on the end of a memory chunck ?

That's right though very unlikely it can happen.
If you request 1 byte trough "HeapAlloc" you will get at least 4 bytes back - if not 8 - but it is not documented by MS as such.
So what I am doing is to wrap the "HeapAlloc" API and add 3 bytes to any heap request if using 32-bit memory access or add 16 bytes if using SSE.

It is a good practice if you use SSE a lot.
Like some of us do.
I use almost exclusivelly SSE for strlen and strcpy because they are so much faster.

Like Dave sad above somewhere it is not allways possible to have both string compared as aligned.
What I am doing it to access at least one string as aligned and the other one by chance. So I get at least 50% alignment on both strings - worst case scenario -

Ficko

Regular Member
Posts: 46
Logged

Re: String Compare - Updated

#22

January 29, 2013, 07:13:43 PM

And BTW the above test would be more accurate if it would generate a table of random lenght and aligned strings and use that as input. IMO

dedndave

Member
Posts: 8,158
Still using Abacus 2.0
Logged

Re: String Compare - Updated

#23

January 29, 2013, 09:01:19 PM

yah - you can't depend on the strings having any particular alignment
probably the best approach is to test all the alignment combinations in each pass of the timing test
then divide the result by the number of tests
that way, you optimize your code for overall performance

Code Select

        ALIGN   4
szTest0 db 'String',0
szTestA db 'Strinx',0

        ALIGN   4
        db 0
szTest1 db 'String',0
szTestB db 'Strinx',0

        ALIGN   4
        db 0,0
szTest2 db 'String',0
szTestC db 'Strinx',0

        ALIGN   4
        db 0,0,0
szTest3 db 'String',0
szTestD db 'Strinx',0

compilers may try to align strings
but, they often wind up being partials of some previous string
or in a buffer that is not aligned, etc
or - perhaps you are comparing strings, starting at some unaligned offset

strings are byte entities in a dword world :P
because aligned SSE is probably the fastest way, it's a 16-byte world

guga

Moderator
Member
Posts: 1,760
Assembly is a state of art.
Location: Brazil
Logged

Re: String Compare - Updated

#24

January 30, 2013, 02:08:48 AM

On the updated version, the alignment really dont seems to be any problem. I tried to updated it to handle both aligned and unaligned strings, without having to use test, cmp mnemonics for each different alignment.

Also, i just did a small improve on the code and aligned the code itself (lea and some cmp mnemonics) and the result is really impressive. It seems to be faster then JJ2007 SSE version.

Code Select

Proc FastStringCompare:
    Arguments @String1, @String2
    Uses esi, ecx, ebx, edx

    mov esi D@String1
    mov ebx D@String2
    mov eax 0
    mov ecx 4
    mov edx D$esi
    cmp edx D$ebx   ; just to force the zero flags. if we have a match zero flag is settled. But we need to check if the last byte at string1 is zero.
jz L0> ; The 1st 4 bytes are not zero, jmp over

        mov ecx 3
        shr edx 8 | jz L1> ; If string is only 1 byte len. ecx = 3
        inc ecx | jmp L1> ; else, ecx = 4

    L0:
        test dh dh | jz L1> ; the last byte of the 1st string is zero. We reached the end of the search. Contunue computing the remaining bytes at "L1"
        lea ecx D$ecx+4 ; compute the len of the string1
Align 4 ; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it don´t insert any nop operation. This line can be commented out

        mov edx D$esi+ecx
        cmp D$ebx+ecx edx
        jz L0<

        jmp L3>> ; strings definitelly dont match

L1:
        lea ebx D$ebx+ecx
        Align 4 ; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it don´t insert any nop operation. This line can be commented out
        mov edx D$ebx-3
        lea esi D$esi+ecx
        Align 4 ; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it insert 2 nops operations
        cmp D$esi-3 edx; if both words on string1 and string2 don´ match jmp over. Otherwise, continue analysing the remaining bytes
         Align 4; trying to align the code to avoid pipeline/stall problems, and get maximum results. In this case, it insert 1 nop operations
        jnz L3>>

    ; In mine tests no matter what bytes gets here, because it always means that we fully analysed the strings, and they always matches
    inc eax


L3:

EndP

Results

Code Select


Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 189/100 cycles


7327	cycles for 100 * crt_strcmp
1885	cycles for 100 * FastStringCompare
2461	cycles for 100 * MasmBasic StringsDiffer
3291	cycles for 100 * STRCMP$

8410	cycles for 100 * crt_strcmp
1903	cycles for 100 * FastStringCompare
2127	cycles for 100 * MasmBasic StringsDiffer
3279	cycles for 100 * STRCMP$

7286	cycles for 100 * crt_strcmp
1847	cycles for 100 * FastStringCompare
2115	cycles for 100 * MasmBasic StringsDiffer
3278	cycles for 100 * STRCMP$

7304	cycles for 100 * crt_strcmp
1847	cycles for 100 * FastStringCompare
2094	cycles for 100 * MasmBasic StringsDiffer
3302	cycles for 100 * STRCMP$

At night i´ll try to see if this can be improved a little bit more.

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

Member
Posts: 14,905
Assembly is fun ;-)
Location: Italy
Logged

Re: String Compare - Updated

#25

January 30, 2013, 03:54:06 AM

Interesting, guga. Can you post the exe (and source), so that we can test it?

Masm32 Tips, Tricks and Traps * enable the deb macro

guga

Moderator
Member
Posts: 1,760
Assembly is a state of art.
Location: Brazil
Logged

Re: String Compare - Updated

#26

January 30, 2013, 11:08:44 AM

Hi JJ

OK, i´ll post in a couple of minutes (or 1 or 2 hours) before i finish my testings for speed and reliability. So far i fixed a small mistake i made that was generating the previous report. On unaligned strings it was missing 1 single byte during the check causing the report to decrease a few ms, and the code miss 1 byte unaligned strings.

Now, it seems to be ok, i modified the code and it seems to handle all strings (aligned and unaligned). So far, the result is:

Code Select


Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz (SE4)
loop overhead is approx. 208/100 cycles


7589    cycles for 100 * crt_strcmp
2208    cycles for 100 * FastStringCompare
2539    cycles for 100 * MasmBasic StringsDiffer
2950    cycles for 100 * STRCMP$

7593    cycles for 100 * crt_strcmp
2223    cycles for 100 * FastStringCompare
2544    cycles for 100 * MasmBasic StringsDiffer
2946    cycles for 100 * STRCMP$

7578    cycles for 100 * crt_strcmp
2206    cycles for 100 * FastStringCompare
2545    cycles for 100 * MasmBasic StringsDiffer
2947    cycles for 100 * STRCMP$

7575    cycles for 100 * crt_strcmp
2209    cycles for 100 * FastStringCompare
2545    cycles for 100 * MasmBasic StringsDiffer
2947    cycles for 100 * STRCMP$

7658    cycles for 100 * crt_strcmp
2214    cycles for 100 * FastStringCompare
2544    cycles for 100 * MasmBasic StringsDiffer
2947    cycles for 100 * STRCMP$

0       = eax crt_strcmp
1       = eax FastStringCompare
0       = eax MasmBasic StringsDiffer
0       = eax STRCMP$

--- ok ---

And on my tests it didn´t missed unaligned bytes anylonger. But, i´ll make a couple of more tests before i post the source,k since i´m focusing in accuracy and then speed.

I´ll upload it as soon as possible. Hold on :)

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Print

Go Up Pages 1 2

User actions

Print