News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Fast Sorting algo help

Started by guga, September 23, 2014, 05:10:13 AM

Previous topic - Next topic

guga

Qsort function decoded

It is exactly the same as in msvcrt. I tried to optimize a bit, but the function is a true hell. Can someone help me optimizing and simplifying it ?


[CUTOFF 8]

[lostk: D$ 0 #30
histk: D$ 0 #30]

Proc qsort:
    Arguments @base, @num, @width, @compare
    Local @stkptr, @lo, @hi, @mid
    Uses esi, edi, ebx


    ; * Note: the number of stack entries required is no more than
    ;        1 + log2(size), so 30 is sufficient for any array
    If_Or D@num < 2, D@width = 0
        ExitP
    End_If

    mov D@stkptr 0 ; initialize stack
    mov esi D@num
    mov edi D@width
    mov ebx D@base
    dec esi | imul esi edi | add esi ebx ; esi = ((esi-1)*edi)+ebx
    mov D@lo ebx
    mov D@hi esi
    ; this entry point is for pseudo-recursion calling:
    ; setting lo and hi and jumping to here is like recursion,
    ; but stkptr is  preserved, locals aren't, so we preserve stuff on the stack

@Recursive2:

    ; number of el's to sort
    mov eax esi | sub eax ebx | xor edx edx | div D@width | inc eax ; eax = ((eax-ebx)/edi)+1 ; (((A-C)/B)+1)
    ; on 1st loop, eax = esi. http://www.wolframalpha.com/input/?i=%28%28%28%28%28A-C%29%2FB%29%2B1%29-1%29*B%29%2BC
    ; below a certain size, it is faster to use a O(n^2) sorting method
    ...If eax <= CUTOFF

        call shortsort esi, ebx, D@width, D@compare

    ...Else
        ; First we pick a partititioning element.  The efficiency of the algorithm demands
        ; that we find one that is approximately the median of the values, but also that we select one fast.
        ; Using the first one produces bad performace if the array is already sorted, so we use the middle one,
        ; which would require a very  wierdly arranged array for worst case performance.
        ; Testing shows  that a median-of-three algorithm does not, in general, increase performance.
        shr eax 1 | imul eax D@width | add eax ebx | mov edi eax ; find middle element

        ; swap it to beginning of array
        call Swap ebx, edi, D@width, D@compare
        call Swap ebx, esi, D@width, D@compare
        call Swap edi, esi, D@width, D@compare

        call Routine1 D@hi, D@width, D@compare

        add esi D@width
        cmp edi esi | jae D9>  ; Code077C37107
            Do
                sub esi D@width
                cmp esi edi | jbe D9>  ; Code077C37107
                call D@compare esi, edi
            Loop_Until eax <> 0
        cmp edi esi | jb G3>  ; Code077C3711F

D9:
    Do
        mov eax D@lo | sub esi D@width;edx
        cmp esi eax | jbe G6>  ; Code077C37122
        call D@compare esi, edi
    Loop_Until eax <> 0

G3:
    mov eax D@lo

G6:
    mov edx D@hi
    mov ecx edx | sub ecx ebx
    mov edi esi | sub edi eax
    cmp edi ecx | jl M4>  ; Code077C3715C
    If eax < esi
        mov ecx D@stkptr
        mov D$lostk+ecx*4 eax
        mov D$histk+ecx*4 esi
        inc D@stkptr
    End_If

    mov edi D@width
    cmp ebx edx | jae A7>  ; Code077C37187
    mov esi D@hi | mov edi D@width | mov D@lo ebx
    jmp @Recursive2

M4:

    If ebx < edx
        mov ecx D@stkptr
        mov D$lostk+ecx*4 ebx
        mov D$histk+ecx*4 edx
        inc D@stkptr
    End_If

    cmp eax esi | jae A7>  ; Code077C37187
    mov ebx D@lo | mov edi D@width | mov D@hi esi
    jmp @Recursive2

    ...End_If

A7:

    dec D@stkptr | js E8>  ; Code077C371B0
    mov eax D@stkptr
    mov edx D$lostk+eax*4
    mov eax D$histk+eax*4
    mov D@lo edx
    mov D@hi eax
    mov esi eax ; hi
    mov ebx edx ; low
    jmp @Recursive2

E8:

EndP



Proc shortsort:
    Arguments @lo, @hi, @width, @comp
    Local @pChar, @TmpLo
    Uses edi, ecx, esi, ebx

    mov ecx D@hi
    mov edi D@lo | mov D@TmpLo edi
    ..If D@lo > ecx

        mov edx D@width | lea eax D$ecx+edx | mov D@pChar eax

        .Do

            mov esi D@pChar
            mov ebx ecx
            .If esi <= D@TmpLo
                Do
                    call D@comp esi, ebx
                    If eax >s 0
                        mov ebx esi
                    End_If
                    add esi D@width
                Loop_Until esi > D@TmpLo
            .End_If

            .If_And ebx <> D@TmpLo, D@width <> 0
                mov eax D@TmpLo
                mov ecx ebx | sub ecx D@TmpLo | mov esi D@width
                L1:
                    mov dh B$ecx+eax
                    mov dl B$eax | mov B$ecx+eax dl
                    mov B$eax dh
                    inc eax
                    dec esi | jne L1<
            .End_If
            mov edi D@TmpLo | sub edi D@width | mov D@TmpLo edi
            mov ecx D@hi
        .Loop_Until edi <= D@hi

    ..End_If

EndP



Proc Swap:
    Arguments @Struct1, @Struct2, @Width, @compare
    Local @Distance
    Uses ebx, ecx, edx

    mov ebx D@Struct1 | sub ebx D@Struct2 | jz L1> | mov D@Distance ebx
    call D@compare D@Struct1, D@Struct2
    .If eax >s 0
        mov ebx D@Distance
        mov eax D@Struct2
        mov ecx D@width
        L1:
            mov dh B$eax+ebx
            mov dl B$eax | mov B$eax+ebx dl
            mov B$eax dh
            inc eax
        dec ecx | jne L1<
    .End_If
L1:

EndP




; ebx, edi, esi is altered and is being used in the main function. Do not changed it
Proc Routine1:
    Arguments @hi, @width, @compare
    Local @mid


    ;..While edi > ebx ; jna
@Recursive1:

    cmp edi ebx | jbe L1>

        Do
            add ebx D@width
            cmp ebx edi | jae L1>;G4>  ; Code077C37080
            call D@compare ebx, edi
        Loop_Until eax >s 0 ; jle ; i inverted it accidentally. it must loop only if it is smaller or equal to zero

            ...If edi <= ebx
L1:

                Do
                    mov ecx D@width
                    mov eax D@hi
                    add ebx ecx
                    cmp ebx eax | ja L2>;I8>  ; Code077C37098
                    call D@compare ebx, edi
                Loop_Until eax >s 0
            ...End_If

L2:
           Do
                sub esi D@width
                cmp esi edi | jbe L3>;K7>  ; Code077C370AB
                call D@compare esi, edi
           Loop_Until eax <s= 0

L3:

        cmp ebx esi | ja L7>;A5>  ; Code077C370E5

        If ebx <> esi
            mov eax esi
            mov edx D@width
            mov ecx ebx | sub ecx esi | mov D@mid edx
            L0:
                mov dh B$eax+ecx
                mov dl B$eax | mov B$eax+ecx dl
                mov B$eax dh

                ;mov dl B$eax | mov B$ecx+eax dl
                inc eax
                dec D@mid | jnz L0<
        End_If

        If edi = esi
            mov edi ebx
        End_If

    jmp @Recursive1

   ; ..End_While
L7:

EndP


Note. I really don´t think it is necessary the tables lostk and histk. It don´t seems to be needed once the proper coding reorganization/optimization is done. The C source says that they are used to stack preservation, but, it doesn´t seems to be the case of preserving the stack.

I suceeded to decode on a way that the user´s comparefunction does not need to be on a stdcalling convention anymore.

But, due to the re-entrances of the function, several loops, i have serious doubt that this can be called "fast" in any means. There should be a way to optimize this beast.

Here is the original C source (From WinNT. The actual version on WinXP seems to be a tiny variation of it.


/***
*qsort.c - quicksort algorithm; qsort() library function for sorting arrays
*
* Copyright (c) 1985-1991, Microsoft Corporation. All rights reserved.
*
*Purpose:
* To implement the qsort() routine for sorting arrays.
*
*Revision History:
* 06-22-84  RN author
* 03-25-85  RN added pre-check for elements already in order to
* eliminate worst-case behavior.
* 05-18-86  TC changed to recurse on the smallest piece to avoid
* piece. unneccesary stack usage, and to iterate on
* largest
* 01-09-87  BCM fixed huge-array case where (num-1) * wid computation
* was overflowing (large/compact models only)
* 06-13-89  PHG made more efficient, many more comments, removed
* recursion
* 10-30-89  JCR Added _cdecl to prototypes
* 03-15-90  GJF Replaced _cdecl with _CALLTYPE1 and added #include
* <cruntime.h>. Also, fixed the copyright.
* 04-05-90  GJF Made shortsort() and swap() _CALLTYPE4. Also, added
* #include <search.h>.
* 10-04-90  GJF New-style function declarators.
*       12-28-90  SRW   Added _CRUISER_ conditional around check_stack pragmas
* 01-24-91  SRW Added missing close comment in swap procedure
* 11-19-91  GJF Do the swap one character at a time to avoid alignment
* woes.
*
*******************************************************************************/

#include <cruntime.h>
#include <stdlib.h>
#include <search.h>

/* prototypes for local routines */
static void _CALLTYPE4 shortsort(char *lo, char *hi, unsigned width,
      int (_CALLTYPE1 *comp)(const void *, const void *));
static void _CALLTYPE4 swap(char *p, char *q, unsigned int width);

/* this parameter defines the cutoff between using quick sort and
   insertion sort for arrays; arrays with lengths shorter or equal to the
   below value use insertion sort */

#define CUTOFF 8     /* testing shows that this is good value */


/***
*qsort(base, num, wid, comp) - quicksort function for sorting arrays
*
*Purpose:
* quicksort the array of elements
* side effects:  sorts in place
*
*Entry:
* char *base = pointer to base of array
* unsigned num  = number of elements in the array
* unsigned width = width in bytes of each array element
* int (*comp)() = pointer to function returning analog of strcmp for
* strings, but supplied by user for comparing the array elements.
* it accepts 2 pointers to elements and returns neg if 1<2, 0 if
* 1=2, pos if 1>2.
*
*Exit:
* returns void
*
*Exceptions:
*
*******************************************************************************/

#ifdef _CRUISER_
#pragma check_stack(on) /* lots of locals */
#endif  /* ndef _CRUISER_ */

/* sort the array between lo and hi (inclusive) */

void _CALLTYPE1 qsort (
    void *base,
    unsigned num,
    unsigned width,
    int (_CALLTYPE1 *comp)(const void *, const void *)
    )
{
    char *lo, *hi; /* ends of sub-array currently sorting */
    char *mid; /* points to middle of subarray */
    char *loguy, *higuy; /* traveling pointers for partition step */
    unsigned size; /* size of the sub-array */
    char *lostk[30], *histk[30];
    int stkptr; /* stack for saving sub-array to be processed */

    /* Note: the number of stack entries required is no more than
       1 + log2(size), so 30 is sufficient for any array */

    if (num < 2 || width == 0)
return; /* nothing to do */

    stkptr = 0; /* initialize stack */

    lo = base;
    hi = (char *)base + width * (num-1); /* initialize limits */

    /* this entry point is for pseudo-recursion calling: setting
       lo and hi and jumping to here is like recursion, but stkptr is
       prserved, locals aren't, so we preserve stuff on the stack */
recurse:

    size = (hi - lo) / width + 1; /* number of el's to sort */

    /* below a certain size, it is faster to use a O(n^2) sorting method */
    if (size <= CUTOFF) {
shortsort(lo, hi, width, comp);
    }
    else {
/* First we pick a partititioning element.  The efficiency of the
   algorithm demands that we find one that is approximately the
   median of the values, but also that we select one fast.  Using
   the first one produces bad performace if the array is already
   sorted, so we use the middle one, which would require a very
   wierdly arranged array for worst case performance.  Testing shows
   that a median-of-three algorithm does not, in general, increase
   performance. */

mid = lo + (size / 2) * width;     /* find middle element */
swap(mid, lo, width);     /* swap it to beginning of array */

/* We now wish to partition the array into three pieces, one
   consisiting of elements <= partition element, one of elements
   equal to the parition element, and one of element >= to it. This
   is done below; comments indicate conditions established at every
   step. */

loguy = lo;
higuy = hi + width;

/* Note that higuy decreases and loguy increases on every iteration,
   so loop must terminate. */
for (;;) {
    /* lo <= loguy < hi, lo < higuy <= hi + 1,
       A[i] <= A[lo] for lo <= i <= loguy,
       A[i] >= A[lo] for higuy <= i <= hi */

    do {
loguy += width;
    } while (loguy <= hi && comp(loguy, lo) <= 0);

    /* lo < loguy <= hi+1, A[i] <= A[lo] for lo <= i < loguy,
       either loguy > hi or A[loguy] > A[lo] */

    do {
higuy -= width;
    } while (higuy > lo && comp(higuy, lo) >= 0);

    /* lo-1 <= higuy <= hi, A[i] >= A[lo] for higuy < i <= hi,
       either higuy <= lo or A[higuy] < A[lo] */

    if (higuy < loguy)
break;

    /* if loguy > hi or higuy <= lo, then we would have exited, so
       A[loguy] > A[lo], A[higuy] < A[lo],
       loguy < hi, highy > lo */

    swap(loguy, higuy, width);

    /* A[loguy] < A[lo], A[higuy] > A[lo]; so condition at top
       of loop is re-established */
}

/*     A[i] >= A[lo] for higuy < i <= hi,
       A[i] <= A[lo] for lo <= i < loguy,
       higuy < loguy, lo <= higuy <= hi
   implying:
       A[i] >= A[lo] for loguy <= i <= hi,
       A[i] <= A[lo] for lo <= i <= higuy,
       A[i] = A[lo] for higuy < i < loguy */

swap(lo, higuy, width);     /* put partition element in place */

/* OK, now we have the following:
      A[i] >= A[higuy] for loguy <= i <= hi,
      A[i] <= A[higuy] for lo <= i < higuy
      A[i] = A[lo] for higuy <= i < loguy    */

/* We've finished the partition, now we want to sort the subarrays
   [lo, higuy-1] and [loguy, hi].
   We do the smaller one first to minimize stack usage.
   We only sort arrays of length 2 or more.*/

if ( higuy - 1 - lo >= hi - loguy ) {
    if (lo + width < higuy) {
lostk[stkptr] = lo;
histk[stkptr] = higuy - width;
++stkptr;
    } /* save big recursion for later */

    if (loguy < hi) {
lo = loguy;
goto recurse; /* do small recursion */
    }
}
else {
    if (loguy < hi) {
lostk[stkptr] = loguy;
histk[stkptr] = hi;
++stkptr; /* save big recursion for later */
    }

    if (lo + width < higuy) {
hi = higuy - width;
goto recurse; /* do small recursion */
    }
}
    }

    /* We have sorted the array, except for any pending sorts on the stack.
       Check if there are any, and do them. */

    --stkptr;
    if (stkptr >= 0) {
lo = lostk[stkptr];
hi = histk[stkptr];
goto recurse; /* pop subarray from stack */
    }
    else
return; /* all subarrays done */
}

#ifdef _CRUISER_
#pragma check_stack()     /* revert to command line behaviour */
#endif  /* ndef _CRUISER_ */


/***
*shortsort(hi, lo, width, comp) - insertion sort for sorting short arrays
*
*Purpose:
* sorts the sub-array of elements between lo and hi (inclusive)
* side effects:  sorts in place
* assumes that lo < hi
*
*Entry:
* char *lo = pointer to low element to sort
* char *hi = pointer to high element to sort
* unsigned width = width in bytes of each array element
* int (*comp)() = pointer to function returning analog of strcmp for
* strings, but supplied by user for comparing the array elements.
* it accepts 2 pointers to elements and returns neg if 1<2, 0 if
* 1=2, pos if 1>2.
*
*Exit:
* returns void
*
*Exceptions:
*
*******************************************************************************/

static void _CALLTYPE4 shortsort (
    char *lo,
    char *hi,
    unsigned width,
    int (_CALLTYPE1 *comp)(const void *, const void *)
    )
{
    char *p, *max;

    /* Note: in assertions below, i and j are alway inside original bound of
       array to sort. */

    while (hi > lo) {
/* A[i] <= A[j] for i <= j, j > hi */
max = lo;
for (p = lo+width; p <= hi; p += width) {
    /* A[i] <= A[max] for lo <= i < p */
    if (comp(p, max) > 0) {
max = p;
    }
    /* A[i] <= A[max] for lo <= i <= p */
}

/* A[i] <= A[max] for lo <= i <= hi */

swap(max, hi, width);

/* A[i] <= A[hi] for i <= hi, so A[i] <= A[j] for i <= j, j >= hi */

hi -= width;

/* A[i] <= A[j] for i <= j, j > hi, loop top condition established */
    }
    /* A[i] <= A[j] for i <= j, j > lo, which implies A[i] <= A[j] for i < j,
       so array is sorted */
}


/***
*swap(a, b, width) - swap two elements
*
*Purpose:
* swaps the two array elements of size width
*
*Entry:
* char *a, *b = pointer to two elements to swap
* unsigned width = width in bytes of each array element
*
*Exit:
* returns void
*
*Exceptions:
*
*******************************************************************************/

static void _CALLTYPE4 swap (
    char *a,
    char *b,
    unsigned width
    )
{
    char tmp;

    if ( a != b )
/* Do the swap one character at a time to avoid potential alignment
   problems. */
while ( width-- ) {
    tmp = *a;
    *a++ = *b;
    *b++ = tmp;
}
}
 

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

Quote from: guga on September 25, 2014, 06:21:18 AM
Qsort function decoded

It is exactly the same as in msvcrt. I tried to optimize a bit, but the function is a true hell. Can someone help me optimizing and simplifying it ?

guga,
CRT qsort is pretty fast for a sort routine that provides a user-defined callback. In the old forum, we did some timings, here is an adaption for DWORD size:


Intel(R) Celeron(R) M CPU        420  @ 1.60GHz (MMX, SSE, SSE2, SSE3)
Sorting 2000000 items:
DwRange=-1
517 ms for ArraySort    MasmBasic
430 ms for nrQsortA     Masm32 lib
839 ms for crt_qsort    with callback

DwRange=10000
262 ms for ArraySort    MasmBasic
296 ms for nrQsortA     Masm32 lib
527 ms for crt_qsort    with callback

DwRange=10000
259 ms for ArraySort    MasmBasic
293 ms for nrQsortA     Masm32 lib
516 ms for crt_qsort    with callback


What you can see is that the Masm32 library qsort beats CRT by a factor 2 if elements are in the full DWORD range, i.e. -1. MasmBasic uses a merge sort, which is faster if elements have a lower range, like 0...10000.

So far, so good. But only CRT provides a callback function, which a) makes it slower but b) is what you need when comparing arbitrary structure elements.

So instead of trying to reinvent the wheel, you better concentrate on speeding up the comparison function, because that's the real bottleneck.

MichaelW

I have not considered all of the possibilities, but if you use an unstable algorithm for the secondary sort, it may fail to maintain the relative order of primary keys where the secondary keys are equal. The stability of the primary sort does not matter. Here is a sloppy example, sorting contrived data, that shows the effect:

;==============================================================================
    include \masm32\include\masm32rt.inc
;==============================================================================
    .data
        a1 dd 60000002h,80000001h,40000005h,90000002h,30000005h,50000001h
        a2 dd 60000002h,80000001h,40000005h,90000002h,30000005h,50000001h
    .code
;==============================================================================
;--------------------------------------------------------
; This is a compare procedure for the CRT qsort function
; that compares only the low-order byte.
;--------------------------------------------------------
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
compare_low proc C p1:DWORD, p2:DWORD
    mov eax, [esp+4]
    mov edx, [esp+8]
    mov eax, [eax]
    mov edx, [edx]
    and eax, 0ffh
    and edx, 0ffh
    sub eax, edx
    ret
compare_low endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==============================================================================
;--------------------------------------------------------
; This is a compare procedure for the CRT qsort function
; that compares only the high-order byte.
;--------------------------------------------------------
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 4
compare_high proc C p1:DWORD, p2:DWORD
    mov eax, [esp+4]
    mov edx, [esp+8]
    mov eax, [eax]
    mov edx, [edx]
    and eax, 0ff000000h
    and edx, 0ff000000h
    sub eax, edx
    ret
compare_high endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==============================================================================
;-----------------------------------------------------------
; This is a quick and dirty ascending dword insertion sort,
; modified to sort based on the low-order byte only.
;-----------------------------------------------------------
isort proc uses ebx pArray:DWORD, count:DWORD
    mov   eax, pArray
    mov   ebx, 1
    .WHILE ebx < count
        mov   edx, [eax+ebx*4]
        mov   ecx, ebx
        push  ebx
        push  edx
        .WHILE ecx > 0
            mov   ebx, [eax+ecx*4-4]
            and   ebx, 0ffh
            and   edx, 0ffh
            .IF ebx > edx
                push  DWORD PTR [eax+ecx*4-4]
                pop   DWORD PTR [eax+ecx*4]
            .ELSE
                .BREAK
            .ENDIF
            dec   ecx
        .ENDW
        pop   edx
        pop   ebx
        mov   [eax+ecx*4], edx
        inc   ebx
    .ENDW
    ret
isort endp
;==============================================================================
start:
;==============================================================================
    invoke crt_qsort, ADDR a1, 6, 4, compare_high
    xor ebx, ebx
    .WHILE ebx < 6
        mov eax, a1[ebx*4]
        printf("%X\n",eax)
        inc ebx
    .ENDW
    printf("\n")
    invoke crt_qsort, ADDR a1, 6, 4, compare_low
    xor ebx, ebx
    .WHILE ebx < 6
        mov eax, a1[ebx*4]
        printf("%X\n",eax)
        inc ebx
    .ENDW
    printf("\n")
    invoke crt_qsort, ADDR a2, 6, 4, compare_high
    xor ebx, ebx
    .WHILE ebx < 6
        mov eax, a2[ebx*4]
        printf("%X\n",eax)
        inc ebx
    .ENDW
    printf("\n")
    invoke isort, ADDR a2, 6
    xor ebx, ebx
    .WHILE ebx < 6
        mov eax, a2[ebx*4]
        printf("%X\n",eax)
        inc ebx
    .ENDW
    printf("\n")

    inkey "Press any key to exit..."
    exit
;==============================================================================
end start


30000005
40000005
50000001
60000002
80000001
90000002

80000001
50000001
60000002
90000002
40000005
30000005

30000005
40000005
50000001
60000002
80000001
90000002

50000001
80000001
60000002
90000002
30000005
40000005


Well Microsoft, here's another nice mess you've gotten us into.

guga

Hi JJ, Many thanks for the timmings and info.

You were right about crt qsort, but, i still needed something faster. So, i adapted Steve´s code to works on a similar way as in his version, which, i presume it is faster then crt and a bit slower then his own version, since i´m using pointers to structure arrays. But, so far, it seems accurated and fast (i´m not using callbacks now)
I had to create some functions to adapt his code, just to make it work, but i can rebuild/merge some functions to gain speed. (And also, remove the SysAllocStringByteLen and replace it with the necessary amount of data to be used as a placeholder for the loops  backup indexes)

This is my initial testings.



Proc nrQsortAEx::
    Arguments @Array, @Count, @StructSize, @RefIndex, @RefSize
    Local @cntr, @bLen, @hMem, @IndexFirst, @IndexLast, @MidPointer
    Uses ebx, esi, edi

    mov D@cntr 0

    ; allocate temporary buffer
    mov eax D@Count | add eax 40 | mov D@bLen eax
    call 'oleaut32.SysAllocStringByteLen' 0, D@bLen
    mov D@hMem eax
    mov edi D@hMem  ; buffer address in EDI

    ; set First and Last reference values (They are indexes to the structure)
    mov D@IndexFirst 0
    mov eax D@Count | dec eax | mov D@IndexLast eax ; Last = count - 1

@outer_loop: B0:

    ; calculate midpoint
    mov eax D@IndexLast | add eax D@IndexFirst | shr eax 1
    call GetPointerFromIndex eax, D@StructSize, D@Array, D@RefIndex | mov D@MidPointer eax

    ; midpoint in EBX
    mov ecx D@IndexFirst
    mov edx D@IndexLast

@inner_loop: D0:
    call GetPointerFromIndex ecx, D@StructSize, D@Array, D@RefIndex ; the decompilation shows this as 8 parameters ?
    call CompareFunc2 eax, D@MidPointer, D@RefIndex, D@RefSize ; 9 > 5, returns 1 . The decompilation shows here as 32 parameters
    On eax >s= 0, jmp @wl2 ; <----- Guga Note: review this macro later. a simple cmp is enough and faster. If eax is signed and bigger or equal to 0, do jmp. Since CompareFunc2 returns 0,1, 0-1, a simple cmp with 0-1 should be enough
    inc ecx
    jmp @inner_loop

@wl2:

    call GetPointerFromIndex edx, D@StructSize, D@Array, D@RefIndex
    call CompareFunc2 eax, D@MidPointer, D@RefIndex, D@RefSize ; if 7 < 5 , returns 1
    On eax <s= 0, jmp @wl2Out ; ok ; <----- Guga Note: review this macro later. a simple cmp is enough and faster. If eax is signed and smaller or equal to 0, do jmp. Since CompareFunc2 returns 0,1, 0-1, a simple cmp with 0-1 should be enough
    dec edx
jmp @wl2

@wl2Out: E6:
    cmp ecx edx | jg @exit_innerx ; If ecx > edx, exit inner loop

        call GetPointerFromIndex ecx, D@StructSize, D@Array, D@RefIndex | mov ebx eax
        call GetPointerFromIndex edx, D@StructSize, D@Array, D@RefIndex
        call SwapStructures ebx, eax, D@StructSize, D@RefIndex
        inc ecx
        dec edx
        cmp ecx edx | jle @inner_loop

@exit_innerx:
    cmp ecx D@IndexLast | jg @iNxt   ; If ecx < Last jump over ; compare only the addresses and not the values (they are indexes)

    mov eax D@cntr
    mov D$edi+eax*4 ecx
    mov ebx D@IndexLast
    mov D$edi+eax*4+4 ebx
    add D@cntr 2

@iNxt:
    ; retrieve last index from edx
    mov D@IndexLast edx  ; Last  = EDX
    cmp edx D@IndexFirst | jg @outer_loop    ; compare Last & First
    cmp D@cntr 0 | je @qsOut

        sub D@cntr 2
        mov eax D@cntr
        mov ebx D$edi+eax*4 | mov D@IndexFirst ebx
        mov ebx D$edi+eax*4+4 | mov D@IndexLast ebx
jmp @outer_loop

@qsOut:
    call 'oleaut32.SysFreeString' D@hMem

EndP



Proc GetPointerFromIndex:
    Arguments @Index, @StructSize, @Array, @RefIndex

    mov eax D@Index
    imul eax D@StructSize | add eax D@Array | add eax D@RefIndex

EndP



Proc CompareFunc2:
    Arguments @Struct1, @Struct2, @RefIndex, @RefSize
    Uses esi, edi, edx, ecx, ebx

    mov ecx D@Struct1 | mov eax ecx | sub eax D@RefIndex | add eax D@RefSize | mov eax D$eax
    mov edx D@Struct2 | mov ebx edx | sub ebx D@RefIndex | add ebx D@RefSize | mov ebx D$ebx

    If eax > ebx ; If size of struct1 is bigger then size of struct2, we truncate at struct2´s size
        mov eax ebx
    End_If

    c_call 'msvcrt.strncmp' D$ecx, D$edx, eax

; To sort an array in decreasing order, reverse the sense of
; greater than and less than in the comparison function :
;
;  neg     eax

L0:

    ; If Strutc1 > Struct2 , return 1
    ; If Strutc1 = Struct2 , return 0
    ; If Strutc1 < Struct2 , return 0-1

EndP



Proc SwapStructures:
    Arguments @Struct1, @Struct2, @StructSize, @RefIndex
    Uses ecx, edx, eax, ebx

    mov ecx D@Struct1 | mov eax ecx | sub eax D@RefIndex
    mov edx D@Struct2 | mov ebx edx | sub ebx D@RefIndex

    mov ecx D@StructSize | dec ecx
        L1:
            mov dh B$eax+ecx
            mov dl B$ebx+ecx | mov B$eax+ecx dl
            mov B$ebx+ecx dh
        dec ecx | jnl L1<
EndP


usage
[ArrayPtr:
ArrayPtr.Sample1.Data1: D$ 10
ArrayPtr.Sample1.Data2: D$ Test1
ArrayPtr.Sample2.Data1: D$ 10
ArrayPtr.Sample2.Data2: D$ Test2
ArrayPtr.Sample3.Data1: D$ 10
ArrayPtr.Sample3.Data2: D$ Test3]

[Test1: B$ 9, 10, 11, 12, 13, 9, 10, 11, 12, 13]
[Test2: B$ 5, 12, 11, 12, 13, 5, 12, 11, 12, 13]
[Test3: B$ 7, 12, 11, 12, 13, 7, 12, 11, 12, 13]

call nrQsortAEx ArrayPtr, 3, 8, 4, 0


Arrayptr = Pointer to the start of the structure array
3 = total amount of Elements of the Array. So, it is the total mount of structures used on the array
8 = The size of each structure
4 = The Relative Position (An equate only) of the structure that is a pointer to the contents you want to parse (test1, test2, test3)
0 = The Relative Position (An equate only) of the structure that is the size of the contents you want to parse (10, in this case)

Notes:
a) One thing that i believe would make it be a  bit faster is remove the sysalloc/sysfre apis.I understood that this memory is used as a placeholder for the indexes that are being jumped and stored inside the loops. But, there should be a way to calculate the maximum amount of indexes used. For instance, a 3 elements of the array, uses only 3 dwords as a placeholder, whereas the function internally when analyzed through Ida, shows me that the functions in between the loops have the arguments "increased" (after passing through "smartdec" decompiler plugin), meaning that the stack is being counted as many times the looping requires. So, one of my functions (SwapStructures) have 4 parameters, but the decompiler assumes it as having 32. So, 8 times, which may means 8 loops at most.
So, if the multiplicand factor is 8, we may not need to add 40 bytes, but 8*n bytes, where "n" = Count.

This can represent a couple of things....

  • or we need only a global variable as a placeholder for edi, containing, 8 dwords only. (I hope that this the case, so, we can no longer need a sysalloc/sysfree)
  • or....the total size of the variable (hmem) is 8(dwords)*Count
  • or, we can have a limitation of the variable size which i suppose it is 256 dwords, representing 255 bytes (chars 0 to FF) plus 1 for padding. (I hope that this can also be the case, so, we can no longer need a sysalloc/sysfree)
  • or on a worst scenario, we may have a total maximum value of 256*256 dwords as a maximum variable len to be used as the placeholder.
  • or we can build the variable len as the same way as in the crt qsort that uses only 30 dwords as a placeholder for each stack backup. SInce it have 2 stacks (lo and hi) we may need a total of 60 dwords only as a placeholder. If this is the case, we need only to use a local structure whose size is 60*4 bytes long as a placeholder, avoiding the usage of a global variable

b) I´ll need to merge/rebuild the GetPointerFromIndex to be used inline to make it a bit faster.
c) SwapStructures function can be replaced by a faster memory copy algorithm
d) CompareFunc2 function, may be inline, but i´ll need to rework on the strncmp function, replacing the msvcrt api with a built'in function, which i can later make also inline.
e) CompareFunc2 function needs some routine to check when the size of the contents varies. Right now, it is truncating when the contents have different sizes (which is normal as i testes on the regular msvcrt qsort function), but i´ll teste more to see if truncating is ok, or i need to revert the results when sizes of functions are different, but the initial bytes of 2 contents matches.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

1st tests on removing the sysalloc are working fine. When i create an array formed by 1000 structures, only 18 dwords are used as a placeholder for edi "stack" backup. And the function is real fast even if i´m using that amount of structures to be parsed (each structure have a size of 24 bytes). So, the "placeholder" seems to be growing, but on a very tiny ratio. I´ll try to calculate the ratio of this growth to see what limit can be used.

So far, it seems to be a logaritm ratio.

; 10 structures uses 6 dwords (tested)
; 100 uses 10 dwords = 6*2 (10^2) (tested)
; 1000 uses 18 dwords  = 6*3 (10^3) (tested)
; 10000 uses 24 = 6*4 (10^4) ? (not tested)
; 100000 uses 30 = 6*5 (10^5) ?? (not tested)
; 1000000 uses 36 = 6*6 (10^6) ?? (not tested)
; 10000000 uses 42 = 6*7 (10^7) ?? (not tested)

Once i suceed to make it be faster rewritting those functions, i´ll try to insert the function on a wrking app and will create a script with those amount of structures to test the placeholder ratio. If the assumption that the logaritm ratio is correct, then we can use a limit of 60 dwords only that should be way more then enough for a real life application. (Afterall 60 dwords may represent that it can use and sort 10000000000 = 10^10 structures = 10 Gb ???....Damn, i hope it is correct :icon_mrgreen: :icon_mrgreen:)


Btw...Steve could do this to test....Remove the sysalloc api, and replace it with a local variable/structure of 60 dwords as a placeholder for edi. Can someone test on his version it to see if the assumption is correct ?
This limit may also work on his version, since the version i made, i biased on his and didn´t changed the main logic of the code.

I´m not sure about the masm syntax, but it may be something like:

remove this
nrQsortA proc Arr:DWORD,count:DWORD

(...)

    mov eax, count
    add eax, 40
    mov bLen, eax
    invoke SysAllocStringByteLen,0,bLen
    mov hMem, eax

    mov edi, hMem             ; buffer address in EDI
(...)
    invoke SysFreeString,hMem


and insert on the top of the function this
    LOCAL hMem  :DWORD (60) <---- 60 dwords
(...)
mov edi, hMem             ; buffer address in EDI >---- or a "lea edi, hMem". I don´t remember the masm syntax for the structure/pointer. I mean, edi is a pointer to hMem
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

jj2007

Quote from: guga on September 26, 2014, 08:55:10 AM
    call GetPointerFromIndex edx, D@StructSize, D@Array, D@RefIndex
  ..

Proc GetPointerFromIndex:
    Arguments @Index, @StructSize, @Array, @RefIndex

    mov eax D@Index
    imul eax D@StructSize | add eax D@Array | add eax D@RefIndex

EndP

You should definitely inline this one - your code will be shorter and faster.
Note that imul itself has shorter and faster variants:

   mov eax, someindex  ; 8 bytes
   imul eax, somesize

is (marginally) slower and longer than

   imul eax, someindex, somesize  ; 6 bytes

guga

Hi JJ, i´m currently working on it

Btw, i´m reading your thread about szcmpi and found your version of replacestring that uses MbStrCmp (stringreplace), which seems to be extremelly fast.
http://masm32.com/board/index.php?topic=214.0
http://masm32.com/board/index.php?topic=94.0

For what i saw, MbStrCmp uses SSE2 mnemonics, and a case sensitive flag, but it is for null terminated strings. Do you have a function as fast as this, that works for predefined size of strings ? (When i talk about strings, they are not necessarily be "strings", but any data chunk with a predefined fixed len). Also, that works for different size of the chuncks/strings

Something similar to strncmp, which for VS2010, i decoded as:



; Same size, though :(. Need adapt it to work with different sizes of each string to be compared

Proc strncmp:
    Arguments @First, @Last, @Count
    Local @nCounter
    Uses ebx, esi, edi, ebx, edx

    xor eax eax
    On D@Count = 0, ExitP
    xor ebx ebx
    mov esi D@First | mov edi D@Last
    mov edx D@Count

    ...If edx >= 4

        sub edx 4
        ..While ebx < edx

            movzx eax B$esi
            movzx ecx B$edi

            If_Or eax = 0, eax <> ecx
                sub eax ecx
                ExitP
            End_If
           
            inc esi | movzx eax B$esi
            inc edi | movzx ecx B$edi

            If_Or eax = 0, eax <> ecx
                sub eax ecx
                ExitP
            End_If

            inc esi | movzx eax B$esi
            inc edi | movzx ecx B$edi

            If_Or eax = 0, eax <> ecx
                sub eax ecx
                ExitP
            End_If

            inc esi | movzx eax B$esi
            inc edi | movzx ecx B$edi

            If_Or eax = 0, eax <> ecx
                sub eax ecx
                ExitP
            End_If

            add ebx 4
            sub edx 4

        ..End_While

    ...End_If

    ; residual loop
    mov edx D@Count
    ..While ebx < edx

        inc esi | movzx eax B$esi
        inc edi | movzx ecx B$edi

        If_Or eax = 0, eax <> ecx
            sub eax ecx
            ExitP
        End_If

        inc ebx

    ..End_While

    xor eax eax

EndP
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

Hi guys, can someone test this variation of nrQsortA ?

I guess, i was right about the logarithm proportion of the used memory. So, i removed the SysAlloc Apis, and tested on a 52 element Array. It works and is faster then the original masm version. On my tests it is about 20% faster.

The source code is:


[hMem: D$ 0 #60]

Proc nrQsortAPlus:
    Arguments @Array, @Count
    Local @cntr, @bLen, @hMem, @First, @Last, @temp
    Uses ebx, esi, edi

    mov esi D@Array
    mov D@cntr 0

    mov edi hMem; buffer address in EDI

    ; set First and Last reference values
    mov D@First 0
    mov eax D@Count | dec eax | mov D@Last eax ; Last = count - 1

@outer_loop: B0:
    ; calculate midpoint
    mov eax D@Last | add eax D@First | shr eax 1

    ; midpoint in EBX
    mov ebx D$esi+eax*4 | mov D@temp ebx

    mov ecx D@First
    mov edx D@Last

@inner_loop: D0:
    cmp D$esi+ecx*4 ebx | jge @wl2
    inc ecx
    jmp @inner_loop

@wl2:

    cmp D$esi+edx*4 ebx | jle @wl2Out
    dec edx
jmp @wl2

@wl2Out: E6:
    cmp ecx edx | jg @exit_innerx ; If ecx > edx, exit inner loop

        mov eax D$esi+ecx*4
        mov ebx D$esi+edx*4 ; swap elements
        mov D$esi+ecx*4 ebx
        mov D$esi+edx*4 eax

        mov ebx D@temp  ; restore EBX
        inc ecx
        dec edx
        cmp ecx edx | jle @inner_loop

@exit_innerx:
    cmp ecx D@Last | jg @iNxt   ; If ecx < Last jump over
    mov eax D@cntr
    mov D$edi+eax*4 ecx
    mov ebx D@Last
    mov D$edi+eax*4+04 ebx
    add D@cntr 2

@iNxt:
    mov ebx D@temp  ; restore EBX
    mov D@Last edx  ; Last  = EDX
    cmp edx D@First | jg @outer_loop    ; compare Last & First
    cmp D@cntr 0 | je @qsOut
        sub D@cntr 2
        mov eax D@cntr
        mov ebx D$edi+eax*4
        mov D@First ebx
        mov ebx D$edi+eax*4+04
        mov D@Last ebx
        mov ebx D@temp  ; restore EBX
jmp @outer_loop

@qsOut:

EndP



Example of usage:



[DArrTest1: D$ 9
DArrTest2: D$ 5
DArrTest3: D$ 7
DArrTest4: D$ 10
DArrTest5: D$ 12
DArrTest6: D$ 35
DArrTest7: D$ 48
DArrTest8: D$ 158
DArrTest9: D$ 63
DArrTest10: D$ 147
DArrTest11: D$ 19
DArrTest12: D$ 20
DArrTest13: D$ 18
DArrTest14: D$ 16
DArrTest15: D$ 15
DArrTest16: D$ 14
DArrTest17: D$ 135
DArrTest18: D$ 555
DArrTest19: D$ 540
DArrTest20: D$ 58
DArrTest21: D$ 254
DArrTest22: D$ 245
DArrTest23: D$ 325
DArrTest24: D$ 1259
DArrTest25: D$ 801
DArrTest26: D$ 759
DArrTest27: D$ 659
DArrTest28: D$ 1
DArrTest29: D$ 1259
DArrTest30: D$ 14589
DArrTest31: D$ 359
DArrTest32: D$ 333
DArrTest33: D$ 350
DArrTest34: D$ 312
DArrTest35: D$ 246
DArrTest36: D$ 298
DArrTest37: D$ 290
DArrTest38: D$ 275
DArrTest39: D$ 571
DArrTest40: D$ 1456
DArrTest41: D$ 1546
DArrTest42: D$ 2458
DArrTest43: D$ 2597
DArrTest44: D$ 3257
DArrTest45: D$ 124
DArrTest46: D$ 177
DArrTest47: D$ 176
DArrTest48: D$ 175
DArrTest49: D$ 888
DArrTest50: D$ 801
DArrTest51: D$ 974
DArrTest52: D$ 902]

    call nrQsortA DArrTest1, 52



But, still, it is slower then i thought. So i decided to make some tests on three partition quicksoft algoritm, dualpivot quicksort, and another one that i´m currently working on that seems to be 80-90% faster then the regular nrQsortA. (Well, at least on my preliminary tests).

Once i´m done, i´ll open another thread for testing this new algorithms.


On those tests i´m making i´m having these results using the values of the array i posted above

QuoteIntel(R) Core(TM) i7 CPU 870 @ 2.93GHz
Results 8 pass average
timing nrQsortA 10047 ms <---- Regular Masm lib
timing nrQsortAPlus 8078 ms <---- Guga variation of Regular Masm lib
timing sedgesort Plus 1822 ms <--------------------- AWESOME !!!!!! . I hope this is really true and accurate. !!! Need more tests
timing insortD 1041 ms <--------------------- EXTRAORDINARY !!!!!!. I hope this is really true and accurate. !!! Need more tests
timing insortF 1748 ms <----- Playing with Floats :)
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com