Qsort function decoded
It is exactly the same as in msvcrt. I tried to optimize a bit, but the function is a true hell. Can someone help me optimizing and simplifying it ?
[CUTOFF 8]
[lostk: D$ 0 #30
histk: D$ 0 #30]
Proc qsort:
Arguments @base, @num, @width, @compare
Local @stkptr, @lo, @hi, @mid
Uses esi, edi, ebx
; * Note: the number of stack entries required is no more than
; 1 + log2(size), so 30 is sufficient for any array
If_Or D@num < 2, D@width = 0
ExitP
End_If
mov D@stkptr 0 ; initialize stack
mov esi D@num
mov edi D@width
mov ebx D@base
dec esi | imul esi edi | add esi ebx ; esi = ((esi-1)*edi)+ebx
mov D@lo ebx
mov D@hi esi
; this entry point is for pseudo-recursion calling:
; setting lo and hi and jumping to here is like recursion,
; but stkptr is preserved, locals aren't, so we preserve stuff on the stack
@Recursive2:
; number of el's to sort
mov eax esi | sub eax ebx | xor edx edx | div D@width | inc eax ; eax = ((eax-ebx)/edi)+1 ; (((A-C)/B)+1)
; on 1st loop, eax = esi. http://www.wolframalpha.com/input/?i=%28%28%28%28%28A-C%29%2FB%29%2B1%29-1%29*B%29%2BC
; below a certain size, it is faster to use a O(n^2) sorting method
...If eax <= CUTOFF
call shortsort esi, ebx, D@width, D@compare
...Else
; First we pick a partititioning element. The efficiency of the algorithm demands
; that we find one that is approximately the median of the values, but also that we select one fast.
; Using the first one produces bad performace if the array is already sorted, so we use the middle one,
; which would require a very wierdly arranged array for worst case performance.
; Testing shows that a median-of-three algorithm does not, in general, increase performance.
shr eax 1 | imul eax D@width | add eax ebx | mov edi eax ; find middle element
; swap it to beginning of array
call Swap ebx, edi, D@width, D@compare
call Swap ebx, esi, D@width, D@compare
call Swap edi, esi, D@width, D@compare
call Routine1 D@hi, D@width, D@compare
add esi D@width
cmp edi esi | jae D9> ; Code077C37107
Do
sub esi D@width
cmp esi edi | jbe D9> ; Code077C37107
call D@compare esi, edi
Loop_Until eax <> 0
cmp edi esi | jb G3> ; Code077C3711F
D9:
Do
mov eax D@lo | sub esi D@width;edx
cmp esi eax | jbe G6> ; Code077C37122
call D@compare esi, edi
Loop_Until eax <> 0
G3:
mov eax D@lo
G6:
mov edx D@hi
mov ecx edx | sub ecx ebx
mov edi esi | sub edi eax
cmp edi ecx | jl M4> ; Code077C3715C
If eax < esi
mov ecx D@stkptr
mov D$lostk+ecx*4 eax
mov D$histk+ecx*4 esi
inc D@stkptr
End_If
mov edi D@width
cmp ebx edx | jae A7> ; Code077C37187
mov esi D@hi | mov edi D@width | mov D@lo ebx
jmp @Recursive2
M4:
If ebx < edx
mov ecx D@stkptr
mov D$lostk+ecx*4 ebx
mov D$histk+ecx*4 edx
inc D@stkptr
End_If
cmp eax esi | jae A7> ; Code077C37187
mov ebx D@lo | mov edi D@width | mov D@hi esi
jmp @Recursive2
...End_If
A7:
dec D@stkptr | js E8> ; Code077C371B0
mov eax D@stkptr
mov edx D$lostk+eax*4
mov eax D$histk+eax*4
mov D@lo edx
mov D@hi eax
mov esi eax ; hi
mov ebx edx ; low
jmp @Recursive2
E8:
EndP
Proc shortsort:
Arguments @lo, @hi, @width, @comp
Local @pChar, @TmpLo
Uses edi, ecx, esi, ebx
mov ecx D@hi
mov edi D@lo | mov D@TmpLo edi
..If D@lo > ecx
mov edx D@width | lea eax D$ecx+edx | mov D@pChar eax
.Do
mov esi D@pChar
mov ebx ecx
.If esi <= D@TmpLo
Do
call D@comp esi, ebx
If eax >s 0
mov ebx esi
End_If
add esi D@width
Loop_Until esi > D@TmpLo
.End_If
.If_And ebx <> D@TmpLo, D@width <> 0
mov eax D@TmpLo
mov ecx ebx | sub ecx D@TmpLo | mov esi D@width
L1:
mov dh B$ecx+eax
mov dl B$eax | mov B$ecx+eax dl
mov B$eax dh
inc eax
dec esi | jne L1<
.End_If
mov edi D@TmpLo | sub edi D@width | mov D@TmpLo edi
mov ecx D@hi
.Loop_Until edi <= D@hi
..End_If
EndP
Proc Swap:
Arguments @Struct1, @Struct2, @Width, @compare
Local @Distance
Uses ebx, ecx, edx
mov ebx D@Struct1 | sub ebx D@Struct2 | jz L1> | mov D@Distance ebx
call D@compare D@Struct1, D@Struct2
.If eax >s 0
mov ebx D@Distance
mov eax D@Struct2
mov ecx D@width
L1:
mov dh B$eax+ebx
mov dl B$eax | mov B$eax+ebx dl
mov B$eax dh
inc eax
dec ecx | jne L1<
.End_If
L1:
EndP
; ebx, edi, esi is altered and is being used in the main function. Do not changed it
Proc Routine1:
Arguments @hi, @width, @compare
Local @mid
;..While edi > ebx ; jna
@Recursive1:
cmp edi ebx | jbe L1>
Do
add ebx D@width
cmp ebx edi | jae L1>;G4> ; Code077C37080
call D@compare ebx, edi
Loop_Until eax >s 0 ; jle ; i inverted it accidentally. it must loop only if it is smaller or equal to zero
...If edi <= ebx
L1:
Do
mov ecx D@width
mov eax D@hi
add ebx ecx
cmp ebx eax | ja L2>;I8> ; Code077C37098
call D@compare ebx, edi
Loop_Until eax >s 0
...End_If
L2:
Do
sub esi D@width
cmp esi edi | jbe L3>;K7> ; Code077C370AB
call D@compare esi, edi
Loop_Until eax <s= 0
L3:
cmp ebx esi | ja L7>;A5> ; Code077C370E5
If ebx <> esi
mov eax esi
mov edx D@width
mov ecx ebx | sub ecx esi | mov D@mid edx
L0:
mov dh B$eax+ecx
mov dl B$eax | mov B$eax+ecx dl
mov B$eax dh
;mov dl B$eax | mov B$ecx+eax dl
inc eax
dec D@mid | jnz L0<
End_If
If edi = esi
mov edi ebx
End_If
jmp @Recursive1
; ..End_While
L7:
EndP
Note. I really don´t think it is necessary the tables lostk and histk. It don´t seems to be needed once the proper coding reorganization/optimization is done. The C source says that they are used to stack preservation, but, it doesn´t seems to be the case of preserving the stack.
I suceeded to decode on a way that the user´s comparefunction does not need to be on a stdcalling convention anymore.
But, due to the re-entrances of the function, several loops, i have serious doubt that this can be called "fast" in any means. There should be a way to optimize this beast.
Here is the original C source (From WinNT. The actual version on WinXP seems to be a tiny variation of it.
/***
*qsort.c - quicksort algorithm; qsort() library function for sorting arrays
*
* Copyright (c) 1985-1991, Microsoft Corporation. All rights reserved.
*
*Purpose:
* To implement the qsort() routine for sorting arrays.
*
*Revision History:
* 06-22-84 RN author
* 03-25-85 RN added pre-check for elements already in order to
* eliminate worst-case behavior.
* 05-18-86 TC changed to recurse on the smallest piece to avoid
* piece. unneccesary stack usage, and to iterate on
* largest
* 01-09-87 BCM fixed huge-array case where (num-1) * wid computation
* was overflowing (large/compact models only)
* 06-13-89 PHG made more efficient, many more comments, removed
* recursion
* 10-30-89 JCR Added _cdecl to prototypes
* 03-15-90 GJF Replaced _cdecl with _CALLTYPE1 and added #include
* <cruntime.h>. Also, fixed the copyright.
* 04-05-90 GJF Made shortsort() and swap() _CALLTYPE4. Also, added
* #include <search.h>.
* 10-04-90 GJF New-style function declarators.
* 12-28-90 SRW Added _CRUISER_ conditional around check_stack pragmas
* 01-24-91 SRW Added missing close comment in swap procedure
* 11-19-91 GJF Do the swap one character at a time to avoid alignment
* woes.
*
*******************************************************************************/
#include <cruntime.h>
#include <stdlib.h>
#include <search.h>
/* prototypes for local routines */
static void _CALLTYPE4 shortsort(char *lo, char *hi, unsigned width,
int (_CALLTYPE1 *comp)(const void *, const void *));
static void _CALLTYPE4 swap(char *p, char *q, unsigned int width);
/* this parameter defines the cutoff between using quick sort and
insertion sort for arrays; arrays with lengths shorter or equal to the
below value use insertion sort */
#define CUTOFF 8 /* testing shows that this is good value */
/***
*qsort(base, num, wid, comp) - quicksort function for sorting arrays
*
*Purpose:
* quicksort the array of elements
* side effects: sorts in place
*
*Entry:
* char *base = pointer to base of array
* unsigned num = number of elements in the array
* unsigned width = width in bytes of each array element
* int (*comp)() = pointer to function returning analog of strcmp for
* strings, but supplied by user for comparing the array elements.
* it accepts 2 pointers to elements and returns neg if 1<2, 0 if
* 1=2, pos if 1>2.
*
*Exit:
* returns void
*
*Exceptions:
*
*******************************************************************************/
#ifdef _CRUISER_
#pragma check_stack(on) /* lots of locals */
#endif /* ndef _CRUISER_ */
/* sort the array between lo and hi (inclusive) */
void _CALLTYPE1 qsort (
void *base,
unsigned num,
unsigned width,
int (_CALLTYPE1 *comp)(const void *, const void *)
)
{
char *lo, *hi; /* ends of sub-array currently sorting */
char *mid; /* points to middle of subarray */
char *loguy, *higuy; /* traveling pointers for partition step */
unsigned size; /* size of the sub-array */
char *lostk[30], *histk[30];
int stkptr; /* stack for saving sub-array to be processed */
/* Note: the number of stack entries required is no more than
1 + log2(size), so 30 is sufficient for any array */
if (num < 2 || width == 0)
return; /* nothing to do */
stkptr = 0; /* initialize stack */
lo = base;
hi = (char *)base + width * (num-1); /* initialize limits */
/* this entry point is for pseudo-recursion calling: setting
lo and hi and jumping to here is like recursion, but stkptr is
prserved, locals aren't, so we preserve stuff on the stack */
recurse:
size = (hi - lo) / width + 1; /* number of el's to sort */
/* below a certain size, it is faster to use a O(n^2) sorting method */
if (size <= CUTOFF) {
shortsort(lo, hi, width, comp);
}
else {
/* First we pick a partititioning element. The efficiency of the
algorithm demands that we find one that is approximately the
median of the values, but also that we select one fast. Using
the first one produces bad performace if the array is already
sorted, so we use the middle one, which would require a very
wierdly arranged array for worst case performance. Testing shows
that a median-of-three algorithm does not, in general, increase
performance. */
mid = lo + (size / 2) * width; /* find middle element */
swap(mid, lo, width); /* swap it to beginning of array */
/* We now wish to partition the array into three pieces, one
consisiting of elements <= partition element, one of elements
equal to the parition element, and one of element >= to it. This
is done below; comments indicate conditions established at every
step. */
loguy = lo;
higuy = hi + width;
/* Note that higuy decreases and loguy increases on every iteration,
so loop must terminate. */
for (;;) {
/* lo <= loguy < hi, lo < higuy <= hi + 1,
A[i] <= A[lo] for lo <= i <= loguy,
A[i] >= A[lo] for higuy <= i <= hi */
do {
loguy += width;
} while (loguy <= hi && comp(loguy, lo) <= 0);
/* lo < loguy <= hi+1, A[i] <= A[lo] for lo <= i < loguy,
either loguy > hi or A[loguy] > A[lo] */
do {
higuy -= width;
} while (higuy > lo && comp(higuy, lo) >= 0);
/* lo-1 <= higuy <= hi, A[i] >= A[lo] for higuy < i <= hi,
either higuy <= lo or A[higuy] < A[lo] */
if (higuy < loguy)
break;
/* if loguy > hi or higuy <= lo, then we would have exited, so
A[loguy] > A[lo], A[higuy] < A[lo],
loguy < hi, highy > lo */
swap(loguy, higuy, width);
/* A[loguy] < A[lo], A[higuy] > A[lo]; so condition at top
of loop is re-established */
}
/* A[i] >= A[lo] for higuy < i <= hi,
A[i] <= A[lo] for lo <= i < loguy,
higuy < loguy, lo <= higuy <= hi
implying:
A[i] >= A[lo] for loguy <= i <= hi,
A[i] <= A[lo] for lo <= i <= higuy,
A[i] = A[lo] for higuy < i < loguy */
swap(lo, higuy, width); /* put partition element in place */
/* OK, now we have the following:
A[i] >= A[higuy] for loguy <= i <= hi,
A[i] <= A[higuy] for lo <= i < higuy
A[i] = A[lo] for higuy <= i < loguy */
/* We've finished the partition, now we want to sort the subarrays
[lo, higuy-1] and [loguy, hi].
We do the smaller one first to minimize stack usage.
We only sort arrays of length 2 or more.*/
if ( higuy - 1 - lo >= hi - loguy ) {
if (lo + width < higuy) {
lostk[stkptr] = lo;
histk[stkptr] = higuy - width;
++stkptr;
} /* save big recursion for later */
if (loguy < hi) {
lo = loguy;
goto recurse; /* do small recursion */
}
}
else {
if (loguy < hi) {
lostk[stkptr] = loguy;
histk[stkptr] = hi;
++stkptr; /* save big recursion for later */
}
if (lo + width < higuy) {
hi = higuy - width;
goto recurse; /* do small recursion */
}
}
}
/* We have sorted the array, except for any pending sorts on the stack.
Check if there are any, and do them. */
--stkptr;
if (stkptr >= 0) {
lo = lostk[stkptr];
hi = histk[stkptr];
goto recurse; /* pop subarray from stack */
}
else
return; /* all subarrays done */
}
#ifdef _CRUISER_
#pragma check_stack() /* revert to command line behaviour */
#endif /* ndef _CRUISER_ */
/***
*shortsort(hi, lo, width, comp) - insertion sort for sorting short arrays
*
*Purpose:
* sorts the sub-array of elements between lo and hi (inclusive)
* side effects: sorts in place
* assumes that lo < hi
*
*Entry:
* char *lo = pointer to low element to sort
* char *hi = pointer to high element to sort
* unsigned width = width in bytes of each array element
* int (*comp)() = pointer to function returning analog of strcmp for
* strings, but supplied by user for comparing the array elements.
* it accepts 2 pointers to elements and returns neg if 1<2, 0 if
* 1=2, pos if 1>2.
*
*Exit:
* returns void
*
*Exceptions:
*
*******************************************************************************/
static void _CALLTYPE4 shortsort (
char *lo,
char *hi,
unsigned width,
int (_CALLTYPE1 *comp)(const void *, const void *)
)
{
char *p, *max;
/* Note: in assertions below, i and j are alway inside original bound of
array to sort. */
while (hi > lo) {
/* A[i] <= A[j] for i <= j, j > hi */
max = lo;
for (p = lo+width; p <= hi; p += width) {
/* A[i] <= A[max] for lo <= i < p */
if (comp(p, max) > 0) {
max = p;
}
/* A[i] <= A[max] for lo <= i <= p */
}
/* A[i] <= A[max] for lo <= i <= hi */
swap(max, hi, width);
/* A[i] <= A[hi] for i <= hi, so A[i] <= A[j] for i <= j, j >= hi */
hi -= width;
/* A[i] <= A[j] for i <= j, j > hi, loop top condition established */
}
/* A[i] <= A[j] for i <= j, j > lo, which implies A[i] <= A[j] for i < j,
so array is sorted */
}
/***
*swap(a, b, width) - swap two elements
*
*Purpose:
* swaps the two array elements of size width
*
*Entry:
* char *a, *b = pointer to two elements to swap
* unsigned width = width in bytes of each array element
*
*Exit:
* returns void
*
*Exceptions:
*
*******************************************************************************/
static void _CALLTYPE4 swap (
char *a,
char *b,
unsigned width
)
{
char tmp;
if ( a != b )
/* Do the swap one character at a time to avoid potential alignment
problems. */
while ( width-- ) {
tmp = *a;
*a++ = *b;
*b++ = tmp;
}
}