I tried to post this in the PB new forum but something is broken in the login and I cannot log into the PB forum so I am posting it here. It is a comparison between a normal stack frame algorithm to count set bits and a stripped down version that uses a FASTCALL format to reduce the overhead. The algorithm is identical in both versions, the difference being the removal of stack overhead and passing the argument directly in the EAX register.
' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
#include "\basic\include\win32api.inc"
' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
FUNCTION PBmain as LONG
#REGISTER NONE
LOCAL var as DWORD
! mov edi, 8
looplbl:
' -----------------------------
GetTickCount
! push eax
! mov esi, 500000000
lbl1:
bitcount1(&H11111111) ' call the function
! mov var, eax ' save the return value in EAX
! sub esi, 1
! jnz lbl1
GetTickCount
! pop ecx
! sub eax, ecx
! mov var, eax
SleepEx 100,0
StdOut format$(var)+" ms normal stack frame"
' -----------------------------
GetTickCount
! push eax
! mov esi, 500000000
lbl2:
! mov eax, &H11111111
! call bitcount0
! mov var, eax ' save the return value in EAX
! sub esi, 1
! jnz lbl2
GetTickCount
! pop ecx
! sub eax, ecx
! mov var, eax
SleepEx 100,0
StdOut format$(var)+" ms FASTPROC"
' -----------------------------
! sub edi, 1
! jnz looplbl
StdOut $CRLF+"Press any key to exit"
waitkey$
End FUNCTION
' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
FASTPROC bitcount0
PREFIX "!"
; -------------------------------
; argument passed in EAX register
; -------------------------------
mov edx,eax
shr eax,1
and eax,&h055555555
sub edx,eax
mov eax,edx
shr edx,2
and eax,&h033333333
and edx,&h033333333
add eax,edx
mov edx,eax
shr eax,4
add eax,edx
and eax,&h00f0f0f0f
imul eax,&h01010101
shr eax,24
ret
END PREFIX
END FASTPROC
' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
FASTPROC bitcount1 ( BYVAL v&) AS LONG 'counts bits of a long
!MOV EDX,v&
!SHR v&,1
!AND v&,&h055555555
!SUB EDX,v&
!MOV v&,EDX
!SHR EDX,2
!AND v&,&h033333333
!AND EDX,&h033333333
!ADD v&,EDX
!MOV EDX,v&
!SHR v&,4
!ADD v&,EDX
!AND v&,&h00F0F0F0F
!IMUL v&,&h01010101
!SHR v&,24
'!mov v&,eax
END FASTPROC=v&
' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
This is the resulting timings on my new i7.
1235 ms normal stack frame
937 ms FASTPROC
1281 ms normal stack frame
922 ms FASTPROC
1235 ms normal stack frame
921 ms FASTPROC
1188 ms normal stack frame
954 ms FASTPROC
1266 ms normal stack frame
953 ms FASTPROC
1219 ms normal stack frame
953 ms FASTPROC
1281 ms normal stack frame
938 ms FASTPROC
1219 ms normal stack frame
953 ms FASTPROC
Press any key to exit
Core i5 on Win7-64:1731 ms normal stack frame
1357 ms FASTPROC
1747 ms normal stack frame
1342 ms FASTPROC
1732 ms normal stack frame
1357 ms FASTPROC
1794 ms normal stack frame
1341 ms FASTPROC
1732 ms normal stack frame
1373 ms FASTPROC
1810 ms normal stack frame
1357 ms FASTPROC
1762 ms normal stack frame
1372 ms FASTPROC
1762 ms normal stack frame
1357 ms FASTPROC
Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz Win8.1 64
1328 ms normal stack frame
1188 ms FASTPROC
1328 ms normal stack frame
1203 ms FASTPROC
1343 ms normal stack frame
1203 ms FASTPROC
1343 ms normal stack frame
1187 ms FASTPROC
1343 ms normal stack frame
1187 ms FASTPROC
1343 ms normal stack frame
1203 ms FASTPROC
1343 ms normal stack frame
1203 ms FASTPROC
1344 ms normal stack frame
1188 ms FASTPROC
AMD
2421 ms normal stack frame
1891 ms FASTPROC
2454 ms normal stack frame
1859 ms FASTPROC
2485 ms normal stack frame
1859 ms FASTPROC
2484 ms normal stack frame
2000 ms FASTPROC
2625 ms normal stack frame
2000 ms FASTPROC
2844 ms normal stack frame
1875 ms FASTPROC
2453 ms normal stack frame
1844 ms FASTPROC
2438 ms normal stack frame
1906 ms FASTPROC
Core i7-4800MQ @2.7Ghz on Win8.1, 64
1157 ms normal stack frame
907 ms FASTPROC
1157 ms normal stack frame
891 ms FASTPROC
1172 ms normal stack frame
891 ms FASTPROC
1141 ms normal stack frame
875 ms FASTPROC
1172 ms normal stack frame
891 ms FASTPROC
1172 ms normal stack frame
891 ms FASTPROC
1188 ms normal stack frame
906 ms FASTPROC
1172 ms normal stack frame
891 ms FASTPROC
Intel(R) Pentium(R) Dual CPU E2160 @ 1.80GHz, under wine
5132 ms normal stack frame
3501 ms FASTPROC
3597 ms normal stack frame
2416 ms FASTPROC
3601 ms normal stack frame
2415 ms FASTPROC
3601 ms normal stack frame
2414 ms FASTPROC
3600 ms normal stack frame
2414 ms FASTPROC
3599 ms normal stack frame
2414 ms FASTPROC
3601 ms normal stack frame
2414 ms FASTPROC
3600 ms normal stack frame
2414 ms FASTPROC
P-III Win 2000
F:\TEMP>bmark
13659 ms normal stack frame
9104 ms FASTPROC
13629 ms normal stack frame
9644 ms FASTPROC
14261 ms normal stack frame
9484 ms FASTPROC
18196 ms normal stack frame
10946 ms FASTPROC
16804 ms normal stack frame
13149 ms FASTPROC
16504 ms normal stack frame
9103 ms FASTPROC
13619 ms normal stack frame
9103 ms FASTPROC
13650 ms normal stack frame
9784 ms FASTPROC
Press any key to exit
i3 Win 8.1
2391 ms normal stack frame
1797 ms FASTPROC
2485 ms normal stack frame
1828 ms FASTPROC
2406 ms normal stack frame
1797 ms FASTPROC
2422 ms normal stack frame
1797 ms FASTPROC
2422 ms normal stack frame
1797 ms FASTPROC
2390 ms normal stack frame
1782 ms FASTPROC
2391 ms normal stack frame
1781 ms FASTPROC
1781 ms FASTPROC
Press any key to exit
Edit: Win 98 Not Supported.