This code eliminates the C++ stuff, adds some function test code (currently commented out), adds a test of the CRT function, adds a naked function, and removes the prologue and epilogue from the external assembly procedure.
//=============================================================================
#include <windows.h>
#include <conio.h>
#include <stdio.h>
#include "counter_c.c"
//=============================================================================
// These for safety on single-core systems.
#define PP HIGH_PRIORITY_CLASS
#define TP THREAD_PRIORITY_NORMAL
// These for multi-core systems.
//#define PP REALTIME_PRIORITY_CLASS
//#define TP THREAD_PRIORITY_TIME_CRITICAL
#define LOOPS 10000000
//=============================================================================
int c_toupper(int c)
{
if (c > 122 || c < 97)
return c;
else
return c - 32;
}
char ia_toupper(int c)
{
__asm
{
mov eax, c
cmp eax, 122
ja end
cmp eax, 97
jb end
sub eax, 32
end:
}
}
__declspec(naked) int nk_toupper(int c)
{
__asm
{
mov eax, [esp+4]
cmp eax, 122
ja end
cmp eax, 97
jb end
sub eax, 32
end:
ret
}
}
//----------------------------------------------------------------------------
// This is necessary to prevent the optimizer from breaking the counter code.
//----------------------------------------------------------------------------
#pragma optimize("",off)
int asm_toupper(int c);
void main(void)
{
int i, c;
/*
for(i=0;i<200;i++)
{
c = rand() >> 8;
printf("%c",toupper(c));
printf("%c",c_toupper(c));
printf("%c",ia_toupper(c));
printf("%c",nk_toupper(c));
printf("%c",asm_toupper(c));
printf("\n");
}
*/
SetProcessAffinityMask(GetCurrentProcess(),1);
Sleep(5000);
for(i=0;i<4;i++)
{
counter_begin(1,LOOPS,PP,TP);
counter_end(1)
printf( "%d cycles, empty\n", counter_cycles );
counter_begin(2,LOOPS,PP,TP);
c = toupper(95);
c = toupper(110);
c = toupper(125);
counter_end(2)
printf( "%d cycles, toupper\n", counter_cycles );
counter_begin(3,LOOPS,PP,TP);
c = c_toupper(95);
c = c_toupper(110);
c = c_toupper(125);
counter_end(3)
printf( "%d cycles, c_toupper\n", counter_cycles );
counter_begin(4,LOOPS,PP,TP);
c = ia_toupper(95);
c = ia_toupper(110);
c = ia_toupper(125);
counter_end(4)
printf( "%d cycles, ia_toupper\n", counter_cycles );
counter_begin(5,LOOPS,PP,TP);
c = nk_toupper(95);
c = nk_toupper(110);
c = nk_toupper(125);
counter_end(5)
printf( "%d cycles, nk_toupper\n", counter_cycles );
counter_begin(6,LOOPS,PP,TP);
c = asm_toupper(95);
c = asm_toupper(110);
c = asm_toupper(125);
counter_end(6)
printf( "%d cycles, asm_toupper\n\n", counter_cycles );
}
getch();
}
#pragma optimize("",on)
Results on my P3:
0 cycles, empty
49 cycles, toupper
20 cycles, c_toupper
26 cycles, ia_toupper
20 cycles, nk_toupper
20 cycles, asm_toupper
0 cycles, empty
49 cycles, toupper
20 cycles, c_toupper
27 cycles, ia_toupper
20 cycles, nk_toupper
20 cycles, asm_toupper
0 cycles, empty
49 cycles, toupper
20 cycles, c_toupper
27 cycles, ia_toupper
20 cycles, nk_toupper
20 cycles, asm_toupper
0 cycles, empty
49 cycles, toupper
20 cycles, c_toupper
27 cycles, ia_toupper
20 cycles, nk_toupper
20 cycles, asm_toupper