News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

Cycle-count macros for the Microsoft C compilers

Started by MichaelW, June 10, 2012, 12:44:40 PM

Previous topic - Next topic

MichaelW

These macros are substantially different than the similar macros that I posted to this sub-forum on the previous MASM Forum. Where the previous macros captured the lowest cycle count that occurred in any single loop through the block of code, these macros average the cycle count over all of the loops. The main reason for the change was that while my previous macros worked well for the AMD processors and for the Intel processors that preceded and followed the Pentium IV, they were almost useless for the Pentium IV (or any of the NetBurst architecture processors AFAIK), returning cycle counts that were typically a multiple of 4 and with a wide run-to-run variation. In my test of these macros running on a Pentium IV, under Windows XP, and assuming a sufficiently high loop count and a high enough priority, the run-to-run variation was typically no more than a few cycles.


//=============================================================================
// These two macros, which as coded are Microsoft C compiler specific, provide
// a convenient method of measuring the processor clock-cycle count for a block
// of code. The macros must be called in pairs, and the block of code, or a
// call to a procedure containing the block of code, must be placed between the
// counter_begin and counter_end macro calls. The average per-loop cycle count,
// corrected for the loop overhead, is returned in the global variable
// counter_cycles.
//
// I provided access to the process priority class and the thread priority to
// make it possible to operate at the highest possible priority by using the
// combination of REALTIME_PRIORITY_CLASS and THREAD_PRIORITY_TIME_CRITICAL.
// On a multi-core system (or even a P4 with HT) running Windows XP, doing so
// appears to be reasonably safe, even if the code being timed triggers an
// exception. But note the Microsoft warnings about operating at such a high
// priority for more than a very brief interval.
//
// The loops and the cycle-count calculations are done entirely in assembly
// to avoid problems with compiler optimizations breaking the code.
//
// I could not find any way to create unique labels for each macro invocation
// with the preprocessor, so I added a parameter to each of the macros so the
// programmer can supply a unique symbol that is then appended to the labels.
//=============================================================================

__int64 counter_cycles;

int     _counter_loopcount_;
int     _counter_loopcounter_;
DWORD   _process_priority_class_;
int     _thread_priority_;

#define counter_begin( n, loop_count, process_priority, thread_priority ) \
    _counter_loopcount_ = loop_count;                                     \
    _process_priority_class_ = GetPriorityClass(GetCurrentProcess());     \
    _thread_priority_ = GetThreadPriority(GetCurrentThread());           \
    SetPriorityClass(GetCurrentProcess(), process_priority);              \
    SetThreadPriority(GetCurrentThread(), thread_priority);              \
    _counter_loopcounter_ = _counter_loopcount_;                          \
    __asm{ xor eax, eax }                                                 \
    __asm{ cpuid }                                                        \
    __asm{ rdtsc }                                                        \
    __asm{ push edx }                                                     \
    __asm{ push eax }                                                     \
    __asm{ xor eax, eax }                                                 \
    __asm{ cpuid }                                                        \
    __asm{ align 16 }                                                     \
  R##n:                                                                   \
    __asm{ sub _counter_loopcounter_, 1 }                                 \
    __asm{ jnz R##n }                                                     \
    __asm{ xor eax, eax }                                                 \
    __asm{ cpuid }                                                        \
    __asm{ rdtsc }                                                        \
    __asm{ pop ecx }                                                      \
    __asm{ sub eax, ecx }                                                 \
    __asm{ pop ecx }                                                      \
    __asm{ sbb edx, ecx }                                                 \
    __asm{ push edx }                                                     \
    __asm{ push eax }                                                     \
    __asm{ xor eax, eax }                                                 \
    __asm{ cpuid }                                                        \
    __asm{ rdtsc }                                                        \
    __asm{ push edx }                                                     \
    __asm{ push eax }                                                     \
    _counter_loopcounter_ = _counter_loopcount_;                          \
    __asm{ xor eax, eax }                                                 \
    __asm{ cpuid }                                                        \
    __asm{ align 16 }                                                     \
  T##n:

#define counter_end( n )                                                  \
    __asm{ sub _counter_loopcounter_, 1 }                                 \
    __asm{ jnz T##n }                                                     \
    __asm{ xor eax, eax }                                                 \
    __asm{ cpuid }                                                        \
    __asm{ rdtsc }                                                        \
    __asm{ pop ecx }                                                      \
    __asm{ sub eax, ecx }                                                 \
    __asm{ pop ecx }                                                      \
    __asm{ sbb edx, ecx }                                                 \
    __asm{ pop ecx }                                                      \
    __asm{ sub eax, ecx }                                                 \
    __asm{ pop ecx }                                                      \
    __asm{ sbb edx, ecx }                                                 \
    __asm{ mov dword ptr [counter_cycles], eax }                          \
    __asm{ mov dword ptr [counter_cycles+4], edx }                        \
    SetPriorityClass(GetCurrentProcess(),_process_priority_class_);       \
    SetThreadPriority(GetCurrentThread(),_thread_priority_);             \
    counter_cycles /= _counter_loopcount_;


//=============================================================================
#include <windows.h>
#include <conio.h>
#include <stdio.h>
#include "counter_c.c"
//=============================================================================

//----------------------------------------------------------------------------
// Classic binary divide-and-conquer popcount.
// This is popcount_2() from
// http://en.wikipedia.org/wiki/Hamming_weight
//----------------------------------------------------------------------------

__inline int popcount_2( UINT32 x )
{
    UINT32 m1 = 0x55555555;
    UINT32 m2 = 0x33333333;
    UINT32 m4 = 0x0f0f0f0f;
    x -= (x >> 1) & m1;
    x = (x & m2) + ((x >> 2) & m2);
    x = (x + (x >> 4)) & m4;
    x += x >>  8;
    return (x + (x >> 16)) & 0x3f;
}

//----------------------------------------------------------------------------
// Popcount using multiply.
// This is popcount_3() from
// http://en.wikipedia.org/wiki/Hamming_weight
//----------------------------------------------------------------------------

__inline int popcount_mult( UINT32 x )
{
    UINT32 m1 = 0x55555555;
    UINT32 m2 = 0x33333333;
    UINT32 m4 = 0x0f0f0f0f;
    UINT32 h01 = 0x01010101;
    x -= (x >> 1) & m1;   /* put count of each 2 bits into those 2 bits */
    x = (x & m2) + ((x >> 2) & m2);   /* put count of each 4 bits in */
    x = (x + (x >> 4)) & m4;  /* put count of each 8 bits in */
    return (x * h01) >> 24;  /* returns left 8 bits of x + (x<<8) + ... */
}

void main( void )
{
    int i,r;

    SetProcessAffinityMask( GetCurrentProcess(), 1);

    printf( "%d\t%d\n", popcount_2(1), popcount_mult(1) );
    printf( "%d\t%d\n", popcount_2(3), popcount_mult(3) );
    printf( "%d\t%d\n", popcount_2(7), popcount_mult(7) );
    printf( "%d\t%d\n\n", popcount_2(8), popcount_mult(8) );

    Sleep(5000);

    for(  i=0 ; i<4 ; i++ )
    {
    counter_begin( 1, 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL );
    counter_end( 1 )
    printf( "%d cycles\n", counter_cycles );

    counter_begin( 2, 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL );
        r = popcount_2(0x3);
    counter_end( 2 )
    printf( "%d cycles\n", counter_cycles );

    counter_begin( 3, 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL );
        r = popcount_mult(0x3);
    counter_end( 3 )
    printf( "%d cycles\n\n", counter_cycles );
    }
    getch();
}


Compiled with VC Toolkit 2003 compiler, with /O2 /G6, and running on a P3:

1       1
2       2
3       3
1       1

0 cycles
16 cycles
12 cycles

0 cycles
16 cycles
12 cycles

0 cycles
16 cycles
12 cycles

0 cycles
16 cycles
12 cycles


And typical results for the same binary running on a P4 (Northwood):

1       1
2       2
3       3
1       1

2 cycles
9 cycles
4 cycles

0 cycles
9 cycles
4 cycles

0 cycles
9 cycles
4 cycles

0 cycles
9 cycles
4 cycles


Edit: Corrected problems in the GetThreadPriority and SetThreadPriority calls where I specified GetCurrentProcess instead of GetCurrentThread.
Well Microsoft, here's another nice mess you've gotten us into.

Greenhorn

Good work, Michael.  :t
If I have more time in the near future I will test it more intensive.

Agner has also an interesting clock counter for VC/MASM, also for x64 ...
Test programs for measuring clock cycles and performance monitoring

Greenhorn
Kole Feut un Nordenwind gift en krusen Büdel un en lütten Pint.

Gunther

Hi Michael,

rock solid work. Thank you.

Gunther
You have to know the facts before you can distort them.