GetPixel slow on recent versions of Windows?

MichaelW · July 21, 2016, 03:56:47 AM

Saw this thread on the FreeBASIC forum, so I created a MASM app to test GetPixel on my notebook, originally running Windows 8.1, but now running 10.

Code Select


;==============================================================================
; Build as a console app so printf will work.
;==============================================================================
  ; ----------------------------------------------------------------------
  ; The essential differences between these macros and the prvious macros
  ; are that these save and restore the original priorities, and provide
  ; a way to control the thread priority. Control of the thread priority
  ; allows timing code at the highest possible priority by combining
  ; REALTIME_PRIORITY_CLASS with THREAD_PRIORITY_TIME_CRITICAL.
  ; ----------------------------------------------------------------------

    counter_begin MACRO loopcount:REQ, process_priority:REQ, thread_priority
        LOCAL label

        IFNDEF __counter__qword__count__
          .data
          ALIGN 8             ;; Optimal alignment for QWORD
            __counter__qword__count__  dq 0
            __counter__loop__count__   dd 0
            __counter__loop__counter__ dd 0
            __process_priority_class__ dd 0
            __thread_priority__        dd 0
            __current_process__        dd 0
            __current_thread__         dd 0
          .code
        ENDIF

        mov __counter__loop__count__, loopcount
        invoke GetCurrentProcess
        mov __current_process__, eax
        invoke GetPriorityClass, __current_process__
        mov __process_priority_class__, eax
        invoke SetPriorityClass, __current_process__, process_priority
        IFNB <thread_priority>
            invoke GetCurrentThread
            mov __current_thread__, eax
            invoke GetThreadPriority, __current_thread__
            mov __thread_priority__, eax
            invoke SetThreadPriority, __current_thread__, thread_priority
        ENDIF
        xor eax, eax          ;; Use same CPUID input value for each call
        cpuid                 ;; Flush pipe & wait for pending ops to finish
        rdtsc                 ;; Read Time Stamp Counter

        push edx              ;; Preserve high-order 32 bits of start count
        push eax              ;; Preserve low-order 32 bits of start count
        mov   __counter__loop__counter__, loopcount
        xor eax, eax
        cpuid                 ;; Make sure loop setup instructions finish
      ALIGN 16                ;; Optimal loop alignment for P6
      @@:                     ;; Start an empty reference loop
        sub __counter__loop__counter__, 1
        jnz @B

        xor eax, eax
        cpuid                 ;; Make sure loop instructions finish
        rdtsc                 ;; Read end count
        pop ecx               ;; Recover low-order 32 bits of start count
        sub eax, ecx          ;; Low-order 32 bits of overhead count in EAX
        pop ecx               ;; Recover high-order 32 bits of start count
        sbb edx, ecx          ;; High-order 32 bits of overhead count in EDX
        push edx              ;; Preserve high-order 32 bits of overhead count
        push eax              ;; Preserve low-order 32 bits of overhead count

        xor eax, eax
        cpuid
        rdtsc
        push edx              ;; Preserve high-order 32 bits of start count
        push eax              ;; Preserve low-order 32 bits of start count
        mov   __counter__loop__counter__, loopcount
        xor eax, eax
        cpuid                 ;; Make sure loop setup instructions finish
      ALIGN 16                ;; Optimal loop alignment for P6
      label:                  ;; Start test loop
        __counter__loop__label__ equ <label>
    ENDM

    counter_end MACRO
        LOCAL lbl
        sub __counter__loop__counter__, 1
        jnz  __counter__loop__label__

        xor eax, eax
        cpuid                 ;; Make sure loop instructions finish
        rdtsc                 ;; Read end count
        pop ecx               ;; Recover low-order 32 bits of start count
        sub eax, ecx          ;; Low-order 32 bits of test count in EAX
        pop ecx               ;; Recover high-order 32 bits of start count
        sbb edx, ecx          ;; High-order 32 bits of test count in EDX
        pop ecx               ;; Recover low-order 32 bits of overhead count
        sub eax, ecx          ;; Low-order 32 bits of adjusted count in EAX
        pop ecx               ;; Recover high-order 32 bits of overhead count
        sbb edx, ecx          ;; High-order 32 bits of adjusted count in EDX

        mov DWORD PTR __counter__qword__count__, eax
        mov DWORD PTR __counter__qword__count__ + 4, edx

        invoke SetPriorityClass,__current_process__,__process_priority_class__
        IFNB <thread_priority>
            invoke SetThreadPriority, __current_thread__, __thread_priority__
        ENDIF

        finit
        fild __counter__qword__count__
        fild __counter__loop__count__
        fdiv
        fistp __counter__qword__count__

        mov eax, DWORD PTR __counter__qword__count__
    ENDM
    
;==============================================================================

include \masm32\include\masm32rt.inc
.686

;==============================================================================

.data
    hdc     dd 0
.code

;==============================================================================

DialogProc proc hwndDlg:HWND,uMsg:UINT,wParam:WPARAM,lParam:LPARAM
    SWITCH uMsg
        CASE WM_INITDIALOG
            invoke GetDC, hwndDlg
            mov hdc, eax
        CASE WM_LBUTTONDOWN
            counter_begin 10000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL
                invoke GetPixel, hdc, 0, 0
            counter_end
            printf("%d cycles\n", eax)            
        CASE WM_COMMAND
            SWITCH wParam
                CASE IDCANCEL
                    invoke ReleaseDC, hwndDlg, hdc
                    invoke EndDialog, hwndDlg, 0
            ENDSW
        CASE WM_CLOSE
            invoke ReleaseDC, hwndDlg, hdc
            invoke EndDialog, hwndDlg, 0
    ENDSW
    return 0
DialogProc endp

;==============================================================================
start:
;==============================================================================

    invoke GetCurrentProcess
    invoke SetProcessAffinityMask, rv(GetCurrentProcess), 1
    
    Dialog "Test", \
           "MS Sans Serif",8, \
            WS_OVERLAPPEDWINDOW or DS_CENTER, \
            0, \
            0,0,400,300, \
            1024
            
    CallModalDialog rv(GetModuleHandle, NULL), 0, DialogProc, NULL
    
    exit
;==============================================================================
end start

I don't have any other system to test ATM, but these are typical results on my laptop, with a 2.16 GHz Celeron:

Code Select


287777 cycles
286302 cycles
287172 cycles
287883 cycles
287064 cycles
285928 cycles
274108 cycles
280846 cycles

HSE · July 21, 2016, 09:57:13 AM

Code Select

183764 cycles
183989 cycles
182425 cycles
181967 cycles
182475 cycles
182457 cycles
182825 cycles
181607 cycles
181547 cycles

AMD A6 2.1 GHz Win7-32

I have a theory, the "worker thing" implemented by M$ add a new layer, some functions are using an intermediary. We will need to learn how to access at the lower level to regain speed in new APIs (I'm just trying to understand the old "subclass thing", don't count with me). It's at least so true like the Zen-Siemanski extraterrestrials.

FORTRANS · July 22, 2016, 05:29:40 AM

Hi,

Code Select

Desktop, Windows 2000, did not run.

Laptop, Pentium M, windows XP.

A:\>getpixel
3297 cycles
3262 cycles
3240 cycles
3243 cycles
3271 cycles
3284 cycles
3283 cycles
3262 cycles
3246 cycles
3263 cycles
3277 cycles
3268 cycles
3258 cycles
3294 cycles
3265 cycles
3290 cycles

Laptop, Core i3, Windows 8.1.  (launched from Explorer, ran unevenly.)

157757 cycles
208074 cycles
161934 cycles
168868 cycles
176620 cycles
162780 cycles
159121 cycles
162836 cycles

HTH,

Steve N.

jj2007 · July 22, 2016, 10:26:53 AM

Yeah, these Core i3 cpus are damn slow 8)

HSE · July 23, 2016, 11:13:41 AM

Code Select

13676 cycles
13726 cycles
13688 cycles
13652 cycles
13646 cycles
13662 cycles
13673 cycles
13786 cycles
13775 cycles

XP - AMD phenom 8650 2.31 GHz (3 core)

It's something more than cpus.

jj2007 · July 23, 2016, 11:58:00 AM

Quote from: HSE on July 23, 2016, 11:13:41 AMIt's something more than cpus.

Of course. It seems that recent Windows versions lock bitmaps to please the C# garbage collector. See e.g. https://msdn.microsoft.com/en-us/library/5ey6h79d.aspx

http://www.codeproject.com/Articles/406045/Why-the-use-of-GetPixel-and-SetPixel-is-so-ineffic

Quoteaccess to the pixel is not a simple reference to a memory area. Each getting or setting of color is associated with the invocation of a .NET Framework method, which is a wrapper for a native function contained in gdiplus.dll. This call is through the mechanism of P/Invoke (Platform Invocation), which is used to communicate from managed code to unmanaged API (an API outside of the .NET Framework). So for a bitmap of 1000x1000 pixels, there will be 1 million calls to the GetPixel method that besides the validation of parameters uses the native GdipBitmapGetPixel function.

Some propose GdipBitmapLockBits (here), but I suspect GetDIBits would be even faster. Check Hans Passant's answer, though :P

Siekmanski · July 23, 2016, 02:01:28 PM

SetPixel and GetPixel have to lock and unlock the bitmap area for every single pixel.
With one GdipBitmapLockBits and one GdipBitmapUnlockBits you can lock and unlock the whole bitmap and access the pixels directly in memory. ( I have used this method in my Texture loader )
Direct3d is faster ( as in my webcam proggy )

HSE · July 24, 2016, 12:59:00 AM

What a mess! There is 2 extra levels now.
Apparently GDI is not optional in new M$ OS.

Thanks for the explanation!

mineiro · July 24, 2016, 08:08:41 AM

Intel Core i3-3110M @2.40ghz, windows 7 32 bits.

Code Select

12655 cycles
12132 cycles
14401 cycles
12729 cycles
12635 cycles
13706 cycles
11855 cycles
12286 cycles
9597 cycles
14024 cycles
12911 cycles
14420 cycles
11540 cycles
12299 cycles
13232 cycles
12149 cycles
14549 cycles
12399 cycles
12366 cycles
13764 cycles
14421 cycles
12918 cycles
11452 cycles
11560 cycles

The MASM Forum

News:

GetPixel slow on recent versions of Windows?

MichaelW

HSE

FORTRANS

jj2007

HSE

jj2007

Siekmanski

HSE

mineiro