News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests
NB: Posting URL's See here: Posted URL Change

Main Menu

BITs extraction from a byte

Started by frktons, February 02, 2013, 02:35:06 AM

Previous topic - Next topic

dedndave

that code i posted has the answer
that's why i posted it   :biggrin:

MichaelW

On a P3, the 14-16 instructions of simple integer code required execute in about 3 cycles.

;==============================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
;==============================================================================
.data
    val1        dd 0
    val2        dd 0
    val3        dd 0
    val4        dd 0
    bit_stream  db 11100100b
.code
;=============================================================================x=

Bits2 MACRO
    movzx eax,byte ptr bit_stream
    mov   ecx,eax
    mov   edx,eax
    shr   ecx,2
    shr   eax,4
    and   dl,3
    mov   ah,al
    and   cl,3
    shr   ah,2
    mov byte ptr val1,dl
    and   al,3
    mov byte ptr val2,cl
    mov byte ptr val4,ah
    mov byte ptr val3,al
ENDM

Xbits MACRO
    movzx eax, bit_stream
    movzx edx, al
    and edx, 3
    mov val1, edx
    shr eax, 2
    movzx edx, al
    and edx, 3
    mov val2, edx
    shr eax, 2
    movzx edx, al
    and edx, 3
    mov val3, edx
    shr eax, 2
    movzx edx, al
    and edx, 3
    mov val4, edx
ENDM

;==============================================================================
start:
;==============================================================================

    Bits2
    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n",val4)
    Xbits
    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n\n",val4)

    invoke GetCurrentProcess
    invoke SetProcessAffinityMask, eax, 1

    invoke Sleep, 5000

    REPEAT 3
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            Bits2
        counter_end
        printf("%d cycles\n", eax)
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            Xbits
        counter_end
        printf("%d cycles\n\n", eax)
    ENDM

    inkey
    exit
;==============================================================================
end start

Well Microsoft, here's another nice mess you've gotten us into.

dedndave

i get about 10 or 11 cycles for my Bits2 proc on a P4
you aren't going to get much faster because you will have at least 10 SSE instructions

Michael's code takes 13 cycles on my machine

frktons

Quote from: dedndave on February 02, 2013, 06:38:17 AM
that code i posted has the answer
that's why i posted it   :biggrin:

The answer I need is all about add/sub, this is one of the viable way I can complete
the SSE2 routine. If you post something interesting, that requires me quite a lot of
time to grasp, I need to postpone it and try by myself.   :P

Quote from: dedndave on February 02, 2013, 06:50:19 AM
i get about 10 or 11 cycles for my Bits2 proc on a P4
you aren't going to get much faster because you will have at least 10 SSE instructions

Michael's code takes 13 cycles on my machine

On my machine PIV 3.2 GHz dual core INTEL I have
Quote
15 cycles
14 cycles

15 cycles
14 cycles

15 cycles
14 cycles

I'm going to do this bit twiddling for about 100,000 bytes, so on those numbers
maybe SSE2 would be of some help.
There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

dedndave

i'll let you make the 3 look-up tables (256 bytes each table)
Bits3   PROC

    movzx eax,byte ptr bit_stream
    movzx ecx,byte ptr look_up_table1[eax]
    movzx edx,byte ptr look_up_table2[eax]
    mov   ch,look_up_table3[eax]
    mov byte ptr val3,dl
    and   al,3
    mov byte ptr val2,cl
    mov byte ptr val4,ch
    mov byte ptr val1,al
    ret

Bits3   ENDP

i am guessing 8 cycles on a P4   :P

frktons

Quote from: MichaelW on February 02, 2013, 06:41:07 AM
On a P3, the 14-16 instructions of simple integer code required execute in about 3 cycles.

;==============================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
;==============================================================================
.data
    val1        dd 0
    val2        dd 0
    val3        dd 0
    val4        dd 0
    bit_stream  db 11100100b
.code
;=============================================================================x=

Bits2 MACRO
    movzx eax,byte ptr bit_stream
    mov   ecx,eax
    mov   edx,eax
    shr   ecx,2
    shr   eax,4
    and   dl,3
    mov   ah,al
    and   cl,3
    shr   ah,2
    mov byte ptr val1,dl
    and   al,3
    mov byte ptr val2,cl
    mov byte ptr val4,ah
    mov byte ptr val3,al
ENDM

Xbits MACRO
    movzx eax, bit_stream
    movzx edx, al
    and edx, 3
    mov val1, edx
    shr eax, 2
    movzx edx, al
    and edx, 3
    mov val2, edx
    shr eax, 2
    movzx edx, al
    and edx, 3
    mov val3, edx
    shr eax, 2
    movzx edx, al
    and edx, 3
    mov val4, edx
ENDM

;==============================================================================
start:
;==============================================================================

    Bits2
    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n",val4)
    Xbits
    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n\n",val4)

    invoke GetCurrentProcess
    invoke SetProcessAffinityMask, eax, 1

    invoke Sleep, 5000

    REPEAT 3
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            Bits2
        counter_end
        printf("%d cycles\n", eax)
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            Xbits
        counter_end
        printf("%d cycles\n\n", eax)
    ENDM

    inkey
    exit
;==============================================================================
end start



Thanks Michael, very helpful.
There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

frktons

Quote from: dedndave on February 02, 2013, 07:04:10 AM
i'll let you make the 3 look-up tables (256 bytes each table)
Bits3   PROC

    movzx eax,byte ptr bit_stream
    movzx ecx,byte ptr look_up_table1[eax]
    movzx edx,byte ptr look_up_table2[eax]
    mov   ch,look_up_table3[eax]
    mov byte ptr val3,dl
    and   al,3
    mov byte ptr val2,cl
    mov byte ptr val4,ch
    mov byte ptr val1,al
    ret

Bits3   ENDP

i am guessing 8 cycles on a P4   :P

I like look up tables. Probably is one of the best solution around. :t
There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

dedndave

i measure 9 cycles on a P4 (Bits3)
you may be able to move things around a little and get it down to 8

also...
if you are doing a bunch of these, you won't be using a PROC
that should save a couple clock cycles

frktons

Quote from: dedndave on February 02, 2013, 07:36:38 AM
i measure 9 cycles on a P4 (Bits3)
you may be able to move things around a little and get it down to 8

also...
if you are doing a bunch of these, you won't be using a PROC
that should save a couple clock cycles

Dave, I don't understand why you need 3 look-up tables. I think one is enough.
A 256 element look-up table of owords, containing for each element one of the
possible 256 value of al already unpacked, and without any hassle, you use the value in al
as the index to the element that contains the unpacked oword that directly can
be used as it is.

More or less:

mov al, AByte
movaps xmm0, oword ptr look_up_table[eax]
movaps oword ptr val1, xmm0


More or less 1-2 cycles.

I foresee this solution is going to be the fastest one.
There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

dedndave

you could do it that way, too
it would be dwords though - not owords
        movzx eax,byte ptr bit_stream
        mov   edx,look_up_table[eax]

;bang - you got 4 bytes


maybe a better way...
        movzx eax,byte ptr bit_stream
        movzx edx,word ptr look_up_table[eax]
        movzx ecx,word ptr look_up_table[eax+2]


with the second method, you don't have to shift or bswap the upper bytes down
the desired bytes are in DL, DH, CL, CH

dedndave

        movzx eax,byte ptr bit_stream
        movzx edx,word ptr look_up_table[eax]
        movzx ecx,word ptr look_up_table[eax+2]
        mov byte ptr val1,dl
        mov byte ptr val2,dh
        mov byte ptr val3,cl
        mov byte ptr val4,ch

won't be too much faster than Bits3, though
you are getting down to a few clock cycles, where you just can't make it faster

dedndave

oh - i see what you are saying - yes, that would be fast   :t
movzx  al, byte ptr bit_stream
shl    eax, 4
movaps xmm0, oword ptr look_up_table[eax]
movaps oword ptr val1, xmm0

or, if it will let you use 16 as a multiplier...
movzx  al, byte ptr bit_stream
movaps xmm0, oword ptr look_up_table[16*eax]
movaps oword ptr val1, xmm0

4 kb table

frktons

These are the performances:
Quote
0h
1h
2h
3h
0h
1h
2h
3h
0h
1h
2h
3h

15 cycles
13 cycles
5 cycles

15 cycles
14 cycles
5 cycles

15 cycles
14 cycles
4 cycles

Press any key to continue ...

It is about 3:1 faster, with the simulation I did:

;==============================================================================
; FourInOne5.asm - MichaelW test version
;
;==============================================================================
include \masm32\include\masm32rt.inc
.686
.xmm
include \masm32\macros\timers.asm
;==============================================================================
.data
    val1        dd 0
    val2        dd 0
    val3        dd 0
    val4        dd 0
    bit_stream  db 11100100b
    align 16
    unpacked    dd 00000000H, 00000001H, 00000002H, 00000003H
.code
;==============================================================================

Bits2 MACRO

    movzx eax,byte ptr bit_stream
    mov   ecx,eax
    mov   edx,eax
    shr   ecx,2
    shr   eax,4
    and   dl,3
    mov   ah,al
    and   cl,3
    shr   ah,2
    mov   byte ptr val1,dl
    and   al,3
    mov   byte ptr val2,cl
    mov   byte ptr val4,ah
    mov   byte ptr val3,al
   
ENDM

Xbits MACRO
    movzx eax,  bit_stream
    movzx edx,  al
    and   edx,  3
    mov   val1, edx
    shr   eax,  2
    movzx edx,  al
    and   edx,  3
    mov   val2, edx
    shr   eax,  2
    movzx edx,  al
    and   edx,  3
    mov   val3, edx
    shr   eax,  2
    movzx edx,  al
    and   edx,  3
    mov   val4, edx
ENDM

;==============================================================================
start:
;==============================================================================

    Bits2
    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n",val4)
    Xbits
    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n",val4)

    lea   eax, unpacked
    movaps  xmm0, oword ptr [eax]
    lea   edi, val1
    movaps oword ptr [edi], xmm0

    printf("%Xh\n",val1)
    printf("%Xh\n",val2)
    printf("%Xh\n",val3)
    printf("%Xh\n\n",val4)

    invoke GetCurrentProcess
    invoke SetProcessAffinityMask, eax, 1

    invoke Sleep, 5000

    REPEAT 3
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            Bits2
        counter_end
        printf("%d cycles\n", eax)
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            Xbits
        counter_end
        printf("%d cycles\n", eax)       
        counter_begin 1000000, HIGH_PRIORITY_CLASS
            lea   eax, unpacked
            movaps  xmm0, oword ptr [eax]
            lea   edi, val1
            movaps oword ptr [edi], xmm0
        counter_end
        printf("%d cycles\n\n", eax)
    ENDM

    inkey
    exit
;==============================================================================
end start


I just used one byte, so didn't generate the look-up/conversion table.
It is only to have an idea.


There are only two days a year when you can't do anything: one is called yesterday, the other is called tomorrow, so today is the right day to love, believe, do and, above all, live.

Dalai Lama

dedndave

you don't have to populate the table in order to test the speed

look_up_table db 4096 dup(?)

just use an empty table
the values that are in the table make little difference, if any, on the speed

movzx  eax, byte ptr bit_stream
shl    eax, 4
movaps xmm0, oword ptr look_up_table[eax]
movaps oword ptr val1, xmm0

dedndave

i get 1 clock cycle on my P4 - lol
(it's probably 2 cycles)
that's with no call/ret overhead