Hi all.
I need a faster way to do the following:
- extract 2 bits at a time from a byte, from LSB to MSB.
- move its values into 4 dwords.
I've created a sample program to show what I mean. I'm not satisfied
with it, I think it could be done faster and shorter.
Any idea? Thanks.
Frank
;==============================================================================
include \masm32\include\masm32rt.inc
;==============================================================================
.nolist
.686
.xmm
;==============================================================================
.data
align 4
val1 dd 0
val2 dd 0
val3 dd 0
val4 dd 0
align 4
Ptrval1 dd val1
bit_stream db 11100100B
.code
;==============================================================================
Main proc near
mov al, bit_stream
mov cl, al
and al, 00000011B
mov byte ptr val1, al
mov al, cl
and al, 00001100B
shr al, 2
mov byte ptr val2, al
mov al, cl
and al, 00110000B
shr al, 4
mov byte ptr val3, al
mov al, cl
and al, 11000000B
rol al, 2
mov byte ptr val4, al
print " First value "
print str$(val1), 13, 10
print " Second value "
print str$(val2), 13, 10
print " Third value "
print str$(val3), 13, 10
print " Fourth value "
print str$(val4), 13, 10
ret
Main endp
;==============================================================================
start:
;==============================================================================
call Main
inkey
exit
;==============================================================================
end start
Output:
Quote
First value 0
Second value 1
Third value 2
Fourth value 3
Press any key to continue ...
hiya Frank
you might do something like this
mov edx,offset Val1
mov al,cl
and al,3
shr ecx,2
mov [edx],al
mov al,cl
add edx,4
and al,3
shr ecx,2
mov [edx],al
mov al,cl
add edx,4
and al,3
shr ecx,2
mov [edx],al
add edx,4
and cl,3
mov [edx],cl
another variation
mov edx,offset Val1
mov al,cl
and al,3
shr ecx,2
mov [edx],al
mov al,cl
and al,3
shr ecx,2
mov [edx+4],al
mov al,cl
and al,3
shr ecx,2
mov [edx+8],al
and cl,3
mov [edx+12],cl
this may be faster
mov al,cl
and al,3
shr ecx,2
mov byte ptr Val1,al
mov dl,cl
and dl,3
shr ecx,2
mov byte ptr Val2,dl
mov al,cl
and al,3
shr ecx,2
mov byte ptr Val3,al
and cl,3
mov byte ptr Val4,cl
Quote from: dedndave on February 02, 2013, 03:28:00 AM
this may be faster
mov al,cl
and al,3
shr ecx,2
mov byte ptr Val1,al
mov dl,cl
and dl,3
shr ecx,2
mov byte ptr Val2,dl
mov al,cl
and al,3
shr ecx,2
mov byte ptr Val3,al
and cl,3
mov byte ptr Val4,cl
Hi Dave.
I think you're using the same logic as mine, with some improvements [I hope].
You know
mov [edx],al
if it works? Shouldn't the two operands be
of the same size? like
mov byte ptr [edx],al
.
EDX is not the operand
the memory location at [EDX] is the operand
so, the assembler knows the size because AL is used - it can't be anything but a byte
Quote from: dedndave on February 02, 2013, 04:12:29 AM
EDX is not the operand
the memory location at [EDX] is the operand
so, the assembler knows the size because AL is used - it can't be anything but a byte
OK Dave. :t
Looking for a XMM/SSE2/3 faster version.
even though direct addressing takes a few more bytes of code, i think it's the fastest
for example...
mov byte ptr Val3,al
yah - SSE is going to be faster, i am sure
Quote from: dedndave on February 02, 2013, 04:15:47 AM
even though direct addressing takes a few more bytes of code, i think it's the fastest
for example...
mov byte ptr Val3,al
Thanks Dave.
here is kind of a "Henry Ford" approach
we push and pop ebx, but reduce dependancies
Bits1 PROC
movzx eax,byte ptr bit_stream
push ebx
mov ecx,eax
mov edx,eax
mov ebx,eax
shr ecx,2
shr edx,4
shr ebx,6
and al,3
and cl,3
mov byte ptr val1,al
mov byte ptr val2,cl
and dl,3
mov byte ptr val4,bl
mov byte ptr val3,dl
pop ebx
ret
Bits1 ENDP
Quote from: dedndave on February 02, 2013, 04:48:03 AM
here is kind of a "Henry Ford" approach
we push and pop ebx, but reduce dependancies
Bits1 PROC
movzx eax,byte ptr bit_stream
push ebx
mov ecx,eax
mov edx,eax
mov ebx,eax
shr ecx,2
shr edx,4
shr ebx,6
and al,3
and cl,3
mov byte ptr val1,al
mov byte ptr val2,cl
and bl,3
and dl,3
mov byte ptr val4,bl
mov byte ptr val3,dl
pop ebx
ret
Bits1 ENDP
And here a kind of frktons approach:
xor eax, eax
xor ebx, ebx
xor ecx, ecx
xor edx, edx
mov al, bit_stream
ror eax, 8
shld edx, eax, 2
shl eax, 2
shld ecx, eax, 2
shl eax, 2
shld ebx, eax, 2
shl eax, 2
rol eax, 2
mov val1, eax
mov val2, ebx
mov val3, ecx
mov val4, edx
Which is faster?
Sill looking for SSE3 solution.
on the Henry Ford - no need to AND BL,3 - you can remove that line of code
i am working on one that doesn't push/pop ebx
then, i will let you write the test code :P
Bits2 PROC
movzx eax,byte ptr bit_stream
mov ecx,eax
mov edx,eax
shr ecx,2
shr eax,4
and dl,3
mov ah,al
and cl,3
shr ah,2
mov byte ptr val1,dl
and al,3
mov byte ptr val2,cl
mov byte ptr val4,ah
mov byte ptr val3,al
ret
Bits2 ENDP
I'm half way SSE2 solution. Still something to do though.
.data
align 16
maskxx dd 000000C0H, 00000030H, 0000000CH, 00000003H
align 16
val1 dd 0
val2 dd 0
val3 dd 0
val4 dd 0
align 4
bit_stream db 11100100B
.code
.....
lea esi, maskxx
movaps xmm1, oword ptr [esi]
lea edi, val1
movzx eax, byte ptr bit_stream
movd xmm0, eax
pshufd xmm0, xmm0, 0
pand xmm0, xmm1
; Here I have to solve the problem to shift 2nd dword in xmm0 2 bits right
; 3rd dword 4 bits right
; 4th dword 6 bits right
movaps oword ptr [edi], xmm0
Any idea? Maybe PSRLD xmm0, xmm2 ?
that is the trick, huh - lol
here is a bit-swap routine that runs very fast (reverses bits in a dword)
developed by BitRake and Nexo, updated by Drizz
mov eax,dwInputVal
mov edx,00001111000011110000111100001111b
and edx,eax
and eax,11110000111100001111000011110000b
shl edx,4
shr eax,4
or eax,edx
mov edx,00110011001100110011001100110011b
and edx,eax
and eax,11001100110011001100110011001100b
shr eax,2
lea eax,[edx*4+eax] ;shl edx,2 ;or eax,edx
mov edx,01010101010101010101010101010101b
and edx,eax
and eax,10101010101010101010101010101010b
shr eax,1
lea eax,[edx*2+eax] ;shl edx,1 ;or eax,edx
bswap eax
now, i know that's not what you want to do
however, it demonstrates a concept that you may find useful
understand how it works - then you may be able to use the same concept in SSE code
Quote from: dedndave on February 02, 2013, 06:10:29 AM
that is the trick, huh - lol
here is a bit-swap routine that runs very fast (reverses bits in a dword)
developed by BitRake and Nexo, updated by Drizz
mov eax,dwInputVal
mov edx,00001111000011110000111100001111b
and edx,eax
and eax,11110000111100001111000011110000b
shl edx,4
shr eax,4
or eax,edx
mov edx,00110011001100110011001100110011b
and edx,eax
and eax,11001100110011001100110011001100b
shr eax,2
lea eax,[edx*4+eax] ;shl edx,2 ;or eax,edx
mov edx,01010101010101010101010101010101b
and edx,eax
and eax,10101010101010101010101010101010b
shr eax,1
lea eax,[edx*2+eax] ;shl edx,1 ;or eax,edx
bswap eax
now, i know that's not what you want to do
however, it demonstrates a concept that you may find useful
understand how it works - then you may be able to use the same concept in SSE code
Thanks Dave, I'll try to understand what it does later.
Now to finish the SSE2 task I've to understand how to shift bits
using a different approach, like adding/subtracting a value
to the byte interested.
if I have
bit_mask db 11100100B
and I want to shift right 6 bits, I can do:
shr bit_mask, 6
If I want to get the same result using sub/add or something like this
what kind of logic have I to use?
that code i posted has the answer
that's why i posted it :biggrin:
On a P3, the 14-16 instructions of simple integer code required execute in about 3 cycles.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
;==============================================================================
.data
val1 dd 0
val2 dd 0
val3 dd 0
val4 dd 0
bit_stream db 11100100b
.code
;=============================================================================x=
Bits2 MACRO
movzx eax,byte ptr bit_stream
mov ecx,eax
mov edx,eax
shr ecx,2
shr eax,4
and dl,3
mov ah,al
and cl,3
shr ah,2
mov byte ptr val1,dl
and al,3
mov byte ptr val2,cl
mov byte ptr val4,ah
mov byte ptr val3,al
ENDM
Xbits MACRO
movzx eax, bit_stream
movzx edx, al
and edx, 3
mov val1, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val2, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val3, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val4, edx
ENDM
;==============================================================================
start:
;==============================================================================
Bits2
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n",val4)
Xbits
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n\n",val4)
invoke GetCurrentProcess
invoke SetProcessAffinityMask, eax, 1
invoke Sleep, 5000
REPEAT 3
counter_begin 1000000, HIGH_PRIORITY_CLASS
Bits2
counter_end
printf("%d cycles\n", eax)
counter_begin 1000000, HIGH_PRIORITY_CLASS
Xbits
counter_end
printf("%d cycles\n\n", eax)
ENDM
inkey
exit
;==============================================================================
end start
i get about 10 or 11 cycles for my Bits2 proc on a P4
you aren't going to get much faster because you will have at least 10 SSE instructions
Michael's code takes 13 cycles on my machine
Quote from: dedndave on February 02, 2013, 06:38:17 AM
that code i posted has the answer
that's why i posted it :biggrin:
The answer I need is all about add/sub, this is one of the viable way I can complete
the SSE2 routine. If you post something interesting, that requires me quite a lot of
time to grasp, I need to postpone it and try by myself. :P
Quote from: dedndave on February 02, 2013, 06:50:19 AM
i get about 10 or 11 cycles for my Bits2 proc on a P4
you aren't going to get much faster because you will have at least 10 SSE instructions
Michael's code takes 13 cycles on my machine
On my machine PIV 3.2 GHz dual core INTEL I have
Quote
15 cycles
14 cycles
15 cycles
14 cycles
15 cycles
14 cycles
I'm going to do this bit twiddling for about 100,000 bytes, so on those numbers
maybe SSE2 would be of some help.
i'll let you make the 3 look-up tables (256 bytes each table)
Bits3 PROC
movzx eax,byte ptr bit_stream
movzx ecx,byte ptr look_up_table1[eax]
movzx edx,byte ptr look_up_table2[eax]
mov ch,look_up_table3[eax]
mov byte ptr val3,dl
and al,3
mov byte ptr val2,cl
mov byte ptr val4,ch
mov byte ptr val1,al
ret
Bits3 ENDP
i am guessing 8 cycles on a P4 :P
Quote from: MichaelW on February 02, 2013, 06:41:07 AM
On a P3, the 14-16 instructions of simple integer code required execute in about 3 cycles.
;==============================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
;==============================================================================
.data
val1 dd 0
val2 dd 0
val3 dd 0
val4 dd 0
bit_stream db 11100100b
.code
;=============================================================================x=
Bits2 MACRO
movzx eax,byte ptr bit_stream
mov ecx,eax
mov edx,eax
shr ecx,2
shr eax,4
and dl,3
mov ah,al
and cl,3
shr ah,2
mov byte ptr val1,dl
and al,3
mov byte ptr val2,cl
mov byte ptr val4,ah
mov byte ptr val3,al
ENDM
Xbits MACRO
movzx eax, bit_stream
movzx edx, al
and edx, 3
mov val1, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val2, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val3, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val4, edx
ENDM
;==============================================================================
start:
;==============================================================================
Bits2
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n",val4)
Xbits
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n\n",val4)
invoke GetCurrentProcess
invoke SetProcessAffinityMask, eax, 1
invoke Sleep, 5000
REPEAT 3
counter_begin 1000000, HIGH_PRIORITY_CLASS
Bits2
counter_end
printf("%d cycles\n", eax)
counter_begin 1000000, HIGH_PRIORITY_CLASS
Xbits
counter_end
printf("%d cycles\n\n", eax)
ENDM
inkey
exit
;==============================================================================
end start
Thanks Michael, very helpful.
Quote from: dedndave on February 02, 2013, 07:04:10 AM
i'll let you make the 3 look-up tables (256 bytes each table)
Bits3 PROC
movzx eax,byte ptr bit_stream
movzx ecx,byte ptr look_up_table1[eax]
movzx edx,byte ptr look_up_table2[eax]
mov ch,look_up_table3[eax]
mov byte ptr val3,dl
and al,3
mov byte ptr val2,cl
mov byte ptr val4,ch
mov byte ptr val1,al
ret
Bits3 ENDP
i am guessing 8 cycles on a P4 :P
I like look up tables. Probably is one of the best solution around. :t
i measure 9 cycles on a P4 (Bits3)
you may be able to move things around a little and get it down to 8
also...
if you are doing a bunch of these, you won't be using a PROC
that should save a couple clock cycles
Quote from: dedndave on February 02, 2013, 07:36:38 AM
i measure 9 cycles on a P4 (Bits3)
you may be able to move things around a little and get it down to 8
also...
if you are doing a bunch of these, you won't be using a PROC
that should save a couple clock cycles
Dave, I don't understand why you need 3 look-up tables. I think one is enough.
A 256 element look-up table of owords, containing for each element one of the
possible 256 value of
al already unpacked, and without any hassle, you use the value in
alas the index to the element that contains the unpacked oword that directly can
be used as it is.
More or less:
mov al, AByte
movaps xmm0, oword ptr look_up_table[eax]
movaps oword ptr val1, xmm0
More or less 1-2 cycles.
I foresee this solution is going to be the fastest one.
you could do it that way, too
it would be dwords though - not owords
movzx eax,byte ptr bit_stream
mov edx,look_up_table[eax]
;bang - you got 4 bytes
maybe a better way...
movzx eax,byte ptr bit_stream
movzx edx,word ptr look_up_table[eax]
movzx ecx,word ptr look_up_table[eax+2]
with the second method, you don't have to shift or bswap the upper bytes down
the desired bytes are in DL, DH, CL, CH
movzx eax,byte ptr bit_stream
movzx edx,word ptr look_up_table[eax]
movzx ecx,word ptr look_up_table[eax+2]
mov byte ptr val1,dl
mov byte ptr val2,dh
mov byte ptr val3,cl
mov byte ptr val4,ch
won't be too much faster than Bits3, though
you are getting down to a few clock cycles, where you just can't make it faster
oh - i see what you are saying - yes, that would be fast :t
movzx al, byte ptr bit_stream
shl eax, 4
movaps xmm0, oword ptr look_up_table[eax]
movaps oword ptr val1, xmm0
or, if it will let you use 16 as a multiplier...
movzx al, byte ptr bit_stream
movaps xmm0, oword ptr look_up_table[16*eax]
movaps oword ptr val1, xmm0
4 kb table
These are the performances:
Quote
0h
1h
2h
3h
0h
1h
2h
3h
0h
1h
2h
3h
15 cycles
13 cycles
5 cycles
15 cycles
14 cycles
5 cycles
15 cycles
14 cycles
4 cycles
Press any key to continue ...
It is about 3:1 faster, with the simulation I did:
;==============================================================================
; FourInOne5.asm - MichaelW test version
;
;==============================================================================
include \masm32\include\masm32rt.inc
.686
.xmm
include \masm32\macros\timers.asm
;==============================================================================
.data
val1 dd 0
val2 dd 0
val3 dd 0
val4 dd 0
bit_stream db 11100100b
align 16
unpacked dd 00000000H, 00000001H, 00000002H, 00000003H
.code
;==============================================================================
Bits2 MACRO
movzx eax,byte ptr bit_stream
mov ecx,eax
mov edx,eax
shr ecx,2
shr eax,4
and dl,3
mov ah,al
and cl,3
shr ah,2
mov byte ptr val1,dl
and al,3
mov byte ptr val2,cl
mov byte ptr val4,ah
mov byte ptr val3,al
ENDM
Xbits MACRO
movzx eax, bit_stream
movzx edx, al
and edx, 3
mov val1, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val2, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val3, edx
shr eax, 2
movzx edx, al
and edx, 3
mov val4, edx
ENDM
;==============================================================================
start:
;==============================================================================
Bits2
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n",val4)
Xbits
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n",val4)
lea eax, unpacked
movaps xmm0, oword ptr [eax]
lea edi, val1
movaps oword ptr [edi], xmm0
printf("%Xh\n",val1)
printf("%Xh\n",val2)
printf("%Xh\n",val3)
printf("%Xh\n\n",val4)
invoke GetCurrentProcess
invoke SetProcessAffinityMask, eax, 1
invoke Sleep, 5000
REPEAT 3
counter_begin 1000000, HIGH_PRIORITY_CLASS
Bits2
counter_end
printf("%d cycles\n", eax)
counter_begin 1000000, HIGH_PRIORITY_CLASS
Xbits
counter_end
printf("%d cycles\n", eax)
counter_begin 1000000, HIGH_PRIORITY_CLASS
lea eax, unpacked
movaps xmm0, oword ptr [eax]
lea edi, val1
movaps oword ptr [edi], xmm0
counter_end
printf("%d cycles\n\n", eax)
ENDM
inkey
exit
;==============================================================================
end start
I just used one byte, so didn't generate the look-up/conversion table.
It is only to have an idea.
you don't have to populate the table in order to test the speed
look_up_table db 4096 dup(?)
just use an empty table
the values that are in the table make little difference, if any, on the speed
movzx eax, byte ptr bit_stream
shl eax, 4
movaps xmm0, oword ptr look_up_table[eax]
movaps oword ptr val1, xmm0
i get 1 clock cycle on my P4 - lol
(it's probably 2 cycles)
that's with no call/ret overhead
Quote from: dedndave on February 02, 2013, 09:06:50 AM
i get 1 clock cycle on my P4 - lol
(it's probably 2 cycles)
that's with no call/ret overhead
That was the target. We got there. Thanks master for your inspiration. :t
I tried it on my Core Duo, because Kaspersky is giving me problems on
my PIV, and I cannot measure the test, it is always 0.
I need to test it with some more data to have an idea of the performance.
Let's do some modification to the test...
For 12 bytes, adding the overhead of the unrolled cycles
I get these results on Core Duo PC.
Quote
96 cycles
113 cycles
45 cycles
Still a couple of times faster than other solutions anyway.
prescott w/htt
Quote139 cycles
164 cycles
62 cycles
163 cycles
143 cycles
55 cycles
163 cycles
142 cycles
61 cycles
deleted