BGR888 To BGR565 SSE2

adeyblue · January 13, 2015, 10:08:40 AM

I've just done this for a PlayStation emulator graphics plugin I'm updating, but I don't know, it seems like there's too many shifts and a lot of shuffling for what it does. I'm using C and VC2013's SSE intrinsics so this isn't handwritten assembly but apart from the interleaving, it's pretty much 1:1. /FAs won't output my comments so I've spliced them into the first round. The whole thing looks monstrous but it's just the same thing unrolled four times.

So yeah, any suggestions / improvements? I've attached the C intrinsic version (renamed to zip) in case anybody wants/needs it

Code Select


.data
align 16
fiveBitMask byte 00,00,1fh,00,00,1fh,00,00,00,00,1fh,00,00,1fh,00,00
greenByteMask byte 00,00,00,0ffh,00,00,0ffh,00,00,00,00,0ffh,00,00,0ffh,00
greenSixBitMask byte 00,00,00,07h,0e0h,00,07h,0e0h,00,00,00,07h,0e0h,00,07h,0e0h
 
; eax = 24-bit pixels to convert
; ecx = number of 48 byte 16-pixel 'packs' to process 
; edi = 16-bit output pixels
.code
SSENarrowBGRToBGR565 PROC
     test    ecx, ecx
     jz      loc_1005F047
     movdqa  xmm6, XMMWORD PTR fiveBitMask
     movdqa  xmm1, XMMWORD PTR greenByteMask
     movdqa  xmm7, xmm6
     movdqa  xmm5, XMMWORD PTR greenSixBitMask
     psrldq  xmm7, 2

loc_1005EE14:                           ; CODE XREF: _SSENarrowBGRToBGR565Sizes+2F1
     ; Each load contains 5 and a third pixels
     ; the C code divides red and blue by 8, and green by 4
     ; so that's what we do. These diagrams in here go 
     ; [0..31, 32...63, 64...95, 96...127] so right shifts move things to the left
     ; because it's obviously not confusing enough
     ; 
     ; First load equals
     ; xmm2 = [0001, 1122, 2333, 4445]
     ; xmm2 = [rgbr, gbrg, brgb, rgbr]
     movdqa  xmm2, xmmword ptr [eax]
     movdqa  xmm3, xmm1
     movdqa  xmm0, xmm2
     ; Save sample 4 and 5's red for the next round, this only processes 96-bits per round
     pshufd  xmm4, xmm2, 0FFh
     ; this gets sample two into the high quad instead of straddling the boundary
     ; [xxxx, xxxx, 2223, 33xx]
     pslldq  xmm0, 2
     ; combine them into one register
     ; [0001, 11xx, 2223, 33xx]
     shufps  xmm2, xmm0, 0E4h
     movdqa  xmm0, xmm6
     ; kill the green samples
     pandn   xmm1, xmm2
     ; shift the five-bit mask down to deal with the red bits
     psrldq  xmm0, 2
     ; pick out the greens since they have a different division than the five-bit parts
     pand    xmm3, xmm2
     ; divide the 5 bit samples by 8
     psrlq   xmm1, 3
     movdqa  xmm2, xmm6
     ; divide the greens by 4 (Shift is 2 for the division, 3 to get them into their output position)
     ; [0g00, g000, 0g00, g000]
     psrlq   xmm3, 5
     
     ;  reds are already in the right position, blues need anding and shifting into position
     pand    xmm2, xmm1
     movdqa  xmm6, xmmword ptr [eax+10h]
     ; ensure just the red bits are left
     pand    xmm0, xmm1
     ; shift the blues down to their final place
     psrlq   xmm2, 5
     ; combine the red and blue bits
     por     xmm2, xmm0
     movdqa  xmm0, xmm5
     
     ; now put the greens back in, 'and' them first to kill
     ; the bits we don't need
     ; This give xmm2 the 4 16-bit samples as such (l = low byte, h = high)
     ; sample num  00x1, 1xxx, 22x3, 3xxx]
     ;            [lh0l, h000, lh0l, h000]
     pand    xmm0, xmm3
     por     xmm2, xmm0
     ; move samples 0 and 2 in to the low quad
     ; [00xx, 22xx, 00xx, 22xx]
     pshufd  xmm0, xmm2, 22h
     ; samples 1 and 3 straddle 32-bit dwords so fix it
     pslldq  xmm2, 1
     ; pack samples 0 and 2 next to each other
     ; [0022, 0022, 00xx, 22xx] 
     pshuflw xmm5, xmm0, 22h
     ; move samples 1 and three in to the low quad
     ; [11xx, 33xx, 11xx, 33xx]
     pshufd  xmm0, xmm2, 77h
     movdqa  xmm2, xmm6
     ; Interleaved part of the next round, ignore for now
     pslldq  xmm2, 4
     ; pack samples 1 and 3 next to each other
     ; [1133, 1133, 11xx, 33xx]
     pshuflw xmm0, xmm0, 22h
     ; next round bits
     movss   xmm2, xmm4
     movdqa  xmm4, XMMWORD PTR greenByteMask
     ; Unpack them all so we have our results
     ; xmm5 = [0011, 2233, xxxx, xxxx]
     punpcklwd xmm5, xmm0
     movdqa  xmm1, xmm4
     movdqa  xmm0, xmm2
     psrldq  xmm6, 8
     pslldq  xmm0, 2
     movdqa  xmm3, xmm4
     shufps  xmm2, xmm0, 0E4h
     movdqa  xmm0, XMMWORD PTR fiveBitMask
     pandn   xmm1, xmm2
     pand    xmm3, xmm2
     psrlq   xmm1, 3
     movdqa  xmm2, xmm0
     psrlq   xmm3, 5
     pand    xmm2, xmm1
     psrldq  xmm0, 2
     pand    xmm0, xmm1
     psrlq   xmm2, 5
     por     xmm2, xmm0
     movdqa  xmm0, XMMWORD PTR greenSixBitMask
     pand    xmm0, xmm3
     movdqa  xmm3, xmm4
     por     xmm2, xmm0
     pshufd  xmm0, xmm2, 22h
     pslldq  xmm2, 1
     pshuflw xmm1, xmm0, 22h
     pshufd  xmm0, xmm2, 77h
     pshuflw xmm0, xmm0, 22h
     punpcklwd xmm1, xmm0
     shufps  xmm5, xmm1, 44h
     movdqa  xmm1, xmm4
     movntdq xmmword ptr [edi], xmm5
     movdqa  xmm5, xmmword ptr [eax+20h]
     movdqa  xmm2, xmm5
     psrldq  xmm5, 4
     pslldq  xmm2, 8
     movsd   xmm2, xmm6
     movdqa  xmm6, XMMWORD PTR fiveBitMask
     movdqa  xmm0, xmm2
     pslldq  xmm0, 2
     shufps  xmm2, xmm0, 0E4h
     movdqa  xmm0, xmm6
     pandn   xmm1, xmm2
     psrldq  xmm0, 2
     pand    xmm3, xmm2
     psrlq   xmm1, 3
     movdqa  xmm2, xmm6
     psrlq   xmm3, 5
     pand    xmm2, xmm1
     pand    xmm0, xmm1
     psrlq   xmm2, 5
     por     xmm2, xmm0
     movdqa  xmm0, XMMWORD PTR greenSixBitMask
     add     edi, 10h
     movdqa  xmm1, XMMWORD PTR greenByteMask
     pand    xmm0, xmm3
     por     xmm2, xmm0
     movdqa  xmm3, xmm1
     pshufd  xmm0, xmm2, 22h
     add     eax, 18h
     pshuflw xmm4, xmm0, 22h
     pslldq  xmm2, 1
     pshufd  xmm0, xmm2, 77h
     movdqa  xmm2, xmm6
     pshuflw xmm0, xmm0, 22h
     punpcklwd xmm4, xmm0
     movdqa  xmm0, xmm5
     pslldq  xmm0, 2
     shufps  xmm5, xmm0, 0E4h
     movdqa  xmm0, xmm7
     pandn   xmm1, xmm5
     pand    xmm3, xmm5
     movdqa  xmm5, XMMWORD PTR greenSixBitMask
     psrlq   xmm1, 3
     psrlq   xmm3, 5
     pand    xmm2, xmm1
     pand    xmm0, xmm1
     psrlq   xmm2, 5
     por     xmm2, xmm0
     movdqa  xmm0, xmm5
     pand    xmm0, xmm3
     por     xmm2, xmm0
     pshufd  xmm0, xmm2, 22h
     pshuflw xmm1, xmm0, 22h
     pslldq  xmm2, 1
     pshufd  xmm0, xmm2, 77h
     pshuflw xmm0, xmm0, 22h
     punpcklwd xmm1, xmm0
     shufps  xmm4, xmm1, 44h
     movdqa  xmm1, XMMWORD PTR greenByteMask
     movntdq xmmword ptr [edi], xmm4
     dec     ecx
     jnz     loc_1005EE14
     
loc_1005F047:
     sfence
     ret
SSENarrowBGRToBGR565 ENDP

public CNarrowBGRToBGR565
CNarrowBGRToBGR565 PROC
; edi = 24-bit Input Pixels
; eax = 16-bit Output Pixels
; ebx = number of pixels
loc_100654E0:                           ; CODE XREF: _CBGR888ToBGR565Lines+93j
     mov     edx, [edi]
     lea     eax, [eax+2]
     mov     esi, edx
     lea     edi, [edi+3]
     shr     esi, 10h
     mov     ecx, edx
     and     esi, 0FFF8h
     shr     ecx, 8
     shl     esi, 5
     and     ecx, 0FCh
     or      esi, ecx
     shr     edx, 3
     shl     esi, 3
     and     edx, 1Fh
     or      esi, edx
     mov     [eax-2], si
     dec     ebx
     jnz     short loc_100654E0
CNarrowBGRToBGR565 ENDP

fearless · January 13, 2015, 12:07:42 PM

This looks like something i might be interested in. I assume this is for DXT1 and DXT5 textures. Im looking at writing something to handle the powervr pvr texture files which can use these two pixel formats. Do you have an example of usage for this function? and also for the reverse function of BGR565 to BGR888 by any chance?

adeyblue · January 14, 2015, 08:17:23 AM

Nah, this is a plain software renderer, no textures in sight. I don't have the reverse, though it should simply be a case of putting the intrinsics bottom to top and using the reciprocal ops.

In C, using this should just be like this. I don't know the format of DXT textures so you'll have to skip any non texture data :

Code Select


ULONG textureSize = 512 * 512;
ULONG numPacks = (textureSize * 3) / 48;
void* p888Data = _aligned_malloc(textureSize * 3, 16);
void* p565Data = _aligned_malloc(textureSize * 2, 16);
ReadFile(hTexture, p888Data, ...);
__asm
{
   push ecx;
   push edi;
   push eax;
   mov ecx, numPacks;
   mov edi, p565Data;
   mov eax, p888Data;
   call SSENarrowBGRToBGR565;
   pop eax;
   pop edi;
   pop ecx;
}

f you mean the code in the c file, it won't be directly applicable since it assumes a 1024 pixel wide source, you also probably wouldn't need the x and y params which are where in pSrc to start reading from.

The MASM Forum

News:

BGR888 To BGR565 SSE2

adeyblue

fearless

adeyblue