I've just done this for a PlayStation emulator graphics plugin I'm updating, but I don't know, it seems like there's too many shifts and a lot of shuffling for what it does. I'm using C and VC2013's SSE intrinsics so this isn't handwritten assembly but apart from the interleaving, it's pretty much 1:1. /FAs won't output my comments so I've spliced them into the first round. The whole thing looks monstrous but it's just the same thing unrolled four times.
So yeah, any suggestions / improvements? I've attached the C intrinsic version (renamed to zip) in case anybody wants/needs it
.data
align 16
fiveBitMask byte 00,00,1fh,00,00,1fh,00,00,00,00,1fh,00,00,1fh,00,00
greenByteMask byte 00,00,00,0ffh,00,00,0ffh,00,00,00,00,0ffh,00,00,0ffh,00
greenSixBitMask byte 00,00,00,07h,0e0h,00,07h,0e0h,00,00,00,07h,0e0h,00,07h,0e0h
; eax = 24-bit pixels to convert
; ecx = number of 48 byte 16-pixel 'packs' to process
; edi = 16-bit output pixels
.code
SSENarrowBGRToBGR565 PROC
test ecx, ecx
jz loc_1005F047
movdqa xmm6, XMMWORD PTR fiveBitMask
movdqa xmm1, XMMWORD PTR greenByteMask
movdqa xmm7, xmm6
movdqa xmm5, XMMWORD PTR greenSixBitMask
psrldq xmm7, 2
loc_1005EE14: ; CODE XREF: _SSENarrowBGRToBGR565Sizes+2F1
; Each load contains 5 and a third pixels
; the C code divides red and blue by 8, and green by 4
; so that's what we do. These diagrams in here go
; [0..31, 32...63, 64...95, 96...127] so right shifts move things to the left
; because it's obviously not confusing enough
;
; First load equals
; xmm2 = [0001, 1122, 2333, 4445]
; xmm2 = [rgbr, gbrg, brgb, rgbr]
movdqa xmm2, xmmword ptr [eax]
movdqa xmm3, xmm1
movdqa xmm0, xmm2
; Save sample 4 and 5's red for the next round, this only processes 96-bits per round
pshufd xmm4, xmm2, 0FFh
; this gets sample two into the high quad instead of straddling the boundary
; [xxxx, xxxx, 2223, 33xx]
pslldq xmm0, 2
; combine them into one register
; [0001, 11xx, 2223, 33xx]
shufps xmm2, xmm0, 0E4h
movdqa xmm0, xmm6
; kill the green samples
pandn xmm1, xmm2
; shift the five-bit mask down to deal with the red bits
psrldq xmm0, 2
; pick out the greens since they have a different division than the five-bit parts
pand xmm3, xmm2
; divide the 5 bit samples by 8
psrlq xmm1, 3
movdqa xmm2, xmm6
; divide the greens by 4 (Shift is 2 for the division, 3 to get them into their output position)
; [0g00, g000, 0g00, g000]
psrlq xmm3, 5
; reds are already in the right position, blues need anding and shifting into position
pand xmm2, xmm1
movdqa xmm6, xmmword ptr [eax+10h]
; ensure just the red bits are left
pand xmm0, xmm1
; shift the blues down to their final place
psrlq xmm2, 5
; combine the red and blue bits
por xmm2, xmm0
movdqa xmm0, xmm5
; now put the greens back in, 'and' them first to kill
; the bits we don't need
; This give xmm2 the 4 16-bit samples as such (l = low byte, h = high)
; sample num 00x1, 1xxx, 22x3, 3xxx]
; [lh0l, h000, lh0l, h000]
pand xmm0, xmm3
por xmm2, xmm0
; move samples 0 and 2 in to the low quad
; [00xx, 22xx, 00xx, 22xx]
pshufd xmm0, xmm2, 22h
; samples 1 and 3 straddle 32-bit dwords so fix it
pslldq xmm2, 1
; pack samples 0 and 2 next to each other
; [0022, 0022, 00xx, 22xx]
pshuflw xmm5, xmm0, 22h
; move samples 1 and three in to the low quad
; [11xx, 33xx, 11xx, 33xx]
pshufd xmm0, xmm2, 77h
movdqa xmm2, xmm6
; Interleaved part of the next round, ignore for now
pslldq xmm2, 4
; pack samples 1 and 3 next to each other
; [1133, 1133, 11xx, 33xx]
pshuflw xmm0, xmm0, 22h
; next round bits
movss xmm2, xmm4
movdqa xmm4, XMMWORD PTR greenByteMask
; Unpack them all so we have our results
; xmm5 = [0011, 2233, xxxx, xxxx]
punpcklwd xmm5, xmm0
movdqa xmm1, xmm4
movdqa xmm0, xmm2
psrldq xmm6, 8
pslldq xmm0, 2
movdqa xmm3, xmm4
shufps xmm2, xmm0, 0E4h
movdqa xmm0, XMMWORD PTR fiveBitMask
pandn xmm1, xmm2
pand xmm3, xmm2
psrlq xmm1, 3
movdqa xmm2, xmm0
psrlq xmm3, 5
pand xmm2, xmm1
psrldq xmm0, 2
pand xmm0, xmm1
psrlq xmm2, 5
por xmm2, xmm0
movdqa xmm0, XMMWORD PTR greenSixBitMask
pand xmm0, xmm3
movdqa xmm3, xmm4
por xmm2, xmm0
pshufd xmm0, xmm2, 22h
pslldq xmm2, 1
pshuflw xmm1, xmm0, 22h
pshufd xmm0, xmm2, 77h
pshuflw xmm0, xmm0, 22h
punpcklwd xmm1, xmm0
shufps xmm5, xmm1, 44h
movdqa xmm1, xmm4
movntdq xmmword ptr [edi], xmm5
movdqa xmm5, xmmword ptr [eax+20h]
movdqa xmm2, xmm5
psrldq xmm5, 4
pslldq xmm2, 8
movsd xmm2, xmm6
movdqa xmm6, XMMWORD PTR fiveBitMask
movdqa xmm0, xmm2
pslldq xmm0, 2
shufps xmm2, xmm0, 0E4h
movdqa xmm0, xmm6
pandn xmm1, xmm2
psrldq xmm0, 2
pand xmm3, xmm2
psrlq xmm1, 3
movdqa xmm2, xmm6
psrlq xmm3, 5
pand xmm2, xmm1
pand xmm0, xmm1
psrlq xmm2, 5
por xmm2, xmm0
movdqa xmm0, XMMWORD PTR greenSixBitMask
add edi, 10h
movdqa xmm1, XMMWORD PTR greenByteMask
pand xmm0, xmm3
por xmm2, xmm0
movdqa xmm3, xmm1
pshufd xmm0, xmm2, 22h
add eax, 18h
pshuflw xmm4, xmm0, 22h
pslldq xmm2, 1
pshufd xmm0, xmm2, 77h
movdqa xmm2, xmm6
pshuflw xmm0, xmm0, 22h
punpcklwd xmm4, xmm0
movdqa xmm0, xmm5
pslldq xmm0, 2
shufps xmm5, xmm0, 0E4h
movdqa xmm0, xmm7
pandn xmm1, xmm5
pand xmm3, xmm5
movdqa xmm5, XMMWORD PTR greenSixBitMask
psrlq xmm1, 3
psrlq xmm3, 5
pand xmm2, xmm1
pand xmm0, xmm1
psrlq xmm2, 5
por xmm2, xmm0
movdqa xmm0, xmm5
pand xmm0, xmm3
por xmm2, xmm0
pshufd xmm0, xmm2, 22h
pshuflw xmm1, xmm0, 22h
pslldq xmm2, 1
pshufd xmm0, xmm2, 77h
pshuflw xmm0, xmm0, 22h
punpcklwd xmm1, xmm0
shufps xmm4, xmm1, 44h
movdqa xmm1, XMMWORD PTR greenByteMask
movntdq xmmword ptr [edi], xmm4
dec ecx
jnz loc_1005EE14
loc_1005F047:
sfence
ret
SSENarrowBGRToBGR565 ENDP
public CNarrowBGRToBGR565
CNarrowBGRToBGR565 PROC
; edi = 24-bit Input Pixels
; eax = 16-bit Output Pixels
; ebx = number of pixels
loc_100654E0: ; CODE XREF: _CBGR888ToBGR565Lines+93j
mov edx, [edi]
lea eax, [eax+2]
mov esi, edx
lea edi, [edi+3]
shr esi, 10h
mov ecx, edx
and esi, 0FFF8h
shr ecx, 8
shl esi, 5
and ecx, 0FCh
or esi, ecx
shr edx, 3
shl esi, 3
and edx, 1Fh
or esi, edx
mov [eax-2], si
dec ebx
jnz short loc_100654E0
CNarrowBGRToBGR565 ENDP
This looks like something i might be interested in. I assume this is for DXT1 and DXT5 textures. Im looking at writing something to handle the powervr pvr texture files which can use these two pixel formats. Do you have an example of usage for this function? and also for the reverse function of BGR565 to BGR888 by any chance?
Nah, this is a plain software renderer, no textures in sight. I don't have the reverse, though it should simply be a case of putting the intrinsics bottom to top and using the reciprocal ops.
In C, using this should just be like this. I don't know the format of DXT textures so you'll have to skip any non texture data :
ULONG textureSize = 512 * 512;
ULONG numPacks = (textureSize * 3) / 48;
void* p888Data = _aligned_malloc(textureSize * 3, 16);
void* p565Data = _aligned_malloc(textureSize * 2, 16);
ReadFile(hTexture, p888Data, ...);
__asm
{
push ecx;
push edi;
push eax;
mov ecx, numPacks;
mov edi, p565Data;
mov eax, p888Data;
call SSENarrowBGRToBGR565;
pop eax;
pop edi;
pop ecx;
}
f you mean the code in the c file, it won't be directly applicable since it assumes a 1024 pixel wide source, you also probably wouldn't need the x and y params which are where in pSrc to start reading from.