I'll post what I'm using right now (mind you this is a long function), the x86 version of this is created from an inline version found in a cpp file and works. The goal is to translate that into x64 and tidy it up a bit. It deals with YUV color space in case you were wondering. There are 3 other conversion functions but they are very similar.
x86:
conv1_YV12_SSE2 proc public srcpY:dword,srcpU:dword,srcpV:dword,src_pitchR:dword,src_pitchY2:dword,src_pitchUV:dword,dstpY:dword,dstpU:dword,dstpV:dword,dst_pitchR:dword,dst_pitchY2:dword,dst_pitchUV:dword,width_:dword,height:dword,loopctr:dword,fact_YU:oword,fact_YV:oword,fact_UU:oword,fact_UV:oword,fact_VV:oword,fact_VU:oword
mov esi, srcpY
mov edi, dstpY
mov eax, srcpU
movd mm0, dstpU
mov ecx, srcpV
mov edx, dstpV
pxor xmm7, xmm7 ; all 0's
movdqa xmm6, oword ptr Q64 ; 64's words
align 16
xloop:
movq xmm0, qword ptr[eax]
movq xmm1, qword ptr[ecx]
punpcklbw xmm0, xmm7 ; unpack to words, 0U0U0U0U
punpcklbw xmm1, xmm7 ; unpack to words, 0V0V0V0V
psubw xmm0, oword ptr Q128 ; adj for 128 chroma offset
psubw xmm1, oword ptr Q128 ; adj for 128 chroma offset
pmullw xmm0, xmm6 ; *64, for rounding later
pmullw xmm1, xmm6 ; *64, for rounding later
movdqa xmm2, xmm0 ; copy so mm0 is stored for later
movdqa xmm3, xmm1 ; copy so mm1 is stored for later
pmulhw xmm2, fact_YU ; YU factor (U term in adjusted Y)
pmulhw xmm3, fact_YV ; YV factor (V term in adjusted Y)
paddw xmm2, xmm3 ; total adjusted amount to add to Y
movdqa xmm4, [esi]
movdqa xmm3, xmm2 ; make copy
punpcklwd xmm2, xmm2 ; words <1,1,0,0>
punpckhwd xmm3, xmm3 ; words <3,3,2,2>
movdqa xmm5, xmm4 ; make copy of it
punpcklbw xmm4, xmm7 ; 0Y0Y0Y0Y
punpckhbw xmm5, xmm7 ; 0Y0Y0Y0Y
pmullw xmm4, xmm6 ; *64
pmullw xmm5, xmm6 ; *64
paddw xmm4, xmm2 ; add uv adjustment
paddw xmm5, xmm3 ; add uv adjustment
paddw xmm4, oword ptr Q32 ; bump up 32 for rounding
paddw xmm5, oword ptr Q32 ; bump up 32 for rounding
psraw xmm4, 6 ; /64
psraw xmm5, 6 ; /64
packuswb xmm4, xmm5 ; pack back to 8 bytes, saturate to 0-255
movdqa [edi], xmm4
add esi, src_pitchR
add edi, dst_pitchR
movdqa xmm4, [esi]
movdqa xmm5, xmm4 ; make copy of it
punpcklbw xmm4, xmm7 ; 0Y0Y0Y0Y
punpckhbw xmm5, xmm7 ; 0Y0Y0Y0Y
pmullw xmm4, xmm6 ; *64
pmullw xmm5, xmm6 ; *64
paddw xmm4, xmm2 ; add uv adjustment
paddw xmm5, xmm3 ; add uv adjustment
paddw xmm4, oword ptr Q32 ; bump up 32 for rounding
paddw xmm5, oword ptr Q32 ; bump up 32 for rounding
psraw xmm4, 6 ; /64
psraw xmm5, 6 ; /64
packuswb xmm4, xmm5 ; pack back to 8 bytes, saturate to 0-255
movdqa [edi], xmm4
sub esi, src_pitchR ; restore curr line
sub edi, dst_pitchR ; restore curr line
movdqa xmm2, xmm0 ; mov back stored U words
movdqa xmm3, xmm1 ; mov back stored V words
paddw xmm2, xmm2 ; adjust for /2 scale in fact_UU
pmulhw xmm2, fact_UU ; UU factor (U term in adjusted U)
pmulhw xmm3, fact_UV ; UV factor (V term in adjusted U)
psubsw xmm2, xmm3 ; this is new U
movd mm1, eax
paddw xmm2, oword ptr Q8224 ; bias up by 64*128 + 32
psraw xmm2, 6 ; /64
movd eax, mm0
packuswb xmm2, xmm7 ; back to 4 bytes
movq qword ptr[eax], xmm2 ; store adjusted U
movdqa xmm2, xmm0 ; mov back stored U words
movdqa xmm3, xmm1 ; mov back stored V words
movd eax, mm1
paddw xmm3, xmm3 ; adjust for /2 scale in fact_VV
pmulhw xmm2, fact_VU ; VU factor (U term in adjusted V)
pmulhw xmm3, fact_VV ; VV factor (V term in adjusted V)
psubsw xmm3, xmm2 ; 1st term negative, this is new V
paddw xmm3, oword ptr Q8224 ; bias up by 64*128 + 32
psraw xmm3, 6 ; /64
packuswb xmm3, xmm7 ; pack to 4 bytes
movq qword ptr[edx], xmm3 ; store adjusted V
add esi, 16 ; bump ptrs
add edi, 16
add eax, 8
add ecx, 8
paddd mm0, Q8
add edx, 8
dec loopctr ; decrease counter
jnz xloop ; loop
sub height, 2
jz return
mov eax, width_
mov esi, srcpY
shr eax, 4
mov edi, dstpY
mov loopctr, eax
mov eax, srcpU
mov edx, dstpV
mov ecx, srcpV
add esi, src_pitchY2
add edi, dst_pitchY2
add eax, src_pitchUV
add ecx, src_pitchUV
add edx, dst_pitchUV
mov srcpY, esi
mov dstpY, edi
mov dstpV, edx
mov srcpU, eax
mov srcpV, ecx
movd mm1, eax
mov eax, dstpU
add eax, dst_pitchUV
mov dstpU, eax
movd mm0, eax
movd eax, mm1
jmp xloop
return:
ret
conv1_YV12_SSE2 endp
x64:
conv1_YV12_SSE2 proc public frame
src_pitchY2 equ dword ptr [rbp+48]
src_pitchUV equ dword ptr [rbp+56]
dstpY equ qword ptr [rbp+64]
dstpU equ qword ptr [rbp+72]
dstpV equ qword ptr [rbp+80]
dst_pitchR equ dword ptr [rbp+88]
dst_pitchY2 equ dword ptr [rbp+96]
dst_pitchUV equ dword ptr [rbp+104]
width_ equ dword ptr [rbp+112]
height equ dword ptr [rbp+120]
loopctr equ dword ptr [rbp+128]
fact_YU equ oword ptr [rbp+136]
fact_YV equ oword ptr [rbp+144]
fact_UU equ oword ptr [rbp+152]
fact_UV equ oword ptr [rbp+160]
fact_VV equ oword ptr [rbp+168]
fact_VU equ oword ptr [rbp+176]
push rbp
.pushreg rbp
mov rbp,rsp
push rsi
.pushreg rsi
push rdi
.pushreg rdi
push r12
.pushreg r12
push r13
.pushreg r13
push r14
.pushreg r14
push r15
.pushreg r15
sub rsp,96
.allocstack 96
movdqu oword ptr[rsp],xmm6
.savexmm128 xmm6,0
movdqu oword ptr[rsp+16],xmm7
.savexmm128 xmm7,16
movdqu oword ptr[rsp+32],xmm8
.savexmm128 xmm8,32
movdqu oword ptr[rsp+48],xmm9
.savexmm128 xmm9,48
movdqu oword ptr[rsp+64],xmm10
.savexmm128 xmm10,64
movdqu oword ptr[rsp+80],xmm11
.savexmm128 xmm11,80
.endprolog
mov rsi, rcx
mov rdi, dstpY
mov rax, rdx
mov r8, dstpU
mov rcx, r8
mov rdx, dstpV
movsxd r9, r9d
movsxd r10, dst_pitchR
movsxd r11, src_pitchY2
movsxd r12, dst_pitchY2
movsxd r13, src_pitchUV
movsxd r14, dst_pitchUV
mov r15d, width_
; 0x0020, Q32
pcmpeqb xmm8, xmm8
psrlw xmm8, 15
psllw xmm8, 5
; 0x0040, Q64
pcmpeqb xmm9, xmm9
psrlw xmm9, 15
psllw xmm9, 6
; 0x0080, Q128
pcmpeqb xmm10, xmm10
psrlw xmm10, 15
psllw xmm10, 7
; 0x2020, Q8224
pcmpeqb xmm11, xmm11
psrlw xmm11, 15
psllw xmm11, 13
por xmm11, xmm8
pxor xmm7, xmm7
movdqa xmm6, xmm9
xloop:
movdqa xmm0, [rax]
movdqa xmm1, [rcx]
punpcklbw xmm0, xmm7 ; unpack to words, 0U0U0U0U
punpcklbw xmm1, xmm7 ; unpack to words, 0V0V0V0V
psubw xmm0, xmm10 ; adj for 128 chroma offset
psubw xmm1, xmm10 ; adj for 128 chroma offset
pmullw xmm0, xmm6 ; *64, for rounding later
pmullw xmm1, xmm6 ; *64, for rounding later
movdqa xmm2, xmm0 ; copy so xmm0 is stored for later
movdqa xmm3, xmm1 ; copy so xmm1 is stored for later
pmulhw xmm2, fact_YU ; YU factor (U term in adjusted Y)
pmulhw xmm3, fact_YV ; YV factor (V term in adjusted Y)
paddw xmm2, xmm3 ; total adjusted amount to add to Y
movdqa xmm4, [rsi]
movdqa xmm3, xmm2 ; make copy
punpcklwd xmm2, xmm2 ; words <1,1,0,0>
punpckhwd xmm3, xmm3 ; words <3,3,2,2>
movdqa xmm5, xmm4 ; make copy of it
punpcklbw xmm4, xmm7 ; 0Y0Y0Y0Y
punpckhbw xmm5, xmm7 ; 0Y0Y0Y0Y
pmullw xmm4, xmm6 ; *64
pmullw xmm5, xmm6 ; *64
paddw xmm4, xmm2 ; add uv adjustment
paddw xmm5, xmm3 ; add uv adjustment
paddw xmm4, xmm8 ; bump up 32 for rounding
paddw xmm5, xmm8 ; bump up 32 for rounding
psraw xmm4, 6 ; /64
psraw xmm5, 6 ; /64
packuswb xmm4, xmm5 ; pack back to 8 bytes, saturate to 0-255
movdqa [rdi], xmm4
add rsi, r9
add rdi, r10
movdqa xmm4, [rsi]
movdqa xmm5, xmm4 ; make copy of it
punpcklbw xmm4, xmm7 ; 0Y0Y0Y0Y
punpckhbw xmm5, xmm7 ; 0Y0Y0Y0Y
pmullw xmm4, xmm6 ; *64
pmullw xmm5, xmm6 ; *64
paddw xmm4, xmm2 ; add uv adjustment
paddw xmm5, xmm3 ; add uv adjustment
paddw xmm4, xmm8 ; bump up 32 for rounding
paddw xmm5, xmm8 ; bump up 32 for rounding
psraw xmm4, 6 ; /64
psraw xmm5, 6 ; /64
packuswb xmm4, xmm5 ; pack back to 8 bytes, saturate to 0-255
movdqa [rdi], xmm4
sub rsi, r9 ; restore curr line
sub rdi, r12 ; restore curr line
movdqa xmm2, xmm0 ; mov back stored U words
movdqa xmm3, xmm1 ; mov back stored V words
paddw xmm2, xmm2 ; adjust for /2 scale in fact_UU
pmulhw xmm2, fact_UU ; UU factor (U term in adjusted U)
pmulhw xmm3, fact_UV ; UV factor (V term in adjusted U)
psubsw xmm2, xmm3 ; this is new U
paddw xmm2, xmm11 ; bias up by 64*128 + 32
psraw xmm2, 6 ; /64
packuswb xmm2, xmm7 ; pack back to 4 bytes
movdqa [r8], xmm2 ; store adjusted U
movdqa xmm2, xmm0 ; mov back stored U words
movdqa xmm3, xmm1 ; mov back stored V words
paddw xmm3, xmm3 ; adjust for /2 scale in fact_VV
pmulhw xmm2, fact_VU ; VU factor (U term in adjusted V)
pmulhw xmm3, fact_VV ; VV factor (V term in adjusted V)
psubsw xmm3, xmm2 ; 1st term negative, this is new V
paddw xmm3, xmm11 ; bias up by 64*128 + 32
psraw xmm3, 6 ; /64
packuswb xmm3, xmm7 ; pack back to 4 bytes
movdqa [rdx], xmm3 ; store adjusted V
add rsi, 16 ; bump ptrs
add rdi, 16
add rax, 8
add rcx, 8
add r8, 8
add rdx, 8
dec loopctr ; decrease counter
jnz xloop ; loop
sub height, 2
jz return
shr r15d, 4
mov loopctr, r15d
add rsi, r11
add rdi, r12
add rax, r13
add r8, r14
add rcx, r13
add rdx, r14
jmp xloop
return:
add rsp,96
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret
conv1_YV12_SSE2 endp