working on some macros,would be nice with inputs if there is faster/better way to load a tile from an image,better ways to compare etc
suggestions to more macros welcome, endings with ALL=uses ***PS mnemonics, endings with ALD= SSE2 double precision ***PD mnemonics
also PROC's
"aim for the stars when it comes to colorization speed"
1:it would be nice to test against one MULPS
2:it would be nice to test between 2 and 8 unroll on many cpu's to understand how many unrolled SSE can be chewed simultanously on different cpus
; #########################################################################
PUSHSTATE MACRO
FXSAVE [ebx]
add ebx,512
ENDM
POPSTATE MACRO
sub ebx,512
FXRSTOR [ebx]
ENDM
IF2D MACRO adr,greyvalue
movsd xmm1,greyvalue
subsd xmm1,adr
movsd xmm0,adr+16 ;get grey +1 in table
subsd xmm0,adr ;-grey in table to easier compare if grey value is between two values
COMISD xmm0,greyvalue ;if less than jmp
ENDM
MOVTILE MACRO adr,pitch ;fetch a tile from image to all XMM regs,pitch
;mov a rectangular tile so its easy to do many colorspace conversions while in XMM regs
;and later easier to check neighbouring pixels
lea ebx,adr
mov eax,pitch
movaps xmm0,[ebx]
movaps xmm1,[ebx+eax]
movaps xmm2,[ebx+eax*2]
movaps xmm4,[ebx+eax*4]
movaps xmm5,[ebx+eax*4+pitch] ;pitch*5
lea eax,eax+eax*2
movaps xmm3,[ebx+eax] ;pitch*3
movaps xmm6,[ebx+eax*2] ;pitch*6
movaps xmm7,[ebx+eax*2+pitch] ;pitch*7
ENDM
MOVALL MACRO var
movaps xmm0,var
movaps xmm1,var
movaps xmm2,var
movaps xmm3,var
movaps xmm4,var
movaps xmm5,var
movaps xmm6,var
movaps xmm7,var
ENDM
MOVUALL MACRO var
movups xmm0,var
movups xmm1,var
movups xmm2,var
movups xmm3,var
movups xmm4,var
movups xmm5,var
movups xmm6,var
movups xmm7,var
ENDM
ADDALL MACRO vectconst
addps XMM0,vectconst
addps XMM1,vectconst
addps XMM2,vectconst
addps XMM3,vectconst
addps XMM4,vectconst
addps XMM5,vectconst
addps XMM6,vectconst
addps XMM7,vectconst
ENDM
MULALL MACRO vectconst
mulps XMM0,vectconst
mulps XMM1,vectconst
mulps XMM2,vectconst
mulps XMM3,vectconst
mulps XMM4,vectconst
mulps XMM5,vectconst
mulps XMM6,vectconst
mulps XMM7,vectconst
ENDM
SUBALL MACRO var
subps xmm0,var
subps xmm1,var
subps xmm2,var
subps xmm3,var
subps xmm4,var
subps xmm5,var
subps xmm6,var
subps xmm7,var
ENDM
RCPALL MACRO var
rcpps xmm0,var
rcpps xmm1,var
rcpps xmm2,var
rcpps xmm3,var
rcpps xmm4,var
rcpps xmm5,var
rcpps xmm6,var
rcpps xmm7,var
ENDM
DIVALL MACRO var
divps xmm0,var
divps xmm1,var
divps xmm2,var
divps xmm3,var
divps xmm4,var
divps xmm5,var
divps xmm6,var
divps xmm7,var
ENDM
SQRTALL MACRO var
sqrtps xmm0,var
sqrtps xmm1,var
sqrtps xmm2,var
sqrtps xmm3,var
sqrtps xmm4,var
sqrtps xmm5,var
sqrtps xmm6,var
sqrtps xmm7,var
ENDM
RSQRTALL MACRO var
rsqrtps xmm0,var
rsqrtps xmm1,var
rsqrtps xmm2,var
rsqrtps xmm3,var
rsqrtps xmm4,var
rsqrtps xmm5,var
rsqrtps xmm6,var
rsqrtps xmm7,var
ENDM
XXALL MACRO ;x*x macro
mulps xmm0,xmm0
mulps xmm1,xmm1
mulps xmm2,xmm2
mulps xmm3,xmm3
mulps xmm4,xmm4
mulps xmm5,xmm5
mulps xmm6,xmm6
mulps xmm7,xmm7
ENDM
;***PD macros
ADDALD MACRO vectconst
addpd XMM0,vectconst
addpd XMM1,vectconst
addpd XMM2,vectconst
addpd XMM3,vectconst
addpd XMM4,vectconst
addpd XMM5,vectconst
addpd XMM6,vectconst
addpd XMM7,vectconst
ENDM
MULALD MACRO vectconst
mulpd XMM0,vectconst
mulpd XMM1,vectconst
mulpd XMM2,vectconst
mulpd XMM3,vectconst
mulpd XMM4,vectconst
mulpd XMM5,vectconst
mulpd XMM6,vectconst
mulpd XMM7,vectconst
ENDM
SUBALD MACRO var
subpd xmm0,var
subpd xmm1,var
subpd xmm2,var
subpd xmm3,var
subpd xmm4,var
subpd xmm5,var
subpd xmm6,var
subpd xmm7,var
ENDM
RCPALD MACRO var
rcppd xmm0,var
rcppd xmm1,var
rcppd xmm2,var
rcppd xmm3,var
rcppd xmm4,var
rcppd xmm5,var
rcppd xmm6,var
rcppd xmm7,var
ENDM
DIVALD MACRO var
divpd xmm0,var
divpd xmm1,var
divpd xmm2,var
divpd xmm3,var
divpd xmm4,var
divpd xmm5,var
divpd xmm6,var
divpd xmm7,var
ENDM
SQRTALD MACRO var
sqrtpd xmm0,var
sqrtpd xmm1,var
sqrtpd xmm2,var
sqrtpd xmm3,var
sqrtpd xmm4,var
sqrtpd xmm5,var
sqrtpd xmm6,var
sqrtpd xmm7,var
ENDM
RSQRTALD MACRO var
rsqrtpd xmm0,var
rsqrtpd xmm1,var
rsqrtpd xmm2,var
rsqrtpd xmm3,var
rsqrtpd xmm4,var
rsqrtpd xmm5,var
rsqrtpd xmm6,var
rsqrtpd xmm7,var
ENDM
XXALD MACRO ;x*x macro
mulpd xmm0,xmm0
mulpd xmm1,xmm1
mulpd xmm2,xmm2
mulpd xmm3,xmm3
mulpd xmm4,xmm4
mulpd xmm5,xmm5
mulpd xmm6,xmm6
mulpd xmm7,xmm7
ENDM
and some simple PROC
PROC HSLtoRGB
add ebx,ebx
movaps xmm0,[HLUT+ebx*8] ;hlut=huetable
movaps xmm1,grey
mulps xmm1,saturation
movaps xmm2,light
addps xmm0,xmm1
mulps xmm0,xmm2
ret
ENDP
here is what I digged up
I newest is what I wrote about compares and MAX**,MIN**
Thanks :t
I am working on several color conversion proc, that works similar Unrolled like *ALL macros, that I want to combine with compares tiles with
This also give you bonus of checking neighbouring pixels
Thank you so much for your efforts in this regard daydreamer. Very much appreciated, I do love your
coding style, it reminds me of mine. :t Just without the bugs as AW will be first to tell you (never mind,
this is an ongoing joke between AW and myself, we point out each others flaws and get mortified etc.,
then help each other out - things get lost in translation and it all starts up again. Almost like marriage)
Quote from: Raistlin on March 18, 2019, 04:43:43 PM
Thank you so much for your efforts in this regard daydreamer. Very much appreciated, I do love your
coding style, it reminds me of mine. :t Just without the bugs as AW will be first to tell you (never mind,
this is an ongoing joke between AW and myself, we point out each others flaws and get mortified etc.,
then help each other out - things get lost in translation and it all starts up again. Almost like marriage)
thanks Raistlin
I thought the part of conditional SSE would be interesting
https://www.cs.uaf.edu/2009/fall/cs301/lecture/11_13_sse_intrinsics.html (https://www.cs.uaf.edu/2009/fall/cs301/lecture/11_13_sse_intrinsics.html)
There is no part of conditional SSE.
Quote from: daydreamer on July 12, 2019, 05:39:55 AM
sse_intrinsics
Intrinsics means "a C
++ compiler thing". :biggrin:
experiment with byte shuffle colors
//input =red 0-255
int blue(int color) {
unsigned int SH = 0xffffffff;
SH = 0x0303; //cyan 0x03;
//green 0x030003;//blue 0x0303;//white 0x03000000; //magenta 0x03000300;//yellow 0x03030000;//black=just set all bit 7 in bytes and it zeros bytes
_asm {
movd xmm1, SH
movd xmm0, color
pshufb xmm0, xmm1
movd color, xmm0
}
return (color & 0xffffff);
}
input
0-255 brightness (color) and 0-7 choose color
int colorI(int color, int color2) {
//black,red,green,blue,yellow,magenta,cyan,white
int SH1[20] = { 0xffffffff,0x030300,0x030003,0x0303,0x03030000,0x03000300,0x03,0x03000000 };
//cyan 0x03;//red0x030300;
//green 0x030003;//blue 0x0303;//white 0x03000000; //magenta 0x03000300;//yellow 0x03030000;
_asm {
mov eax, color2
lea ebx,[ SH1 + eax*4]
movd xmm1,[ebx]
movd xmm0, color
pshufb xmm0, xmm1
movd color, xmm0
}
return (color & 0xffffff);
}