Print Page - My SSE macros+ tutorial

Title: My SSE macros+ tutorial
Post by: daydreamer on February 10, 2019, 08:32:07 AM

working on some macros,would be nice with inputs if there is faster/better way to load a tile from an image,better ways to compare etc
suggestions to more macros welcome, endings with ALL=uses ***PS mnemonics, endings with ALD= SSE2 double precision ***PD mnemonics
also PROC's
"aim for the stars when it comes to colorization speed"
1:it would be nice to test against one MULPS
2:it would be nice to test between 2 and 8 unroll on many cpu's to understand how many unrolled SSE can be chewed simultanously on different cpus

Code Select

; #########################################################################
    PUSHSTATE MACRO
    FXSAVE [ebx]
    add ebx,512
    ENDM
    POPSTATE MACRO
    sub ebx,512
    FXRSTOR [ebx]
    
    ENDM
    IF2D MACRO adr,greyvalue
    movsd xmm1,greyvalue
    subsd xmm1,adr
    movsd xmm0,adr+16 ;get grey +1 in table
    subsd xmm0,adr   ;-grey in table to easier compare if grey value is between two values
    COMISD xmm0,greyvalue ;if less than jmp 
    ENDM 
    
    
    MOVTILE MACRO adr,pitch ;fetch a tile from image to all XMM regs,pitch
    ;mov a rectangular tile so its easy to do many colorspace conversions while in XMM regs
    ;and later easier to check neighbouring pixels 
    lea ebx,adr
    mov eax,pitch
    movaps xmm0,[ebx]
    movaps xmm1,[ebx+eax]
    movaps xmm2,[ebx+eax*2]
    movaps xmm4,[ebx+eax*4]
    movaps xmm5,[ebx+eax*4+pitch] ;pitch*5
    lea eax,eax+eax*2
    movaps xmm3,[ebx+eax] ;pitch*3
    movaps xmm6,[ebx+eax*2] ;pitch*6
    movaps xmm7,[ebx+eax*2+pitch] ;pitch*7
    ENDM
    
    MOVALL MACRO var
    movaps xmm0,var
    movaps xmm1,var
    movaps xmm2,var
    movaps xmm3,var
    movaps xmm4,var
    movaps xmm5,var
    movaps xmm6,var
    movaps xmm7,var
    ENDM
    MOVUALL MACRO var
    movups xmm0,var
    movups xmm1,var
    movups xmm2,var
    movups xmm3,var
    movups xmm4,var
    movups xmm5,var
    movups xmm6,var
    movups xmm7,var
    ENDM
    ADDALL MACRO vectconst
    addps XMM0,vectconst
    addps XMM1,vectconst
    addps XMM2,vectconst
    addps XMM3,vectconst
    addps XMM4,vectconst
    addps XMM5,vectconst
    addps XMM6,vectconst
    addps XMM7,vectconst

    ENDM
    MULALL MACRO vectconst
    mulps XMM0,vectconst
    mulps XMM1,vectconst
    mulps XMM2,vectconst
    mulps XMM3,vectconst
    mulps XMM4,vectconst
    mulps XMM5,vectconst
    mulps XMM6,vectconst
    mulps XMM7,vectconst
    ENDM
    SUBALL MACRO var
    subps xmm0,var
    subps xmm1,var
    subps xmm2,var
    subps xmm3,var
    subps xmm4,var
    subps xmm5,var
    subps xmm6,var
    subps xmm7,var
    ENDM
    RCPALL MACRO var
    rcpps xmm0,var
    rcpps xmm1,var
    rcpps xmm2,var
    rcpps xmm3,var
    rcpps xmm4,var
    rcpps xmm5,var
    rcpps xmm6,var
    rcpps xmm7,var
    ENDM
    DIVALL MACRO var
    divps xmm0,var
    divps xmm1,var
    divps xmm2,var
    divps xmm3,var
    divps xmm4,var
    divps xmm5,var
    divps xmm6,var
    divps xmm7,var
    ENDM
    SQRTALL MACRO var
    sqrtps xmm0,var
    sqrtps xmm1,var
    sqrtps xmm2,var
    sqrtps xmm3,var
    sqrtps xmm4,var
    sqrtps xmm5,var
    sqrtps xmm6,var
    sqrtps xmm7,var
    ENDM
    RSQRTALL MACRO var
    rsqrtps xmm0,var
    rsqrtps xmm1,var
    rsqrtps xmm2,var
    rsqrtps xmm3,var
    rsqrtps xmm4,var
    rsqrtps xmm5,var
    rsqrtps xmm6,var
    rsqrtps xmm7,var
    ENDM
    XXALL MACRO ;x*x macro
    mulps xmm0,xmm0
    mulps xmm1,xmm1
    mulps xmm2,xmm2
    mulps xmm3,xmm3
    mulps xmm4,xmm4
    mulps xmm5,xmm5
    mulps xmm6,xmm6
    mulps xmm7,xmm7
    ENDM
;***PD macros
ADDALD MACRO vectconst
    addpd XMM0,vectconst
    addpd XMM1,vectconst
    addpd XMM2,vectconst
    addpd XMM3,vectconst
    addpd XMM4,vectconst
    addpd XMM5,vectconst
    addpd XMM6,vectconst
    addpd XMM7,vectconst

    ENDM
    MULALD MACRO vectconst
    mulpd XMM0,vectconst
    mulpd XMM1,vectconst
    mulpd XMM2,vectconst
    mulpd XMM3,vectconst
    mulpd XMM4,vectconst
    mulpd XMM5,vectconst
    mulpd XMM6,vectconst
    mulpd XMM7,vectconst
    ENDM
    SUBALD MACRO var
    subpd xmm0,var
    subpd xmm1,var
    subpd xmm2,var
    subpd xmm3,var
    subpd xmm4,var
    subpd xmm5,var
    subpd xmm6,var
    subpd xmm7,var
    ENDM
    RCPALD MACRO var
    rcppd xmm0,var
    rcppd xmm1,var
    rcppd xmm2,var
    rcppd xmm3,var
    rcppd xmm4,var
    rcppd xmm5,var
    rcppd xmm6,var
    rcppd xmm7,var
    ENDM
    DIVALD MACRO var
    divpd xmm0,var
    divpd xmm1,var
    divpd xmm2,var
    divpd xmm3,var
    divpd xmm4,var
    divpd xmm5,var
    divpd xmm6,var
    divpd xmm7,var
    ENDM
    SQRTALD MACRO var
    sqrtpd xmm0,var
    sqrtpd xmm1,var
    sqrtpd xmm2,var
    sqrtpd xmm3,var
    sqrtpd xmm4,var
    sqrtpd xmm5,var
    sqrtpd xmm6,var
    sqrtpd xmm7,var
    ENDM
    RSQRTALD MACRO var
    rsqrtpd xmm0,var
    rsqrtpd xmm1,var
    rsqrtpd xmm2,var
    rsqrtpd xmm3,var
    rsqrtpd xmm4,var
    rsqrtpd xmm5,var
    rsqrtpd xmm6,var
    rsqrtpd xmm7,var
    ENDM
    XXALD MACRO ;x*x macro
    mulpd xmm0,xmm0
    mulpd xmm1,xmm1
    mulpd xmm2,xmm2
    mulpd xmm3,xmm3
    mulpd xmm4,xmm4
    mulpd xmm5,xmm5
    mulpd xmm6,xmm6
    mulpd xmm7,xmm7
    ENDM

and some simple PROC

Code Select


PROC HSLtoRGB
    add ebx,ebx
    movaps xmm0,[HLUT+ebx*8] ;hlut=huetable
    movaps xmm1,grey
    mulps xmm1,saturation
    movaps xmm2,light
    addps xmm0,xmm1
    mulps xmm0,xmm2
    ret
ENDP

Title: Re: My SSE macros
Post by: daydreamer on February 16, 2019, 08:01:16 PM

here is what I digged up
I newest is what I wrote about compares and MAX**,MIN**

Title: Re: My SSE macros
Post by: Siekmanski on February 16, 2019, 08:25:16 PM

Thanks :t

Title: Re: My SSE macros+ tutorial
Post by: daydreamer on February 22, 2019, 05:23:38 AM

I am working on several color conversion proc, that works similar Unrolled like *ALL macros, that I want to combine with compares tiles with
This also give you bonus of checking neighbouring pixels

Title: Re: My SSE macros+ tutorial
Post by: Raistlin on March 18, 2019, 04:43:43 PM

Thank you so much for your efforts in this regard daydreamer. Very much appreciated, I do love your
coding style, it reminds me of mine. :t Just without the bugs as AW will be first to tell you (never mind,
this is an ongoing joke between AW and myself, we point out each others flaws and get mortified etc.,
then help each other out - things get lost in translation and it all starts up again. Almost like marriage)

Title: Re: My SSE macros+ tutorial
Post by: daydreamer on March 19, 2019, 07:05:09 AM

Quote from: Raistlin on March 18, 2019, 04:43:43 PM
Thank you so much for your efforts in this regard daydreamer. Very much appreciated, I do love your
coding style, it reminds me of mine. :t Just without the bugs as AW will be first to tell you (never mind,
this is an ongoing joke between AW and myself, we point out each others flaws and get mortified etc.,
then help each other out - things get lost in translation and it all starts up again. Almost like marriage)

thanks Raistlin

Title: Re: My SSE macros+ tutorial
Post by: daydreamer on July 12, 2019, 05:39:55 AM

I thought the part of conditional SSE would be interesting
https://www.cs.uaf.edu/2009/fall/cs301/lecture/11_13_sse_intrinsics.html (https://www.cs.uaf.edu/2009/fall/cs301/lecture/11_13_sse_intrinsics.html)

Title: Re: My SSE macros+ tutorial
Post by: jj2007 on July 12, 2019, 07:21:34 AM

There is no part of conditional SSE.

Title: Re: My SSE macros+ tutorial
Post by: HSE on July 12, 2019, 08:42:53 AM

Quote from: daydreamer on July 12, 2019, 05:39:55 AM
sse_intrinsics

Intrinsics means "a C⁺⁺ compiler thing". :biggrin:

Title: Re: My SSE macros+ tutorial
Post by: daydreamer on August 09, 2019, 10:41:37 PM

experiment with byte shuffle colors

Code Select


//input =red 0-255
int blue(int color) {
	
	unsigned int SH = 0xffffffff;
	
	SH = 0x0303; //cyan 0x03;
	//green 0x030003;//blue 0x0303;//white 0x03000000; //magenta 0x03000300;//yellow 0x03030000;//black=just set all bit 7 in bytes and it zeros bytes
	_asm {
		movd xmm1, SH
		movd xmm0, color
		pshufb xmm0, xmm1
		movd color, xmm0
	}
	return (color & 0xffffff);
	
}

input
0-255 brightness (color) and 0-7 choose color

Code Select

int colorI(int color, int color2) {
	//black,red,green,blue,yellow,magenta,cyan,white
	int SH1[20] = { 0xffffffff,0x030300,0x030003,0x0303,0x03030000,0x03000300,0x03,0x03000000 };
	//cyan 0x03;//red0x030300;
	//green 0x030003;//blue 0x0303;//white 0x03000000; //magenta 0x03000300;//yellow 0x03030000;
	
		_asm {
			mov eax, color2
			lea ebx,[ SH1 + eax*4]
			movd xmm1,[ebx]
			movd xmm0, color
			pshufb xmm0, xmm1
			movd color, xmm0
		}
	return (color & 0xffffff);
}

The MASM Forum

General => The Laboratory => Topic started by: daydreamer on February 10, 2019, 08:32:07 AM