News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

How to pass an integer input parameter to shufps

Started by KradMoonRa, March 30, 2018, 04:03:07 AM

Previous topic - Next topic

Siekmanski

You can get rid of all the compare instructions by using a 256 offsets entry table and jump directly to the code.

.data

Imm8Jump    dd offset Imm8_0,offset Imm8_1,offset Imm8_2,........... Imm8_253,offset Imm8_254,offset Imm8_255

.code

    movzx eax,reg2
    jmp         [Imm8Jump+eax*4]


Imm8_0:
    shufps reg0,reg1,0
    ret
Imm8_1:
    shufps reg0,reg1,1
    ret
Imm8_2:
    shufps reg0,reg1,2
    ret

-------
-------
-------

Imm8_253:
    shufps reg0,reg1,253
    ret
Imm8_254:
    shufps reg0,reg1,254
    ret
Imm8_255:
    shufps reg0,reg1,255
    ret

Creative coders use backward thinking techniques as a strategy.

KradMoonRa

Hi @Siekmanski  :biggrin:

Thank-you, after some good hours managed to get compiled.
Tried to put the label data outside off the code but uasm complains for unknown label and offsets symbols.
Tried to replace the ret with an end label jump, seriously the object header full off jmp erros.
Some errors in the object header still persist about decisions in the .text code, but gets compiled and I think its not big deal.
Some work to port relocatable address for 64bits, working.

And finally:  :greenclp:

_TEXT segment
align 16
uXm_xmm_shuffle_ps proto UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword, _Imm8:dword

align 16
uXm_xmm_shuffle_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword, _Imm8:dword

ifndef __X64__
xor ecx, ecx
xor eax, eax
movzx ecx, byte ptr [dparam3]
lea eax, [shpsjmptable]
mov ecx, [eax+ecx*4]
mov eax, ecx
jmp eax
else
xor rcx, rcx
xor rax, rax
movzx rcx, byte ptr [rparam3]
lea rax, [shpsjmptable]
mov ecx, [rax+rcx*4]
mov rax, rcx
jmp rax
endif

ifndef __X64__
shpsword textequ <dword>
shpsiword textequ <dd>
else
shpsword textequ <qword>
shpsiword textequ <dq>
endif

;uams complains about line to big, and only work declared here
shpsjmptable label shpsword
shpsiword offset shps_0, .....
shpsiword offset shps_51, .....
shpsiword offset shps_101, .....
shpsiword offset shps_151, .....
shpsiword offset shps_201, .....
shpsiword offset shps_251, ....

shps_0 label shpsword
shufps xmm0, xmm1, 0
ret
shps_1 label shpsword
shufps xmm0, xmm1, 1
ret
.................
.................
.................
.................
.................
.................
shps_254 label shpsword
shufps xmm0, xmm1, 254
ret
shps_255 label shpsword
shufps xmm0, xmm1, 255
ret

uXm_xmm_shuffle_ps endp
_TEXT ends

The uasmlib

Siekmanski

 :t
Cool that it works but, do you really need 7 instructions for the jump table execution?
In Masm it works with only 2 instructions.
The both xor instructions are useless.
Creative coders use backward thinking techniques as a strategy.

jj2007

Quote from: Siekmanski on April 02, 2018, 03:56:07 AM
In Masm it works with only 2 instructions.

One should be sufficient: In 64-bit land, the first 4 args are passed in registers 8)

KradMoonRa

#19
Hi,

Reading some spec from intel manuals, interesting some opcodes can be pushed and used.

If someone knows, I'm near something, can be done?

header file:

extern "CC" {
extern __uXm128 uXm_mm_shuffle_ps(__uXm128 InXmm_A, __uXm128 InXmm_B, unsigned int _Imm8);
}


asm file:

ifndef __X64__
   dparam3 textequ <esp+16*2+4>
ifdef WINDOWS
   dparam3 textequ <r8d>
else
   dparam3 textequ <ecx>
endif

align 16
uXm_mm_shuffle_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword, _Imm8:dword

ifndef __X64__
push ebp
mov ebp, esp
sub esp, 16*2+4 ; allocate space on stack
movups [ebp-16], xmm0 ; push xmm param 1
movups [ebp-16*2], xmm1 ; push xmm param 2
mov [ebp-16*2+4], dparam3 ; push param 3
db 0fh, 0c6h, 3h ; shufps imm encoding
mov dparam3, [ebp-16*2+4] ; pop param 3
movups xmm1, [ebp-16*2] ; pop xmm param 2
movups xmm0, [ebp-16] ; pop xmm param 1
add esp, 16*2+4 ; deallocate space on stack
mov esp, ebp
pop ebp
else
push rbp
mov rbp, rsp
sub rsp, 16*2+4 ; allocate space on stack
movups [rbp-16], xmm0 ; push xmm param 1
movups [rbp-16*2], xmm1 ; push xmm param 2
mov [rbp-16*2+4], dparam3 ; push param 3
db 0fh, 0c6h, 3h ; shufps imm encoding
mov dparam3, [rbp-16*2+4] ; pop param 3
movups xmm1, [rbp-16*2] ; pop xmm param 2
movups xmm0, [rbp-16] ; pop xmm param 1
add rsp, 16*2+4 ; deallocate space on stack
mov rsp, rbp
pop rbp
endif

ret
uXm_mm_shuffle_ps endp


EDIT:  producing unexpected results. Can't be done like this. The byte opcode produces same opcode resulting there's a need for a fixed imm.
The uasmlib

Siekmanski

I'm not exactly sure what your goal is?
Maybe I'm missing something?
Using so many instructions to execute 1 shufps instruction which really slows down the code execution a lot....
Why not set the imm8 with a macro construction?

uXm_xmm_shufps macro reg0, reg1, _imm8

    shufps  reg0,reg1,_imm8

endm
Creative coders use backward thinking techniques as a strategy.

KradMoonRa

Hi Siekmanski,

actually I have done it

cc header file:

/*******************************************************/
/* MACRO for use uXm_mm_shuffle_****_ps(). */
/* Argument fp3 is a digit[0123] that represents the fp*/
/* from argument "b" of uXm_mm_shuffle_****_ps that will be     */
/* placed in fp3 of result. fp2 is the same for fp2 in */
/* result. fp1 is a digit[0123] that represents the fp */
/* from argument "a" of uXm_mm_shuffle_****_ps that will be     */
/* places in fp1 of result. fp0 is the same for fp0 of */
/* result                                              */
/* const __uXm128 temp = uXm_MM_SHUFFLE_IMR_PS(InXmm_A, InXmm_B, 0, 1, 2, 3); */
/*******************************************************/
#define uXm_MM_SHUFFLE_IM_PS(VA,VB,fp3,fp2,fp1,fp0) uXm_mm_shuffle_##fp3##fp2##fp1##fp0##_ps(VA,VB)
#define uXm_MM_SHUFFLE_IMR_PS(VA,VB,fp0,fp1,fp2,fp3) uXm_mm_shuffle_##fp3##fp2##fp1##fp0##_ps(VA,VB)


asm file:

align 16
uXm_mm_shuffle_0000_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword
shufps xmm0, xmm1, 0
ret
uXm_mm_shuffle_0000_ps endp

align 16
uXm_mm_shuffle_0001_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword
shufps xmm0, xmm1, 1
ret
uXm_mm_shuffle_0001_ps endp

align 16
uXm_mm_shuffle_0002_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword
shufps xmm0, xmm1, 2
ret
uXm_mm_shuffle_0002_ps endp

align 16
uXm_mm_shuffle_0003_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword
shufps xmm0, xmm1, 3
ret
uXm_mm_shuffle_0003_ps endp

align 16
uXm_mm_shuffle_0010_ps proc UX_VECCALL (xmmword) ;InXmm_A:xmmword, InXmm_B:xmmword
shufps xmm0, xmm1, 4
ret
uXm_mm_shuffle_0010_ps endp
.............
.............


But I'm really searching how to, work with the imm from the c function and using it with the imm asm, know I can't mix the 2 thing's. The imm its an instruction opcode, but how to really select it using the c imm function in a simple clear manner.

I have to do it to bit m128 shifting also, the count it's an imm(opcode).


/*******************************************************/
/* MACRO for use uXm_mm_slli_si128_*(). */
/* result                                              */
/* const __uXm128i temp = uXm_MM_SLLI_SI128_IM(InXmm_A, 3); */
/*******************************************************/
#define uXm_MM_SLLI_SI128_IM(VA,IMM) uXm_mm_slli_si128_##IMM##(VA)



align 16
uXm_mm_slli_si128_0 proc UX_VECCALL (xmmword) ;Inxmm_A:xmmword
pslldq xmm0, 0
ret
uXm_mm_slli_si128_0 endp


The macros expand to the named function declared in the header as extern.

I'm excited to new and different programming approaches.

Love to know how to pass the value from the c function as an opcode to the shufps/pslldq/psllq... etc.

The uasmlib

Siekmanski

Sadly, I don't speak CC language.
Does it have inline ASM support inside a MACRO construction?
Because IMHO it is a waste to use a function for 1 instruction.
Creative coders use backward thinking techniques as a strategy.

KradMoonRa

#23
Unfortunately VC compiler for 64 bits, it can't compile inline assembly.
Yea, one instruction it's really full and fast code. Called from cc has jmp to code really breaks the speedy. But 40% or the relation off Murphy's law, can be fast than the cc function counterpart, or not, depends how the function its used.
I believe the jmp it's the best option for now, till I learn how to make an runtime called machine opcode with the imm converted to byte code, and something like.


;0Fh, 0C6h:    shufleps
;0C1h:         xmm0 to xmm1
;3h:           r/r and not r/m
;0FFh:         255 "shuffle4(3,3,3,3)"
;0C3h:         retn

__shuffleps proc
; how to make this at runtime, must be like this, only need to convert the regparam3 to the byte place FFh, something like regtohex(regparam3).
byte 0Fh, 0C6h, 0C1h, 3h, 0FFh, 0C3h
__shuffleps endp



The uasmlib

habran

Hi KradMoonRa :biggrin:
The best way to learn assembly programming is to write functions in C and than look through disassembly and try to shorten and/or speed it up.
Such altered disassembly can be used to create functions in assembly.
You can play with optimisation and see what difference  produce C compiler.
You can play with intrinsics as well.

Cod-Father

KradMoonRa

Hi habran,

Yes, I has decompiling the functions from c and cc, really fun to follow and trying to bring it with asm.

But I'm watching that some undercover secrets off asm and machine code, its the way to do it in the right way.
The uasmlib

KradMoonRa

#26
Hi,  :biggrin:

Finally I got the jmp address in 64bits working as the 32bits version, still 3 instructions to overcome 64bits linker /LARGEADDRESSAWARE.


_uXm_m128_cvtelts_f32 proc UX_VECCALL (real4) ;InXmm_A:xmmword, InInt_BSel:dword

;.if(rparam2 > 3)
; ret
;.else

ifndef __X64__
movzx eax, byte ptr [rparam2]
;mov rbx, dword ptr [rbx+rparam2*4]
jmp dword ptr [m128cvteltsf32jmptable+eax*4]
else
;movzx rax, byte ptr [rparam2]
lea rbx, qword ptr [m128cvteltsf32jmptable]
mov rbx, qword ptr [rbx+rparam2*8]
jmp rbx
endif

ifndef __X64__
m128cvteltsf32word textequ <dword>
m128cvteltsf32iword textequ <dd>
else
m128cvteltsf32word textequ <qword>
m128cvteltsf32iword textequ <dq>
endif

m128cvteltsf32_0 label m128cvteltsf32word
movss xmm0, xmm0
ret
m128cvteltsf32_1 label m128cvteltsf32word
shufps xmm0, xmm0, _uXm_mm_shuffler4(1,1,1,1)
movss xmm0, xmm0
ret
m128cvteltsf32_2 label m128cvteltsf32word
shufps xmm0, xmm0, _uXm_mm_shuffler4(2,2,2,2)
movss xmm0, xmm0
ret
m128cvteltsf32_3 label m128cvteltsf32word
shufps xmm0, xmm0, _uXm_mm_shuffler4(3,3,3,3)
movss xmm0, xmm0
ret
;.endif

_uXm_m128_cvtelts_f32 endp


All the code attacked.
The uasmlib