What is the best way to test if YMM0 equals YMMWORD ptr [reg], (same question for YMM0 equals 0)
The general idea is:
I have a Victor contains n elements, Each element is 32 bytes long (1 record=32 byte{YMMWORD})
I want to filter this vector, every 2 item are the same,they should be deleted
Example:
n=2000000
.DATA
Vec_tInfo DQ n,0,0,0
Vec YMMWORD n DUP(?)
Vec={5,50,5,6,2,2,8,17,33,5,6,10,30,2,1,2,7,13,9,2}
must be:
Vec={50,8,17,33,5,10,30,1,7,13,9,2}
I am looking for the fastest way، because a vector may contain two million items
VPCMPEQQ? (https://en.wikipedia.org/wiki/AVX-512#Legacy_instructions_upgraded_with_EVEX_encoded_versions)
My first try :biggrin: , need performance (Or another way) because it is slow :sad:
RecordLength=32
mov rsi,Vec_Info
mov rax,qword ptr [rsi]
mov aindex,rax ;aindex= NB element
cmp rax,1
jg @F001 ; if there are one elem
ret
@F001:
mov rax,qword ptr [rsi]
mov rbx,RecordLength
mul rbx
mov rdi,rax
add rdi,Vec_Info ; rdx=index to Last element
mov rdx,Vec_Info ; rdx=index to first compared element
Filter__01_Start:
add rdx, RecordLength ;Vec
Filter__01_Start_Without_Sh:
vmovups ymm0, YMMWORD PTR [rdx] ; Get the first compared Element to compare with next elements
;--------------------------------------------
mov rcx,aindex ; Get the position of the first compared Element
dec rcx
mov bindex,rcx ; bindex=The number of items remaining
mov rcx,rdx
Filter__02_Start:
add rcx, RecordLength ; Next Element
vmovupd ymm1, ymm0 ; Get the second compared Element YMM0
vpxor ymm1, YMMWORD PTR [rcx] ; Compare with the second compared Element
vptest ymm1,ymm1;ymm0,YMMWORD PTR [rcx]
jnz Filter__02_NoEqu ; jmp if not equ
;case equal:**********************************************
sub qword ptr [rsi] ,2 ; Decrease the number total of element by two
.if aindex==2 ; In the case of the first compared element is before the last elements =======
jmp Filter__01_ExitDo ;Do not do anything exit loop
.endif
.if aindex==3 ; In the case of the first compared element is before the last two elements====
.if rcx==rdi ; If the second compared element is the last element
sub rdi,RecordLength
.endif ; else :Copy the last element in place of the first compared element
vmovups ymm1,YMMWORD PTR [rdi]
vmovups YMMWORD PTR [rdx],ymm1
jmp Filter__01_ExitDo ;and exit loop
.endif
.if rcx==rdi ; In the case of the second compared element is last elements====================
sub rdi,RecordLength ;Copy the penultimate item instead of the first compared item
vmovups ymm1,YMMWORD PTR [rdi]
sub rdi,RecordLength
vmovups YMMWORD PTR [rdx],ymm1
sub aIndex,2
jmp Filter__01_Start_Without_Sh ; loop
.endif
.if bindex==2 ; In the case of the second compared element is before the last====================
vmovups ymm1,YMMWORD PTR [rdi] ;Copy the last item instead of the first compared item
sub rdi,RecordLength *2 ; Decrease the index of last element by two
vmovups YMMWORD PTR [rdx],ymm1
sub aIndex,2
jmp Filter__01_Start_Without_Sh ; loop
.endif ; Case ELSE ======================================================================
vmovups ymm1,YMMWORD PTR [rdi];Copy the last item instead of the first compared item
sub rdi,RecordLength
vmovups YMMWORD PTR [rdx],ymm1
vmovups ymm1,YMMWORD PTR [rdi]
sub rdi,RecordLength ;Copy the item before it instead of the second compared item
vmovups YMMWORD PTR [rcx],ymm1
sub aIndex,2
sub bIndex,2
jmp Filter__01_Start_Without_Sh ;loop
Filter__02_NoEqu:
dec bIndex
jnz Filter__02_Start
;--------------------------------------------
dec aIndex
cmp aIndex,1
jg Filter__01_Start
Filter__01_ExitDo:
for test
n=20
.DATA
Vec_Info DQ n,0,0,0
Vec YMMWORD 5,50,5,6,2,2,8,17,33,5,6,10,30,2,1,2,7,13,9,2
This is much faster than the first, bu Copy filtering
filter01 proc
lea rsi,Vec_Info
mov rax,qword ptr [rsi]
cmp rax,1
jg @F ; if there are one elem
ret
@@:
vpxor xmm7,xmm7,xmm7;==0
mov r12,rax
inc r12
mov rbx,rsi
lea rdi,Vec_Info
mov rcx,rdi
mov qword ptr [rcx],0
filter01_Start:
add rbx, RLength
cmp qword ptr [rbx],0 ; if element == 0
jne filter01_Loop02
cmp qword ptr [rbx+8],0
jne filter01_Loop02
cmp qword ptr [rbx+16],0
jne filter01_Loop02
cmp qword ptr [rbx+24],0
jne filter01_Loop02
jmp filter01_Next; V1==0 ; ==0 =>pass to the next
filter01_Loop02:
mov r8 , qword ptr [rbx]
mov r15,rbx
mov r14,r12
dec r14
jz filter01_Next
filter01_Start02:
add r15, RLength
cmp r8,qword ptr [r15]
jne filter01_Next02
mov r13 ,qword ptr [rbx+8]; 2nd qword
cmp r13 ,qword ptr [r15+8]
jne filter01_Next02
mov r13 ,qword ptr [rbx+16]; 3 qword
cmp r13 ,qword ptr [r15+16]
jne filter01_Next02
mov r13 ,qword ptr [rbx+24]; 4 qword
cmp r13 ,qword ptr [r15+24]
jne filter01_Next02
;case equ ----------------------------
vmovupd ymmword ptr [r15],ymm7 ; set = 0
jmp filter01_Next
filter01_Next02:
dec r14
jnz filter01_Start02
;-------------------------- copy
add rdi,RLength
inc dword ptr [rcx]
vmovupd ymm1,ymmword ptr [rbx]
vmovupd ymmword ptr [rdi],ymm1
;--------------------------
filter01_Next:
dec r12
jnz filter01_Start
filter01_End:
ret
filter01 endp
Hello sir mabdelouahab;
this is a simple try, linux x86-64, gtk3 or gtk2 (glib).
Hello mineiro, Thanks a good way
The attachment file is a comparison of all past methods for filtering a table of 128 elements, It can be executed in Windows and linux:
Windows : buildWin.bat {micro second}
QuotemabdelouahabFilter01 nbELEMENT= { 78} time ns= { 63}
mabdelouahabFilter02 nbELEMENT= { 78} time ns= { 144}
mineiro_ymmx_cmp_02 nbELEMENT= {128} time ns= { 98}
linux : build.sh {nano second}
QuotemabdelouahabFilter01 nbELEMENT= { 78} time ns= { 8631}
mabdelouahabFilter02 nbELEMENT= { 78} time ns= { 16383}
mineiro_ymmx_cmp_02 nbELEMENT= {128} time ns= { 10960}
hello sir mabdelouahab;
Nice job. Follow measures here:
mabdelouahabFilter01 nbELEMENT= { 78} time ns= { 19357}
mabdelouahabFilter02 nbELEMENT= { 78} time ns= { 40501}
mineiro_ymmx_cmp_02 nbELEMENT= {128} time ns= { 35468}
Windows version I'm receiving error in mineiro_ymmx_cmp_02 procedure when executing by wine, I think is necessary rewrite to be windows compatible. I will do that later.
$ wine64 filter.exe
mabdelouahabFilter01 nbELEMENT= { 78} time ns= { 43}
mabdelouahabFilter02 nbELEMENT= { 78} time ns= { 84}
wine: Unhandled page fault on read access to FFFFFFFFFFFFFFFF at address 00000001400012C3 (thread 0009), starting debugger...
1400012c3: c4 c1 fd 10 0a vmovupd ymm1,YMMWORD PTR [r10]
Today it's a hard work day, I will do that until this end of week. I quick view I perceive g_malloc0, just a suposition, maybe malloc+rltzeromemory. I'm forgoting windows functions, need review.