News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

YMMx comparison

Started by mabdelouahab, September 26, 2020, 09:42:21 AM

Previous topic - Next topic

mabdelouahab

What is the best way to test if YMM0 equals YMMWORD ptr [reg], (same question for YMM0 equals 0)

mabdelouahab

The general idea is:
I have a Victor contains n elements, Each element is 32 bytes long (1 record=32 byte{YMMWORD})
I want to filter this vector, every 2 item are the same,they should be deleted
Example:

n=2000000
.DATA
Vec_tInfo DQ n,0,0,0
Vec YMMWORD n DUP(?)

Vec={5,50,5,6,2,2,8,17,33,5,6,10,30,2,1,2,7,13,9,2}
must be:
Vec={50,8,17,33,5,10,30,1,7,13,9,2}

I am looking for the fastest way، because a vector may contain two million items

jj2007


mabdelouahab


My first try  :biggrin: , need performance (Or another way) because it is slow  :sad:

   RecordLength=32
      mov rsi,Vec_Info
      mov rax,qword ptr [rsi]
      mov aindex,rax         ;aindex= NB element
      cmp rax,1
      jg @F001                           ; if there are one elem
         ret
      @F001:
      mov rax,qword ptr [rsi]
      mov rbx,RecordLength
      mul   rbx
      mov rdi,rax
      add rdi,Vec_Info                        ;    rdx=index to Last element
      mov rdx,Vec_Info                        ;    rdx=index to first compared element
      Filter__01_Start:
         add    rdx,   RecordLength   ;Vec   
         Filter__01_Start_Without_Sh:
         vmovups ymm0, YMMWORD PTR [rdx]         ;   Get the first compared Element to compare with next elements   
         ;--------------------------------------------                              
         mov rcx,aindex      ; Get the position of the first compared Element
         dec  rcx
         mov bindex,rcx      ; bindex=The number of items remaining
         mov rcx,rdx             
         Filter__02_Start:
            add    rcx,   RecordLength   ; Next Element
            vmovupd   ymm1, ymm0            ; Get the second compared Element  YMM0
            vpxor      ymm1, YMMWORD PTR [rcx]   ; Compare with the second compared Element 
            vptest    ymm1,ymm1;ymm0,YMMWORD PTR [rcx]   
            jnz      Filter__02_NoEqu         ;   jmp if not equ
                  ;case equal:**********************************************
                  sub    qword ptr [rsi]      ,2   ; Decrease the number total of element by two
                  .if aindex==2               ; In the case of  the first compared element is before the last elements =======
                     jmp    Filter__01_ExitDo   ;Do not do anything   exit loop            
                  .endif
                  .if aindex==3               ; In the case of  the first compared element is before the last two elements====
                     .if rcx==rdi   ;   If the second compared element is the last element
                        sub rdi,RecordLength               
                     .endif         ;   else :Copy the last element in place of the first compared element
                     vmovups   ymm1,YMMWORD PTR [rdi]
                     vmovups YMMWORD PTR [rdx],ymm1
                     jmp    Filter__01_ExitDo   ;and exit loop   
                  .endif
                  .if rcx==rdi            ;   In the case of the second compared element is last elements====================
                     sub    rdi,RecordLength   ;Copy the penultimate item instead of the first compared item               
                     vmovups   ymm1,YMMWORD PTR [rdi]
                     sub    rdi,RecordLength       
                     vmovups YMMWORD PTR [rdx],ymm1
                     sub   aIndex,2
                     jmp   Filter__01_Start_Without_Sh   ; loop
                  .endif
                  .if bindex==2            ;   In the case of the second compared element is before the last====================
                     vmovups   ymm1,YMMWORD PTR [rdi] ;Copy the last item instead of the first compared item
                     sub    rdi,RecordLength *2 ; Decrease the index of last element by two
                     vmovups YMMWORD PTR [rdx],ymm1
                     sub      aIndex,2
                     jmp   Filter__01_Start_Without_Sh   ; loop
                  .endif                  ;   Case   ELSE ======================================================================
                  vmovups   ymm1,YMMWORD PTR [rdi];Copy the last item instead of the first compared item               
                  sub    rdi,RecordLength
                  vmovups YMMWORD PTR [rdx],ymm1
                  vmovups   ymm1,YMMWORD PTR [rdi]
                  sub    rdi,RecordLength   ;Copy the item before it instead of the second compared item
                  vmovups YMMWORD PTR [rcx],ymm1
                  sub   aIndex,2
                  sub   bIndex,2   
                  jmp   Filter__01_Start_Without_Sh   ;loop
         Filter__02_NoEqu:
         dec    bIndex
         jnz    Filter__02_Start   
         ;--------------------------------------------                              
      dec    aIndex
      cmp    aIndex,1
      jg      Filter__01_Start         
      Filter__01_ExitDo:

mabdelouahab

for test
n=20
.DATA
Vec_Info DQ n,0,0,0
Vec YMMWORD 5,50,5,6,2,2,8,17,33,5,6,10,30,2,1,2,7,13,9,2

mabdelouahab


This is much faster than the first, bu Copy filtering

   filter01 proc
      lea rsi,Vec_Info
      mov rax,qword ptr [rsi]
      cmp rax,1
      jg @F                           ; if there are one elem
         ret
      @@:
      vpxor xmm7,xmm7,xmm7;==0
      mov r12,rax
      inc r12
      mov rbx,rsi
      lea rdi,Vec_Info
      mov rcx,rdi
      mov qword ptr [rcx],0   
      filter01_Start:
         add rbx,   RLength
         cmp qword ptr [rbx],0      ;   if element == 0
         jne   filter01_Loop02         
          cmp qword ptr [rbx+8],0                         
          jne filter01_Loop02      
           cmp qword ptr [rbx+16],0                        
           jne filter01_Loop02      
            cmp qword ptr [rbx+24],0                        
            jne filter01_Loop02                   
               jmp filter01_Next;      V1==0  ; ==0 =>pass to the next             
            filter01_Loop02:      
                  mov r8 , qword ptr [rbx]                  
                  mov r15,rbx
                  mov r14,r12
                  dec r14
                  jz filter01_Next
                  filter01_Start02:
                     add r15,   RLength
                     cmp r8,qword ptr [r15]               
                     jne   filter01_Next02
                        mov r13 ,qword ptr [rbx+8]; 2nd qword
                        cmp r13 ,qword ptr [r15+8]
                        jne   filter01_Next02
                           mov r13 ,qword ptr [rbx+16]; 3 qword
                           cmp r13 ,qword ptr [r15+16]
                           jne   filter01_Next02
                              mov r13 ,qword ptr [rbx+24]; 4 qword
                              cmp r13 ,qword ptr [r15+24]
                              jne   filter01_Next02
                                 ;case equ ----------------------------
                                 vmovupd ymmword ptr [r15],ymm7      ;   set = 0
                                 jmp filter01_Next
               filter01_Next02:
               dec r14
               jnz filter01_Start02
            ;-------------------------- copy
            add rdi,RLength
            inc dword ptr [rcx]
            vmovupd ymm1,ymmword ptr [rbx]
            vmovupd ymmword ptr [rdi],ymm1
            ;--------------------------
         filter01_Next:
         dec r12
         jnz   filter01_Start
         
      filter01_End:
         
      ret
   filter01 endp

mineiro

Hello sir  mabdelouahab;
this is a simple try, linux x86-64, gtk3 or gtk2 (glib).
I'd rather be this ambulant metamorphosis than to have that old opinion about everything

mabdelouahab

Hello mineiro, Thanks a good way
The attachment file is a comparison of all past methods for filtering a table of 128 elements, It can be executed in Windows and linux:

Windows : buildWin.bat {micro second}
QuotemabdelouahabFilter01    nbELEMENT= { 78} time ns= {    63}
mabdelouahabFilter02    nbELEMENT= { 78} time ns= {   144}
mineiro_ymmx_cmp_02     nbELEMENT= {128} time ns= {    98}
linux        : build.sh {nano second}
QuotemabdelouahabFilter01    nbELEMENT= { 78} time ns= {  8631}
mabdelouahabFilter02    nbELEMENT= { 78} time ns= { 16383}
mineiro_ymmx_cmp_02    nbELEMENT= {128} time ns= { 10960}

mineiro

hello sir  mabdelouahab;
Nice job. Follow measures here:

mabdelouahabFilter01 nbELEMENT= { 78} time ns= { 19357}
mabdelouahabFilter02 nbELEMENT= { 78} time ns= { 40501}
mineiro_ymmx_cmp_02 nbELEMENT= {128} time ns= { 35468}

Windows version I'm receiving error in mineiro_ymmx_cmp_02 procedure when executing by wine, I think is necessary rewrite to be windows compatible. I will do that later.

$ wine64 filter.exe
mabdelouahabFilter01 nbELEMENT= { 78} time ns= {    43}
mabdelouahabFilter02 nbELEMENT= { 78} time ns= {    84}
wine: Unhandled page fault on read access to FFFFFFFFFFFFFFFF at address 00000001400012C3 (thread 0009), starting debugger...

1400012c3:   c4 c1 fd 10 0a          vmovupd ymm1,YMMWORD PTR [r10]
Today it's a hard work day, I will do that until this end of week. I quick view I perceive g_malloc0, just a suposition, maybe malloc+rltzeromemory. I'm forgoting windows functions, need review.
I'd rather be this ambulant metamorphosis than to have that old opinion about everything