I'm sure you can convert this to floating point
This also certainly has some AVX potential but only considering that there will be many sequential dwords on correct positions.
mov rsi, _buff3 ; 16byte aligned buffer
mov ecx, (_buff3_len*4) ; length in bytes
and ecx, not 15
jz error
sub ecx, 16
jb error
cmp ecx, 16 ; min 32byte buffer
jb error
xor edx, edx
mov ebx, 16
mov ebp, ecx
align 16
.one_pass:
movdqa xmm0, [rsi] ; 3 2 1 0
pshufd xmm1, xmm0, 111001b ; _ 3 2 1
pinsrd xmm1, [rsi+16], 3 ; 4 3 2 1
movdqa xmm2, xmm0
pcmpgtd xmm0, xmm1 ; -1 if any of theese true: 0>1 1>2 2>3 3>4
pmovmskb eax, xmm0
test ax, ax
jz @f ; we like 0, not -1
; sort from high byte in memory to lowest
pshufd xmm6, xmm1, 3
pshufd xmm4, xmm2, 3
pshufd xmm3, xmm2, 2
movdqa xmm5, xmm6
pmaxud xmm6, xmm4
pminud xmm4, xmm5
movdqa xmm5, xmm3
movd [rsi+16], xmm6
pminud xmm3, xmm4
pmaxud xmm5, xmm4
movdqa xmm4, xmm1
movd [rsi+12], xmm5
pmaxud xmm4, xmm3
pminud xmm1, xmm3
movdqa xmm3, xmm2
movd [rsi+8], xmm4
pminud xmm2, xmm1
pmaxud xmm3, xmm1
movd [rsi], xmm2
movd [rsi+4], xmm3
xor ebx, ebx
@@:
add rsi, 16
add edx, ebx ; number bytes that are on their places
sub ecx, 16
jnz .one_pass
; last 16byte chunk in the array:
movdqa xmm0, [rsi]
pshufd xmm1, xmm0, 111001b
movdqa xmm2, xmm0
pcmpgtd xmm0, xmm1
pmovmskb eax, xmm0
test eax, 4095
jz @f
pshufd xmm3, xmm2, 2
pshufd xmm4, xmm2, 3
movdqa xmm5, xmm3
pminud xmm3, xmm4
pmaxud xmm5, xmm4
movdqa xmm4, xmm1
movd [rsi+12], xmm5
pmaxud xmm4, xmm3
pminud xmm1, xmm3
movdqa xmm3, xmm2
movd [rsi+8], xmm4
pminud xmm2, xmm1
pmaxud xmm3, xmm1
movd [rsi], xmm2
movd [rsi+4], xmm3
xor ebx, ebx
@@:
add edx, ebx
xor eax, eax
sub edx, 16
cmovc edx, eax
mov ecx, ebp
cmp edx, ebp
jz .done
lea rsi, [_buff3+rdx]
sub ecx, edx
mov ebx, 16
jmp .one_pass
.done:
[/code[