Another approach, shorter and probably faster (untested). It requires AVX2 because uses the instruction VPGATHERDD (Gather Packed Dword Values Using Signed Dword).
; ************ TRANSPOSING CODE START ***********************
lea r8, invalues
mov ecx, 8
lea r9, outvalues
vmovdqa ymm0, YMMWORD PTR Index
@another:
vmovdqa ymm2, YMMWORD PTR _mask
VPGATHERDD ymm1, [r8+ymm0*1], ymm2
vmovdqa YMMWORD PTR [r9], ymm1
add r8, sizeof DWORD
add r9, 8*sizeof DWORD
loop @another
; ************ TRANSPOSING CODE END ************************
Transposing a 8x8 Matrix
Before:
row 0 1 2 3 4 5 6 7 8
row 1 9 10 11 12 13 14 15 16
row 2 17 18 19 20 21 22 23 24
row 3 25 26 27 28 29 30 31 32
row 4 33 34 35 36 37 38 39 40
row 5 41 42 43 44 45 46 47 48
row 6 49 50 51 52 53 54 55 56
row 7 57 58 59 60 61 62 63 64
After:
row 0 1 9 17 25 33 41 49 57
row 1 2 10 18 26 34 42 50 58
row 2 3 11 19 27 35 43 51 59
row 3 4 12 20 28 36 44 52 60
row 4 5 13 21 29 37 45 53 61
row 5 6 14 22 30 38 46 54 62
row 6 7 15 23 31 39 47 55 63
row 7 8 16 24 32 40 48 56 64