Maybe I've found a solution. I'll try it and test its performance.
If it works it is a first step into translating the code into SSE WAY.
mov eax, offset Dest
mov ebx, offset Source
@@:
movd mm0, dword ptr [ebx]
movd mm1, dword ptr [ebx + 4]
movd mm2, dword ptr [ebx + 8]
movd mm3, dword ptr [ebx + 12]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
punpcklbw mm0, mm1
movd dword ptr [eax], mm0