just as addition, one might try this x86 solution. If I'm not wrong, it also behaves like the CMP instruction:
cmp128 macro ow0,ow1
LOCAL @NE1,@NE2,@NE3,@end
lea esi,ow0
lea edi,ow1
mov eax,[esi+0*DWORD]
mov ecx,[esi+1*DWORD]
mov edx,[esi+2*DWORD]
mov ebx,[esi+3*DWORD]
sub eax,[edi+0*DWORD]
jnz @NE1
sbb ecx,[edi+1*DWORD]
jnz @NE2
sbb edx,[edi+2*DWORD]
jnz @NE3
sbb ebx,[edi+3*DWORD]
;/* equal */
jmp @end
@NE1: sbb ecx,[edi+1*DWORD]
@NE2: sbb edx,[edi+2*DWORD]
@NE3: sbb ebx,[edi+3*DWORD]
;/* LT or GT */
jnz @end
or ebx,2 ; MOV may be better, because it breaks the dependency chain...
cmp ebx,1 ; new flags: A GT B
@end:
endm