Converting inline code to standalone asm file

Elegant · June 21, 2015, 03:50:26 AM

Hi, I'm trying to convert one function from inline assembly in VS to being in a standalone asm file. I'm having quite some trouble with one part of the function however. The inline function is:

Code Select


//This line is defined in the header for onesByte.
__declspec(align(16)) const __int64 onesByte[2] = { 0x0101010101010101, 0x0101010101010101 };

void checkOscillation5_SSE2(const uint8_t *p2p, const uint8_t *p1p, const uint8_t *s1p,
		const uint8_t *n1p, const uint8_t *n2p, uint8_t *dstp, int stride, int width,
		int height, int thresh)
{
	__int64 thresh128[2];
	thresh128[0] = ((thresh-1)<<8)+(thresh-1);
	thresh128[0] += (thresh128[0]<<48)+(thresh128[0]<<32)+(thresh128[0]<<16);
	thresh128[1] = thresh128[0];
	__asm
	{
		mov eax,p2p
		mov ebx,p1p
		mov edx,s1p
		mov edi,n1p
		mov esi,n2p
		pxor xmm6,xmm6
		movdqu xmm7,thresh128
yloop:
		xor ecx,ecx
		align 16
xloop:
		movdqa xmm0,[eax+ecx]
		movdqa xmm2,[ebx+ecx]
		movdqa xmm1,xmm0
		movdqa xmm3,xmm2
		pminub xmm0,[edx+ecx]
		pmaxub xmm1,[edx+ecx]
		pminub xmm2,[edi+ecx]
		pmaxub xmm3,[edi+ecx]
		pminub xmm0,[esi+ecx]
		pmaxub xmm1,[esi+ecx]
		movdqa xmm4,xmm3
		movdqa xmm5,xmm1
		psubusb xmm4,xmm2
		psubusb xmm5,xmm0
		psubusb xmm4,xmm7
		psubusb xmm5,xmm7
		psubusb xmm2,onesByte
		psubusb xmm0,onesByte
		psubusb xmm1,xmm2
		psubusb xmm3,xmm0
		pcmpeqb xmm1,xmm6
		pcmpeqb xmm3,xmm6
		pcmpeqb xmm4,xmm6
		pcmpeqb xmm5,xmm6
		mov eax,dstp
		por xmm1,xmm3
		pand xmm4,xmm5
		pand xmm1,xmm4
		movdqa [eax+ecx],xmm1
		add ecx,16
		mov eax,p2p
		cmp ecx,width
		jl xloop
		mov eax,stride
		add ebx,stride
		add p2p,eax
		add edx,stride
		mov eax,stride
		add edi,stride
		add dstp,eax
		add esi,stride
		mov eax,p2p
		dec height
		jnz yloop
	}
}

After a bit work I've come to the following for the standalone file:

Code Select


.xmm
.model flat,c

.data

align 16

onesByte qword 2 dup(0101010101010101h)

.code

checkOscillation5_SSE2 proc p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword

	public checkOscillation5_SSE2

	mov eax,p2p
	mov ebx,p1p
	mov edx,s1p
	mov edi,n1p
	mov esi,n2p
	pxor xmm6,xmm6
	dec thresh
	movd xmm7,thresh
	punpcklbw xmm7,xmm7
	punpcklwd xmm7,xmm7
	punpckldq xmm7,xmm7
	punpcklqdq xmm7,xmm7
yloop:
	xor ecx,ecx
	align 16
xloop:
	movdqa xmm0,[eax+ecx]
	movdqa xmm2,[ebx+ecx]
	movdqa xmm1,xmm0
	movdqa xmm3,xmm2
	pminub xmm0,[edx+ecx]
	pmaxub xmm1,[edx+ecx]
	pminub xmm2,[edi+ecx]
	pmaxub xmm3,[edi+ecx]
	pminub xmm0,[esi+ecx]
	pmaxub xmm1,[esi+ecx]
	movdqa xmm4,xmm3
	movdqa xmm5,xmm1
	psubusb xmm4,xmm2
	psubusb xmm5,xmm0
	psubusb xmm4,xmm7
	psubusb xmm5,xmm7
	psubusb xmm2,oword ptr onesByte
	psubusb xmm0,oword ptr onesByte
	psubusb xmm1,xmm2
	psubusb xmm3,xmm0
	pcmpeqb xmm1,xmm6
	pcmpeqb xmm3,xmm6
	pcmpeqb xmm4,xmm6
	pcmpeqb xmm5,xmm6
	mov eax,dstp
	por xmm1,xmm3
	pand xmm4,xmm5
	pand xmm1,xmm4
	movdqa [eax+ecx],xmm1
	add ecx,16
	mov eax,p2p
	cmp ecx,width_
	jl xloop
	mov eax,stride
	add ebx,stride
	add p2p,eax
	add edx,stride
	mov eax,stride
	add edi,stride
	add dstp,eax
	add esi,stride
	mov eax,p2p
	dec height
	jnz yloop

	ret
	
checkOscillation5_SSE2 endp

From my current knowledge I've realized that the 2 lines of code dealing with onesByte in the standalone file seems to be causing the issue but I'm unable to determine what needs to be changed in order to subtract onesByte for the calculations. This next section is the original function in C++ for reference.

Code Select


const uint8_t *p1p = p1->GetPtr(b);
const int stride = p1->GetPitch(b);
const int width = p1->GetWidth(b);
const int height = p1->GetHeight(b);
const uint8_t *p2p = p2->GetPtr(b);
const uint8_t *s1p = s1->GetPtr(b);
const uint8_t *n1p = n1->GetPtr(b);
const uint8_t *n2p = n2->GetPtr(b);
uint8_t *dstp = dst->GetPtr(b);
const int thresh = b == 0 ? othreshL : othreshC;

if (cpu&CPUF_SSE2)
	checkOscillation5_SSE2(p2p, p1p, s1p, n1p, n2p, dstp, stride, width, height, thresh);
else
{
	for (int y = 0; y < height; ++y)
	{
		for (int x = 0; x < width; ++x)
		{
			const int min31 = min3(p2p[x], s1p[x], n2p[x]);
			const int max31 = max3(p2p[x], s1p[x], n2p[x]);
			const int min22 = min(p1p[x], n1p[x]);
			const int max22 = max(p1p[x], n1p[x]);
			if (((min31 > max22) || max22 == 0 || (max31 < min22) || max31 == 0) &&
				max31-min31 < thresh && max22-min22 < thresh)
				dstp[x] = 0xFF;
			else dstp[x] = 0;
		}

		p2p += stride;
		p1p += stride;
		s1p += stride;
		n1p += stride;
		n2p += stride;
		dstp += stride;
	}
}

Any help in what exactly I'm missing for this port would be greatly appreciated. This is the only function I have left that gave me trouble. I also tested using all other functions in their C++ forms to rule them out as the culprit.

dedndave · June 21, 2015, 05:35:14 AM

you should probably add a processor at the beginning

Code Select

        .686p
        .MMX
        .XMM
        .MODEL  Flat,C
        OPTION  CaseMap:None

the compiler probably handles register preservation for you
in ASM, you should do it explicitly for EBX, ESI, EDI
the assembler will handle EBP and the stack frame, though

Code Select

checkOscillation5_SSE2 proc USES EBX ESI EDI p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword

in ASM, PROC's are always public, by default - no need to declare it

rrr314159 · June 21, 2015, 08:40:33 AM

Hi Elegant,

There's nothing wrong the onesbyte statements as they stand, but the only thing that occurs to me, if xmm2 or xmm0 get to 0 psubusb won't work, they will remain 0 not become -1. I can't tell if that is Ok with your logic or not (haven't figured out what the function is doing), but I'm supposing you're implementing ++y and ++x here? Because u then subtract them from xmm1 and xmm3, so in effect you're incrementing xmm1 and xmm3 by 1. If these guesses are true then maybe would be better to paddusb onesbyte to xmm1 and xmm3 instead of the double subtraction.

Of course if the original inline function works it's hard to see why this would be a problem. In that case I would look at the thresh computations (punpcklbw xmm7,xmm7 etc) but didn't because u don't think that's where the problem is.

One way to make sense of it: the original thresh comps in C were correct, but the asm translation is not. With correct thresh xmm2 and xmm0 were never negative, but with incorrect comps they are; that's why u think the prob is when you psubusb onesbyte, but actually the prob is with thresh comps

Sorry for the guessing. If you want a definite answer, just post a simpler function!

Elegant · June 21, 2015, 01:12:30 PM

@dedndave Alright, I'll add that in right and now and remember it for future projects. Unfortunately, this doesn't solve the issue at hand.

@rrr314159 Yes that's the intent. It's a helper function for checking oscillation on a combed/dot crawled frame in video editing. I'm trying to update the code to more modern version and get a bit more FPS out of the filter and to then port it to create a x64 version but fixing this procedure (my last x86 procedure) comes first. I've double checked the value of thresh and all seems well, to be safe I restored the C++ version and passed it through, same result.

Really strange that this is accepted and handled nicely when inline but now it's like it's not even the same function.

rrr314159 · June 21, 2015, 03:40:28 PM

I assume that xmm0 and xmm2 aren't becoming negative as I suggested, so there's nothing wrong with the psubusb statements.

Looking elsewhere, I see one inconsistency. Width passed to inline asm routine must be multiplied by 16, right?, from the width passed to C++ routine - since it's compared to ecx, which is incremented by 16 not 1. But in the standalone, width is called width_ instead. So, is it also multiplied by 16?

Another thought, you should write a testbed to build and run the code. It should call the subroutine and print out inputs and outputs (apparently dstp array is the only output). A lot of work if u haven't done it already but it makes it easy to troubleshoot

If u have done that already, post runs from inline and standalone - it might help. Another possibility, leave this one alone and move on to 64 bit, maybe it'll come out in the wash (often works for me). Or, find a site where they're better at this sort of thing than we are

Elegant · June 21, 2015, 04:21:45 PM

Both "widths" are the same and are being multiplied. The variable name "width" cannot be used in standalone files as it conflicts with the operator by the same name so for simplicity it was renamed "width_". The width is also used in the other procedures in the same manner. I think I will have to come up with a testbed and see what's going on for this one function as you say.

In this case, I don't think moving on to the x64 in this case is going to provide to much insight as I can remove onesByte and use the below with the extra xmm it provides.

Code Select


pcmpeqb xmm9, xmm9
psrlw xmm9, 15
movdqa xmm8, xmm9
psllw xmm8, 8
por xmm9, xmm8
...
psubusb xmm2, xmm9
psubusb xmm0, xmm9

rrr314159 · June 21, 2015, 05:50:26 PM

Re 64 bit, that's a perfect example of what I'm talking about. Go on to 64-bit and find out if it actually works using a register in that way - would definitely nail down the problem to onesbyte. Maybe, just off the top of my head, it's being overwritten by dstp (but not when running inline); who knows; xmm9 approach might reveal something. But I wouldn't be surprised if xmm9 fails the same way as onesbyte (because I can't see anything wrong with it). (Don't see anything wrong elsewhere either.) Either way it's a vital clue. Or, simply save and restore xmm3 (for instance) and try the different technique with that - take 1/2 hour. Alternative is to sit there and stare at the code

- or, take the time and write a testbed prog

BTW u need to use a debugger if don't already, Olly is good or Windows Debug

jj2007 · June 21, 2015, 07:15:15 PM

Quote from: rrr314159 on June 21, 2015, 05:50:26 PMtake the time and write a testbed prog

BTW u need to use a debugger if don't already, Olly is good or Windows Debug

I can echo that ;-)

Alternative is a macro that outputs reg values to the console without changing them. For the standalone asm proc I could offer deb:

Code Select

deb 4, "reg contents", eax, ecx, x:xmm0, x:xmm1, x:xmm2
Output:

Code Select

reg contents
eax             4277048
ecx             0
x:xmm0          FFFB7EFD E0000018 FF880018 FF940000
x:xmm1          00000000 0018FFA0 00000000 00000000
x:xmm2          0018FFEC 00000000 38788916 77551985

Doing the same in C/C++ is certainly possible. For example, you could move the xmm regs to a RECT and use printf() to display the left, top etc elements. One problem is that the 64-bit Windows versions destroy the contents of xmm regs when calling a (32-bit) Windows API, so you'll have to save them with fxsave and get them back with fxrstor.

P.S.: Tested with Pelles C and MSVC:

Code Select

#include <stdio.h>
#include <windows.h>
#pragma warn(disable:2216 2007 2018 2118)   	// retval never used etc

void XmmPrint(int index) {
  RECT rc;
  __declspec(align(16)) byte xms[512];
  __asm fxsave xms;	// save our precious xmm regs
  __asm pushad;
  switch(index) {
  case 0:
	_asm movups rc, xmm0;
	break;
  case 1:
	_asm movups rc, xmm1;
	break;
  case 2:
	_asm movups rc, xmm2;
	break;
  case 3:
	_asm movups rc, xmm3;
	break;
  case 4:
	_asm movups rc, xmm4;
	break;
  case 5:
	_asm movups rc, xmm5;
	break;
  case 6:
	_asm movups rc, xmm6;
	break;
  case 7:
	_asm movups rc, xmm7;
	break;
  }
//   printf() trashes xmm regs...
  printf("xmm%i = %x %x %x %x \n", index, rc.bottom, rc.right, rc.top, rc.left);
//   ... but we knew that before ;-)
  _asm fxrstor xms;
  __asm popad;
  return; 
}

char* c="This is a string";	// one more for testing

int main(int argc, char* argv[]) {
  long long qh=0x4444444433333333, ql=0x2222222211111111;
  _asm movlps xmm0, ql;
  _asm movhps xmm0, qh;
  _asm mov edx, c;
  _asm movups xmm1, [edx];
  XmmPrint(0);
  XmmPrint(1);
  _getch();
}

Output:

Code Select

xmm0 = 44444444 33333333 22222222 11111111
xmm1 = 676e6972 74732061 20736920 73696854

Elegant · June 22, 2015, 12:27:19 PM

Man was that frustrating... While I did finish the testbed (I kind of cheated a bit). I noticed that it was returning the same values. Clearly it was coming elsewhere, so I removed a few deprecated variables in .data and it all worked. That'll teach me to leave stuff lying around... Thanks again for the help everyone!

Now onward to the x64 version :P

rrr314159 · June 22, 2015, 02:46:57 PM

Hopefully it will also teach u to move on to the next step, as I said, when getting nowhere looking for a bug in the code. Often there is no bug

The MASM Forum

News:

Converting inline code to standalone asm file

Elegant

dedndave

rrr314159

Elegant

rrr314159

Elegant

rrr314159

jj2007

Elegant

rrr314159