News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Converting inline code to standalone asm file

Started by Elegant, June 21, 2015, 03:50:26 AM

Previous topic - Next topic

Elegant

Hi, I'm trying to convert one function from inline assembly in VS to being in a standalone asm file. I'm having quite some trouble with one part of the function however. The inline function is:


//This line is defined in the header for onesByte.
__declspec(align(16)) const __int64 onesByte[2] = { 0x0101010101010101, 0x0101010101010101 };

void checkOscillation5_SSE2(const uint8_t *p2p, const uint8_t *p1p, const uint8_t *s1p,
const uint8_t *n1p, const uint8_t *n2p, uint8_t *dstp, int stride, int width,
int height, int thresh)
{
__int64 thresh128[2];
thresh128[0] = ((thresh-1)<<8)+(thresh-1);
thresh128[0] += (thresh128[0]<<48)+(thresh128[0]<<32)+(thresh128[0]<<16);
thresh128[1] = thresh128[0];
__asm
{
mov eax,p2p
mov ebx,p1p
mov edx,s1p
mov edi,n1p
mov esi,n2p
pxor xmm6,xmm6
movdqu xmm7,thresh128
yloop:
xor ecx,ecx
align 16
xloop:
movdqa xmm0,[eax+ecx]
movdqa xmm2,[ebx+ecx]
movdqa xmm1,xmm0
movdqa xmm3,xmm2
pminub xmm0,[edx+ecx]
pmaxub xmm1,[edx+ecx]
pminub xmm2,[edi+ecx]
pmaxub xmm3,[edi+ecx]
pminub xmm0,[esi+ecx]
pmaxub xmm1,[esi+ecx]
movdqa xmm4,xmm3
movdqa xmm5,xmm1
psubusb xmm4,xmm2
psubusb xmm5,xmm0
psubusb xmm4,xmm7
psubusb xmm5,xmm7
psubusb xmm2,onesByte
psubusb xmm0,onesByte
psubusb xmm1,xmm2
psubusb xmm3,xmm0
pcmpeqb xmm1,xmm6
pcmpeqb xmm3,xmm6
pcmpeqb xmm4,xmm6
pcmpeqb xmm5,xmm6
mov eax,dstp
por xmm1,xmm3
pand xmm4,xmm5
pand xmm1,xmm4
movdqa [eax+ecx],xmm1
add ecx,16
mov eax,p2p
cmp ecx,width
jl xloop
mov eax,stride
add ebx,stride
add p2p,eax
add edx,stride
mov eax,stride
add edi,stride
add dstp,eax
add esi,stride
mov eax,p2p
dec height
jnz yloop
}
}


After a bit work I've come to the following for the standalone file:


.xmm
.model flat,c

.data

align 16

onesByte qword 2 dup(0101010101010101h)

.code

checkOscillation5_SSE2 proc p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword

public checkOscillation5_SSE2

mov eax,p2p
mov ebx,p1p
mov edx,s1p
mov edi,n1p
mov esi,n2p
pxor xmm6,xmm6
dec thresh
movd xmm7,thresh
punpcklbw xmm7,xmm7
punpcklwd xmm7,xmm7
punpckldq xmm7,xmm7
punpcklqdq xmm7,xmm7
yloop:
xor ecx,ecx
align 16
xloop:
movdqa xmm0,[eax+ecx]
movdqa xmm2,[ebx+ecx]
movdqa xmm1,xmm0
movdqa xmm3,xmm2
pminub xmm0,[edx+ecx]
pmaxub xmm1,[edx+ecx]
pminub xmm2,[edi+ecx]
pmaxub xmm3,[edi+ecx]
pminub xmm0,[esi+ecx]
pmaxub xmm1,[esi+ecx]
movdqa xmm4,xmm3
movdqa xmm5,xmm1
psubusb xmm4,xmm2
psubusb xmm5,xmm0
psubusb xmm4,xmm7
psubusb xmm5,xmm7
psubusb xmm2,oword ptr onesByte
psubusb xmm0,oword ptr onesByte
psubusb xmm1,xmm2
psubusb xmm3,xmm0
pcmpeqb xmm1,xmm6
pcmpeqb xmm3,xmm6
pcmpeqb xmm4,xmm6
pcmpeqb xmm5,xmm6
mov eax,dstp
por xmm1,xmm3
pand xmm4,xmm5
pand xmm1,xmm4
movdqa [eax+ecx],xmm1
add ecx,16
mov eax,p2p
cmp ecx,width_
jl xloop
mov eax,stride
add ebx,stride
add p2p,eax
add edx,stride
mov eax,stride
add edi,stride
add dstp,eax
add esi,stride
mov eax,p2p
dec height
jnz yloop

ret

checkOscillation5_SSE2 endp


From my current knowledge I've realized that the 2 lines of code dealing with onesByte in the standalone file seems to be causing the issue but I'm unable to determine what needs to be changed in order to subtract onesByte for the calculations. This next section is the original function in C++ for reference.


const uint8_t *p1p = p1->GetPtr(b);
const int stride = p1->GetPitch(b);
const int width = p1->GetWidth(b);
const int height = p1->GetHeight(b);
const uint8_t *p2p = p2->GetPtr(b);
const uint8_t *s1p = s1->GetPtr(b);
const uint8_t *n1p = n1->GetPtr(b);
const uint8_t *n2p = n2->GetPtr(b);
uint8_t *dstp = dst->GetPtr(b);
const int thresh = b == 0 ? othreshL : othreshC;

if (cpu&CPUF_SSE2)
checkOscillation5_SSE2(p2p, p1p, s1p, n1p, n2p, dstp, stride, width, height, thresh);
else
{
for (int y = 0; y < height; ++y)
{
for (int x = 0; x < width; ++x)
{
const int min31 = min3(p2p[x], s1p[x], n2p[x]);
const int max31 = max3(p2p[x], s1p[x], n2p[x]);
const int min22 = min(p1p[x], n1p[x]);
const int max22 = max(p1p[x], n1p[x]);
if (((min31 > max22) || max22 == 0 || (max31 < min22) || max31 == 0) &&
max31-min31 < thresh && max22-min22 < thresh)
dstp[x] = 0xFF;
else dstp[x] = 0;
}

p2p += stride;
p1p += stride;
s1p += stride;
n1p += stride;
n2p += stride;
dstp += stride;
}
}


Any help in what exactly I'm missing for this port would be greatly appreciated. This is the only function I have left that gave me trouble. I also tested using all other functions in their C++ forms to rule them out as the culprit.

dedndave

you should probably add a processor at the beginning
        .686p
        .MMX
        .XMM
        .MODEL  Flat,C
        OPTION  CaseMap:None


the compiler probably handles register preservation for you
in ASM, you should do it explicitly for EBX, ESI, EDI
the assembler will handle EBP and the stack frame, though
checkOscillation5_SSE2 proc USES EBX ESI EDI p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword

in ASM, PROC's are always public, by default - no need to declare it

rrr314159

Hi Elegant,

There's nothing wrong the onesbyte statements as they stand, but the only thing that occurs to me, if xmm2 or xmm0 get to 0 psubusb won't work, they will remain 0 not become -1. I can't tell if that is Ok with your logic or not (haven't figured out what the function is doing), but I'm supposing you're implementing ++y and ++x here? Because u then subtract them from xmm1 and xmm3, so in effect you're incrementing xmm1 and xmm3 by 1. If these guesses are true then maybe would be better to paddusb onesbyte to xmm1 and xmm3 instead of the double subtraction.

Of course if the original inline function works it's hard to see why this would be a problem. In that case I would look at the thresh computations (punpcklbw xmm7,xmm7 etc) but didn't because u don't think that's where the problem is.

One way to make sense of it: the original thresh comps in C were correct, but the asm translation is not. With correct thresh xmm2 and xmm0 were never negative, but with incorrect comps they are; that's why u think the prob is when you psubusb onesbyte, but actually the prob is with thresh comps

Sorry for the guessing. If you want a definite answer, just post a simpler function! :biggrin:
I am NaN ;)

Elegant

@dedndave Alright, I'll add that in right and now and remember it for future projects. Unfortunately, this doesn't solve the issue at hand.

@rrr314159 Yes that's the intent. It's a helper function for checking oscillation on a combed/dot crawled frame in video editing. I'm trying to update the code to more modern version and get a bit more FPS out of the filter and to then port it to create a x64 version but fixing this procedure (my last x86 procedure) comes first. I've double checked the value of thresh and all seems well, to be safe I restored the C++ version and passed it through, same result.

Really strange that this is accepted and handled nicely when inline but now it's like it's not even the same function.

rrr314159

I assume that xmm0 and xmm2 aren't becoming negative as I suggested, so there's nothing wrong with the psubusb statements.

Looking elsewhere, I see one inconsistency. Width passed to inline asm routine must be multiplied by 16, right?, from the width passed to C++ routine - since it's compared to ecx, which is incremented by 16 not 1. But in the standalone, width is called width_ instead. So, is it also multiplied by 16?

Another thought, you should write a testbed to build and run the code. It should call the subroutine and print out inputs and outputs (apparently dstp array is the only output). A lot of work if u haven't done it already but it makes it easy to troubleshoot

If u have done that already, post runs from inline and standalone - it might help. Another possibility, leave this one alone and move on to 64 bit, maybe it'll come out in the wash (often works for me). Or, find a site where they're better at this sort of thing than we are :biggrin:
I am NaN ;)

Elegant

Both "widths" are the same and are being multiplied. The variable name "width" cannot be used in standalone files as it conflicts with the operator by the same name so for simplicity it was renamed "width_". The width is also used in the other procedures in the same manner. I think I will have to come up with a testbed and see what's going on for this one function as you say.

In this case, I don't think moving on to the x64 in this case is going to provide to much insight as I can remove onesByte and use the below with the extra xmm it provides.


pcmpeqb xmm9, xmm9
psrlw xmm9, 15
movdqa xmm8, xmm9
psllw xmm8, 8
por xmm9, xmm8
...
psubusb xmm2, xmm9
psubusb xmm0, xmm9

rrr314159

Re 64 bit, that's a perfect example of what I'm talking about. Go on to 64-bit and find out if it actually works using a register in that way - would definitely nail down the problem to onesbyte. Maybe, just off the top of my head, it's being overwritten by dstp (but not when running inline); who knows; xmm9 approach might reveal something. But I wouldn't be surprised if xmm9 fails the same way as onesbyte (because I can't see anything wrong with it). (Don't see anything wrong elsewhere either.) Either way it's a vital clue. Or, simply save and restore xmm3 (for instance) and try the different technique with that - take 1/2 hour. Alternative is to sit there and stare at the code :biggrin: - or, take the time and write a testbed prog

BTW u need to use a debugger if don't already, Olly is good or Windows Debug
I am NaN ;)

jj2007

#7
Quote from: rrr314159 on June 21, 2015, 05:50:26 PMtake the time and write a testbed prog

BTW u need to use a debugger if don't already, Olly is good or Windows Debug

I can echo that ;-)

Alternative is a macro that outputs reg values to the console without changing them. For the standalone asm proc I could offer deb:
deb 4, "reg contents", eax, ecx, x:xmm0, x:xmm1, x:xmm2
Output:
reg contents
eax             4277048
ecx             0
x:xmm0          FFFB7EFD E0000018 FF880018 FF940000
x:xmm1          00000000 0018FFA0 00000000 00000000
x:xmm2          0018FFEC 00000000 38788916 77551985


Doing the same in C/C++ is certainly possible. For example, you could move the xmm regs to a RECT and use printf() to display the left, top etc elements. One problem is that the 64-bit Windows versions destroy the contents of xmm regs when calling a (32-bit) Windows API, so you'll have to save them with fxsave and get them back with fxrstor.

P.S.: Tested with Pelles C and MSVC:
#include <stdio.h>
#include <windows.h>
#pragma warn(disable:2216 2007 2018 2118)    // retval never used etc

void XmmPrint(int index) {
  RECT rc;
  __declspec(align(16)) byte xms[512];
  __asm fxsave xms; // save our precious xmm regs
  __asm pushad;
  switch(index) {
  case 0:
_asm movups rc, xmm0;
break;
  case 1:
_asm movups rc, xmm1;
break;
  case 2:
_asm movups rc, xmm2;
break;
  case 3:
_asm movups rc, xmm3;
break;
  case 4:
_asm movups rc, xmm4;
break;
  case 5:
_asm movups rc, xmm5;
break;
  case 6:
_asm movups rc, xmm6;
break;
  case 7:
_asm movups rc, xmm7;
break;
  }
//   printf() trashes xmm regs...
  printf("xmm%i = %x %x %x %x \n", index, rc.bottom, rc.right, rc.top, rc.left);
//   ... but we knew that before ;-)
  _asm fxrstor xms;
  __asm popad;
  return;
}

char* c="This is a string"; // one more for testing

int main(int argc, char* argv[]) {
  long long qh=0x4444444433333333, ql=0x2222222211111111;
  _asm movlps xmm0, ql;
  _asm movhps xmm0, qh;
  _asm mov edx, c;
  _asm movups xmm1, [edx];
  XmmPrint(0);
  XmmPrint(1);
  _getch();
}


Output:
xmm0 = 44444444 33333333 22222222 11111111
xmm1 = 676e6972 74732061 20736920 73696854

Elegant

Man was that frustrating... While I did finish the testbed (I kind of cheated a bit). I noticed that it was returning the same values. Clearly it was coming elsewhere, so I removed a few deprecated variables in .data and it all worked. That'll teach me to leave stuff lying around... Thanks again for the help everyone!

Now onward to the x64 version :P

rrr314159

Hopefully it will also teach u to move on to the next step, as I said, when getting nowhere looking for a bug in the code. Often there is no bug
I am NaN ;)