News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Fast Matrix Flip

Started by guga, April 07, 2017, 03:56:33 AM

Previous topic - Next topic

guga

Hi Marinus

Thank you :) It works like a gem here :t

QuoteHere are the sources as promised released under the SHARE & ENJOY license.
:greensml: :greensml: :greensml: :greensml:

One thing only, i´m not sure if using DirectX is faster then the direct manipulation of the pixels itself as we are dong before. The functions will be used for video manipulation. For that i´m using VirtualDub making those functions as part of a plugin to test the speed. I believe that VDub can handle D3d9 but i don´t know how to setup it. What i know from Vdub is that the functions related to DirectX manipulation are these:



class VDFilterAccelEngine;

class VDFilterAccelContext : public IVDXAContext {
public:
VDFilterAccelContext();
~VDFilterAccelContext();

int VDXAPIENTRY AddRef();
int VDXAPIENTRY Release();
void *VDXAPIENTRY AsInterface(uint32 iid);

bool Init(VDFilterAccelEngine& eng);
void Shutdown();

bool Restore();

uint32 RegisterRenderTarget(IVDTSurface *surf, uint32 rw, uint32 rh, uint32 bw, uint32 bh);
uint32 RegisterTexture(IVDTTexture2D *tex, uint32 imageW, uint32 imageH);

uint32 VDXAPIENTRY CreateTexture2D(uint32 width, uint32 height, uint32 mipCount, VDXAFormat format, bool wrap, const VDXAInitData2D *initData);
uint32 VDXAPIENTRY CreateRenderTexture(uint32 width, uint32 height, uint32 borderWidth, uint32 borderHeight, VDXAFormat format, bool wrap);
uint32 VDXAPIENTRY CreateFragmentProgram(VDXAProgramFormat programFormat, const void *data, uint32 length);
void VDXAPIENTRY DestroyObject(uint32 handle);

void VDXAPIENTRY GetTextureDesc(uint32 handle, VDXATextureDesc& desc);

void VDXAPIENTRY SetTextureMatrix(uint32 coordIndex, uint32 textureHandle, float xoffset, float yoffset, const float uvMatrix[12]);
void VDXAPIENTRY SetTextureMatrixDual(uint32 coordIndex, uint32 textureHandle, float xoffset, float yoffset, float xoffset2, float yoffset2);
void VDXAPIENTRY SetSampler(uint32 samplerIndex, uint32 textureHandle, VDXAFilterMode filter);
void VDXAPIENTRY SetFragmentProgramConstF(uint32 startIndex, uint32 count, const float *data);
void VDXAPIENTRY DrawRect(uint32 renderTargetHandle, uint32 fragmentProgram, const VDXRect *destRect);
void VDXAPIENTRY FillRects(uint32 renderTargetHandle, uint32 rectCount, const VDXRect *rects, uint32 colorARGB);

protected:
enum {
kHTFragmentProgram = 0x00010000,
kHTRenderTarget = 0x00020000,
kHTTexture = 0x00030000,
kHTRenderTexture = 0x00040000,
kHTTypeMask = 0xFFFF0000
};

struct HandleEntry {
uint32 mFullHandle;
IVDTResource *mpObject;

uint32 mImageW;
uint32 mImageH;
uint32 mSurfaceW;
uint32 mSurfaceH;
uint32 mRenderBorderW;
uint32 mRenderBorderH;
bool mbWrap;
};

uint32 AllocHandle(IVDTResource *obj, uint32 handleType);
HandleEntry *AllocHandleEntry(uint32 handleType);

IVDTResource *DecodeHandle(uint32 handle, uint32 handleType) const;
const HandleEntry *DecodeHandleEntry(uint32 handle, uint32 handleType) const;

void ReportLogicError(const char *msg);

IVDTContext *mpParent;
VDFilterAccelEngine *mpEngine;

typedef vdfastvector<HandleEntry> Handles;
Handles mHandles;
uint32 mNextFreeHandle;

bool mbErrorState;

float mUVTransforms[8][12];

VDAtomicInt mRefCount;
};

#endif // f_VD2_FILTERACCELCONTEXT_H



the classes VDXAPIENTRY seems to be used to setup D3d9 dll, but i have no idea how to make it work yet.

There is a example of a internal plugin inside Vdub itself called invert which as the name suggests, invert the colors of a video. The plugin itself is a bit fast, but i didn´t compared it with other versions because it uses another way to setup and initialize the plugin engine that is a bit harder to understand.(Damn C++ classes)  :greensml: :greensml: :greensml:


The full plugin is written like this:


namespace {
#ifdef _M_IX86
void __declspec(naked) VDInvertRect32(uint32 *data, long w, long h, ptrdiff_t pitch) {
__asm {
push ebp
push edi
push esi
push ebx

mov edi,[esp+4+16]
mov edx,[esp+8+16]
mov ecx,[esp+12+16]
mov esi,[esp+16+16]
mov eax,edx
xor edx,-1
shl eax,2
inc edx
add edi,eax
test edx,1
jz yloop
sub edi,4
yloop:
mov ebp,edx
inc ebp
sar ebp,1
jz zero
xloop:
mov eax,[edi+ebp*8  ]
mov ebx,[edi+ebp*8+4]
xor eax,-1
xor ebx,-1
mov [edi+ebp*8  ],eax
mov [edi+ebp*8+4],ebx
inc ebp
jne xloop
zero:
test edx,1
jz notodd
not dword ptr [edi]
notodd:
add edi,esi
dec ecx
jne yloop

pop ebx
pop esi
pop edi
pop ebp
ret
};
}
#else
void VDInvertRect32(uint32 *data, long w, long h, ptrdiff_t pitch) {
pitch -= 4*w;

do {
long wt = w;
do {
*data = ~*data;
++data;
} while(--wt);

data = (uint32 *)((char *)data + pitch);
} while(--h);
}
#endif
}

///////////////////////////////////////////////////////////////////////////////

class VDVideoFilterInvert : public VDXVideoFilter {
public:
VDVideoFilterInvert();

uint32 GetParams();
void Run();

void StartAccel(IVDXAContext *vdxa);
void RunAccel(IVDXAContext *vdxa);
void StopAccel(IVDXAContext *vdxa);

protected:
uint32 mAccelFP;
};

VDVideoFilterInvert::VDVideoFilterInvert()
: mAccelFP(0)
{
}

uint32 VDVideoFilterInvert::GetParams() {
const VDXPixmapLayout& pxlsrc = *fa->src.mpPixmapLayout;
VDXPixmapLayout& pxldst = *fa->dst.mpPixmapLayout;

switch(pxlsrc.format) {
case nsVDXPixmap::kPixFormat_XRGB8888:
pxldst.pitch = pxlsrc.pitch;
return FILTERPARAM_SUPPORTS_ALTFORMATS | FILTERPARAM_PURE_TRANSFORM;

case nsVDXPixmap::kPixFormat_VDXA_RGB:
case nsVDXPixmap::kPixFormat_VDXA_YUV:
return FILTERPARAM_SWAP_BUFFERS | FILTERPARAM_SUPPORTS_ALTFORMATS | FILTERPARAM_PURE_TRANSFORM;

default:
return FILTERPARAM_NOT_SUPPORTED;
}
}

void VDVideoFilterInvert::Run() {
VDInvertRect32(
fa->src.data,
fa->src.w,
fa->src.h,
fa->src.pitch
);
}

void VDVideoFilterInvert::StartAccel(IVDXAContext *vdxa) {
mAccelFP = vdxa->CreateFragmentProgram(kVDXAPF_D3D9ByteCodePS20, kVDFilterInvertPS, sizeof kVDFilterInvertPS);
}

void VDVideoFilterInvert::RunAccel(IVDXAContext *vdxa) {
vdxa->SetTextureMatrix(0, fa->src.mVDXAHandle, 0, 0, NULL);
vdxa->SetSampler(0, fa->src.mVDXAHandle, kVDXAFilt_Point);
vdxa->DrawRect(fa->dst.mVDXAHandle, mAccelFP, NULL);
}

void VDVideoFilterInvert::StopAccel(IVDXAContext *vdxa) {
if (mAccelFP) {
vdxa->DestroyObject(mAccelFP);
mAccelFP = 0;
}
}

///////////////////////////////////////////////////////////////////////////////

extern const VDXFilterDefinition g_VDVFInvert = VDXVideoFilterDefinition<VDVideoFilterInvert>(
NULL,
"invert",
"Inverts the colors in the image.\n\n[Assembly optimized]");

#ifdef _MSC_VER
#pragma warning(disable: 4505) // warning C4505: 'VDXVideoFilter::[thunk]: __thiscall VDXVideoFilter::`vcall'{48,{flat}}' }'' : unreferenced local function has been removed
#endif



I´ll try port this to assembly to make the proper tests on Dx video manipulation on Vdub, but, not sure if i´ll succeed or if it is, in fact faster then the direct pixel manipulation as we were doing before. (I believe it cannot be faster, because we need to take onto account all the internal functions used to access directx itself.

One good thing is that i finally succeeded to make Vdub change the Layout with the others functions. Now it is missing only to see if it will work with Matrix_transpose function :)

Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

avcaballero

Quote from: Siekmanski on April 25, 2017, 10:26:41 PM
Hi caballero,

And it doesn't save the MatrixImage.png i assume?
Can you trace down where exactly in the code the error occurs?
MatrixImage.png is created but with 0 bytes. We have already seen that my computer is abit odd for gdip, hence don't worry. Nevertheless, here are some captures from debugging. The flow stops when execute "GdipSaveImageToFile" with "F8", maybe here.

Regards

Siekmanski

Hi caballero,

Phewww...., so we can blame GDIplus for the error.

Hi guga,

I don't know what VDub is, is it freeware ?
It seems to use directX9 in some way.

I suppose your goal is to manipulate color values and color positions in an image, am i right ?
This is a perfect job for the video device instead of only the CPU.

For example transposing a 720 by 480 32bit image via CPU:

1 - setup your program code.
2 - load the image data to system memory
3 - the calculation loop:

  a - read an array of 1382400 bytes
  b - calculate the new positions in the array ( the transpose matrix routine )
  c - write 1382400 bytes
  d - write 1382400 bytes to the correct position in video memory ( this is slow )
  e - present the new image to the screen

For example transposing a 720 by 480 32bit image via GPU (via Direct3D9):

1 - setup your program code.
2 - load the image data to video memory
3 - the calculation loop:
 
  a - calculate the new 4 * X,Y positions for the screen positions, and the new 4 * X,Y for the image corner coordinates.
  b - write the new calculated 64 bytes to the video device ( the transpose matrix routine )
  c - present the new image to the screen.

I think you will agree, that the second method is much much much faster.
No transfers between system and video memory, and only 16 values to calculate for the whole transpose matrix routine.

0.005 milliseconds for transposing a 32bit 720 by 480 pixels image, try to beat that with CPU coding.

For fast image manipulations try to avoid data transfers between system and video memory because they are slow.
So, better do all the image manipulations etc. by using the video device itself if possible.
Creative coders use backward thinking techniques as a strategy.

guga

Vdudb is free and opensource (I´m talking on the sense the sources are released, disregarding about the license itself  :icon_mrgreen:).

It is a Video Editor tool simple and very powerfull, although a bit hard to configure the plugins. It was originally made more then a decade ago, but is used as an alternative for professional video editors such as Sony Vegas, for example.

http://www.virtualdub.org
https://sourceforge.net/projects/virtualdub/?source=top3_dlp_t5

For example, there is a university in Russia that make incredible plugins for it, such as a subtitle remover, motion estimation, noise remover, TV commercial detection, video stabilizer, etc etc. Their plugins can also be found here: http://www.compression.ru/video/video_codecs.htm

Other places of people who made plugins for it (with the source or not) can be found here:

http://www.guthspot.se/video/deshaker.htm
http://avisynth.nl/users/fizick/fizick.html
https://forum.doom9.org/
https://forum.videohelp.com/threads/281594-Avisynth-Virtualdub


Some tutorials on youtube explain several kinds of plugins as well. One of those that i like is:
https://www.youtube.com/watch?v=6QRJZpOrX0s

QuoteI suppose your goal is to manipulate color values and color positions in an image, am i right ?

Yeah...it is for image and video manipulation. I´m currently trying to understand and create those matrices functions in order to create a plugin (or app/dll) that is a variation of a PHash algorithm that is used to compare images (either from video or pure image). PHash algo is a sort of image signature and the field of application is huge....similar to what google and youtube uses for image searching tool (Dunno if google uses a sort of Phash algorithm, but, it probably do) or to be used in object removal of a video or a image, face detection, motion estimation, tracking, image/video reconstruction, etc etc....Also, Phash can be used for audio recognition too.

Rebuilding Phash is the 1st step that i can test to create a plugin for scene detection on videos. Currently i made a plugin for Vdub that can be able to detect scenes from a video. The only problem is that the accuracy is limited to hard cut scenes, but for transition (fades, etc) the algorithm i´m using fails. Basically a scene can be detected comparing the difference of 2 frames. The difference is achieved calculating the Minimum Standard deviation of the Light/Luma values from one frame and the other. So, we compute the STD on each frame, and simply subtract one from another (with xor to Potentiate the differences and not a simple sub). 2 frames are different completelly between each other when the difference of the minimum STD between them is positive, but....when we deal with soft cut scenes is where the algo fails and is where i´ll try to use Phash on it. Phash uses matrix manipulation internally achieved from Cimg library that it uses for loading the images to be compared.

I believe that Phash can be used as a replacement for the scene detection algorithm i´m currently creating. the advantage of using Phash is that we may not be limited to scene detection. A wide range of things can be done with this algorithm (for video and image processing and also for audio)

the phash can be found here. http://phash.org  But...as i said before it uses a crappy Cimg library and, at the end, it is incredible slow compared to what we are doing. it is impossible to use the current version of Phash to identify a full video, for example. It would take hours to complete, where as if we simplify the algo you could process it completelly in a matter of minutes, and also use whatever other image library you wish. On this way you are not forced to use CImg all the time, but you can use any other that you want.

Quote0.005 milliseconds for transposing a 32bit 720 by 480 pixels image, try to beat that with CPU coding.

Yes..that´s fast, but i wonder if the performance is the same when using it for videos. If it is faster then what we are doing, then it´s, ok..to we use :)  But...i didnp´t measured it. The timmings i had for your previous algo were in nanoseconds. (273 nanosecs, in average was the timming i´ve got on your previous function. About 0,000273 miliseconds for that function.)

I don´t know how to measure separately the matrix manipulation for DX to see if it beats your previous work or not, but, probably DX function used to manipulate the matrix can´t beat your previous work. I mean, if you take the function on DX responsable for matrix manipulation (transpose, flip etc) isolated and compare to the function you did, i doubt DX is faster then yours.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

guga

#49
I´m not sure if it became clear, but, basically what i´m trying to do is:

a) the user access the pixel data with whatever method he wants (DX, LoadBitmap, GDIPlus, etc etc)

Once he get the pointers to the image pixels and know the width, height (and perhaps pitch, on case of videos) he simply do this:

b) uses the pointers to create the phash of a image to be compared with the other he already loaded

the problem relies on Phash that load the image with that crappy CImg library which uses matrix transposition internally and react according to the width and height. But...in fact, we don´t need that PHash load the image, we only needs the minimum necessary (the true algorithm) used to create the hash and a few functions to manipulate the matrixes (the pixels we previously loaded) in order to we create a convolution function to retrieve the hash.  So, how the image will be loaded to pass the pixel data pointer to Phash algo  is up to the user.

Sure, once we create the matrix manipulation functions, they can be used elsewhere with Phash or not, but since we need a minimum of matrix manipulation for Phash, it worth creating them.

Phash works like this:


int ph_dct_imagehash(const char* file,ulong64 &hash){

    if (!file){
        return -1;
    }
    CImg<uint8_t> src;
    try {
        src.load(file);
    } catch (CImgIOException ex){
        return -1;
    }
    CImg<float> meanfilter(7,7,1,1,1);
    CImg<float> img;
    if (src.spectrum() == 3){
        img = src.RGBtoYCbCr().channel(0).get_convolve(meanfilter);
    } else if (src.spectrum() == 4){
        int width = img.width();
        int height = img.height();
        int depth = img.depth();
        img = src.crop(0,0,0,0,width-1,height-1,depth-1,2).RGBtoYCbCr().channel(0).get_convolve(meanfilter);
    } else {
        img = src.channel(0).get_convolve(meanfilter);
    }

    img.resize(32,32);
    CImg<float> *C  = ph_dct_matrix(32);
    CImg<float> Ctransp = C->get_transpose();

    CImg<float> dctImage = (*C)*img*Ctransp;

    CImg<float> subsec = dctImage.crop(1,1,8,8).unroll('x');;

    float median = subsec.median();
    ulong64 one = 0x0000000000000001;
    hash = 0x0000000000000000;
    for (int i=0;i< 64;i++){
        float current = subsec(i);
        if (current > median)
            hash |= one;
        one = one << 1;
    }

    delete C;

    return 0;
}


All of the CImg crap we actually don´t need. All we need from it is the minimum matrix manipulation functions and convolution to work directly on the pixel data we already got. (because we already got the pixel data with whatever other method we choose). And, to make things a bit easier, we actually don´t even need RGBtoYCbCr because PHash uses only Luma (Y) and it is up to the user choose whatever method he wants to retrieve the Luma values. Probably all we need is the pointers to the pixel data already converted to Luma.

Now..imagine this new PHash being used as a video plugin  :icon_cool:... It can do amazing things on a faster and more reliable way.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Siekmanski

Don't know for sure if i understand it well.

1. You need an image loaded and encoded to raw image data in memory ?
2. PHash ( don't know what it is, or does ) the raw data ?
3. Where comes the matrix and convolution stuff ?

You told me it is for video editing and making plugins to create video effects right ?
Am i right that you need to fetch each picture from a movie, let the effect doing its work and save it back in the movie ?

I have a little trouble understanding everything you want to achieve.
Creative coders use backward thinking techniques as a strategy.

guga

Let´s do like Jack the ripper and go in parts :)  :icon_mrgreen:

About VDub functionality:

Quote1. You need an image loaded and encoded to raw image data in memory ?

Yes..i needed to know the pointer to the pixels of the image that was loaded (with whatever method, api etc...DX, GdiPlus...etc)
This is the easier part, since Vdub (that is used to edit videos) get me access to the pixels on each frame directly. So i can know exactly the width, height, pitch of a image that belongs to a certain frame of a video.

This is done on a structure called VFBitmap which i ported to Asm onto:

[VFBitmap:
VFBitmap.pVBitmapFunctions: D$ 0 ; It is a pointer inside Vdbub. (vtable) It is a array of offsets. Points to deprecated VBitmap functions
VFBitmap.data: D$ 0    ; Pixel32 (data of the image).  Pointer to start of _bottom-most_ scanline of plane 0.
VFBitmap.palette: D$ 0 ; Pointer to palette (reserved - set to NULL).
VFBitmap.depth: D$ 0   ; image depth. Same as biBitCount in BitmapInfoHeader, but this is a dword
VFBitmap.w: D$ 0       ; The width of the bitmap, in pixels. Same as in BitmapInfoHeader
VFBitmap.h: D$ 0       ; The height of the bitmap, in pixels. Same as in BitmapInfoHeader
VFBitmap.pitch: D$ 0   ; Distance, in bytes, from the start of one scanline in plane 0 to the next. ( Bitmaps can be stored top-down as well as bottom-up. The pitch value value is positive if the image is stored bottom-up in memory and negative if the image is stored top-down.)
VFBitmap.modulo: D$ 0  ; Distance, in bytes, from the end of one scanline in plane 0 to the start of the next. (This value is positive or zero if the image is stored bottom-up in memory and negative if the image is stored top-down. A value of zero indicates that the image is stored bottom-up with no padding between scanlines. For a 32-bit bitmap, modulo is equal to pitch -)
VFBitmap.size: D$ 0    ; The size, in bytes, of the image. Size of plane 0, including padding. Same as in BITMAPINFOHEADER
VFBitmap.offset: D$ 0  ; Offset from beginning of buffer to beginning of plane 0.
VFBitmap.dwFlags: D$ 0 ; Set in paramProc if the filter requires a Win32 GDI display context for a bitmap.
                        ; (Deprecated as of API V12 - do not use) NEEDS_HDC  = 0x00000001L,
VFBitmap.hdc: D$ 0]    ; A handle to a device context.


So, the member "data" from VFBitmap structure points to the start of the pixels in memory; (In general, they are in RGB8888 format, which is easy to convert to RGBQUAD - I already done this part)

This part of the code to retrieve the pixels in memory are already done (in case with Vdub that loaded the video and granted me access to each frame containing the pixels to be manipulated)

This is the easier part.

In Vdub Images are passed in and out of video filters through the VFBitmap structure. Each VFBitmap represents an image as follows:

The image is stored as a series of sequential scanlines, where each scanline consists of a series of pixels.

Since the video filter system works with 32-bit bitmaps, scanlines are guaranteed to be aligned to 32-bit boundaries. No further alignment is guaranteed. In particular, filter code must not assume that scanlines are 16-byte aligned for SSE code.

It is important to note that there may be padding between scanlines. The pitch field indicates the true spacing in bytes, and should be used to step from one scanline to the next

All of this is how VDub access and handle the image data in memory. This part, in general, i already did. But, i´m having problems only to understand the scanline stuff, because when manipulating the images from Matrix_Transpose, for example, the width and height of the resultant image was weird as you saw on the image i posted earlier. But...i think i found how to make it work properly. It seems that i didn´t configured properly the way the layout can be displayed (I´m currently working on it to see if i can fix the transposing mode)

Why i´m doing this with Vdub ? because with VDub i´ll then use the Phash algo to identify scenes from a video i´ll load on it.

Now....about Phash.

Quote2 - PHash ( don't know what it is, or does ) the raw data ?

Yes...but....Phash as it is written is bloated because it loads the image for you and do all the fancy stuff with the convolution and matrix manipulation using a library called Cimg.  The main problem is that, it is insanelly slow for video processing (and also for images, btw), although it is incredible accurate.

I´m posting it here a small example of Phash being used. The source code is embedded (RosAsm file), but it is simply this:


[Float_Half: R$ 0.5]

[ImgHash1: R$ 0]
[ImgHash2: R$ 0]
[Similarity_Result: R$ 0]
[Float_64: R$ 64.0]
[Float_Thrsehold: R$ 0.85]

Main:

C_call 'pHash.ph_dct_imagehash' {B$ "Img1.jpg", 0}, ImgHash1
C_call 'pHash.ph_dct_imagehash' {B$ "Img2.jpg", 0}, ImgHash2
C_call 'pHash.ph_hamming_distance' ImgHash1, ImgHash2
mov D$Similarity_Result eax
fld1 | fild F$Similarity_Result | fdiv R$Float_64 | fsubp ST1 ST0 | fstp R$Similarity_Result

Fpu_If R$Similarity_Result >= R$Float_Half
    call 'USER32.MessageBoxA' 0, {B$ 'Images are similar', 0}, {B$ 'PHash test', 0}, &MB_YESNO
Fpu_End_If

call 'Kernel32.ExitProcess' 0


The functionality and example is explained here:
http://cloudinary.com/blog/how_to_automatically_identify_similar_images_using_phash

and here explain in more details the technical functionality.
http://www.hackerfactor.com/blog/?/archives/432-Looks-Like-It.html


Quote3. Where comes the matrix and convolution stuff ?
The matrix and convolution comes from PHash itself. Internally it creates a matrix to be used as a mask for later build the convolution, in order to make the hashes for each image that is being compared.

The matrix and convolution functions used in PHash (That is open source, i.e, we have access to the source code to read it and learn how it works exactly) are available freely at phash.org.

http://www.phash.org/releases/win/pHash-0.9.4.zip

And here are the technical aspects of Phash too and the guide of usage:

http://www.phash.org/docs/design.html
http://www.phash.org/docs/howto.html


The major problem is that... to work, pHash uses a bloated Cimg library to load the image and create the matrix and convolutions routines in order to the algo can produce the hash for each image. (The part of the code i posted in the other post on this thread)

How the comparison works ? The final comparition is kinda easy to understand. After having the hashes all you need is compare the hash found on a image and the hash found on another. If the difference is above 50%, then we have a similarity (The bigger the value, more similar the images are). Below 50% the images are definitely different.

How to solve the speed problem in order to use phash on videos (or regular images or audio) on a fast and more reliable way ? Creating our own set of matrix and convolution functions to work on the pixels that was already loaded in memory, instead having to create several functions to load the image or using external bloated libraries to do it for us.

PHash as it is without we fix it, is simply not usefull for video detection, because it is slow as hell, despite it´s high level of accuracy. So, the goal is recreate Phash using our own set of matrix and convolution functions, instead being forced to use bloated Libraries that does a terrible job internally resulting on a algo impossible to be used for video manipulation or even  image manipulation in general. We need to recreate a phash that don´t load a image, don´t uses bloated libraries. We need one that simply take the pixel data from a image previously loaded in memory and compute the hash of it.

So, the matrix and convolutions functions we needs basically to manipulate directly the pixel data of a certain image (Which was previously loaded no matter in what method/api used), and we need only to feed the functions only with the pixel pointer, height and width (and perhaps pitch/scanline, since it seems to be necessary sometimes for vdub). We are not using the matrix and convolution functions to load the image, we are using them to manipulate the pixel data already loaded in memory, so the method chosen to load the image is not what matters, since the important is we have access to the pixel and we manipulate them directly.

Since the images can be on any size, the matrix manipulation and convolution functions needs to work on any size (squared or not), because we can have images that have different width and height.


QuoteYou told me it is for video editing and making plugins to create video effects right ?
yes :)

QuoteAm i right that you need to fetch each picture from a movie, let the effect doing its work and save it back in the movie ?
yes :) :)

But, all of the functions responsible for saving the video back, loading it etc, are already done by the main app (VDub). Basically it is a plugin that take the pixel data loaded by Vdub and manipulate the pixels directly. In case, it is a plugin i´m creating to identify the scenes of a video using a algorithm called Phash that uses matrixes functions and convolution to find the hash on each image/frame.

And, since for creating the phash algo, we will need to build the matrices and convolutions functions, those functions can be also used later on others plugins or apps that direct manipulate pixels from memory.
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Siekmanski

QuoteLet´s do like Jack the ripper and go in parts :)  :icon_mrgreen:
:lol:

The easiest and most flexible way to load and get access to the image data and even control the format you want no matter the pixel format of the original image, is this small piece of GDIplus code.

.const

;the formats you may need
PixelFormat1bppIndexed          equ 30101h
PixelFormat4bppIndexed          equ 30402h
PixelFormat8bppIndexed          equ 30803h
PixelFormat16bppGreyScale       equ 101004h
PixelFormat16bppRGB555          equ 21005h
PixelFormat16bppRGB565          equ 21006h
PixelFormat16bppARGB1555        equ 61007h
PixelFormat24bppRGB             equ 21808h
PixelFormat32bppRGB             equ 22009h
PixelFormat32bppARGB            equ 26200Ah
PixelFormat32bppPARGB           equ 0E200Bh

BitmapData struct                         
    dwWidth      dd ?
    dwHeight     dd ?   
    Stride       dd ?   
    PixelFormat  dd ?   
    Scan0        dd ?   ; pointer to the raw bitmap data
    Reserved     dd ?
BitmapData ends

.data?
GDIplusBitmapData BitmapData <?>
pImage dd ?
GdiplusToken dd ?

.code
    invoke  GdiplusStartup,offset GdiplusToken,offset GdiplusInput,NULL
    invoke  GdipCreateBitmapFromFile,offset FilenameW,addr pImage
    invoke  GdipBitmapLockBits,pImage,NULL,1,PixelFormat32bppARGB,offset GDIplusBitmapData

; do your stuff here on the bitmap data
    mov     esi,GDIplusBitmapData.Scan0     ; pointer to the bitmap data
    mov     ecx,GDIplusBitmapData.dwHeight
    mov     edx,GDIplusBitmapData.dwWidth
    add     esi,GDIplusBitmapData.Stride    ; jump to the next scan line etc.

; close everything
    invoke  GdipBitmapUnlockBits,pImage,offset GDIplusBitmapData
    invoke  GdipDisposeImage,pImage
    invoke  GdiplusShutdown,GdiplusToken


I think you still have a lot to do to get that PHash routine working the way you describe.  :t
Creative coders use backward thinking techniques as a strategy.

guga

#53
HI marinus

Thanks for the code. I´ll give a try and see how it works with Vdub. A few questions about gdiplus. SinceVdub already retrieve the pixel data, how to use gdiplus on a bitmap data rather then it´s filename? Does GpStatus GdipCreateBitmapFromScan0 can do it ?

For now, i´m asking this for curiosity, because i don´t believe we need a routine to work with GdiPlus right now, since Vdub already retrieve the pixel data and format to us to work. It is a good thing, because is one less step to do :)  maybe we can use this GdiPlus only for testing purposes while i´m also testing the main matrix functions on vdub to see if everything works ok

QuoteI think you still have a lot to do to get that PHash routine working the way you describe.  :t
Yep :)  :bgrin: :bgrin:

That´s why i´m trying to do in steps. The first ones are basically theses:
a) Create some manipulation matrix functions to work in whatever size
b) create the convolution function.

Once those 2 steps are done, i can then start analysing the internal routines of PHash itself. I believe that there will be needed a few more steps after the convolution is done. Perhaps i´ll need to create only 4 or 5 functions in order to retrieve Phash after the convolution is done, but 1st i need to see if the matrix and convolution functions are ok and have the same result as the ones produced by the internal Cimg routines inside Phash itself .

One of the routines used by Phash i already succeeded to convert. It is the hamming_distance function. This function, i succeeded to port. Although i didn´t tested yet to speed or optimization, because the important on this stage of development is make everything works 1st :)


So we can start with your older functions to start testing. Some of them i converted but unsure if will work as expected on Vdub on all layout formats (Switching Height x width, for example)

The 1st one we can test is Matrix_FlipX (Then we later can work with Matrix_FlipY and Matrix_FlipXY). I remade it without the stride using the examples that you, jochen and Aw provided. it works, but...i´m not sure if it will work on all cases, because it is not using the stride/pitch. How can we add the stride/pitch on it ?


Proc Matrix_FlipX:
    Arguments @Input, @Output, @Width, @Height
    Local @MaxXPos, @MaxYPos, @Remainder, @AdjustSmallSize, @NextScanLine
    Uses esi, edi, ebx, ecx, edx

    mov esi D@Input
    mov edi D@Output

    ; How many xmm moves per row are required;
    mov eax D@Width
    mov edx eax
    mov ebx eax
    and edx 3 | mov D@Remainder edx
    shr eax 2 | mov D@MaxXPos eax

    mov eax D@Height | mov D@MaxYPos eax

    mov D@AdjustSmallSize 0
    mov ebx 4 ; case of less than 4 columns
    If D@MaxXPos > 0
        mov ebx 16 ; subtract enough to fill one xmm register
        mov D@AdjustSmallSize 12
    End_if

    mov eax D@Width | shl eax 2 | mov D@NextScanLine eax |  sub eax ebx
    mov edi D@Output | add edi eax

L1:
        mov ebx D@MaxXPos
        ; source points to the start of every row
        mov eax edi
        mov ecx esi
        test ebx ebx | Jz L4>

        L0:
            movdqu xmm0 X$ecx
            pshufd xmm0 xmm0 27
            movdqu X$eax xmm0
            sub eax 16
            add ecx 16
            dec ebx | jg L0<

L4:
        mov ebx D@remainder
        test ebx ebx | jz L3>
            mov edx D@AdjustSmallSize
            movdqu XMM0 X$ecx | movd D$eax+edx XMM0
            dec ebx | jz L3> | PSRLDQ xmm0 4 | movd D$eax+edx-4 XMM0
            dec ebx | jz L3> | PSRLDQ xmm0 4 | movd D$eax+edx-8 XMM0
        L3:

         add edi D@NextScanLine
         add esi D@NextScanLine

        dec D@MaxYPos | jg L1<

EndP


I used the pitch to copy the buffer on another function i´m using Vdub. This one uses the pitch information. This was done after Matrix_X worked. It is this:


Proc CopyImageBuffer:
    Arguments @Input, @Output, @Width, @Height, @Pitch
    Local @CurYPos
    Uses eax, ebx, ecx, edx, edi

    mov eax D@Height
    xor ebx ebx
    mov D@CurYPos eax
    .Do
        mov eax D@Output | add eax ebx
        mov ecx D@Input | add ecx ebx
        mov edx D@Width
        Do
            mov edi D$ecx
            add ecx 4
            mov D$eax edi
            add eax 4
            dec edx
        Loop_Until edx = 0
        add ebx D@Pitch
        dec D@CurYPos
    .Loop_Until D@CurYPos = 0

EndP


But..i wonder, if using Pitch is really necessary to be inserted inside the main loop. (or, if it is really necessary at all, btw)

Also...maybe, once the routines are ok (with or without the pitch) we can see if it could be faster using the same Input as the output. On this way we can make variations of the matrix functions where the Input and output are the same so we can test the speed. Ex: we can create a sort of Matrix_FlipXEx containing only 3 arguments: Input, Width, Height (or also thee pitch if needed). Where the input will be used also as the output. This is to prevent the need to using another function to copy the contents of the Output to another buffer.

On this way we may have at the end only 6 major functions to manipulate the matrix, instead of 3.
    Matrix_FlipX - > input and output buffers are distincts
    Matrix_FlipXEx - > input is used to output
    Matrix_FlipY - > input and output buffers are distincts
    Matrix_FlipYEx - > input is used to output
    Matrix_FlipXY - > input and output buffers are distincts
    Matrix_FlipXYEx - > input is used to output
Coding in Assembly requires a mix of:
80% of brain, passion, intuition, creativity
10% of programming skills
10% of alcoholic levels in your blood.

My Code Sites:
http://rosasm.freeforums.org
http://winasm.tripod.com

Siekmanski

I posted the code because you didn't want to use the bloated Cimg library.  :biggrin:

I'm getting the grasp now of your intentions.

1. you need 2 images. ( size may differ )
2. load them to system memory as raw bitmap data.
3. apply a matrix function on both images.
4. apply a convolution. ( this part is still fuzzy to me, what mask etc. )
5. create a PHash value for both images and compare the 2 values.
6. let it run as fast as possible.

Are those the steps to be done and are they in the correct order?

The best way is to come up with a good strategy, that is to work backwards and design the best algorithms for an optimal situation.
This way you can control, how the data needs to be organized when loading the images.

For example:
- What input is needed to make the PHash routine happy.
- Is it faster to transpose a complete image in video memory and copy it back to system memory,
  or calculate it in system memory?
- Is it faster to get rid of the difference between stride/pitch and the actual bitmap width?
- Is it faster to add zeros to the horizontal bitmap lines that are not multiples of 4 pixels?
- Try to do as much inner-loop coding on 1 cache line ( < 64 byte ) and align that code for fast execution.
- Align the data for fast reading and writing.
- Thus create the best situation to perform the fastest code possible.

In fact what i meant with working backwards, you don't need to add extra code to adjust things to make it work.
Your code is prepared for the next step.
Creative coders use backward thinking techniques as a strategy.