Author Topic: NewSphinxCmm Examples  (Read 1025 times)

Emil_halim

  • Member
  • **
  • Posts: 99
NewSphinxCmm Examples
« on: May 15, 2017, 02:36:38 AM »
Hi all,

Here is an other Example that showing you how to mix C-- & Masm codes.

NoteWell:
=======
1-  NewSphinxC extended the extern keyword so that it will accept an external block of decelerations.

Code: [Select]
extern {
  cdecl GetStrLen();   // declares external function
  cdecl Ten();             // declares external function
  byte  buffer;            // declares external data
  byte  hw;                // declares external data
}
   

2- using  EXTERNDEF keyword in Masm block to external data of function those declared outside Masm code block and you entend to use them in Masm block.

3- the Masm code block started with .model  directive  with term 'c' which is a calling convention.

4- MasmPlgIn allows some c-- features with masm code blcok .
Code: [Select]
EAX := -1
EAX++ 

here the demo code
Code: [Select]
/*************************************
*         New Sphinx Cmm             * 
*                                    *
*           masm  test2              *
*                                    *
*         by Emil_halim              *         
*                                    *
*************************************/
#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

#Entry  main
#includelib  win32.lib MSVCRT.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

// tells Cmm about data & code in masm section
//-------------------------------------------
extern {
  cdecl StCpy();
  cdecl GetStrLen();
  cdecl Ten();
  byte  buffer;
  byte  hw;
}

// declare some Cmm variables
cmm_buffer: $DB 12 dup 0
char* cmm_hw = "Hello World from cmm";

int cmm_val = 100;

// masm code start here
^^
  .MODEL flat, c   
  .nolist
 
   EXTERNDEF  cmm_val:SDWORD

.data?
buffer db 12 dup(?) ; destination

.data
hw db "Hello World from masm", 0 ; source

.code

// Gets the length of a string(not including the NULL terminator)
GetStrLen Proc ,_str:PTR
MOV ECX,_str // Move source pointer to ECX
EAX := -1 // Start of at -1 so we can build a faster loop
  next_char:
EAX++      // EAX==NULL
CMP byte PTR[ECX+EAX],0
JNE next_char // If BYTE is not equal to NULL process next .
RET // Returns string length in EAX
GetStrLen EndP

Ten:
   mov EAX,10
   add EAX,cmm_val
   ret
   
FASTPROC StCpy
  push esi        
  push edi
  mov edi, [esp+8+4] // dest from stack (4 bytes each)
  mov esi, [esp+8+8] // src from stack
  .Repeat
  lodsb
  stosb
  .Until al==0
  pop edi
  pop esi
  ret   
END FASTPROC   

^^
// masm code end here


main()
{
       
   printf("string length = %d \n" , GetStrLen("CmmPro Is the best"));
   
   StCpy(#cmm_buffer,cmm_hw);
   puts(#cmm_buffer);
   
   StCpy(#buffer,#hw);
   puts(#buffer);

   printf("Ten = %d\n",Ten());
   system("pause");   
 
}


Emil_halim

  • Member
  • **
  • Posts: 99
Re: NewSphinxCmm Examples
« Reply #1 on: May 15, 2017, 02:38:29 AM »
Hi All,

Here is another Example that mix SphinxC and HJWasm.

some roles you have to care of.

1- masm block of code must started with term '^^' and end with the same term.
2- masm bolck will extract from sphinxc then compile and later linked with alink.
3- this process will accomplished by MasmPlgIn , you have not use the cpu directive.
4- MasmPlgIn will specify tha path of include directory.
5- take care of function call convention , it may crash your code.


the next masm code was taken from http://www.masmforum.com/board/index.php?topic=14696.105
it determines the cpu of computer.

Code: [Select]
/*************************************
*         New Sphinx Cmm             * 
*                                    *
*           masm test                *
*                                    *
*         by Emil_halim              *         
*                                    *
*************************************/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

#Entry  main
#includelib  win32.lib MSVCRT.lib ole32.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

// tells SphinxC about masm ShowCpu function
extern stdcall ShowCpu(dword v);

^^ //start of Masm code
.nolist
include masm32rt.inc

include m32lib/dwtoa.asm
include m32lib/stdout.asm

.code
;//  masm code taken from the next link
;//  http://www.masmforum.com/board/index.php?topic=14696.105
ShowCpu proc stdcall ; mode:DWORD
COMMENT @ Usage:
  push 0, call ShowCpu ; simple, no printing, just returns SSE level
  push 1, call ShowCpu ; prints the brand string and returns SSE level @
 
  pushad
  sub esp, 80     ; create a buffer for the brand string
  mov edi, esp ; point edi to it
  xor ebp, ebp
  .Repeat
  lea eax, [ebp+80000002h]
db 0Fh, 0A2h ; cpuid 80000002h-80000004h
stosd
mov eax, ebx
stosd
mov eax, ecx
stosd
mov eax, edx
stosd
inc ebp
  .Until ebp>=3
  push 1
  pop eax
  db 0Fh, 0A2h ; cpuid 1
  xor ebx, ebx ; CpuSSE
  xor esi, esi ; add zero plus the carry flag
  bt edx, 25 ; edx bit 25, SSE1
  adc ebx, esi
  bt edx, 26 ; edx bit 26, SSE2
  adc ebx, esi
  bt ecx, esi ; ecx bit 0, SSE3
  adc ebx, esi
  bt ecx, 9 ; ecx bit 9, SSE4
  adc ebx, esi
  dec dword ptr [esp+4+32+80] ; dec mode in stack
  .if Zero?
mov edi, esp ; restore pointer to brand string
  .Repeat
.Break .if byte ptr [edi]!=32 ; mode was 1, so show a string but skip leading blanks
inc edi
.Until 0
.if byte ptr [edi]<32
print chr$("pre-P4")
.else
print edi ; CpuBrand
.endif
.if ebx
print chr$(32, 40, "SSE") ; info on SSE level, 40=(
print str$(ebx), 41, 13, 10 ; 41=)
.endif
  .endif
  add esp, 80 ; discard brand buffer (after printing!)
  mov [esp+32-4], ebx ; move ebx into eax stack position - returns eax to main for further use
  popad
  ret 4
ShowCpu endp

^^ //End of Masm code


;//***********************************************************************************//


main()
{
   ShowCpu(1); // print brand string and SSE level
     
   system("pause");   
 
}

Emil_halim

  • Member
  • **
  • Posts: 99
Re: memory-copy-benchmarks
« Reply #2 on: May 17, 2017, 04:20:02 AM »
Hi All,

Here is another Example from PowerBasic Forum written by Steve Hutchesson,

I have  convert his Example to C-- & Masm, the procedures converted to Masm , and the main code to c--.

Code: [Select]
/*************************************
*         New Sphinx Cmm             * 
*                                    *
*     memory-copy-benchmarks         *
*                                    *
*      from powerbasic Forum         *         
*                                    *
*************************************/


#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

//#pragma option upx-

#Entry  main
#includelib  win32.lib MSVCRT.lib ole32.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

#pragma option ia

// tells SphinxC about masm ShowCpu function
extern {
   cdecl SSEcopy(dword src,dword dst,dword blen);
   cdecl SSEcopy2(dword src,dword dst,dword blen);
}
//start of Masm code
^^
  .MODEL flat, c   
  .nolist
.data

pflead dd 0

.code
;//  masm code taken from the next link and cereated by Steve Hutchesson.
;//  https://forum.powerbasic.com/forum/user-to-user-discussions/powerbasic-inline-assembler/43459-memory-copy-benchmarks

SSEcopy proc c,src:DWORD,dst:DWORD,blen:DWORD

     mov esi, src
     mov edi, dst
     mov ebx, blen
     shr ebx, 6                        ; int divide ebx by 64
     xor edx, edx                     ; zero EDX and use as INDEX

  align 4
  lbl0:
     movdqa xmm0, [esi+edx]            ; 16 byte aligned reads
     movdqa xmm1, [esi+edx+16]
     movdqa xmm2, [esi+edx+32]
     movdqa xmm3, [esi+edx+48]

     movntdq [edi+edx], xmm0           ; non temporal writes
     movntdq [edi+edx+16], xmm1
     movntdq [edi+edx+32], xmm2
     movntdq [edi+edx+48], xmm3
     
     add edx, 64                       ; add block copy size to INDEX

     sub ebx, 1                        ; decrement loop counter
     jnz lbl0

     mov ebx, edx                      ; test for remainder
     sub ebx, blen                     ; EBX is remainder loop counter if not zero
     jz lbl2

  align 4
  lbl1:
     movzx eax, BYTE PTR [esi+edx]     ; copy remainder
     mov [edi+edx], al
     add edx, 1                        ; increment the INDEX
     sub ebx, 1                        ; decrement the loop counter
     jnz lbl1

  lbl2:
    ret
SSEcopy endp

SSEcopy2 proc c,src:DWORD,dst:DWORD,blen:DWORD

     mov esi, src
     mov edi, dst
     mov ebx, blen
     shr ebx, 7                        ;// int divide ebx by 128
     xor edx, edx                      ;// zero EDX and use as INDEX

  align 4
  lbl0:
    ; prefetchnta BYTE PTR [esi+edx+pflead]
    ;//  prefetcht0 BYTE PTR [esi+edx+%pflead]
    ;//  prefetcht1 BYTE PTR [esi+edx+%pflead]
    ;// prefetcht2 BYTE PTR [esi+edx+%pflead]

     movdqa xmm0, [esi+edx]            ;// 16 byte aligned reads
     movdqa xmm1, [esi+edx+16]
     movdqa xmm2, [esi+edx+32]
     movdqa xmm3, [esi+edx+48]

     movdqa xmm4, [esi+edx+64]
     movdqa xmm5, [esi+edx+80]
     movdqa xmm6, [esi+edx+96]
     movdqa xmm7, [esi+edx+112]

     movntdq [edi+edx], xmm0           ;// non temporal writes
     movntdq [edi+edx+16], xmm1
     movntdq [edi+edx+32], xmm2
     movntdq [edi+edx+48], xmm3

     movntdq [edi+edx+64], xmm4
     movntdq [edi+edx+80], xmm5
     movntdq [edi+edx+96], xmm6
     movntdq [edi+edx+112], xmm7

     add edx, 128                      ;// add block copy size to INDEX

     sub ebx, 1                        ;// decrement loop counter
     jnz lbl0

     mov ebx, edx                      ;// test for remainder
     sub ebx, blen                     ;// EBX is remainder loop counter if not zero
     jz lbl2

  align 4
  lbl1:
     movzx eax, BYTE PTR [esi+edx]     ;// copy remainder
     mov [edi+edx], al
     add edx, 1                        ;// increment the INDEX
     sub ebx, 1                        ;// decrement the loop counter
     jnz lbl1

  lbl2:
    ret
SSEcopy2 endp

^^
//End of Masm code


;//***********************************************************************************//

#define MEMLEN  1024*1024*129

main()
{
   dword hMem,tMem,aMem,tc;
   
   hMem = GlobalAlloc(GMEM_FIXED | GMEM_ZEROINIT,MEMLEN); // allocate 129 meg
   tMem = GlobalAlloc(GMEM_FIXED,MEMLEN);                 // allocate 129 meg
 
      MOV esi, hMem
  // -------------------------------
  // align ESI to a 16 byte boundary
  // -------------------------------
      add esi, 15
      and esi, -16
      mov aMem, esi
     
      puts("please wait......");
     
      tc = GetTickCount();
     
      mov ecx, 100
  lbl0:
      SSEcopy2(aMem,tMem,MEMLEN);
      sub ecx, 1
      jnz lbl0

      tc = GetTickCount() - tc;
     
      printf("XMM copy 12.8 gig memory copy in %d ms\n",tc);
     
   GlobalFree(hMem);
   GlobalFree(tMem); 
   system("pause");   
   
}



Emil_halim

  • Member
  • **
  • Posts: 99
shuffle array
« Reply #3 on: June 17, 2017, 01:25:02 AM »
Hi All,

This is pure SphinxC-- code.

shuffle array  created by hutch and converted by me.

http://masm32.com/board/index.php?PHPSESSID=a641fd02d84ff11e8eb1ce754b88cbda&topic=5367.0

Code: [Select]
/*************************************
*         New Sphinx  Cmm            * 
*                                    *
*          shuffle array             *
*                                    *
*         from Masm Forum            *         
*                                    *
*************************************/



#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization
#jumptomain NEAR

#parsecommandline TRUE

#pragma option dbg
#pragma option lst

#pragma option upx-

#Entry __startupproc
#includelib  win32.lib MSVCRT.lib ole32.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

#pragma option ia


dword nrandom_seed = 0;

nrandom Proc c rbase:DWORD
    mov eax, nrandom_seed

  // ****************************************
    test eax, 80000000h
    jz  nxt
    add eax, 7fffffffh
  nxt:   
  // ****************************************

    xor edx, edx
    mov ecx, 127773
    div ecx
    mov ecx, eax
    mov eax, 16807
    mul edx
    mov edx, ecx
    mov ecx, eax
    mov eax, 2836
    mul edx
    sub ecx, eax
    xor edx, edx
    mov eax, ecx
    mov nrandom_seed, ecx
    div rbase
    mov eax, edx
    ret 
nrandom endp

shuffle_array proc arr:DWORD,cnt:DWORD

    LOCAL lcnt  :DWORD

    lcnt = cnt;             // copy cnt to lcnt

    push ebx,esi,edi

    mov esi, arr
    mov edi, arr
    xor ebx, ebx

  @@:
    invoke nrandom,cnt      // get the random number within "cnt" range
    mov ecx, [esi+ebx*4]    // get the incremental pointer
    mov edx, [edi+eax*4]    // get the random pointer
    mov [esi+ebx*4], edx    // write random pointer back to incremental location
    mov [edi+eax*4], ecx    // write incremental pointer back to random location
    add ebx, 1              // increment the original pointer
    sub lcnt, 1             // decrement the loop counter
    jnz @B

    pop edi,esi,ebx

    ret

shuffle_array endp


ltok Proc c  src : DWORD, pArray : DWORD

    dword pTxt, pmem , bcnt;

  // ---------------------------------------------------------------
  // tokenise lines in a text source writing an array of pointers
  // to the address of "pArray" and returning the line count in EAX.
  //
  // The address written to the variable "pArray" should be released
  // within the same scope as the variable with a call to GlobalFree()
  // when the pointer array is no longer required.
  //
  // EXAMPLE
  // cnt = ltok(ptxt,ByVal VarPtr(harr))    ' tokenise source lines
  // dim tline(cnt) as ASCIIZ PTR at harr   ' treat it as an ASCIIZ PTR array.
  // .....
  // GlobalFree harr                        ' deallocate memory from "ltok"
  // ---------------------------------------------------------------
    pTxt =  src;

     mov edi, 1                      // set counter to 1 in case of no trailing CRLF

     mov esi, pTxt
     sub esi, 1
  // ----------------
  // count line feeds
  // ----------------
  @@:
     add esi, 1
     movzx edx, BYTE PTR [esi]
     test edx, edx                   // test for terminator
     jz @F
     cmp edx, 10                     // test for line feed
     jne @B
     add edi, 1                      // lf count in EDI
     jmp @B
  @@:
  // --------------------
  // multiply result by 4
  // --------------------
     add edi, edi
     add edi, edi
     mov bcnt, edi

     pmem = GlobalAlloc(GMEM_FIXED | GMEM_ZEROINIT,bcnt);

     mov edi, pmem                   // copy allocated memory address into EDI
     mov esi, pTxt
     xor eax, eax                    // zero arg counter
     sub esi, 1
     jmp Ftrim

  // ---------------------------------

  Terminate:
     mov BYTE PTR [esi], 0           // terminate end of current line

  Ftrim:                             // scan to find next acceptable character
     add esi, 1
     movzx edx, BYTE PTR [esi]       // zero extend byte
     test edx, edx                   // test for zero terminator
     jz Lout
     cmp edx, 32
     jbe Ftrim                       // scan again for 32 or less

  // ¤=÷=¤=÷=¤=÷=¤=÷=¤
     mov [edi], esi                  // write current location to pointer
     add edi, 4                      // set next pointer location
     add eax, 1                      // increment arg count return value
  // ¤=÷=¤=÷=¤=÷=¤=÷=¤

  Ttrim:                             // scan to find the next CR or LF
     add esi, 1
     movzx edx, BYTE PTR [esi]       // zero extend byte
     cmp edx, 13
     jg Ttrim                        //short loop on normal case

     je Terminate
     cmp edx, 10                     // extra test for ascii 10
     je Terminate
     test edx, edx
     jnz Ttrim                       // loop back if not zero, IE TAB.

  // ---------------------------------

  Lout:
     mov esi, pArray                 // load passed handle address into ESI
     mov ecx, pmem                   // local memory handle into ECX
     mov [esi], ecx                  // store local array handle at address of passed handle
     dec eax
     ret                             // return the line count
ltok endp



// FASTPROC 
.code

str_len:

    mov eax, [esp+4]
    sub eax, 1
  lbl:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne lbl

    sub eax, [esp+4]

    ret 4
.data


BOOL Exist(char *szFilePath)
{
  if (GetFileAttributes(szFilePath) != 0xffffffff) return TRUE;
  return FALSE;
}

help()
{
    puts( "\nSHFLARR : shuffle text file lines to random order\n" );
    puts( "Syntax  : shflarr inputfile outputfile" );
}

dword load_file(dword fname)
{
    unsigned long size;
    dword filehandle;
    dword buf;
   
    filehandle=fopen(fname,"rb");
if(filehandle==0)return NULL;
// obtain file size:
    fseek (filehandle , 0 , SEEK_END);
    size = ftell (filehandle);
    rewind (filehandle);
if(size==0){
fclose(filehandle);
return NULL;
}
buf=malloc(size+1);

if(fread(buf,1,size,filehandle)!= size)
{
    fclose(filehandle);
return NULL;
}
fclose(filehandle);
return buf;
}

//***********************************************************************************//

main()
{
 
   char *ifile,*ofile;
   dword psrc,lcnt,parr,pstr,hFil;

   ifile = PARAMSTR(1);
   ofile = PARAMSTR(2);
   
   if(!Exist(ifile))
    {
        puts("Cannot find input file");
        help();
        @EXIT(0);
    }
   
   if(ofile==NULL)
    {
       puts("No output file specified");
       help();
       @EXIT(0);
    }
   
   
  // ----------------------------------------------
  // seed the random algo with a near unique number
  // ----------------------------------------------
    nrandom_seed = GetTickCount();
   
    psrc = load_file(ifile);
   
  // -------------------------------
  // tokenise file into memory array
  // -------------------------------
    lcnt = ltok(psrc,#parr); 
   
    printf("Name of input file   = %s\n",ifile);
    printf("Text file line count = %d\n", lcnt);
    printf("Array shuffle count  = 100\n");
    printf("Name of output file  = %s\n",ofile);

   
  // -----------------
  // shuffle 100 times
  // -----------------
      mov esi, 100
  @@:
    shuffle_array(parr,lcnt);
      sub esi, 1
      jnz @B
  // -----------------

  // --------------------
  // write result to disk
  // --------------------
    hFil = fopen(ofile,"wb");

      mov esi, lcnt
      mov ebx, parr
  @@:
      mov eax, [ebx]
      mov pstr, eax

    fwrite(pstr,1,str_len(pstr),hFil);    // the line of text
    fwrite("\n",1,2,hFil);                // crlf
   
      add ebx, 4
      sub esi, 1
      jnz @B

    fclose(hFil);
  // --------------------

  // ------------------------------------------
  // free the memory allocated in the tokeniser
  // ------------------------------------------
    GlobalFree( parr );
    free(psrc);
}


« Last Edit: June 18, 2017, 04:50:43 AM by Emil_halim »

hutch--

  • Administrator
  • Member
  • ******
  • Posts: 4886
  • Mnemonic Driven API Grinder
    • The MASM32 SDK
Re: NewSphinxCmm Examples
« Reply #4 on: June 17, 2017, 01:28:58 AM »
 :biggrin:
hutch at movsd dot com
http://www.masm32.com    :biggrin:  :biggrin:

Emil_halim

  • Member
  • **
  • Posts: 99
sample multithread program
« Reply #5 on: June 19, 2017, 02:56:27 AM »
Hi All,

Example from msdn microsoft.

Code: [Select]
/*************************************
*         New Sphinx  Cmm            * 
*                                    *
*     sample multithread program     *
*                                    *
*         from msdn microsoft        *         
*                                    *
*************************************/

// https://msdn.microsoft.com/en-us/library/esszf9hw.aspx

//  Bounce - Creates a new thread each time the letter 'a' is typed. 
//  Each thread bounces a happy face of a different color around 
//  the screen. All threads are terminated when the letter 'Q' is 
//  entered. 

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

#pragma option upx-


#includelib  win32.lib MSVCRT.lib ole32.lib


// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->   


#pragma option ia

#define MAX_THREADS  32 
 
// The function getrandom returns a random number between   
// min and max, which must be in integer range. 
#define getrandom( min, max )   EAX=max; EAX++; EAX-=min; AX = rand() % EAX; AX+=min;   
 
int main( void );                    // Thread 1: main   
void KbdFunc( void  );               // Keyboard input, thread dispatch 
void BounceProc( dword  MyID );      // Threads 2 to n: display   
void ClearScreen( void );            // Screen clear   
void ShutDown( void );               // Program shutdown   
void WriteTitle( int ThreadNum );    // Display title bar information   
 
HANDLE  hConsoleOut;                 // Handle to the console   
HANDLE  hRunMutex;                   // "Keep Running" mutex   
HANDLE  hScreenMutex;                // "Screen update" mutex 
int     ThreadNr;                    // Number of threads started   
CONSOLE_SCREEN_BUFFER_INFO csbiInfo; // Console information 
 
int main() // Thread One   

    // Get display screen information & clear the screen. 
    hConsoleOut = GetStdHandle( STD_OUTPUT_HANDLE ); 
    GetConsoleScreenBufferInfo( hConsoleOut, #csbiInfo ); 
    ClearScreen(); 
    WriteTitle( 0 ); 
 
    // Create the mutexes and reset thread count. 
    hScreenMutex = CreateMutex( NULL, FALSE, NULL );  // Cleared   
    hRunMutex = CreateMutex( NULL, TRUE, NULL );      // Set   
    ThreadNr = 0; 
 
    // Start waiting for keyboard input to dispatch threads or exit. 
    KbdFunc(); 
 
    // All threads done. Clean up handles. 
    CloseHandle( hScreenMutex ); 
    CloseHandle( hRunMutex ); 
    CloseHandle( hConsoleOut ); 

 
void ShutDown( void ) // Shut down threads   

    while ( ThreadNr > 0 ) 
    { 
        // Tell thread to die and record its death. 
        ReleaseMutex( hRunMutex ); 
        ThreadNr--;     
    } 
 
    // Clean up display when done 
    WaitForSingleObject( hScreenMutex, INFINITE ); 
    ClearScreen(); 

 
void KbdFunc( void ) // Dispatch and count threads. 

    dword       tID;
    do 
    {   
        if ( GetAsyncKeyState(VK_A)  & 0x8000  ) && ( ThreadNr < MAX_THREADS ) 
        { 
            ThreadNr++; 
            CreateThread(0,0,#BounceProc,#ThreadNr,0,#tID);
            WriteTitle( ThreadNr );
            Sleep(100);
        } 
    } while(  !GetAsyncKeyState(VK_Q)  & 0x8000   ); 
 
    ShutDown(); 

 
void BounceProc( dword pMyID ) 

    char    MyCell, OldCell; 
    WORD    MyAttrib, OldAttrib; 
    char    BlankCell; 
    COORD   Coords, Delta; 
    COORD   Old; 
    DWORD   Dummy; 
    dword   MyID; 
 
    BlankCell = 0x20;
    Old.X = Old.Y = 0; 
    MyID = pMyID;
   
    // Generate update increments and initial   
    // display coordinates. 
    srand( MyID * 3 ); 
 
    Coords.X = getrandom( 0, csbiInfo.dwSize.X - 1 ); 
    Coords.Y = getrandom( 0, csbiInfo.dwSize.Y - 1 ); 
    Delta.X = getrandom( -3, 3 ); 
    Delta.Y = getrandom( -3, 3 ); 
 
    // Set up "happy face" & generate color   
    // attribute from thread number. 
    if( MyID > 16) 
        MyCell = 0x01;          // outline face   
    else 
        MyCell = 0x02;          // solid face   3
    MyAttrib =  MyID & 0x0F;   // force black background   
 
    do 
    { 
        // Wait for display to be available, then lock it. 
        WaitForSingleObject( hScreenMutex, INFINITE ); 
 
        // If we still occupy the old screen position, blank it out.   
        ReadConsoleOutputCharacter( hConsoleOut, #OldCell, 1, DSDWORD[#Old], #Dummy ); 
        ReadConsoleOutputAttribute( hConsoleOut, #OldAttrib, 1, DSDWORD[#Old], #Dummy ); 
        if (( OldCell == MyCell ) && (OldAttrib == MyAttrib)) 
            WriteConsoleOutputCharacter( hConsoleOut, #BlankCell, 1, DSDWORD[#Old], #Dummy ); 
 
        // Draw new face, then clear screen lock   
        WriteConsoleOutputCharacter( hConsoleOut, #MyCell, 1, DSDWORD[#Coords], #Dummy ); 
        WriteConsoleOutputAttribute( hConsoleOut, #MyAttrib, 1, DSDWORD[#Coords], #Dummy ); 
        ReleaseMutex( hScreenMutex ); 
 
        // Increment the coordinates for next placement of the block.   
        Old.X = Coords.X; 
        Old.Y = Coords.Y; 
        Coords.X += Delta.X; 
        Coords.Y += Delta.Y; 
 
        // If we are about to go off the screen, reverse direction   
        if( Coords.X < 0 ) || ( Coords.X >= csbiInfo.dwSize.X ) 
        { 
            Delta.X = -Delta.X; 
            Beep( 400, 50 ); 
        } 
        if( Coords.Y < 0 ) || ( Coords.Y > csbiInfo.dwSize.Y ) 
        { 
            Delta.Y = -Delta.Y; 
            Beep( 600, 50 ); 
        } 
    } 
    // Repeat while RunMutex is still taken.   
    while ( WaitForSingleObject( hRunMutex, 75L ) == WAIT_TIMEOUT ); 

 
void WriteTitle( int ThreadNum ) 

   
    char  NThreadMsg[80]; 
 
    sprintf( #NThreadMsg, "Threads running: %02d.  Press 'A' " "to start a thread,'Q' to quit.", ThreadNum ); 
    SetConsoleTitle( #NThreadMsg ); 

 
void ClearScreen( void ) 

    DWORD    dummy; 
    COORD    Home;
    Home.X = Home.Y = 0 ; 
    FillConsoleOutputCharacter( hConsoleOut, ' ', csbiInfo.dwSize.X * csbiInfo.dwSize.Y, DSDWORD[#Home], #dummy ); 




Emil_halim

  • Member
  • **
  • Posts: 99
NewSphinxCmm version of Michael Webster's code timing macros
« Reply #6 on: June 26, 2017, 12:15:29 AM »
Hi all

Here is the cmm version of Michael Webster's code timing macros.http://masm32.com/board/index.php?topic=49.0

I have converted to CMM, there was a small problem when i tried to do that.

the problem is , how to put the code in-between the 2 CMM macro and let it works just like masm code. i uesd a trick , which is that at the end of first macro i puted this code
Code: [Select]
        call label
    @label:                              /* Start test loop                             */   
      ?aligncode  16                     /* Optimal loop alignment for P6               */   
   

it push the current address in the stack . and puted this in the front of second macro
Code: [Select]
         dec  __counter__loop__counter__
        jz    @F
        jmp DSDWORD[ESP];
     
      @@:
        ESP += 4;
so it jumps to the end of first macro if the counter is greater than zero.

Here is the macros [ cntrcmm.inc ]
============
Code: [Select]
/*************************************
*         New Sphinx  Cmm            * 
*                                    *
*             counter                *
*                                    *
*         from Masm Forum            *         
*                                    *
*************************************/

//Michael Webster's code timing macros
//http://masm32.com/board/index.php?topic=49.0
 
/*   ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; processor clock cycle count for a block of code. These macros must
  ; be used in pairs, and the block of code must be placed in between
  ; the counter_begin and counter_end macro calls. The counter_end macro
  ; returns the clock cycle count for a single pass through the block of
  ; code, corrected for the test loop overhead, in EAX.
  ;
  ; These macros require a .586 or higher processor directive.
  ;
  ;
  ; The loopcount parameter should be set to a relatively high value to
  ; produce repeatable results.
  ;
  ; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
  ; involves some risk, as it will cause your process to preempt *all*
  ; other processes, including critical Windows processes. Setting the
  ; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
  ; reduce the risk, and in most cases will produce the same cycle count.
  ; --------------------------------------------------------------------- */

#pragma option ia

dword   _loop_count_;
dword   _process_priority_class_;
int     _thread_priority_;

dword __counter__loop__counter__=0;
qword tmp1,tmp2,__counter__qword__count__;

#define counter_begin( arg )  EAX = arg; _counter_begin();
inline _counter_begin(  )                                                         
{
     
      _loop_count_ = EAX;                                                           
      _process_priority_class_ = GetPriorityClass(GetCurrentProcess());                   
      _thread_priority_ = GetThreadPriority(GetCurrentThread());                           
      SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);                     
      SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);               
                                                                                           
        xor   eax, eax                   /* Use same CPUID input value for each call*/     
        cpuid                            /* Flush pipe & wait for pending ops to finish*/ 
        rdtsc                            /* Read Time Stamp Counter*/                     
                                                                                           
        tmp1 = EDX:EAX;
       __counter__loop__counter__ = _loop_count_;                                       
        xor   eax, eax                                                                     
        cpuid                            /* Make sure loop setup instructions finish */   
      ?aligncode  16                     /* Optimal loop alignment for P6  */
        call Lab             
      @Lab:                                /* Start an empty reference loop  */
                   
        dec   __counter__loop__counter__                                               
        jz    @F
        jmp DSDWORD[ESP];
     
      @@:
        ESP += 4;                                                                         
                                                                                           
        xor   eax, eax                                                                     
        cpuid                            /* Make sure loop instructions finish  */           
        rdtsc                            /* Read end count  */                               
       
        EDX:EAX -= tmp1;
        tmp1 = EDX:EAX;
                                                                                           
        xor   eax, eax                                                                     
        cpuid                                                                             
        rdtsc                                                                             
        tmp2 = EDX:EAX;
        __counter__loop__counter__ = _loop_count_;                                           
        xor   eax, eax                                                                     
        cpuid                            /* Make sure loop setup instructions finish    */   
        call label
    @label:                              /* Start test loop                             */   
      ?aligncode  16                     /* Optimal loop alignment for P6               */   
}                                                                                       


inline counter_end()

        dec  __counter__loop__counter__
        jz    @F
        jmp DSDWORD[ESP];
     
      @@:
        ESP += 4;
        xor   eax, eax
        cpuid                            // Make sure loop instructions finish
        rdtsc                            // Read end count
        EDX:EAX -= tmp2;

        __counter__qword__count__ = EDX:EAX - tmp1;
       

   SetPriorityClass(GetCurrentProcess(),_process_priority_class_);   
   SetThreadPriority(GetCurrentThread(),_thread_priority_); 
   
        finit
        fild  DSQWORD [# __counter__qword__count__ ]
        fild  dword [# _loop_count_ ]
        fdiv
        fistp dword [# __counter__qword__count__ ]

        mov   eax, dword [# __counter__qword__count__ ]   
}

//--------------------------------------------------------------------------------------

 /* ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; execution time in milliseconds for a specified number of loops
  ; through a block of code. These macros must be used in pairs, and
  ; the block of code must be placed in between the timer_begin and
  ; timer_end macro calls. The timer_end macro returns the elapsed
  ; milliseconds for the entire loop in EAX.
  ;
  ; These macros utilize the high-resolution performance counter.
  ; The return value will be zero if the high-resolution performance
  ; counter is not available.
  ;
  ; The loopcount parameter should be set to a relatively high value to
  ; produce repeatable results.
  ;
  ; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
  ; involves some risk, as it will cause your process to preempt *all*
  ; other processes, including critical Windows processes. Setting the
  ; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
  ; reduce the risk, and in most cases will produce very nearly the same
  ; result.
  ; --------------------------------------------------------------------- */
 
  __timer__pc__frequency__  :  dq 0
  __timer__pc__count__      :  dq 0
  __timer__loop__counter__  :  dd 0
  __timer__dw_count__       :  dd 0
 
  #define timer_begin( arg )  EAX = arg; _timer_begin();
  inline _timer_begin()
  {
        _loop_count_ = EAX;
        QueryPerformanceFrequency( # __timer__pc__frequency__ );
        if( EAX != 0 )
         {
             _process_priority_class_ = GetPriorityClass(GetCurrentProcess());                   
             _thread_priority_ = GetThreadPriority(GetCurrentThread());                           
             SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);                     
             SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);       
             QueryPerformanceCounter( # __timer__pc__count__ );
             push    DSDWORD [#__timer__pc__count__ + 4]
             push    DSDWORD [#__timer__pc__count__]

             DSDWORD [#__timer__loop__counter__] = _loop_count_;
             call Lab
           @Lab: 
          ?aligncode 8             // Optimal loop alignment for P6
                                   // Start an empty reference loop
             sub   DSDWORD [#__timer__loop__counter__], 1
             jz    @F
             goto  DSDWORD[ESP];
         @@:
             ESP += 4;

             QueryPerformanceCounter( # __timer__pc__count__ );

             pop   ecx           // Recover low-order 32 bits of start count
             sub   DSDWORD [# __timer__pc__count__ ], ecx
             pop   ecx           // Recover high-order 32 bits of start count
             sbb   DSDWORD [# __timer__pc__count__ + 4 ], ecx

             push   DSDWORD [# __timer__pc__count__ + 4 ] // Overhead count
             push   DSDWORD [# __timer__pc__count__ ]     // Overhead count

             QueryPerformanceCounter( # __timer__pc__count__ );

             push   DSDWORD [# __timer__pc__count__ + 4 ] // Start count
             push   DSDWORD [# __timer__pc__count__ ]     // Start count

             DSDWORD [#__timer__loop__counter__] = _loop_count_;
             call label
       @label:                      // Start test loop
         ?aligncode 16              // Optimal loop alignment for P6
         }
   }
   
   inline timer_end()
   {
            dec  DSDWORD [#__timer__loop__counter__]
            jz    @F
            jmp DSDWORD[ESP];
     
         @@:
            ESP += 4;
           
            QueryPerformanceFrequency( # __timer__pc__frequency__ );
          if( EAX != 0)
           {
               QueryPerformanceCounter( # __timer__pc__count__ );
               pop   ecx           // Recover low-order 32 bits of start count
               sub   DSDWORD [# __timer__pc__count__ ], ecx
               pop   ecx           // Recover high-order 32 bits of start count
               sbb   DSDWORD [# __timer__pc__count__ + 4 ], ecx
               pop   ecx           // Recover low-order 32 bits of overhead count
               sub   DSDWORD [# __timer__pc__count__ ], ecx
               pop   ecx           // Recover high-order 32 bits of overhead count
               sbb   DSDWORD [# __timer__pc__count__ + 4 ], ecx
            }
   
            SetPriorityClass(GetCurrentProcess(),_process_priority_class_);   
            SetThreadPriority(GetCurrentThread(),_thread_priority_); 
           
            finit
            fild DSQWORD[# __timer__pc__count__]
            fild DSQWORD[# __timer__pc__frequency__]
            fdiv
            mov   DSDWORD[#__timer__dw_count__], 1000
            fild  dword [# __timer__dw_count__]
            fmul
            fistp dword [# __timer__dw_count__ ]
            mov   eax, [# __timer__dw_count__]
   }

Here is a test  program
===============
Code: [Select]
/*************************************
*         New Sphinx  Cmm            * 
*                                    *
*             counter                *
*                                    *
*         from Masm Forum            *         
*                                    *
*************************************/

//Michael Webster's code timing macros
//http://masm32.com/board/index.php?topic=49.0

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

#pragma option upx-

#includelib  win32.lib MSVCRT.lib ole32.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

#include "cntrcmm.inc"

//#include <math64.h-->


#define LOOP_COUNT   10000000

main()
{
      unsigned long long pam, sam, tsc;
      unsigned long pid;
     
      SetProcessAffinityMask(GetCurrentProcess(), 1);
      GetProcessAffinityMask(GetCurrentProcess(), #pam, #sam);
      printf("%I64d\t%I64d\n", pam, sam); 


            counter_begin(LOOP_COUNT); 
           
                mov    eax, 2
                cpuid
         
            counter_end();
           
            printf("the process takes: %d cycles\n", EAX ); 
           
            timer_begin(LOOP_COUNT);
           
                mov    eax, 2
                cpuid
               
            timer_end();
           
            printf("the process takes: %d cycles\n", EAX ); 
           
      system("pause");
}

Emil_halim

  • Member
  • **
  • Posts: 99
MOV vs PUSH ticks comparison
« Reply #7 on: June 27, 2017, 02:11:47 AM »
Hi All

MOV vs PUSH ticks comparison from http://masm32.com/board/index.php?topic=6324.0

so , this converted example shows you that how it is so easy to convert masm code to NewSphinxCmm then using CMM stunning features.

Code: [Select]
/*************************************
*         New Sphinx  Cmm            * 
*                                    *
*             timeit                *
*                                    *
*         from Masm Forum            *         
*                                    *
*************************************/
/*
; MOV vs PUSH ticks comparison
; MOV wins only for 7 strikes. After that,
; PUSH wins.

http://masm32.com/board/index.php?topic=6324.0

*/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

#pragma option upx-

#includelib  win32.lib MSVCRT.lib ole32.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

#pragma option ia

    loops equ 50000000  //50 mil
 
disply1: db "MOV  = %u ticks\n",0ah,0
disply2: db "PUSH = %u ticks\n",0ah,0
disply3: db "%u*2*6\n",0dh,0ah,0

mytest(dword MainLoops)
{
   dword MainLoop;
   
 //;;===========MOV==============
push MainLoops
push offset disply3
call printf
add esp,8
cpuid
call GetTickCount
mov esi,eax
mov ecx,loops
@@:
MainLoop = MainLoops;
.repeat
sub esp,4*6
mov [esp],eax
mov [esp+4],ebx
mov [esp+8],ecx
mov [esp+12],edx
mov [esp+16],edi
mov [esp+20],esi
dec MainLoop
.until MINUSFLAG
MainLoop = MainLoops;
.repeat
mov eax,[esp]
mov ebx,[esp+4]
mov ecx,[esp+8]
mov edx,[esp+12]
mov edi,[esp+16]
mov esi,[esp+20]
add esp,4*6
dec MainLoop
.until MINUSFLAG
sub ecx,1
jnz @B
call GetTickCount
sub eax,esi
push eax
push offset disply1
call printf
add esp,8

//;;============PUSH==============
cpuid
call GetTickCount
mov esi,eax
mov ecx,loops

@@:
MainLoop = MainLoops;
.repeat
push eax
push ebx
push ecx
push edx
push edi
push esi
dec MainLoop
.until MINUSFLAG
MainLoop = MainLoops;
.repeat
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
dec MainLoop
.until MINUSFLAG

sub ecx,1
jnz @B
call GetTickCount
sub eax,esi
push eax
push offset disply2
call printf
add esp,8

}

main()
{   
     
    mytest( 1  );
mytest( 1  );
mytest( 2  );
mytest( 2  );
mytest( 4  );
mytest( 8  );
mytest( 16 );
mytest( 32 );
     
//;;===========MOV==============
cpuid
call GetTickCount
mov esi,eax
mov ecx,loops

@@:
sub esp,4*7
mov [esp],eax
mov [esp+4],ebx
mov [esp+8],ecx
mov [esp+12],edx
mov [esp+16],edi
mov [esp+20],esi
mov [esp+24],ebp

mov eax,[esp]
mov ebx,[esp+4]
mov ecx,[esp+8]
mov edx,[esp+12]
mov edi,[esp+16]
mov esi,[esp+20]
mov ebp,[esp+24]
add esp,4*7

sub ecx,1
jnz @B
call GetTickCount
sub eax,esi
push eax
push offset disply1
call printf
add esp,8

//;;============PUSH==============
cpuid
call GetTickCount
mov esi,eax
mov ecx,loops

@@:
push eax
push ebx
push ecx
push edx
push edi
push esi
push ebp

pop ebp
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax

sub ecx,1
jnz @B
call GetTickCount
sub eax,esi
push eax
push offset disply2
call printf
add esp,8

     system("pause");
}

Emil_halim

  • Member
  • **
  • Posts: 99
CPU detection
« Reply #8 on: June 28, 2017, 02:47:09 AM »
Hi all,

still the code is close to masm syntax.

this part of code ported to CMM from http://masm32.com/board/index.php?topic=4940.msg53093#msg53093

Code: [Select]
/*************************************
*         New Sphinx  Cmm            * 
*                                    *
*          CPU detection             *
*                                    *
*         from Masm Forum            *         
*                                    *
*************************************/

/*
   this is part of Fast memory allocation
   
   written by nidud , converted by Me.
   
   http://masm32.com/board/index.php?topic=4940.msg53093#msg53093   
*/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option dbg
#pragma option lst

#pragma option upx-

#includelib  win32.lib MSVCRT.lib ole32.lib

// $ will replaced with SphinxC-- main path
#includepath "$\winlib" 
 
#include <windows.h> 
#include <MSVCRT.H-->

#pragma option ia

SSE_MMX equ 00000000001B
SSE_SSE equ 00000000010B
SSE_SSE2 equ 00000000100B
SSE_SSE3 equ 00000001000B
SSE_SSSE3 equ 00000010000B
SSE_SSE41 equ 00000100000B
SSE_SSE42 equ 00001000000B
SSE_XGETBV equ 00010000000B
SSE_AVX equ 00100000000B
SSE_AVX2 equ 01000000000B
SSE_AVXOS equ 10000000000B


dword sselevel=0;

main()
{
//-------------------------------------------------------------------------------
//  CPU detection
//-------------------------------------------------------------------------------

pushfd
pop eax
mov ecx,200000h
mov edx,eax
xor eax,ecx
push eax
popfd
pushfd
pop eax
xor eax,edx
and eax,ecx
push ebx
.if ! ZEROFLAG
xor eax,eax
cpuid
.if EAX
.if AH == 5
xor eax,eax
.else
mov eax,7
xor ecx,ecx
cpuid // check AVX2 support
xor eax,eax
bt ebx,5 // AVX2
rcl eax,1 // into bit 9
push eax
mov eax,1
cpuid
pop eax
bt ecx,28 // AVX support by CPU
rcl eax,1 // into bit 8
bt ecx,27 // XGETBV supported
rcl eax,1 // into bit 7
bt ecx,20 // SSE4.2
rcl eax,1 // into bit 6
bt ecx,19 // SSE4.1
rcl eax,1 // into bit 5
bt ecx,9 // SSSE3
rcl eax,1 // into bit 4
bt ecx,0 // SSE3
rcl eax,1 // into bit 3
bt edx,26 // SSE2
rcl eax,1 // into bit 2
bt edx,25 // SSE
rcl eax,1 // into bit 1
bt ecx,0 // MMX
rcl eax,1 // into bit 0
mov sselevel,eax
.endif
.endif
.endif
.if EAX & SSE_XGETBV
push eax
xor ecx,ecx
    db  0x0F,0x01,0xD0   // xgetbv
and eax,6      // AVX support by OS?
pop eax
.if !ZEROFLAG
or sselevel,SSE_AVXOS
.endif
.endif
pop ebx
.if ! EAX = sselevel & SSE_SSE2
printf( "CPU error: Need SSE2 level\n" );
system("pause");
ExitProcess( 0 );
.endif
sub esp,80
mov edi,esp
xor esi,esi
.repeat
lea eax,[esi+80000002h]
cpuid
mov [edi],eax
mov [edi+4],ebx
mov [edi+8],ecx
mov [edi+12],edx
add edi,16
inc esi
.until ESI == 3
mov eax,esp
.while DSBYTE [EAX] == ' '
inc eax
.endw
printf( EAX );
add esp,80

printf( " (" );
.if EAX = sselevel  & SSE_AVX2
printf( "AVX2" );
.elseif EAX  = sselevel  & SSE_AVX
printf( "AVX" );
.elseif EAX  = sselevel  & SSE_SSE42
printf( "SSE4.2" );
.elseif EAX  = sselevel  & SSE_SSE41
printf( "SSE4.1" );
.elseif EAX  = sselevel  & SSE_SSSE3
printf( "SSSE3" );
.elseif EAX  = sselevel  & SSE_SSE3
printf( "SSE3" );
.else
printf( "SSE2" );
.endif
printf( ")\n----------------------------------------------\n" );

system("pause");
ExitProcess( 0 );     
}

Emil_halim

  • Member
  • **
  • Posts: 99
strlen demo
« Reply #9 on: June 28, 2017, 05:27:19 AM »
Hi All,

This demo will show you that , how it is so easy to create Function Address Table which holds the address of functions then later you can call them in a certain order.

also it shows you how to code the same function in many deferent syntax way , such as c style , asm style , c-- style .......

finally it shows you how easy to make a benchmark test to see the deferent speed of some cods.

Code: [Select]
/***************************************
*          New Sphinx Cmm              * 
*                                      *
*     strlen demo  By Emil Halim       *
*                                      *
***************************************/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization
 
#includepath "$\winlib" 

#include <windows.h> 
#include <MSVCRT.H-->

#includelib  win32.lib , msvcrt.lib 

#pragma option upx-

#pragma option LST
#pragma option ia

dword  strlen0( char * pStr )
{
    char* cp =  pStr;
    while ( *cp != 0 ) cp++;
    return cp - pStr;     
}
 
int strlen1(char* pStr)
{
    EAX=0;
    while(byte *pStr !=0 )
     {
        pStr++;
        EAX++;
     }
}

int fastcall strlen2(EAX)
{   
    EBX=EAX;
    while(DSBYTE[EAX] !=0 )
     {
        EAX++;
     }
    EAX -= EBX;
}

int fastcall strlen3(EAX)      // pure ASM code
{
   MOV EBX,EAX
@lop:
   CMP DSBYTE[EAX],0
   JE  near @fin
   INC EAX
   JMP lop
@fin:   
   SUB EAX,EBX
}

?aligncode 4
 
strlen4 Proc item:DWORD
    push    ebx
    mov     eax, item               // get pointer to string
    lea     edx, [eax+3]            // pointer+3 used in the end
 @@:
    mov     ebx, [eax]              // read first 4 bytes
    add     eax, 4                  // increment pointer
    lea     ecx, [ebx-01010101h]    // subtract 1 from each byte
    not     ebx                     // invert all bytes
    and     ecx, ebx                // and these two
    and     ecx, 80808080h
    jz      @B                      // no zero bytes, continue loop
    test    ecx, 00008080h          // test first two bytes
    jnz     @F
    shr     ecx, 16                 // not in the first 2 bytes
    add     eax, 2
 @@:
    shl     cl, 1                   // use carry flag to avoid branch
    sbb     eax, edx                // compute length
    pop     ebx
strlen4 Endp

 
// *** SSE2 version  from MASM forum***
? aligncode 16
int  fastcall strlen5(EAX) 
{     
    EBX = EAX ;                 // get the string pointer
    LEA ECX, DSDWORD[EAX+16]    // save pointer to string, on par with eax after first loop
    EAX &= 0xFFFFFFF0;          // align for use with SSE2
  @shiftOK:     
    XORPS XMM0, XMM0            // zero xmm0 for finding zero bytes
  @a1: 
    PCMPEQB XMM0, DSQWORD[EAX]  // ---- inner loop -----
    PMOVMSKB EDX, XMM0          // set byte mask in edx
    EAX += 16;                  // len counter (best position here)
    TEST EDX,EDX
    JE a1
    if(ECX<=EAX) goto a2;
    ECX -= EAX;                 // get difference, and cancel "misalign flag"
    SHR EDX, CL                 // shift invalid
    SHL EDX, CL                 // bits out
    JE shiftOK
  @a2:   
    BSF EDX, EDX                // bit scan for the index
    SUB EAX, EBX                // subtract original src pointer
    LEA EAX, DSDWORD[EAX+EDX-16] // add scan index
}
? aligncode 4

.code
strlen6:
    mov eax, [esp+4]
    sub eax, 1
  lbl:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne lbl
    sub eax, [esp+4]
    ret 4
.data

strlen7 Proc  mstr:DWORD
MOV ECX,mstr // Move source pointer to ECX
EAX = -1; // Start of at -1 so we can build a faster loop
  next_char:
EAX++;     // EAX==NULL
CMP DSBYTE[ECX+EAX],0
JNE next_char // If BYTE is not equal to NULL process next .
RET // Returns string length in EAX
strlen7 Endp
//-----------function address table-----------------
FunTbl : dd  # strlen0,
             # strlen1,
             # strlen2,
             # strlen3,
             # strlen4,
             # strlen5,
             # strlen6,
             # strlen7,
             0
/*-------------------------------------------------------------------------------*/
char* testStr = "NewSphinxCmm is a stunning program language";

qword  temp_1; 
qword  temp_2; 

main() 

  int i; 
  double  reslt1, reslt2; 
  int count;   
 
     printf("strlen  = %d\n",strlen ( testStr ));
     printf("strlen0 = %d\n",strlen0( testStr ));   
     printf("strlen1 = %d\n",strlen1( testStr ));   
     printf("strlen2 = %d\n",strlen2( testStr ));   
     printf("strlen3 = %d\n",strlen3( testStr ));   
     printf("strlen4 = %d\n",strlen4( testStr )); 
     printf("strlen5 = %d\n",strlen5( testStr )); 
     printf("strlen6 = %d\n",strlen6( testStr ));
     printf("strlen7 = %d\n",strlen7( testStr ));
     
     i = 0;
     while(DSDWORD[i*4+#FunTbl])
     {
       
        EDX=DSDWORD[i*4+#FunTbl];
        if (i==2) || (i==3) || (i==5) // fast functions they have no stack frame
         {
           EAX=testStr;
           EDX();
         }
         else
           EDX( testStr );
        printf("len[%d] = %d\n",i, EAX );
        i++;
     }

  count = 1000000; 

  SetPriorityClass( GetCurrentProcess(), HIGH_PRIORITY_CLASS);           
  rdtsc 
  temp_1 = EDX:EAX; 
   
     for(i=0; i < count; i++) 
      { 
          strlen1( testStr ); 
      }       
  rdtsc 
  temp_1 = EDX:EAX - temp_1; 
   
  rdtsc 
  temp_2 = EDX:EAX; 
   
     for(i=0; i < count; i++) 
      { 
          strlen5( testStr ); 
      }       
  rdtsc 
  temp_2 = EDX:EAX - temp_2; 
   
  SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); 
   
  ST(0) = temp_1 / count;   
  fstp   reslt1 
  ST(0) = temp_2 / count;   
  fstp   reslt2 
  printf("strlen1 is %f\nstrlen5 is %f\n",   reslt1 , reslt2 );
   
  system("pause");   


Emil_halim

  • Member
  • **
  • Posts: 99
using Borland ilink32
« Reply #10 on: June 28, 2017, 09:56:45 PM »
Hi ALL,

To use ilink32 of Borland , just first download a BorlandC++ trail version in your system.

then try the next Example , needs NewSphinxCmm version 256.

Code: [Select]
/***************************************
*          New Sphinx Cmm              * 
*                                      *
*     using ilink  By Emil Halim       *
*                                      *
***************************************/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option obj        //tells NewSphinxCmm to do not link

#OnExit "ilink32 /ap /m -s  -Gt -Gn  $OFILE$.obj c0x32.OBJ , , , import32.lib cw32.lib , , "
 
#includepath "$\winlib" 

#include <windows.h> 

#pragma option LST
#pragma option ia

dword dumy;   // disable error of missing a section of class 2 by ilink

/*------------------------*/
extern cdecl _printf();   
#define printf  _printf

extern cdecl _strlen();   
#define strlen  _strlen

extern cdecl _system();   
#define system  _system

_main()
{
main:
   printf("hello world ......\n\n");
   printf("using ilink Borland linker ......\n\n");
   
   printf("the length of welcome = %d\n\n" , strlen("welcome"));
   
   system("pause");   
}
/*------------------------*/

 

Emil_halim

  • Member
  • **
  • Posts: 99
BorlandC++ startup code in cmm
« Reply #11 on: June 29, 2017, 09:19:40 PM »
Hi ALL,

In this Demo i have changed the  __acrtused procedure in the c0nt.asm to be like that
Code: [Select]
;----------------------------------------------------------------------
; Startup code

EXTRN           cmmstratup:NEAR  ; Added By Emil Halim

_TEXT           SEGMENT  DWORD USE32 PUBLIC 'CODE'

                public __acrtused
__acrtused      PROC NEAR   

                jmp   cmmstratup              ; Added By Emil Halim                 
                               
__acrtused      ENDP



then OnExit directive will assemble it with tasm32 see below.

also i remove the underscore from _main so that it will call CMM main.

so the cmmstratup cmm code holds the startup code , you can modify it as you wish.

Here is the CMM demo
==============
Code: [Select]
/***************************************
*          New Sphinx Cmm              * 
*                                      *
*     using ilink  By Emil Halim       *
*                                      *
***************************************/

/* borland C++ console starup code */

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option obj        //tells NewSphinxCmm to do not link

#OnExit "tasm32 /ml c0nt.asm"
#OnExit "ilink32 /ap /m -s  -Gt -Gn -Gl $OFILE$.obj  c0nt.obj, , ,vcl.lib import32.lib cw32.lib , , "
 
#includepath "$\winlib" 

#include <windows.h> 

#pragma option LST
#pragma option ia

/*------------------------*/
extern cdecl _printf();   
#define printf  _printf

extern cdecl _strlen();   
#define strlen  _strlen

extern cdecl _system();   
#define system  _system

dword hinst;

/*-----------startup code----------------*/
extern
{
  dword ___CPPdebugHook_segment;
  dword __TLS_index;
  dword __TLS_index4;
  dword __hInstance;
  dword ___CPPdebugHook;
  dword module_data;
  ___CRTL_VCL_Init();
  ___CRTL_MEM_UseBorMM();
  ___CRTL_VCLLIB_Linkage();
  __ExceptInit();
  __startup();
}

/* borland C++ console starup code */
cmmstratup() 
{   
                jmp     skip_dbg_vector
                db      "fb:C++HOOK"           // special signature
                nop                            // alignment byte
                db      0E9h                   // encode a jmp instruction so that the disassembler in the IDE can see past this address to the skip_dbg_vector
                dd      # ___CPPdebugHook_segment
    skip_dbg_vector:
                __TLS_index4 = __TLS_index << 2;
                push     edx
                push    0                       // NULL returns current module
                edx = GetModuleHandle();
                ___CRTL_VCL_Init();             // EDX now has hInstance in it
                pop     edx
                ___CRTL_MEM_UseBorMM();         // Call out to potentially re-vector the memory manager
                ___CRTL_VCLLIB_Linkage();       // Call out to touch a symbol that will be undefined if vcl.lib was used with any of the CW32xx forms of the RTL.
    skip_CRTL_xxxx:
                push    0
                __ExceptInit();
                pop     ecx
    not_process_attach:
                push    # module_data
                push    0                       // NULL returns current module
                __hInstance = GetModuleHandle();
                               
                hinst = EAX;                    // Added By Emil Halim  you cau put your own work
               
                push    0                       // dummy return address
                goto     __startup;                                                           
}
/*-----------end of startup code----------------*/


main()
{

   printf("hello world ......\n\n");
   printf("using ilink borland linker ......\n\n");
   
   printf("the length of welcome = %d\n\n" , strlen("welcome"));
   
   printf("the hInstance = %d\n\n" , hinst );
   
   system("pause");   
}