NewSphinxCmm Examples

Emil_halim · May 15, 2017, 02:36:38 AM

Hi all,

Here is an other Example that showing you how to mix C-- & Masm codes.

NoteWell:
=======
1- NewSphinxC extended the extern keyword so that it will accept an external block of decelerations.

Code Select


extern {
  cdecl GetStrLen();   // declares external function 
  cdecl Ten();             // declares external function
  byte  buffer;            // declares external data
  byte  hw;                // declares external data
}

2- using EXTERNDEF keyword in Masm block to external data of function those declared outside Masm code block and you entend to use them in Masm block.

3- the Masm code block started with .model directive with term 'c' which is a calling convention.

4- MasmPlgIn allows some c-- features with masm code blcok .

Code Select


EAX := -1	
EAX++

here the demo code

Code Select


/************************************* 
*         New Sphinx Cmm             *  
*                                    *
*           masm  test2              *
*                                    *
*         by Emil_halim              *          
*                                    *
*************************************/
#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

#Entry  main
#includelib  win32.lib MSVCRT.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

// tells Cmm about data & code in masm section
//-------------------------------------------
extern {
  cdecl StCpy();
  cdecl GetStrLen();
  cdecl Ten();
  byte  buffer;
  byte  hw;
}

// declare some Cmm variables
cmm_buffer: $DB 12 dup 0 
char* cmm_hw = "Hello World from cmm";

int cmm_val = 100;

// masm code start here
^^
  .MODEL flat, c   
  .nolist
  
   EXTERNDEF  cmm_val:SDWORD

.data?
buffer	db 12 dup(?)	; destination

.data
hw	db "Hello World from masm", 0	; source

.code

// Gets the length of a string(not including the NULL terminator)
GetStrLen Proc ,_str:PTR
	MOV ECX,_str				// Move source pointer to ECX
	EAX := -1					// Start of at -1 so we can build a faster loop
  next_char:
	EAX++  					    // EAX==NULL	
	CMP byte PTR[ECX+EAX],0
	JNE next_char				// If BYTE is not equal to NULL process next .
	RET							// Returns string length in EAX
GetStrLen EndP

Ten:
   mov EAX,10
   add EAX,cmm_val
   ret 
   
FASTPROC StCpy
  push esi		        
  push edi
  mov edi, [esp+8+4]	// dest from stack (4 bytes each)
  mov esi, [esp+8+8]	// src from stack
  .Repeat
	  lodsb
	  stosb
  .Until al==0
  pop edi
  pop esi
  ret    		
END FASTPROC   

^^
// masm code end here


main()
{
       
   printf("string length = %d \n" , GetStrLen("CmmPro Is the best"));
   
   StCpy(#cmm_buffer,cmm_hw);
   puts(#cmm_buffer);
   
   StCpy(#buffer,#hw);
   puts(#buffer);

   printf("Ten = %d\n",Ten());
   system("pause");   
  
}

Emil_halim · May 15, 2017, 02:38:29 AM

Hi All,

Here is another Example that mix SphinxC and HJWasm.

some roles you have to care of.

1- masm block of code must started with term '^^' and end with the same term.
2- masm bolck will extract from sphinxc then compile and later linked with alink.
3- this process will accomplished by MasmPlgIn , you have not use the cpu directive.
4- MasmPlgIn will specify tha path of include directory.
5- take care of function call convention , it may crash your code.

the next masm code was taken from http://www.masmforum.com/board/index.php?topic=14696.105
it determines the cpu of computer.

Code Select


/************************************* 
*         New Sphinx Cmm             *  
*                                    *
*           masm test                *
*                                    *
*         by Emil_halim              *          
*                                    *
*************************************/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

#Entry  main
#includelib  win32.lib MSVCRT.lib ole32.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

// tells SphinxC about masm ShowCpu function
extern stdcall ShowCpu(dword v);

^^ //start of Masm code
.nolist
include masm32rt.inc

include m32lib/dwtoa.asm
include m32lib/stdout.asm

.code
;//  masm code taken from the next link 
;//  http://www.masmforum.com/board/index.php?topic=14696.105
ShowCpu proc stdcall	; mode:DWORD
COMMENT @ Usage: 
  push 0, call ShowCpu	; simple, no printing, just returns SSE level
  push 1, call ShowCpu	; prints the brand string and returns SSE level @
  
  pushad
  sub esp, 80	    ; create a buffer for the brand string
  mov edi, esp		; point edi to it
  xor ebp, ebp
  .Repeat
  	lea eax, [ebp+80000002h]
	db 0Fh, 0A2h	; cpuid 80000002h-80000004h
	stosd
	mov eax, ebx
	stosd
	mov eax, ecx
	stosd
	mov eax, edx
	stosd
	inc ebp
  .Until ebp>=3
  push 1
  pop eax
  db 0Fh, 0A2h		; cpuid 1
  xor ebx, ebx		; CpuSSE
  xor esi, esi		; add zero plus the carry flag
  bt edx, 25		; edx bit 25, SSE1
  adc ebx, esi
  bt edx, 26		; edx bit 26, SSE2
  adc ebx, esi
  bt ecx, esi		; ecx bit 0, SSE3
  adc ebx, esi
  bt ecx, 9			; ecx bit 9, SSE4
  adc ebx, esi
  dec dword ptr [esp+4+32+80]	; dec mode in stack
  .if Zero?
	mov edi, esp	; restore pointer to brand string
  	.Repeat				
		.Break .if byte ptr [edi]!=32	; mode was 1, so show a string but skip leading blanks
		inc edi
	.Until 0
	.if byte ptr [edi]<32
		print chr$("pre-P4")
	.else
		print edi	; CpuBrand
	.endif
	.if ebx
		print chr$(32, 40, "SSE")	; info on SSE level, 40=(
		print str$(ebx), 41, 13, 10	; 41=)
	.endif
  .endif
  add esp, 80		; discard brand buffer (after printing!)
  mov [esp+32-4], ebx	; move ebx into eax stack position - returns eax to main for further use
  popad
  ret 4
ShowCpu endp

^^ //End of Masm code


;//***********************************************************************************//


main()
{
   ShowCpu(1);	// print brand string and SSE level
     
   system("pause");   
  
}

Emil_halim · May 17, 2017, 04:20:02 AM

Hi All,

Here is another Example from PowerBasic Forum written by Steve Hutchesson,

I have convert his Example to C-- & Masm, the procedures converted to Masm , and the main code to c--.

Code Select


/************************************* 
*         New Sphinx Cmm             *  
*                                    *
*     memory-copy-benchmarks         *
*                                    *
*      from powerbasic Forum         *          
*                                    *
*************************************/


#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

//#pragma option upx-

#Entry  main
#includelib  win32.lib MSVCRT.lib ole32.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

#pragma option ia

// tells SphinxC about masm ShowCpu function
extern {
   cdecl SSEcopy(dword src,dword dst,dword blen);
   cdecl SSEcopy2(dword src,dword dst,dword blen);
}
//start of Masm code
^^ 
  .MODEL flat, c   
  .nolist
.data

pflead dd 0

.code
;//  masm code taken from the next link and cereated by Steve Hutchesson. 
;//  https://forum.powerbasic.com/forum/user-to-user-discussions/powerbasic-inline-assembler/43459-memory-copy-benchmarks

SSEcopy proc c,src:DWORD,dst:DWORD,blen:DWORD

     mov esi, src
     mov edi, dst
     mov ebx, blen
     shr ebx, 6                        ; int divide ebx by 64
     xor edx, edx                     ; zero EDX and use as INDEX

  align 4
  lbl0:
     movdqa xmm0, [esi+edx]            ; 16 byte aligned reads
     movdqa xmm1, [esi+edx+16]
     movdqa xmm2, [esi+edx+32]
     movdqa xmm3, [esi+edx+48]

     movntdq [edi+edx], xmm0           ; non temporal writes
     movntdq [edi+edx+16], xmm1
     movntdq [edi+edx+32], xmm2
     movntdq [edi+edx+48], xmm3
     
     add edx, 64                       ; add block copy size to INDEX

     sub ebx, 1                        ; decrement loop counter
     jnz lbl0

     mov ebx, edx                      ; test for remainder
     sub ebx, blen                     ; EBX is remainder loop counter if not zero
     jz lbl2

  align 4
  lbl1:
     movzx eax, BYTE PTR [esi+edx]     ; copy remainder
     mov [edi+edx], al
     add edx, 1                        ; increment the INDEX
     sub ebx, 1                        ; decrement the loop counter
     jnz lbl1

  lbl2:
    ret 
SSEcopy endp

SSEcopy2 proc c,src:DWORD,dst:DWORD,blen:DWORD

     mov esi, src
     mov edi, dst
     mov ebx, blen
     shr ebx, 7                        ;// int divide ebx by 128
     xor edx, edx                      ;// zero EDX and use as INDEX

  align 4
  lbl0:
    ; prefetchnta BYTE PTR [esi+edx+pflead]
    ;//  prefetcht0 BYTE PTR [esi+edx+%pflead]
    ;//  prefetcht1 BYTE PTR [esi+edx+%pflead]
    ;// prefetcht2 BYTE PTR [esi+edx+%pflead]

     movdqa xmm0, [esi+edx]            ;// 16 byte aligned reads
     movdqa xmm1, [esi+edx+16]
     movdqa xmm2, [esi+edx+32]
     movdqa xmm3, [esi+edx+48]

     movdqa xmm4, [esi+edx+64]
     movdqa xmm5, [esi+edx+80]
     movdqa xmm6, [esi+edx+96]
     movdqa xmm7, [esi+edx+112]

     movntdq [edi+edx], xmm0           ;// non temporal writes
     movntdq [edi+edx+16], xmm1
     movntdq [edi+edx+32], xmm2
     movntdq [edi+edx+48], xmm3

     movntdq [edi+edx+64], xmm4
     movntdq [edi+edx+80], xmm5
     movntdq [edi+edx+96], xmm6
     movntdq [edi+edx+112], xmm7

     add edx, 128                      ;// add block copy size to INDEX

     sub ebx, 1                        ;// decrement loop counter
     jnz lbl0

     mov ebx, edx                      ;// test for remainder
     sub ebx, blen                     ;// EBX is remainder loop counter if not zero
     jz lbl2

  align 4
  lbl1:
     movzx eax, BYTE PTR [esi+edx]     ;// copy remainder
     mov [edi+edx], al
     add edx, 1                        ;// increment the INDEX
     sub ebx, 1                        ;// decrement the loop counter
     jnz lbl1

  lbl2:
    ret 
SSEcopy2 endp

^^ 
//End of Masm code


;//***********************************************************************************//

#define MEMLEN  1024*1024*129

main()
{
   dword hMem,tMem,aMem,tc;
   
   hMem = GlobalAlloc(GMEM_FIXED | GMEM_ZEROINIT,MEMLEN); // allocate 129 meg
   tMem = GlobalAlloc(GMEM_FIXED,MEMLEN);                 // allocate 129 meg 
  
      MOV esi, hMem
  // -------------------------------
  // align ESI to a 16 byte boundary
  // -------------------------------
      add esi, 15
      and esi, -16
      mov aMem, esi
      
      puts("please wait......");
      
      tc = GetTickCount();
      
      mov ecx, 100
  lbl0:
      SSEcopy2(aMem,tMem,MEMLEN);
      sub ecx, 1
      jnz lbl0

      tc = GetTickCount() - tc;
      
      printf("XMM copy 12.8 gig memory copy in %d ms\n",tc);
     
   GlobalFree(hMem);
   GlobalFree(tMem);  
   system("pause");   
   
}

Emil_halim · June 17, 2017, 01:25:02 AM

Hi All,

This is pure SphinxC-- code.

shuffle array created by hutch and converted by me.

http://masm32.com/board/index.php?PHPSESSID=a641fd02d84ff11e8eb1ce754b88cbda&topic=5367.0

Code Select


/************************************* 
*         New Sphinx  Cmm            *  
*                                    *
*          shuffle array             *
*                                    *
*         from Masm Forum            *          
*                                    *
*************************************/



#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 
#jumptomain NEAR

#parsecommandline TRUE 

#pragma option dbg
#pragma option lst

#pragma option upx-

#Entry __startupproc
#includelib  win32.lib MSVCRT.lib ole32.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

#pragma option ia


dword nrandom_seed = 0;

nrandom Proc c rbase:DWORD
    mov eax, nrandom_seed

  // ****************************************
    test eax, 80000000h
    jz  nxt
    add eax, 7fffffffh
  nxt:   
  // **************************************** 

    xor edx, edx
    mov ecx, 127773
    div ecx
    mov ecx, eax
    mov eax, 16807
    mul edx
    mov edx, ecx
    mov ecx, eax
    mov eax, 2836
    mul edx
    sub ecx, eax
    xor edx, edx
    mov eax, ecx
    mov nrandom_seed, ecx
    div rbase
    mov eax, edx
    ret  
nrandom endp

shuffle_array proc arr:DWORD,cnt:DWORD

    LOCAL lcnt  :DWORD

    lcnt = cnt;             // copy cnt to lcnt

    push ebx,esi,edi

    mov esi, arr
    mov edi, arr
    xor ebx, ebx

  @@:
    invoke nrandom,cnt      // get the random number within "cnt" range
    mov ecx, [esi+ebx*4]    // get the incremental pointer
    mov edx, [edi+eax*4]    // get the random pointer
    mov [esi+ebx*4], edx    // write random pointer back to incremental location
    mov [edi+eax*4], ecx    // write incremental pointer back to random location
    add ebx, 1              // increment the original pointer
    sub lcnt, 1             // decrement the loop counter
    jnz @B

    pop edi,esi,ebx

    ret

shuffle_array endp


ltok Proc c  src : DWORD, pArray : DWORD

    dword pTxt, pmem , bcnt;

  // ---------------------------------------------------------------
  // tokenise lines in a text source writing an array of pointers
  // to the address of "pArray" and returning the line count in EAX.
  //
  // The address written to the variable "pArray" should be released
  // within the same scope as the variable with a call to GlobalFree()
  // when the pointer array is no longer required.
  //
  // EXAMPLE
  // cnt = ltok(ptxt,ByVal VarPtr(harr))    ' tokenise source lines
  // dim tline(cnt) as ASCIIZ PTR at harr   ' treat it as an ASCIIZ PTR array.
  // .....
  // GlobalFree harr                        ' deallocate memory from "ltok"
  // ---------------------------------------------------------------
    pTxt =  src;

     mov edi, 1                      // set counter to 1 in case of no trailing CRLF

     mov esi, pTxt
     sub esi, 1
  // ----------------
  // count line feeds
  // ----------------
  @@:
     add esi, 1
     movzx edx, BYTE PTR [esi]
     test edx, edx                   // test for terminator
     jz @F
     cmp edx, 10                     // test for line feed
     jne @B
     add edi, 1                      // lf count in EDI
     jmp @B
  @@:
  // --------------------
  // multiply result by 4
  // --------------------
     add edi, edi
     add edi, edi
     mov bcnt, edi

     pmem = GlobalAlloc(GMEM_FIXED | GMEM_ZEROINIT,bcnt);

     mov edi, pmem                   // copy allocated memory address into EDI
     mov esi, pTxt
     xor eax, eax                    // zero arg counter
     sub esi, 1
     jmp Ftrim

  // ---------------------------------

  Terminate:
     mov BYTE PTR [esi], 0           // terminate end of current line

  Ftrim:                             // scan to find next acceptable character
     add esi, 1
     movzx edx, BYTE PTR [esi]       // zero extend byte
     test edx, edx                   // test for zero terminator
     jz Lout
     cmp edx, 32
     jbe Ftrim                       // scan again for 32 or less

  // ¤=÷=¤=÷=¤=÷=¤=÷=¤
     mov [edi], esi                  // write current location to pointer
     add edi, 4                      // set next pointer location
     add eax, 1                      // increment arg count return value
  // ¤=÷=¤=÷=¤=÷=¤=÷=¤

  Ttrim:                             // scan to find the next CR or LF
     add esi, 1
     movzx edx, BYTE PTR [esi]       // zero extend byte
     cmp edx, 13
     jg Ttrim                        //short loop on normal case

     je Terminate
     cmp edx, 10                     // extra test for ascii 10
     je Terminate
     test edx, edx
     jnz Ttrim                       // loop back if not zero, IE TAB.

  // ---------------------------------

  Lout:
     mov esi, pArray                 // load passed handle address into ESI
     mov ecx, pmem                   // local memory handle into ECX
     mov [esi], ecx                  // store local array handle at address of passed handle
     dec eax
     ret                             // return the line count
ltok endp



// FASTPROC  
.code

str_len:

    mov eax, [esp+4]
    sub eax, 1
  lbl:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne lbl

    sub eax, [esp+4]

    ret 4
.data


BOOL Exist(char *szFilePath)
{
  if (GetFileAttributes(szFilePath) != 0xffffffff) return TRUE;
  return FALSE;
}

help()
{
    puts( "\nSHFLARR : shuffle text file lines to random order\n" );
    puts( "Syntax  : shflarr inputfile outputfile" );
}

dword load_file(dword fname)
{
    unsigned long size;
    dword filehandle;
    dword buf;
    
    filehandle=fopen(fname,"rb");
	if(filehandle==0)return NULL;
	// obtain file size:
    fseek (filehandle , 0 , SEEK_END);
    size = ftell (filehandle);
    rewind (filehandle);
	if(size==0){
		fclose(filehandle);
		return NULL;
	}
	buf=malloc(size+1);
	
	if(fread(buf,1,size,filehandle)!= size)
	{
	    fclose(filehandle);
		return NULL;
	}
	fclose(filehandle);
	return buf;
}

//***********************************************************************************//

main()
{
 
   char *ifile,*ofile;
   dword psrc,lcnt,parr,pstr,hFil;

   ifile = PARAMSTR(1);
   ofile = PARAMSTR(2);
   
   if(!Exist(ifile))
    {
        puts("Cannot find input file");
        help();
        @EXIT(0);
    }
   
   if(ofile==NULL)
    {
       puts("No output file specified");
       help();
       @EXIT(0);
    }
    
   
  // ----------------------------------------------
  // seed the random algo with a near unique number
  // ----------------------------------------------
    nrandom_seed = GetTickCount();
    
    psrc = load_file(ifile);
    
  // -------------------------------
  // tokenise file into memory array
  // -------------------------------
    lcnt = ltok(psrc,#parr);  
    
    printf("Name of input file   = %s\n",ifile);
    printf("Text file line count = %d\n", lcnt);
    printf("Array shuffle count  = 100\n");
    printf("Name of output file  = %s\n",ofile);

    
  // -----------------
  // shuffle 100 times
  // -----------------
      mov esi, 100
  @@:
    shuffle_array(parr,lcnt);
      sub esi, 1
      jnz @B
  // -----------------

  // --------------------
  // write result to disk
  // --------------------
    hFil = fopen(ofile,"wb");

      mov esi, lcnt
      mov ebx, parr
  @@:
      mov eax, [ebx]
      mov pstr, eax

    fwrite(pstr,1,str_len(pstr),hFil);    // the line of text
    fwrite("\n",1,2,hFil);                // crlf
    
      add ebx, 4
      sub esi, 1
      jnz @B

    fclose(hFil);
  // --------------------

  // ------------------------------------------
  // free the memory allocated in the tokeniser
  // ------------------------------------------
    GlobalFree( parr );
    free(psrc);
}

hutch-- · June 17, 2017, 01:28:58 AM

Emil_halim · June 19, 2017, 02:56:27 AM

Hi All,

Example from msdn microsoft.

Code Select


/************************************* 
*         New Sphinx  Cmm            *  
*                                    *
*     sample multithread program     *
*                                    *
*         from msdn microsoft        *          
*                                    *
*************************************/

// https://msdn.microsoft.com/en-us/library/esszf9hw.aspx

//  Bounce - Creates a new thread each time the letter 'a' is typed.  
//  Each thread bounces a happy face of a different color around  
//  the screen. All threads are terminated when the letter 'Q' is  
//  entered.  

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

#pragma option upx-


#includelib  win32.lib MSVCRT.lib ole32.lib 


// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->   


#pragma option ia

#define MAX_THREADS  32  
  
// The function getrandom returns a random number between   
// min and max, which must be in integer range.  
#define getrandom( min, max )   EAX=max; EAX++; EAX-=min; AX = rand() % EAX; AX+=min;   
  
int main( void );                    // Thread 1: main   
void KbdFunc( void  );               // Keyboard input, thread dispatch  
void BounceProc( dword  MyID );      // Threads 2 to n: display   
void ClearScreen( void );            // Screen clear   
void ShutDown( void );               // Program shutdown   
void WriteTitle( int ThreadNum );    // Display title bar information   
  
HANDLE  hConsoleOut;                 // Handle to the console   
HANDLE  hRunMutex;                   // "Keep Running" mutex   
HANDLE  hScreenMutex;                // "Screen update" mutex  
int     ThreadNr;                    // Number of threads started   
CONSOLE_SCREEN_BUFFER_INFO csbiInfo; // Console information  
  
int main() // Thread One   
{  
    // Get display screen information & clear the screen.  
    hConsoleOut = GetStdHandle( STD_OUTPUT_HANDLE );  
    GetConsoleScreenBufferInfo( hConsoleOut, #csbiInfo );  
    ClearScreen();  
    WriteTitle( 0 );  
  
    // Create the mutexes and reset thread count.  
    hScreenMutex = CreateMutex( NULL, FALSE, NULL );  // Cleared   
    hRunMutex = CreateMutex( NULL, TRUE, NULL );      // Set   
    ThreadNr = 0;  
  
    // Start waiting for keyboard input to dispatch threads or exit.  
    KbdFunc();  
  
    // All threads done. Clean up handles.  
    CloseHandle( hScreenMutex );  
    CloseHandle( hRunMutex );  
    CloseHandle( hConsoleOut );  
}  
  
void ShutDown( void ) // Shut down threads   
{  
    while ( ThreadNr > 0 )  
    {  
        // Tell thread to die and record its death.  
        ReleaseMutex( hRunMutex );  
        ThreadNr--;     
    }  
  
    // Clean up display when done  
    WaitForSingleObject( hScreenMutex, INFINITE );  
    ClearScreen();  
}  
  
void KbdFunc( void ) // Dispatch and count threads.  
{  
    dword       tID;
    do  
    {   
        if ( GetAsyncKeyState(VK_A)  & 0x8000  ) && ( ThreadNr < MAX_THREADS )  
        {  
            ThreadNr++;  
            CreateThread(0,0,#BounceProc,#ThreadNr,0,#tID);
            WriteTitle( ThreadNr ); 
            Sleep(100); 
        }  
    } while(  !GetAsyncKeyState(VK_Q)  & 0x8000   );  
  
    ShutDown();  
}  
  
void BounceProc( dword pMyID )  
{  
    char    MyCell, OldCell;  
    WORD    MyAttrib, OldAttrib;  
    char    BlankCell;  
    COORD   Coords, Delta;  
    COORD   Old;  
    DWORD   Dummy;  
    dword   MyID;  
  
    BlankCell = 0x20; 
    Old.X = Old.Y = 0;  
    MyID = pMyID;
    
    // Generate update increments and initial   
    // display coordinates.  
    srand( MyID * 3 );  
  
    Coords.X = getrandom( 0, csbiInfo.dwSize.X - 1 );  
    Coords.Y = getrandom( 0, csbiInfo.dwSize.Y - 1 );  
    Delta.X = getrandom( -3, 3 );  
    Delta.Y = getrandom( -3, 3 );  
  
    // Set up "happy face" & generate color   
    // attribute from thread number.  
    if( MyID > 16)  
        MyCell = 0x01;          // outline face   
    else  
        MyCell = 0x02;          // solid face   3
    MyAttrib =  MyID & 0x0F;   // force black background   
  
    do  
    {  
        // Wait for display to be available, then lock it.  
        WaitForSingleObject( hScreenMutex, INFINITE );  
  
        // If we still occupy the old screen position, blank it out.   
        ReadConsoleOutputCharacter( hConsoleOut, #OldCell, 1, DSDWORD[#Old], #Dummy );  
        ReadConsoleOutputAttribute( hConsoleOut, #OldAttrib, 1, DSDWORD[#Old], #Dummy );  
        if (( OldCell == MyCell ) && (OldAttrib == MyAttrib))  
            WriteConsoleOutputCharacter( hConsoleOut, #BlankCell, 1, DSDWORD[#Old], #Dummy );  
  
        // Draw new face, then clear screen lock   
        WriteConsoleOutputCharacter( hConsoleOut, #MyCell, 1, DSDWORD[#Coords], #Dummy );  
        WriteConsoleOutputAttribute( hConsoleOut, #MyAttrib, 1, DSDWORD[#Coords], #Dummy );  
        ReleaseMutex( hScreenMutex );  
  
        // Increment the coordinates for next placement of the block.   
        Old.X = Coords.X;  
        Old.Y = Coords.Y;  
        Coords.X += Delta.X;  
        Coords.Y += Delta.Y;  
  
        // If we are about to go off the screen, reverse direction   
        if( Coords.X < 0 ) || ( Coords.X >= csbiInfo.dwSize.X )  
        {  
            Delta.X = -Delta.X;  
            Beep( 400, 50 );  
        }  
        if( Coords.Y < 0 ) || ( Coords.Y > csbiInfo.dwSize.Y )  
        {  
            Delta.Y = -Delta.Y;  
            Beep( 600, 50 );  
        }  
    }  
    // Repeat while RunMutex is still taken.   
    while ( WaitForSingleObject( hRunMutex, 75L ) == WAIT_TIMEOUT );  
}  
  
void WriteTitle( int ThreadNum )  
{  
    
    char  NThreadMsg[80];  
  
    sprintf( #NThreadMsg, "Threads running: %02d.  Press 'A' " "to start a thread,'Q' to quit.", ThreadNum );  
    SetConsoleTitle( #NThreadMsg );  
}  
  
void ClearScreen( void )  
{  
    DWORD    dummy;  
    COORD    Home;
    Home.X = Home.Y = 0 ;  
    FillConsoleOutputCharacter( hConsoleOut, ' ', csbiInfo.dwSize.X * csbiInfo.dwSize.Y, DSDWORD[#Home], #dummy );  
}

Emil_halim · June 26, 2017, 12:15:29 AM

Hi all

Here is the cmm version of Michael Webster's code timing macros.http://masm32.com/board/index.php?topic=49.0

I have converted to CMM, there was a small problem when i tried to do that.

the problem is , how to put the code in-between the 2 CMM macro and let it works just like masm code. i uesd a trick , which is that at the end of first macro i puted this code

Code Select


        call label
    @label:                              /* Start test loop                             */    
      ?aligncode  16                     /* Optimal loop alignment for P6               */

it push the current address in the stack . and puted this in the front of second macro

Code Select


         dec  __counter__loop__counter__
        jz    @F
        jmp DSDWORD[ESP];
     
      @@:
        ESP += 4;

so it jumps to the end of first macro if the counter is greater than zero.

Here is the macros [ cntrcmm.inc ]
============

Code Select


/************************************* 
*         New Sphinx  Cmm            *  
*                                    *
*             counter                *
*                                    *
*         from Masm Forum            *          
*                                    *
*************************************/

//Michael Webster's code timing macros
//http://masm32.com/board/index.php?topic=49.0
 
/*   ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; processor clock cycle count for a block of code. These macros must
  ; be used in pairs, and the block of code must be placed in between
  ; the counter_begin and counter_end macro calls. The counter_end macro
  ; returns the clock cycle count for a single pass through the block of
  ; code, corrected for the test loop overhead, in EAX.
  ;
  ; These macros require a .586 or higher processor directive.
  ;
  ;
  ; The loopcount parameter should be set to a relatively high value to
  ; produce repeatable results.
  ;
  ; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
  ; involves some risk, as it will cause your process to preempt *all*
  ; other processes, including critical Windows processes. Setting the
  ; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
  ; reduce the risk, and in most cases will produce the same cycle count.
  ; --------------------------------------------------------------------- */

#pragma option ia

dword   _loop_count_;
dword   _process_priority_class_;
int     _thread_priority_;

dword __counter__loop__counter__=0;
qword tmp1,tmp2,__counter__qword__count__;

#define counter_begin( arg )  EAX = arg; _counter_begin(); 
inline _counter_begin(  )                                                         
{
      
      _loop_count_ = EAX;                                                           
      _process_priority_class_ = GetPriorityClass(GetCurrentProcess());                    
      _thread_priority_ = GetThreadPriority(GetCurrentThread());                            
      SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);                      
      SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);                
                                                                                           
        xor   eax, eax                   /* Use same CPUID input value for each call*/     
        cpuid                            /* Flush pipe & wait for pending ops to finish*/  
        rdtsc                            /* Read Time Stamp Counter*/                      
                                                                                           
        tmp1 = EDX:EAX;
       __counter__loop__counter__ = _loop_count_;                                       
        xor   eax, eax                                                                     
        cpuid                            /* Make sure loop setup instructions finish */    
      ?aligncode  16                     /* Optimal loop alignment for P6  */ 
        call Lab              
      @Lab:                                /* Start an empty reference loop  */ 
                   
        dec   __counter__loop__counter__                                                
        jz    @F
        jmp DSDWORD[ESP];
     
      @@:
        ESP += 4;                                                                         
                                                                                           
        xor   eax, eax                                                                     
        cpuid                            /* Make sure loop instructions finish  */           
        rdtsc                            /* Read end count  */                               
        
        EDX:EAX -= tmp1;
        tmp1 = EDX:EAX;
                                                                                            
        xor   eax, eax                                                                     
        cpuid                                                                              
        rdtsc                                                                              
        tmp2 = EDX:EAX; 
        __counter__loop__counter__ = _loop_count_;                                            
        xor   eax, eax                                                                     
        cpuid                            /* Make sure loop setup instructions finish    */   
        call label
    @label:                              /* Start test loop                             */    
      ?aligncode  16                     /* Optimal loop alignment for P6               */   
}                                                                                        


inline counter_end()
{  
        dec  __counter__loop__counter__
        jz    @F
        jmp DSDWORD[ESP];
     
      @@:
        ESP += 4;
        xor   eax, eax
        cpuid                            // Make sure loop instructions finish
        rdtsc                            // Read end count
        EDX:EAX -= tmp2;

        __counter__qword__count__ = EDX:EAX - tmp1;
       

   SetPriorityClass(GetCurrentProcess(),_process_priority_class_);    
   SetThreadPriority(GetCurrentThread(),_thread_priority_);  
   
        finit
        fild  DSQWORD [# __counter__qword__count__ ]
        fild  dword [# _loop_count_ ]
        fdiv
        fistp dword [# __counter__qword__count__ ]

        mov   eax, dword [# __counter__qword__count__ ]    
}

//--------------------------------------------------------------------------------------

 /* ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; execution time in milliseconds for a specified number of loops
  ; through a block of code. These macros must be used in pairs, and
  ; the block of code must be placed in between the timer_begin and
  ; timer_end macro calls. The timer_end macro returns the elapsed
  ; milliseconds for the entire loop in EAX.
  ;
  ; These macros utilize the high-resolution performance counter.
  ; The return value will be zero if the high-resolution performance
  ; counter is not available.
  ;
  ; The loopcount parameter should be set to a relatively high value to
  ; produce repeatable results.
  ;
  ; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
  ; involves some risk, as it will cause your process to preempt *all*
  ; other processes, including critical Windows processes. Setting the
  ; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
  ; reduce the risk, and in most cases will produce very nearly the same
  ; result.
  ; --------------------------------------------------------------------- */
  
  __timer__pc__frequency__  :  dq 0
  __timer__pc__count__      :  dq 0
  __timer__loop__counter__  :  dd 0
  __timer__dw_count__       :  dd 0
  
  #define timer_begin( arg )  EAX = arg; _timer_begin(); 
  inline _timer_begin()
  {
        _loop_count_ = EAX;
        QueryPerformanceFrequency( # __timer__pc__frequency__ );
        if( EAX != 0 )
         {
             _process_priority_class_ = GetPriorityClass(GetCurrentProcess());                    
             _thread_priority_ = GetThreadPriority(GetCurrentThread());                            
             SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);                      
             SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);       
             QueryPerformanceCounter( # __timer__pc__count__ );
             push    DSDWORD [#__timer__pc__count__ + 4]
             push    DSDWORD [#__timer__pc__count__]

             DSDWORD [#__timer__loop__counter__] = _loop_count_;
             call Lab
           @Lab:  
          ?aligncode 8             // Optimal loop alignment for P6
                                   // Start an empty reference loop
             sub   DSDWORD [#__timer__loop__counter__], 1
             jz    @F
             goto  DSDWORD[ESP];
         @@:
             ESP += 4;

             QueryPerformanceCounter( # __timer__pc__count__ );

             pop   ecx           // Recover low-order 32 bits of start count
             sub   DSDWORD [# __timer__pc__count__ ], ecx
             pop   ecx           // Recover high-order 32 bits of start count
             sbb   DSDWORD [# __timer__pc__count__ + 4 ], ecx

             push   DSDWORD [# __timer__pc__count__ + 4 ] // Overhead count
             push   DSDWORD [# __timer__pc__count__ ]     // Overhead count

             QueryPerformanceCounter( # __timer__pc__count__ );

             push   DSDWORD [# __timer__pc__count__ + 4 ] // Start count
             push   DSDWORD [# __timer__pc__count__ ]     // Start count

             DSDWORD [#__timer__loop__counter__] = _loop_count_;
             call label
       @label:                      // Start test loop
         ?aligncode 16              // Optimal loop alignment for P6
         }
   }
   
   inline timer_end()
   {
            dec  DSDWORD [#__timer__loop__counter__]
            jz    @F
            jmp DSDWORD[ESP];
     
         @@:
            ESP += 4;
            
            QueryPerformanceFrequency( # __timer__pc__frequency__ );
          if( EAX != 0)
           {
               QueryPerformanceCounter( # __timer__pc__count__ );
               pop   ecx           // Recover low-order 32 bits of start count
               sub   DSDWORD [# __timer__pc__count__ ], ecx
               pop   ecx           // Recover high-order 32 bits of start count
               sbb   DSDWORD [# __timer__pc__count__ + 4 ], ecx
               pop   ecx           // Recover low-order 32 bits of overhead count
               sub   DSDWORD [# __timer__pc__count__ ], ecx
               pop   ecx           // Recover high-order 32 bits of overhead count
               sbb   DSDWORD [# __timer__pc__count__ + 4 ], ecx
            }
   
            SetPriorityClass(GetCurrentProcess(),_process_priority_class_);    
            SetThreadPriority(GetCurrentThread(),_thread_priority_);  
            
            finit
            fild DSQWORD[# __timer__pc__count__]
            fild DSQWORD[# __timer__pc__frequency__]
            fdiv
            mov   DSDWORD[#__timer__dw_count__], 1000
            fild  dword [# __timer__dw_count__]
            fmul
            fistp dword [# __timer__dw_count__ ]
            mov   eax, [# __timer__dw_count__]
   }

Here is a test program
===============

Code Select


/************************************* 
*         New Sphinx  Cmm            *  
*                                    *
*             counter                *
*                                    *
*         from Masm Forum            *          
*                                    *
*************************************/

//Michael Webster's code timing macros
//http://masm32.com/board/index.php?topic=49.0

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

#pragma option upx-

#includelib  win32.lib MSVCRT.lib ole32.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

#include "cntrcmm.inc"

//#include <math64.h-->


#define LOOP_COUNT   10000000 

main()
{
      unsigned long long pam, sam, tsc;
      unsigned long pid;
      
      SetProcessAffinityMask(GetCurrentProcess(), 1);
      GetProcessAffinityMask(GetCurrentProcess(), #pam, #sam);
      printf("%I64d\t%I64d\n", pam, sam);  


            counter_begin(LOOP_COUNT);  
           
                mov    eax, 2
                cpuid
          
            counter_end();
            
            printf("the process takes: %d cycles\n", EAX );  
            
            timer_begin(LOOP_COUNT);
            
                mov    eax, 2
                cpuid
                
            timer_end();
            
            printf("the process takes: %d cycles\n", EAX );  
            
      system("pause");
}

Emil_halim · June 27, 2017, 02:11:47 AM

Hi All

MOV vs PUSH ticks comparison from http://masm32.com/board/index.php?topic=6324.0

so , this converted example shows you that how it is so easy to convert masm code to NewSphinxCmm then using CMM stunning features.

Code Select


/************************************* 
*         New Sphinx  Cmm            *  
*                                    *
*             timeit                *
*                                    *
*         from Masm Forum            *          
*                                    *
*************************************/
/*
;	MOV vs PUSH ticks comparison
;	MOV wins only for 7 strikes. After that,
;	PUSH wins.

http://masm32.com/board/index.php?topic=6324.0

*/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

#pragma option upx-

#includelib  win32.lib MSVCRT.lib ole32.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

#pragma option ia

    loops equ 50000000  //50 mil
 
disply1: db "MOV  = %u ticks\n",0ah,0
disply2: db "PUSH = %u ticks\n",0ah,0
disply3: db "%u*2*6\n",0dh,0ah,0

mytest(dword MainLoops)
{
   dword MainLoop;
   
 //;;===========MOV==============
	push	MainLoops
	push	offset disply3
	call 	printf
	add	esp,8
	cpuid
	call	GetTickCount
	mov	esi,eax
	mov	ecx,loops
@@:
	MainLoop = MainLoops;
	.repeat
		sub	esp,4*6
		mov	[esp],eax
		mov	[esp+4],ebx
		mov	[esp+8],ecx
		mov	[esp+12],edx
		mov	[esp+16],edi
		mov	[esp+20],esi
		dec	MainLoop
	.until MINUSFLAG 
	MainLoop = MainLoops;
	.repeat
		mov	eax,[esp]
		mov	ebx,[esp+4]
		mov	ecx,[esp+8]
		mov	edx,[esp+12]
		mov	edi,[esp+16]
		mov	esi,[esp+20]
		add	esp,4*6
		dec	MainLoop
	.until MINUSFLAG 
	sub	ecx,1
	jnz	@B
	call	GetTickCount
	sub	eax,esi
	push	eax
	push	offset disply1
	call 	printf
	add	esp,8

//;;============PUSH==============
	cpuid
	call	GetTickCount
	mov	esi,eax
	mov	ecx,loops

@@:
	MainLoop = MainLoops;
	.repeat
		push	eax
		push	ebx
		push	ecx
		push	edx
		push	edi
		push	esi
		dec	MainLoop
	.until MINUSFLAG 
	MainLoop = MainLoops;
	.repeat
		pop	esi
		pop	edi
		pop	edx
		pop	ecx
		pop	ebx
		pop	eax
		dec	MainLoop
	.until MINUSFLAG 

	sub	ecx,1
	jnz	@B
	call	GetTickCount
	sub	eax,esi
	push	eax
	push	offset disply2
	call 	printf
	add	esp,8

}

main()
{   
     
    mytest( 1  );
	mytest( 1  );
	mytest( 2  );
	mytest( 2  );
	mytest( 4  );
	mytest( 8  );
	mytest( 16 );
	mytest( 32 );
     
//;;===========MOV==============
	 cpuid
	 call	GetTickCount
	 mov	esi,eax
	 mov	ecx,loops

@@:
	 sub	esp,4*7
	 mov	[esp],eax
	 mov	[esp+4],ebx
	 mov	[esp+8],ecx
	 mov	[esp+12],edx
	 mov	[esp+16],edi
	 mov	[esp+20],esi
	 mov	[esp+24],ebp

	 mov	eax,[esp]
	 mov	ebx,[esp+4]
	 mov	ecx,[esp+8]
	 mov	edx,[esp+12]
	 mov	edi,[esp+16]
	 mov	esi,[esp+20]
	 mov	ebp,[esp+24]
	 add	esp,4*7

	 sub	ecx,1
	 jnz	@B
	 call	GetTickCount
	 sub	eax,esi
	 push	eax
	 push	offset disply1
	 call	printf
	 add	esp,8

//;;============PUSH==============
	 cpuid
	 call	GetTickCount
	 mov	esi,eax
	 mov	ecx,loops

@@:
	 push	eax
	 push	ebx
	 push	ecx
	 push	edx
	 push	edi
	 push	esi
	 push	ebp

	 pop	ebp
	 pop	esi
	 pop	edi
	 pop	edx
	 pop	ecx
	 pop	ebx
	 pop	eax

	 sub	ecx,1
	 jnz	@B
	 call	GetTickCount
	 sub	eax,esi
	 push	eax
	 push	offset disply2
	 call	printf
	 add	esp,8

     system("pause");
}

Emil_halim · June 28, 2017, 02:47:09 AM

Hi all,

still the code is close to masm syntax.

this part of code ported to CMM from http://masm32.com/board/index.php?topic=4940.msg53093#msg53093

Code Select


/************************************* 
*         New Sphinx  Cmm            *  
*                                    *
*          CPU detection             *
*                                    *
*         from Masm Forum            *          
*                                    *
*************************************/

/* 
   this is part of Fast memory allocation
   
   written by nidud , converted by Me.
   
   http://masm32.com/board/index.php?topic=4940.msg53093#msg53093   
*/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization 

#pragma option dbg
#pragma option lst

#pragma option upx-

#includelib  win32.lib MSVCRT.lib ole32.lib 

// $ will replaced with SphinxC-- main path
#includepath "$\winlib"  
 
#include <windows.h>  
#include <MSVCRT.H-->

#pragma option ia

SSE_MMX		equ 00000000001B
SSE_SSE		equ 00000000010B
SSE_SSE2	equ 00000000100B
SSE_SSE3	equ 00000001000B
SSE_SSSE3	equ 00000010000B
SSE_SSE41	equ 00000100000B
SSE_SSE42	equ 00001000000B
SSE_XGETBV	equ 00010000000B
SSE_AVX		equ 00100000000B
SSE_AVX2	equ 01000000000B
SSE_AVXOS	equ 10000000000B


dword sselevel=0;

main()
{
//-------------------------------------------------------------------------------
//  CPU detection
//-------------------------------------------------------------------------------

	pushfd
	pop	eax
	mov	ecx,200000h
	mov	edx,eax
	xor	eax,ecx
	push	eax
	popfd
	pushfd
	pop	eax
	xor	eax,edx
	and	eax,ecx
	push	ebx
	.if ! ZEROFLAG
		xor	eax,eax
		cpuid
		.if EAX
			.if AH == 5
				xor	eax,eax
			.else
				mov	eax,7
				xor	ecx,ecx
				cpuid			// check AVX2 support
				xor	eax,eax
				bt	ebx,5		// AVX2
				rcl	eax,1		// into bit 9
				push	eax
				mov	eax,1
				cpuid
				pop	eax
				bt	ecx,28		// AVX support by CPU
				rcl	eax,1		// into bit 8
				bt	ecx,27		// XGETBV supported
				rcl	eax,1		// into bit 7
				bt	ecx,20		// SSE4.2
				rcl	eax,1		// into bit 6
				bt	ecx,19		// SSE4.1
				rcl	eax,1		// into bit 5
				bt	ecx,9		// SSSE3
				rcl	eax,1		// into bit 4
				bt	ecx,0		// SSE3
				rcl	eax,1		// into bit 3
				bt	edx,26		// SSE2
				rcl	eax,1		// into bit 2
				bt	edx,25		// SSE
				rcl	eax,1		// into bit 1
				bt	ecx,0		// MMX
				rcl	eax,1		// into bit 0
				mov	sselevel,eax
			.endif
		.endif
	.endif
	.if EAX & SSE_XGETBV 
		push	eax
		xor	ecx,ecx
	    db  0x0F,0x01,0xD0   // xgetbv
		and	eax,6		     // AVX support by OS?
		pop	eax
		.if !ZEROFLAG
			or sselevel,SSE_AVXOS
		.endif
	.endif
	pop	ebx
	.if ! EAX = sselevel & SSE_SSE2
		printf( "CPU error: Need SSE2 level\n" );
		system("pause");
		ExitProcess( 0 );
	.endif
	sub	esp,80
	mov	edi,esp
	xor	esi,esi
	.repeat
		lea	eax,[esi+80000002h]
		cpuid
		mov	[edi],eax
		mov	[edi+4],ebx
		mov	[edi+8],ecx
		mov	[edi+12],edx
		add	edi,16
		inc	esi
	.until	ESI == 3
	mov	eax,esp
	.while DSBYTE [EAX] == ' '
		inc eax
	.endw
	printf( EAX );
	add	esp,80

	printf( " (" );	
	.if EAX = sselevel  & SSE_AVX2
		printf( "AVX2" );
	.elseif EAX  = sselevel  & SSE_AVX
		printf( "AVX" );
	.elseif EAX  = sselevel  & SSE_SSE42
		printf( "SSE4.2" );
	.elseif EAX  = sselevel  & SSE_SSE41
		printf( "SSE4.1" );
	.elseif EAX  = sselevel  & SSE_SSSE3
		printf( "SSSE3" );
	.elseif EAX  = sselevel  & SSE_SSE3
		printf( "SSE3" );
	.else
		printf( "SSE2" );
	.endif
	printf( ")\n----------------------------------------------\n" );
	
	system("pause");
	ExitProcess( 0 );     
}

Emil_halim · June 28, 2017, 05:27:19 AM

Hi All,

This demo will show you that , how it is so easy to create Function Address Table which holds the address of functions then later you can call them in a certain order.

also it shows you how to code the same function in many deferent syntax way , such as c style , asm style , c-- style .......

finally it shows you how easy to make a benchmark test to see the deferent speed of some cods.

Code Select


/***************************************
*          New Sphinx Cmm              *  
*                                      *
*     strlen demo  By Emil Halim       *
*                                      *
***************************************/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization
  
#includepath "$\winlib"  

#include <windows.h>  
#include <MSVCRT.H-->

#includelib  win32.lib , msvcrt.lib  

#pragma option upx-

#pragma option LST
#pragma option ia

dword  strlen0( char * pStr ) 
{ 
    char* cp =  pStr; 
    while ( *cp != 0 ) cp++;
    return cp - pStr;      
}
 
int strlen1(char* pStr)
{
    EAX=0;
    while(byte *pStr !=0 )
     {
        pStr++;
        EAX++;
     }
} 

int fastcall strlen2(EAX)
{   
    EBX=EAX;
    while(DSBYTE[EAX] !=0 )
     {
        EAX++;
     }
    EAX -= EBX; 
} 

int fastcall strlen3(EAX)      // pure ASM code 
{ 
   MOV EBX,EAX 
@lop: 
   CMP DSBYTE[EAX],0 
   JE  near @fin
   INC EAX 
   JMP lop 
@fin:    
   SUB EAX,EBX 
} 

?aligncode 4
 
strlen4 Proc item:DWORD
    push    ebx
    mov     eax, item               // get pointer to string
    lea     edx, [eax+3]            // pointer+3 used in the end
 @@:
    mov     ebx, [eax]              // read first 4 bytes
    add     eax, 4                  // increment pointer
    lea     ecx, [ebx-01010101h]    // subtract 1 from each byte
    not     ebx                     // invert all bytes
    and     ecx, ebx                // and these two
    and     ecx, 80808080h
    jz      @B                      // no zero bytes, continue loop
    test    ecx, 00008080h          // test first two bytes
    jnz     @F
    shr     ecx, 16                 // not in the first 2 bytes
    add     eax, 2
 @@:
    shl     cl, 1                   // use carry flag to avoid branch
    sbb     eax, edx                // compute length
    pop     ebx
strlen4 Endp

 
// *** SSE2 version  from MASM forum*** 
? aligncode 16 
int  fastcall strlen5(EAX)  
{      
    EBX = EAX ;                 // get the string pointer 
    LEA ECX, DSDWORD[EAX+16]    // save pointer to string, on par with eax after first loop 
    EAX &= 0xFFFFFFF0;          // align for use with SSE2 
  @shiftOK:      
    XORPS XMM0, XMM0            // zero xmm0 for finding zero bytes 
  @a1:  
    PCMPEQB XMM0, DSQWORD[EAX]  // ---- inner loop ----- 
    PMOVMSKB EDX, XMM0          // set byte mask in edx 
    EAX += 16;                  // len counter (best position here) 
    TEST EDX,EDX 
    JE a1 
    if(ECX<=EAX) goto a2; 
    ECX -= EAX;                 // get difference, and cancel "misalign flag" 
    SHR EDX, CL                 // shift invalid 
    SHL EDX, CL                 // bits out 
    JE shiftOK 
  @a2:   
    BSF EDX, EDX                // bit scan for the index 
    SUB EAX, EBX                // subtract original src pointer 
    LEA EAX, DSDWORD[EAX+EDX-16] // add scan index 
} 
? aligncode 4 

.code
strlen6:
    mov eax, [esp+4]
    sub eax, 1
  lbl:
    add eax, 1
    cmp BYTE PTR [eax], 0
    jne lbl
    sub eax, [esp+4]
    ret 4
.data

strlen7 Proc  mstr:DWORD
	MOV ECX,mstr				// Move source pointer to ECX
	EAX = -1;					// Start of at -1 so we can build a faster loop
  next_char:
	EAX++; 					    // EAX==NULL	
	CMP DSBYTE[ECX+EAX],0
	JNE next_char				// If BYTE is not equal to NULL process next .
	RET							// Returns string length in EAX
strlen7 Endp
//-----------function address table-----------------
FunTbl : dd  # strlen0,
             # strlen1,
             # strlen2,
             # strlen3,
             # strlen4,
             # strlen5,
             # strlen6,
             # strlen7,
             0 
/*-------------------------------------------------------------------------------*/
char* testStr = "NewSphinxCmm is a stunning program language";

qword  temp_1;  
qword  temp_2;  

main()  
{  
  int i;  
  double  reslt1, reslt2;  
  int count;    
  
     printf("strlen  = %d\n",strlen ( testStr ));
     printf("strlen0 = %d\n",strlen0( testStr ));   
     printf("strlen1 = %d\n",strlen1( testStr ));   
     printf("strlen2 = %d\n",strlen2( testStr ));   
     printf("strlen3 = %d\n",strlen3( testStr ));    
     printf("strlen4 = %d\n",strlen4( testStr ));  
     printf("strlen5 = %d\n",strlen5( testStr ));  
     printf("strlen6 = %d\n",strlen6( testStr )); 
     printf("strlen7 = %d\n",strlen7( testStr )); 
     
     i = 0;
     while(DSDWORD[i*4+#FunTbl])
     {
        
        EDX=DSDWORD[i*4+#FunTbl];
        if (i==2) || (i==3) || (i==5) // fast functions they have no stack frame
         {
           EAX=testStr;
           EDX();
         }
         else
           EDX( testStr );
        printf("len[%d] = %d\n",i, EAX );
        i++;
     }

  count = 1000000;  

  SetPriorityClass( GetCurrentProcess(), HIGH_PRIORITY_CLASS);            
  rdtsc  
  temp_1 = EDX:EAX;  
    
     for(i=0; i < count; i++)  
      {  
          strlen1( testStr );  
      }        
  rdtsc  
  temp_1 = EDX:EAX - temp_1;  
    
  rdtsc  
  temp_2 = EDX:EAX;  
    
     for(i=0; i < count; i++)  
      {  
          strlen5( testStr );  
      }        
  rdtsc  
  temp_2 = EDX:EAX - temp_2;  
    
  SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS);  
   
  ST(0) = temp_1 / count;   
  fstp   reslt1  
  ST(0) = temp_2 / count;   
  fstp   reslt2  
  printf("strlen1 is %f\nstrlen5 is %f\n",   reslt1 , reslt2 );
    
  system("pause");   
}

Emil_halim · June 28, 2017, 09:56:45 PM

Hi ALL,

To use ilink32 of Borland , just first download a BorlandC++ trail version in your system.

then try the next Example , needs NewSphinxCmm version 256.

Code Select


/***************************************
*          New Sphinx Cmm              *  
*                                      *
*     using ilink  By Emil Halim       *
*                                      *
***************************************/

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option obj        //tells NewSphinxCmm to do not link 

#OnExit "ilink32 /ap /m -s  -Gt -Gn  $OFILE$.obj c0x32.OBJ , , , import32.lib cw32.lib , , "
  
#includepath "$\winlib"  

#include <windows.h>  

#pragma option LST
#pragma option ia

dword dumy;   // disable error of missing a section of class 2 by ilink

/*------------------------*/
extern cdecl _printf();    
#define printf  _printf 

extern cdecl _strlen();    
#define strlen  _strlen 

extern cdecl _system();    
#define system  _system

_main()
{
main:
   printf("hello world ......\n\n");
   printf("using ilink Borland linker ......\n\n");
   
   printf("the length of welcome = %d\n\n" , strlen("welcome"));
   
   system("pause");   
}
/*------------------------*/

Emil_halim · June 29, 2017, 09:19:40 PM

Hi ALL,

In this Demo i have changed the __acrtused procedure in the c0nt.asm to be like that

Code Select


;----------------------------------------------------------------------
; Startup code

EXTRN           cmmstratup:NEAR  ; Added By Emil Halim

_TEXT           SEGMENT  DWORD USE32 PUBLIC 'CODE'

                public __acrtused
__acrtused      PROC NEAR    

                jmp   cmmstratup              ; Added By Emil Halim                 
                               
__acrtused      ENDP

then OnExit directive will assemble it with tasm32 see below.

also i remove the underscore from _main so that it will call CMM main.

so the cmmstratup cmm code holds the startup code , you can modify it as you wish.

Here is the CMM demo
==============

Code Select


/***************************************
*          New Sphinx Cmm              *  
*                                      *
*     using ilink  By Emil Halim       *
*                                      *
***************************************/

/* borland C++ console starup code */

#pragma option w32c       //create Windows console EXE.
#pragma option OS         //speed optimization

#pragma option obj        //tells NewSphinxCmm to do not link 

#OnExit "tasm32 /ml c0nt.asm"
#OnExit "ilink32 /ap /m -s  -Gt -Gn -Gl $OFILE$.obj  c0nt.obj, , ,vcl.lib import32.lib cw32.lib , , "
  
#includepath "$\winlib"  

#include <windows.h>  

#pragma option LST
#pragma option ia

/*------------------------*/
extern cdecl _printf();    
#define printf  _printf 

extern cdecl _strlen();    
#define strlen  _strlen 

extern cdecl _system();    
#define system  _system

dword hinst;

/*-----------startup code----------------*/
extern 
{
  dword ___CPPdebugHook_segment;
  dword __TLS_index;
  dword __TLS_index4;
  dword __hInstance;
  dword ___CPPdebugHook;
  dword module_data;
  ___CRTL_VCL_Init();
  ___CRTL_MEM_UseBorMM();
  ___CRTL_VCLLIB_Linkage();
  __ExceptInit();
  __startup();
}

/* borland C++ console starup code */
cmmstratup()  
{    
                jmp     skip_dbg_vector
                db      "fb:C++HOOK"           // special signature
                nop                            // alignment byte
                db      0E9h                   // encode a jmp instruction so that the disassembler in the IDE can see past this address to the skip_dbg_vector
                dd      # ___CPPdebugHook_segment
    skip_dbg_vector:
                __TLS_index4 = __TLS_index << 2;
                push     edx
                push    0                       // NULL returns current module
                edx = GetModuleHandle();
                ___CRTL_VCL_Init();             // EDX now has hInstance in it
                pop     edx
                ___CRTL_MEM_UseBorMM();         // Call out to potentially re-vector the memory manager
                ___CRTL_VCLLIB_Linkage();       // Call out to touch a symbol that will be undefined if vcl.lib was used with any of the CW32xx forms of the RTL.
    skip_CRTL_xxxx:
                push    0
                __ExceptInit();
                pop     ecx
    not_process_attach:
                push    # module_data
                push    0                       // NULL returns current module
                __hInstance = GetModuleHandle();
                                
                hinst = EAX;                    // Added By Emil Halim  you cau put your own work 
                
                push    0                       // dummy return address
                goto     __startup;                                                            
}
/*-----------end of startup code----------------*/


main()
{

   printf("hello world ......\n\n");
   printf("using ilink borland linker ......\n\n");
   
   printf("the length of welcome = %d\n\n" , strlen("welcome"));
   
   printf("the hInstance = %d\n\n" , hinst );
   
   system("pause");   
}

The MASM Forum

News:

NewSphinxCmm Examples

Emil_halim

Emil_halim

Emil_halim

Emil_halim

hutch--

Emil_halim

Emil_halim

Emil_halim

Emil_halim

Emil_halim

Emil_halim

Emil_halim