Calculate a Pythagorean triple in C++ and assembly

Ben756 · February 22, 2017, 08:07:18 AM

I have a c++ program. I need to covert calculation part to assembly.


#include <iostream>
#include <iomanip>
#include <math.h>

using namespace std;

int main()
{
	int m, n;
	int a, b, c;
	cout << "Pythagorean Triples Calculation" << endl;

	do
	{
		cout << "Enter values for m an n: ";
		cin >> m >> n;
		if (m == 0 || m <0)
		{
			cout << "The value " << m << " is out of range- only positive integers are allowed" << endl;
		}
		if (n == 0 || n <0)
		{
			cout << "The value " << n << " is out of range- only positive integers are allowed" << endl;
		}
	
	} while (m <= 0 || n <= 0);

	a = 2 * m*n;
	b = pow(m, 2) - pow(n, 2);
	c = pow(m, 2) + pow(n, 2);


	cout << "Side a = " << a << endl;
	cout << "Side b = " << b << endl;
	cout << "Side c = " << c << endl;
	cout << "Calculation ended." << endl;
	return 0;
}

so, I need this part in assembly:

Code Select

	a = 2 * m*n;
	b = pow(m, 2) - pow(n, 2);
	c = pow(m, 2) + pow(n, 2);

ragdog · February 22, 2017, 09:26:22 AM

Hello

"crt_pow" ?

I think it help you https://masm32.com/board/index.php?topic=3614.0

Greets,

jj2007 · February 22, 2017, 10:02:00 AM

include \masm32\MasmBasic\MasmBasic.inc ; download
SetGlobals m, n, intA, intB, intC, pos
Init
Let esi="3, 2" ; start with a default set of values
.While 1
Let esi=Input$("m, n: ", esi) ; ask for input (the string is editable)
.Break .if !Len(Trim$(esi)) ; stop it if the string is empty
mov pos, esi
mov m, Val(pos) ; get m
.if edx>99
deb 4, "your first value makes no sense"
.else
add pos, edx
mov n, Val(pos) ; get n
.if edx>99
deb 4, "your second value makes no sense"
.else
deb 4, "input values", m, n
fild m
fimul n ; m*n
fadd ST, ST ; *2
fistp intA ; a = 2 * m*n
fild n
fmul ST, ST ; m*m = pow(m, 2)
fild m
fmul ST, ST ; n*n = pow(n, 2)
fld st ; make a copy of pow(n, 2)
fsub ST, ST(2)
fistp intB ; b = pow(m, 2) - pow(n, 2)
fadd
fistp intC ; b = pow(m, 2) + pow(n, 2)
deb 4, "calculated values:", intA, intB, intC
.endif
.endif
.Endw
PrintLine "bye"
Delay 1000
EndOfCode

Testbed attached.

hutch-- · February 22, 2017, 03:02:36 PM

I would be inclined to use the C compiler option to output an ASM file if it is 32 bit code. You will need to be able to isolate the section of calculation code and whether in fact it uses a C runtime library to do the calculation so that you can benchmark the technique against any assembler code that you may write.

Floating point code will give the right answer if its written correctly but depending on the precision required, it may not be the fastest option and I gather that you would want to write assembler code in this area to make the calculations faster. Sad to say many have learnt that the code they have written in assembler is not as fast as the runtime library. This is of course why you should benchmark the code you write against the C compiler generated code to see if you get any speed gain.

Ben756 · February 22, 2017, 05:39:10 PM

I have done some parts of this assignment. The only part remaining is overflow and error checking. I put description and my code if anybody can help for error checking part.

Your program should next use in-line assembler (the __asm{ } directive) to perform the calculation. For each step in the calculations, check for overflow or other errors. Should an error arise, abandon all further calculations and issue an error message (message 4 in the list below). (Issue all messages from the C++ part of the program. Do not attempt to use Windows system services to produce output.)

Code Select

#include <iostream>
#include <iomanip>

using namespace std;

int main()
{

	int m, n;
	int a, b, c;
	cout << "Pythagorean Triples Calculation" << endl;

	do
	{
		cout << "Enter values for m an n: ";
		cin >> m >> n;
		if (m == 0 || m < 0)
		{
			cout << "The value " << m << " is out of range- only positive integers are allowed" << endl;
		}
		if (n == 0 || n < 0)
		{
			cout << "The value " << n << " is out of range- only positive integers are allowed" << endl;
		}

	} while (m <= 0 || n <= 0);

	__asm {
		mov eax, m
		mov ecx, n
		mul ecx
		add eax, eax
		mov a, eax

		mov eax, m
		imul eax, eax
		mov ecx, n
		imul ecx, ecx
		sub eax, ecx
		mov b, eax

		mov eax, m
		imul eax, eax
		mov ecx, n
		imul ecx, ecx
		add eax, ecx
		mov c, eax
	}

	cout << "Side a = " << a << endl;
	cout << "Side b = " << abs(b) << endl;
	cout << "Side c = " << c << endl;
	cout << "Calculation ended." << endl;
	return 0;
}

raymond · February 23, 2017, 04:11:57 AM

If you want to improve speed, you need to remember a few things such as:
i) minimize memory accesses,
ii) avoid repeating operations unnecessarily.

Modifying your example would probably reduce the timing by at least 30% and also reduce code size.

   __asm {
      mov eax, m
      mov ecx, n
      mov ebx,eax ;keep it for later
      mul ecx
      add eax, eax
      mov a, eax

;      mov eax, m
      mov eax, ebx ;retrieve the 'm' value from within the cpu
      imul eax, eax
;      mov ecx, n no need to repeat this operation, ecx already contains 'n'
      mov ebx,eax ;keep the product in eax to avoid repeating this multiplication
      imul ecx, ecx
      sub eax, ecx
      mov b, eax

      add ebx,ecx
      mov c,ebx
;those last two instructions avoided two more accesses to memory and another multiplication

;      mov eax, m
;      imul eax, eax
;      mov ecx, n
;      imul ecx, ecx
;      add eax, ecx
;      mov c, eax
   }

All superfluous instructions have been commented out.

Enjoy

jj2007 · February 23, 2017, 07:08:36 AM

Quote from: raymond on February 23, 2017, 04:11:57 AMModifying your example would probably reduce the timing by at least 30% and also reduce code size.

Your code (below: CPU) is indeed pretty fast, it clearly beats the FPU:

Code Select

Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)

758     cycles for 100 * FPU int
467     cycles for 100 * CPU
550     cycles for 100 * FPU Real4

749     cycles for 100 * FPU int
466     cycles for 100 * CPU
550     cycles for 100 * FPU Real4

747     cycles for 100 * FPU int
468     cycles for 100 * CPU
549     cycles for 100 * FPU Real4

752     cycles for 100 * FPU int
468     cycles for 100 * CPU
551     cycles for 100 * FPU Real4

745     cycles for 100 * FPU int
466     cycles for 100 * CPU
551     cycles for 100 * FPU Real4

57      bytes for FPU int
53      bytes for CPU
57      bytes for FPU Real4

raymond · February 23, 2017, 07:59:07 AM

I think that the reason for the FPU code being slower than my integer code is primarily due to the conversions to/from integers and floats.

Those are obviously time consuming operations which should be added to the list of things to avoid (whenever possible) for fastest code. But, there are occasions where such additional operations may be worthwhile such as when other ALU operations could be performed in parallel with the FPU being busy computing!!!

jj2007 · February 23, 2017, 12:00:21 PM

Quote from: raymond on February 23, 2017, 07:59:07 AMI think that the reason for the FPU code being slower than my integer code is primarily due to the conversions to/from integers and floats.

These are indeed important; calc_fpuReal2 / Real4b below is floats only, and is exactly as fast as the integer version:

Code Select

	fld mR
	fld nR
	fld st	; n
	fld st(2)	; m
	fmul	; m*n
	fadd ST, ST	; *2
	fstp aR	; a = 2 * m*n= 12
	fmul ST, ST	; m*m = pow(m, 2)
	fxch
	fmul ST, ST	; n*n = pow(n, 2)
	fld	st	; make a copy of pow(n, 2)
	fsub ST, ST(2)
	fstp bR	; b = pow(m, 2) - pow(n, 2)
	fadd
	fstp cR	; b = pow(m, 2) + pow(n, 2)

Code Select

Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)

731     cycles for 100 * FPU int
471     cycles for 100 * CPU
538     cycles for 100 * FPU Real4a
459     cycles for 100 * FPU Real4b

729     cycles for 100 * FPU int
458     cycles for 100 * CPU
539     cycles for 100 * FPU Real4a
460     cycles for 100 * FPU Real4b

raymond · February 23, 2017, 12:42:44 PM

Many thanks for the confirmation. :t

aw27 · August 17, 2017, 01:32:51 AM

This is a solution without using x87 instructions. Calculations are performed on floats (real4) not doubles (real8). Of course, it could be done on doubles as well.

Code Select


.686
.xmm

.model flat, stdcall
option casemap :none  

includelib \masm32\lib\msvcrt.lib
printf proto C :vararg
includelib \masm32\lib\kernel32.lib
ExitProcess proto :dword 

.const
format0 db "m=%f, n=%.2f",13,10,0,0
format1 db "a = 2 * m*n : %.2f", 13,10,0,0
format2 db "b = pow(m, 2) - pow(n, 2) : %.2f", 13,10,0,0
format3 db "b = pow(m, 2) + pow(n, 2) : %.2f", 13,10,0,0

_Constant_ps_min_norm_pos_powf dd 800000h
align 16
_Constant_ps_inv_mant_mask_powf dd 807fffffh
align 16
_Constant_ps_0p5_powf dd 0.500000000
align 16
_Constant_pi32_0x7f_powf dd 7fh
align 16
_Constant_one_powf dd 1.0
_Constant__ps_cephes_SQRTHF_powf dd 0.707106769
_Constant_ps_cephes_log_p0_powf dd 0.0703768358
_Constant_ps_cephes_log_p1_powf dd -0.115146101
_Constant_ps_cephes_log_p2_powf dd 0.116769984
_Constant_ps_cephes_log_p3_powf dd -0.124201410
_Constant_ps_cephes_log_p4_powf dd 0.142493233
_Constant_ps_cephes_log_p5_powf dd -0.166680574
_Constant_ps_cephes_log_p6_powf dd 0.200007141
_Constant_ps_cephes_log_p7_powf dd -0.249999940
_Constant_ps_cephes_log_p8_powf dd 0.333333313
_Constant_ps_cephes_log_q1_powf dd -0.000212194442
_Constant_ps_cephes_log_q2_powf dd 0.693359375
_Constant_ps_cephes_LOG2EF_powf dd 1.44269502
_Constant_ps_cephes_exp_C1_powf dd 0.693359375
_Constant_ps_cephes_exp_C2_powf dd -0.000212194442
_Constant_ps_cephes_exp_p0_powf dd 0.000198756912
_Constant_ps_cephes_exp_p1_powf dd 0.00139819994
_Constant_ps_cephes_exp_p2_powf dd 0.00833345205
_Constant_ps_cephes_exp_p3_powf dd 0.0416657962
_Constant_ps_cephes_exp_p4_powf dd 0.166666657
_Constant_ps_cephes_exp_p5_powf dd 0.500000000

base1 REAL4 4.5
base2 REAL4 5.5
exp REAL4 2.0
mulInt DWORD 2
	
.code

calcPow proc public _base: real4, _exp : real4
	LOCAL res : REAL4
	
	movss xmm7, _base
	xorps xmm0, xmm0
	movss xmm6, xmm7
	cmpleps xmm6, xmm0
	maxss xmm7, real4 ptr _Constant_ps_min_norm_pos_powf
	movss xmm5, xmm7
	psrld xmm5, 17h 

	andps xmm7, real4 ptr _Constant_ps_inv_mant_mask_powf
	orps xmm7, real4 ptr _Constant_ps_0p5_powf
	psubd xmm5, real4 ptr _Constant_pi32_0x7f_powf

	cvtdq2ps xmm5, xmm5
	addss xmm5, real4 ptr _Constant_one_powf
	movss xmm4, xmm7
	cmpltss xmm4, real4 ptr _Constant__ps_cephes_SQRTHF_powf
	movss xmm0, xmm7
	andps xmm0, xmm4 
	subss xmm7, real4 ptr _Constant_one_powf
	movss xmm1, real4 ptr _Constant_one_powf
	andps xmm1, xmm4
	subss xmm5, xmm1 
	addss xmm7, xmm0
	movss xmm2, xmm7
	mulss xmm2, xmm2
	movss xmm3, xmm7
	mulss xmm3, real4 ptr _Constant_ps_cephes_log_p0_powf
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p1_powf
	mulss xmm3, xmm7
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p2_powf
	mulss xmm3, xmm7
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p3_powf		
	mulss xmm3, xmm7	
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p4_powf
	mulss xmm3, xmm7	
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p5_powf				
	mulss xmm3, xmm7	
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p6_powf
	mulss xmm3, xmm7	
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p7_powf				
	mulss xmm3, xmm7	
	addss xmm3, real4 ptr _Constant_ps_cephes_log_p8_powf
	mulss xmm3, xmm7
	mulss xmm3, xmm2		
	movss xmm0, xmm5
	mulss xmm0, real4 ptr _Constant_ps_cephes_log_q1_powf				
	addss xmm3, xmm0
	movss xmm0, xmm2
	mulss xmm0, real4 ptr _Constant_ps_0p5_powf
	subss xmm3, xmm0
	movss xmm0, xmm5
	mulss xmm0, real4 ptr _Constant_ps_cephes_log_q2_powf
	addss xmm7, xmm3
	addss xmm7, xmm0
	orps xmm7, xmm6	
	movss xmm6, _exp 
	mulss xmm6, xmm7
	xorps xmm0, xmm0
	movss xmm7, xmm6
	mulss xmm7, xmm0
	cmpneqps xmm7, xmm7
	movss xmm4, xmm6
	mulss xmm4, real4 ptr _Constant_ps_cephes_LOG2EF_powf
	addss xmm4, real4 ptr _Constant_ps_0p5_powf
	movss xmm3, xmm4
	cvttps2dq  xmm3, xmm3 
	cvtdq2ps xmm0, xmm3
	movss xmm2, xmm4
	cmpltps xmm2, xmm0
	andps xmm2, real4 ptr _Constant_one_powf
	movss xmm4, xmm0
	subss xmm4, xmm2
	movss xmm0, xmm4
	mulss xmm0, real4 ptr _Constant_ps_cephes_exp_C1_powf
	movss xmm5, xmm4
	mulss xmm5, real4 ptr _Constant_ps_cephes_exp_C2_powf
	subss xmm6, xmm0
	subss xmm6, xmm5
	movss xmm5, xmm6
	mulss xmm5, xmm5
	movss xmm1, real4 ptr _Constant_ps_cephes_exp_p0_powf
	mulss xmm1, xmm6
	addss xmm1, real4 ptr _Constant_ps_cephes_exp_p1_powf
	mulss xmm1, xmm6
	addss xmm1, real4 ptr _Constant_ps_cephes_exp_p2_powf
	mulss xmm1, xmm6
	addss xmm1, real4 ptr _Constant_ps_cephes_exp_p3_powf
	mulss xmm1, xmm6
	addss xmm1, real4 ptr _Constant_ps_cephes_exp_p4_powf
	mulss xmm1, xmm6
	addss xmm1, real4 ptr _Constant_ps_cephes_exp_p5_powf
	mulss xmm1, xmm5
	addss xmm1, xmm6 
	addss xmm1, real4 ptr _Constant_one_powf
	movss xmm3, xmm4
	cvttps2dq xmm3, xmm4
	paddd xmm3, real4 ptr _Constant_pi32_0x7f_powf
	pslld xmm3, 17h
	movdqa xmm0, xmm3
	mulss xmm1, xmm0
	orps xmm1, xmm7
	movd res, xmm1
	
	FLD res ; this is the way floats are returned in Win32
	ret
calcPow endp


main PROC
	LOCAL firstPow : REAL8
	LOCAL secPow : REAL8
	LOCAl aVal : REAL8
	LOCAL bVal : REAL8
	LOCAL cVal : REAL8
	LOCAL base1Real8 : REAL8
	LOCAL base2Real8 : REAL8

	movss xmm0, base1
	cvtps2pd xmm0, xmm0
	movsd REAL8 ptr base1Real8, xmm0
	movss xmm0, base2
	cvtps2pd xmm0, xmm0
	movsd REAL8 ptr base2Real8, xmm0
	
	; print m and n as real8 (double)
	invoke printf, offset format0, base1Real8, base2Real8
	
	push exp
	push base1
	call calcPow ; calculate pow(m, 2)
	fstp real8 ptr firstPow
	wait
	
	push exp
	push base2
	call calcPow ; pow(n, 2)
	fstp real8 ptr secPow
	wait
	
	movss xmm0, base1
	mulss xmm0, base2
	cvtsi2ss xmm1, mulInt
	mulss xmm0, xmm1
	cvtps2pd xmm1, xmm0
	movsd aVal, xmm1
	
	; print a = 2 * m*n;
	invoke printf, offset format1, aVal
	
	movsd xmm0, firstPow
	movsd xmm1, secPow
	subsd xmm0, xmm1
	movsd bVal, xmm0
	; print b = pow(m, 2) - pow(n, 2)
	invoke printf, offset format2, bVal

	movsd xmm0, firstPow
	movsd xmm1, secPow
	addsd xmm0, xmm1
	movsd cVal, xmm0
	; print b = pow(m, 2) + pow(n, 2)
	invoke printf, offset format3, cVal
	
	invoke ExitProcess, 0
main ENDP
END main

The MASM Forum

News:

Calculate a Pythagorean triple in C++ and assembly

Ben756

ragdog

jj2007

hutch--

Ben756

raymond

jj2007

raymond

jj2007

raymond

aw27