here are three examples which will tell you why
first is 64 beauty:
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov r9,rcx
.if (rcx != rdx)
.for (¦r8¦al=[rdx],[rcx]=al,rcx++,rdx++,r8--)
.endfor
.endif
mov rax,r9
ret
xmemcpy ENDP
compiled to this:
000000014004DA78 push rbp
000000014004DA79 mov rbp,rsp
000000014004DA7C mov r9,rcx
000000014004DA7F cmp rcx,rdx
000000014004DA82 je xmemcpy+22h (14004DA9Ah)
000000014004DA84 jmp xmemcpy+1Bh (14004DA93h)
000000014004DA86 mov al,byte ptr [rdx]
000000014004DA88 mov byte ptr [rcx],al
000000014004DA8A inc rcx
000000014004DA8D inc rdx
000000014004DA90 dec r8
000000014004DA93 and r8,r8
000000014004DA96 je xmemcpy+22h (14004DA9Ah)
000000014004DA98 jmp xmemcpy+0Eh (14004DA86h)
000000014004DA9A mov rax,r9
000000014004DA9D leave
000000014004DA9E ret
the second is the same routine but 32 bit:
xmemcpy PROC USES ebx dest:DORD,src:DWORD,count:DWORD
mov ecx,dest
.if (ecx != src)
.for (edx=src,ebx=count¦ebx¦al=[edx],[ecx]=al,ecx++,edx++,ebx--)
.endfor
.endif
mov eax,dest
ret
xmemcpy ENDP
compiled to this:
00401020 55 push ebp
00401021 8bec mov ebp,esp
00401023 53 push ebx
00401024 8b4d08 mov ecx,dword ptr [ebp+8]
00401027 3b4d0c cmp ecx,dword ptr [ebp+0Ch]
0040102a 7415 je xmemcpy+0x21 (00401041)
0040102c 8b550c mov edx,dword ptr [ebp+0Ch]
0040102f 8b5d10 mov ebx,dword ptr [ebp+10h]
00401032 eb07 jmp xmemcpy+0x1b (0040103b)
00401034 8a02 mov al,byte ptr [edx]
00401036 8801 mov byte ptr [ecx],al
00401038 41 inc ecx
00401039 42 inc edx
0040103a 4b dec ebx
0040103b 23db and ebx,ebx
0040103d 7402 je xmemcpy+0x21 (00401041)
0040103f ebf3 jmp xmemcpy+0x14 (00401034)
00401041 8b4508 mov eax,dword ptr [ebp+8]
00401044 5b pop ebx
00401045 5d pop ebp
00401046 c20c00 ret 0Ch
and here is the C version:
void* xmemcpy(void *dest, const void *src, int count)
{
unsigned char *byte_dest=(unsigned char *)dest;
unsigned char *byte_src=(unsigned char *)src;
if (byte_dest != byte_src)
{
if (count)
{
for (;;)
{
*byte_dest=*byte_src;
if (!--count) break;
++byte_dest;
++byte_src;
}
}
}
return dest;
}
compiled to this:
01271AF0 push ebp
01271AF1 mov ebp,esp
01271AF3 sub esp,0D8h
01271AF9 push ebx
01271AFA push esi
01271AFB push edi
01271AFC lea edi,[ebp-0D8h]
01271B02 mov ecx,36h
01271B07 mov eax,0CCCCCCCCh
01271B0C rep stos dword ptr es:[edi]
01271B0E mov eax,dword ptr [dest]
01271B11 mov dword ptr [byte_dest],eax
01271B14 mov eax,dword ptr [src]
01271B17 mov dword ptr [byte_src],eax
01271B1A mov eax,dword ptr [byte_dest]
01271B1D cmp eax,dword ptr [byte_src]
01271B20 je xmemcpy+63h (01271B53h)
01271B22 cmp dword ptr [count],0
01271B26 je xmemcpy+63h (01271B53h)
01271B28 mov eax,dword ptr [byte_dest]
01271B2B mov ecx,dword ptr [byte_src]
01271B2E mov dl,byte ptr [ecx]
01271B30 mov byte ptr [eax],dl
01271B32 mov eax,dword ptr [count]
01271B35 sub eax,1
01271B38 mov dword ptr [count],eax
01271B3B jne xmemcpy+4Fh (01271B3Fh)
01271B3D jmp xmemcpy+63h (01271B53h)
01271B3F mov eax,dword ptr [byte_dest]
01271B42 add eax,1
01271B45 mov dword ptr [byte_dest],eax
01271B48 mov eax,dword ptr [byte_src]
01271B4B add eax,1
01271B4E mov dword ptr [byte_src],eax
01271B51 jmp xmemcpy+38h (01271B28h)
01271B53 mov eax,dword ptr [dest]
01271B56 pop edi
01271B57 pop esi
01271B58 pop ebx
01271B59 mov esp,ebp
01271B5B pop ebp
01271B5C ret
Do you need more reasons? ;)
Am I missing something here, it looks like the 64-bit code is moving a byte at a time. Should not most of the move be done 64-bits at a time?
Hi Habran,
Did you test the same C code with a 64-bit C compiler?
Timings would be nice :biggrin:
good point wortex :t
here it is the 64 bit in C:
void* xmemcpy(void *dest, const void *src, UINT_PTR count)
{
unsigned char *byte_dest=(unsigned char *)dest;
unsigned char *byte_src=(unsigned char *)src;
if (byte_dest != byte_src)
{
if (count)
{
for (;;)
{
*byte_dest=*byte_src;
if (!--count) break;
++byte_dest;
++byte_src;
}
}
}
return dest;
}
0000000140063630 mov qword ptr [rsp+18h],r8
0000000140063635 mov qword ptr [rsp+10h],rdx
000000014006363A mov qword ptr [rsp+8],rcx
000000014006363F sub rsp,18h
0000000140063643 mov rax,qword ptr [dest]
0000000140063648 mov qword ptr [byte_dest],rax
000000014006364D mov rax,qword ptr [src]
0000000140063652 mov qword ptr [rsp],rax
0000000140063656 mov rax,qword ptr [rsp]
000000014006365A cmp qword ptr [byte_dest],rax
000000014006365F je xmemcpy+7Bh (1400636ABh)
0000000140063661 cmp qword ptr [count],0
0000000140063667 je xmemcpy+7Bh (1400636ABh)
0000000140063669 mov rax,qword ptr [byte_dest]
000000014006366E mov rcx,qword ptr [rsp]
0000000140063672 movzx ecx,byte ptr [rcx]
0000000140063675 mov byte ptr [rax],cl
0000000140063677 mov rax,qword ptr [count]
000000014006367C sub rax,1
0000000140063680 mov qword ptr [count],rax
0000000140063685 cmp qword ptr [count],0
000000014006368B jne xmemcpy+5Fh (14006368Fh)
000000014006368D jmp xmemcpy+7Bh (1400636ABh)
000000014006368F mov rax,qword ptr [byte_dest]
0000000140063694 add rax,1
0000000140063698 mov qword ptr [byte_dest],rax
000000014006369D mov rax,qword ptr [rsp]
00000001400636A1 add rax,1
00000001400636A5 mov qword ptr [rsp],rax
00000001400636A9 jmp xmemcpy+39h (140063669h)
00000001400636AB mov rax,qword ptr [dest]
00000001400636B0 add rsp,18h
00000001400636B4 ret
MichaelW,
NO :icon_exclaim: :biggrin:
JJ2007,
QuoteTimings would be nice :biggrin:
I agree, but I have to CONFESS that I don't know that part :icon_mrgreen:
Can you please do it for mee :biggrin:
MichaelW,
if data is aligned to 8 , 16 or 32 byte
than it is possible to do that like this example:
align 8
AXCHARINDEX struct
nLine SDWORD ?
lpLine INT_PTR ?
nCharInLine SDWORD ?
AXCHARINDEX ends
.code
lea rdi,ciPoint ;points to first index
lea rsi,ciPoint1 ;points to second index
mov ecx,sizeof(AXCHARINDEX)/8
rep movsq
otherwise, if they are chars for example, it is not convenient to do that always
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU 32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE
in my case I used simple xmemcpy because of simplicity
have look at these code above
It takes about the same amount of bytes as if you call some function but it is there in the present location
I assure you that it is faster and more appropriate than some sophisticated function especially if it is not in the cash at the time
It would be even more appropriate to create a MACRO to do the same job :biggrin:
Hi habran,
there's no doubt that the 64 bit world is the future. But for the next years both - 32 bit and 64 bit - will coexist. I'm inside the 64 bit world since a few years; the first Linux kernel came out 2001; Windows was some years later. Under 32 bit the Application Binary Interface (ABI) are the same. So, one could write code for both platforms. That's over, because the 64 bit ABIs are very different. We can use code for both platforms only in rare cases.
All things considered: there are advantages and disadvantages.
Gunther
hey Gunther,
QuoteBut for the next years both - 32 bit and 64 bit - will coexist.
no doubt thy will, because of accumulated 32 bit apps
however, IMO to continue to write 32 bit programs would be like holding with your nails on the cliff
in my case I would forever like to program in assembly C64, I felt like I had a chocolate in my mouth when I did that
but who needs any more those apps :(
now, I have the same sensation as with C64 when I write 64 bit JWASM :icon_exclaim: :icon_exclaim: :icon_exclaim:
if it was not for excellent JWASM (thanks Japheth :t) I would maybe return to C and C#
it looks like 64 is my favorite number
I am a lazy person by nature and I always think (I am not lazy to think ;))
about a fast and simple way to finish anything (except sex :t)
I don't walk to shop 10 time to bring home the grocery, I use a car for that
you may say that walking is healthy but it is not if you have to carry bloody grocery in your hands :bgrin:
Hi habran,
things are a bit more complicated.
The transition from 32 bit to 64 bit will take more time as you might think. We've seen in the past the transition from 16 bit to 32 bit (by the way, I've never discussed C64 programming in my posts). That process had a time line of approximately 15 years. But: we had a lot of memory trouble under 16 bit, which wasn't easy (XMS, EMS, several DOS extenders etc). The pressure was enorm, because a lot of applications at the begin of the 90s were very memory hungry.
That's not the case by the transition from 32 to 64 bit. There are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not. So, we can calculate for that transition at least 15 years. That's a long time; therefore it makes sense to write for both worlds.
I won't argue that you should write 32 bit code. Write your 64 bit applications and that's fine. But have a look for the difficulties: different ABI for the main platforms, some people in our forum can't run 64 bit operating systems (hardware limitations), other people like 32 bit programming etc. etc. A bit more tolerance for other point of views wouldn't be bad.
Gunther
many of us use MichaelW's code timing macros
http://masm32.com/board/index.php?topic=49.0 (http://masm32.com/board/index.php?topic=49.0)
attached is a 32-bit program for timing code (assemble as a console app)
you may or may not want to adapt the code to 64-bit
thanks dedndave :t
I will look at it tomorrow and see if it pays of to translate to x64 :biggrin:
i don't know if Michael has plans to make a 64-bit version of his macro set
shouldn't be too hard :P
For a x64 adaption of MichaelW's counter macro see my post in this thread: http://masm32.com/board/index.php?topic=49.msg130#msg130.
thanks qWord :biggrin:
habran,
your compare between C and ASM in your first post is unfair, because is obviously a debug build.
Also, a smarter "algorithm" would probably produce much better results. e.g. something like this (not tested):
void* xmemcpy(void *dest, void *src, unsigned int cb)
{
unsigned int cnt1 = cb>>((sizeof(char*)==8)?3:2);
unsigned int cnt2 = cb&(sizeof(char*)-1);
char** p1 = (char**)dest;
char** p2 = (char**)src;
char* p3;
char* p4;
for(;cnt1--;p1++,p2++)
*p1 = *p2;
p3 = (char*)p1;
p4 = (char*)p2;
if (sizeof(char*) == 8) // dead code for x32
if(cnt2&4)
{ *((int*)p3)= *((int*)p4);
p3+=4;p4+=4;cnt2-=4;
}
for(;cnt2--;p3++,p4++)
*p3 = *p4;
return dest;
}
Quote from: qWord on February 11, 2013, 06:23:36 AM
habran,
your compare between C and ASM in your first post is unfair, because is obviously a debug build.
Also, a smarter "algorithm" would probably produce much better results.
I wonder how efficient this code from the "64 beauty" example is (can't test it, unfortunately):
000000014004DA90 dec r8
000000014004DA93 and r8,r8 <<< no need for that, the flag is already set
000000014004DA96 je xmemcpy+22h (14004DA9Ah) <<< why not jne xmemcpy+0Eh?? static branch prediction rules would suggest that it is even faster...
000000014004DA98 jmp xmemcpy+0Eh (14004DA86h) <<< can be dropped entirely.
000000014004DA9A mov rax,r9 Again, timings would be nice ;-)
For what I recall, a
memcopy done with native 64 bit registers, in 64
bit systems, is the fastest solution found when we tested, a couple
of years ago, XMM/SSE2 code for this kind of operation.
The test was done on a 32 MB buffer that was simply blanked, not really a memcopy
but it was set just to measure the performance of
REP STOSQ vs
MOVNTDQand measured via
rdtsc.
The results were like:
Quote
Clearing done
117,940,861 clocks for a 33,554,432 bytes buffer with using REP STOSQ
Clearing done
1,208,750,068 clocks for a 33,554,432 bytes buffer with using MOVNTDQ
Code from Alex.
I agree with habran, as I said at the time, for many reasons, but I also
understand why years of work are not easily dropped or rewritten. :t
Frank
Frank,
Quote from: frktons on February 11, 2013, 10:35:01 AM
For what I recall, a memcopy done with native 64 bit registers, in 64
bit systems, is the fastest solution found when we tested, a couple
of years ago, XMM/SSE2 code for this kind of operation.
the situation has changed dramatically since the advent of Intel's AVX. We should do the test again.
Gunther
Quote from: Gunther on February 11, 2013, 10:51:18 AM
the situation has changed dramatically since the advent of Intel's AVX. We should do the test again.
Gunther
I think a new test can only confirm that 64 bit mov operations are
faster than 32 bit ones. If anyone has a new processor, say habran, and
the skill to use AVX code, he could do it.
Not that difficult if he really likes to do the test, I can post the 64 bit MASM
code that I used 2 years ago. No AVX because neither Alex's, nor my PC are
AVX able.
I have tested a speed of 64 bit and result is 1:4 against C
for one pass JWASM is 80 or 50h
and C is 207 intersting :P (JJ2007) 0CFh
JJ207 you are correct that more optimization could be done to it
however my intent in this case was not so much focused on that but on beauty and simplicity of 64 bit JWASM
I have used ".for" loop which is portable, readable and easy to use but it can not beat human eyes and brains
thank you Frank for supporting me that's what friends are for :t
qWord,
Quoteyour compare between C and ASM in your first post is unfair, because is obviously a debug build
all of them are debug built because I needed to read a code in memory :icon_eek:
your function looks good and I will test it later
Quote
however, we can write more complex EG: xxxmemcpy which would be able to calculate the size of data
and than first transfer all possible QWORDS and than if left last DWORD and than if left last WORD and than if left last BYTE
EG: data size is 256+7 EQU 32 QWORDS, 1 DWORD, 1 WORD, and 1 BYTE
I think I have seen already on internet written similar function but I can't remember was it in C or assembler
UNFAIR :icon_eek:
what is fair in this world??? life is a bitch!
these days even death is not fair any more, if you are rich you by yourself brand new organs and live as long as you want :P
hi Frank,
I can try to do that dough I did not learn yet AVX
I am ready for another challenge, I am not a chicken :lol:
I have to go now to earn my living, "I'll be back 8)"
Quote from: habran on February 11, 2013, 11:35:59 AMintersting :P (JJ2007) 0CFh
What do you mean with that? ::)
Quote from: habran on February 11, 2013, 11:35:59 AM
I have tested a speed of 64 bit and result is 1:4 against C
I can't confirm that: my own quick test shows that there is nearly no difference between your .for-loop and xmemcpy.
Function: xmemcpy xmemcpy2 xmemcpy_Q xmemcpy_Q2 memcpy @ForLoop
--- buffer size = 13 ---
align +0 29 17 4 2 8 33
align +1 29 17 5 3 10 32
align +2 29 17 7 3 13 32
align +3 29 17 5 2 11 33
align +4 29 17 4 2 11 32
align +5 29 17 5 3 11 32
align +6 29 17 4 3 11 32
align +7 29 17 5 2 11 32
align +8 29 17 4 2 11 32
align +9 29 17 5 3 11 32
align +10 29 18 4 3 11 32
align +11 29 17 5 2 11 32
align +12 29 17 4 2 11 32
align +13 29 17 5 3 11 32
align +14 29 17 4 4 11 32
align +15 29 17 5 3 11 32
--- buffer size = 33 ---
align +0 93 46 10 10 10 77
align +1 73 46 10 7 10 76
align +2 77 46 10 7 10 76
align +3 92 46 10 7 10 77
align +4 73 46 10 7 10 76
align +5 82 46 10 7 10 76
align +6 73 46 10 7 10 77
align +7 91 47 10 7 10 76
align +8 73 47 11 7 13 76
align +9 73 47 10 7 10 91
align +10 73 46 10 7 10 76
align +11 73 47 10 7 11 77
align +12 76 47 10 7 10 76
align +13 86 47 10 7 10 76
align +14 74 47 10 7 10 76
align +15 84 47 18 8 10 76
--- buffer size = 59 ---
align +0 124 97 21 14 17 134
align +1 152 98 22 15 17 135
align +2 129 98 23 15 17 134
align +3 129 102 22 15 17 135
align +4 129 98 22 14 17 137
align +5 129 98 21 16 18 139
align +6 128 98 21 16 17 133
align +7 129 98 21 15 17 135
align +8 129 98 21 14 17 134
align +9 128 98 21 14 17 135
align +10 128 98 21 15 17 135
align +11 129 98 20 15 17 134
align +12 129 98 21 14 17 134
align +13 133 98 21 15 17 135
align +14 135 99 21 15 16 135
align +15 127 98 21 15 17 134
--- buffer size = 590 ---
align +0 920 908 150 123 65 1041
align +1 915 886 149 124 62 1040
align +2 922 906 149 124 82 1048
align +3 925 887 150 124 64 1037
align +4 920 891 150 127 63 1103
align +5 918 892 149 124 64 1042
align +6 974 897 157 128 82 1087
align +7 938 888 149 124 64 1032
align +8 921 889 151 123 65 1070
align +9 941 887 154 124 85 1051
align +10 937 888 150 124 63 1056
align +11 953 887 150 123 64 1013
align +12 920 897 151 124 63 1039
align +13 925 900 150 123 63 1017
align +14 938 892 151 124 63 1090
align +15 927 889 156 125 64 1053
--- Functions ----
xmemcpy : habran , PellesC
xmemcpy2 : habran , VC 2012
xmemcpy_Q : qWord , PellesC
xmemcpy_Q2 : qWord , VC 2012
memcpy : MSVCRT
@ForLoop : habran
only alignment of Src varies, Dest is allocated by HeapAlloc()
Press any key to continue ...
qWord,
I have used counter_begin and counter_end as MACROS like this
local buff[256]:BYTE
counter_begin 1,1
invoke xmemcpy,ADDR buff,CTEXT("habran is very smart cooker"), 27
counter_end
and I've got above mentioned results
do you want to say that I lied :icon_eek:
however, I don't believe in your testing because, looking in a C source everyone can see that there is much more
job for processor and also accessing memory in C than ASM
are you sure that your testing is correct
if so I will go back to C64 :bgrin:
JJ2007,
QuoteWhat do you mean with that?
207 reminded me on 2007 and it is funny because C knows that you don't like it :biggrin:
BTW 2007 reminded me on two James Bonds or double agent 007
what actually you are doing in Italy? 8)
:biggrin:
http://csdb.dk/forums/?roomid=11 (http://csdb.dk/forums/?roomid=11)
thanks dedndave :P :
Bye everyone :biggrin:
Frank,
Quote from: frktons on February 11, 2013, 11:05:20 AM
I think a new test can only confirm that 64 bit mov operations are
faster than 32 bit ones. If anyone has a new processor, say habran, and
the skill to use AVX code, he could do it.
Not that difficult if he really likes to do the test, I can post the 64 bit MASM
code that I used 2 years ago. No AVX because neither Alex's, nor my PC are
AVX able.
I can do that next weekend; please post your code.
Gunther
Quote from: Gunther on February 11, 2013, 06:25:49 PM
Frank,
I can do that next weekend; please post your code.
Gunther
Here you are. The code tests only REP STOSQ vs MOVNTDQ.
You can add the tests for MOVAPS, MOVDQA, etc... if you like.
Frank
Quote from: habran on February 11, 2013, 03:17:47 PMdo you want to say that I lied :icon_eek:
yes, the purpose of my post was to defame you :dazzled:
Quote from: habran on February 11, 2013, 03:17:47 PM
however, I don't believe in your testing because, looking in a C source everyone can see that there is much more
job for processor and also accessing memory in C than ASM
good point :t
BTW, this is what PellesC creates from your C code:
sub_140001000 proc near
mov rax, rcx
mov rcx, rax
cmp rcx, rdx
jz short locret_140001026
test r8d, r8d
jz short locret_140001026
loc_140001010:
mov r9b, [rdx]
mov [rcx], r9b
sub r8d, 1
jz short locret_140001026
add rcx, 1
add rdx, 1
jmp short loc_140001010
locret_140001026:
retn
sub_140001000 endp
qWord,
Quote
BTW, this is what PellesC creates from your C code:
Holly Cow!!! :exclaim: :icon_exclaim: :icon_eek:
are you pulling my leg :shock: actually, are you puling my both legs!???
If hat is true why can I not build 64 bit JWASM with it?
give me a proper explanation or I am gone to that C64 forum
Quoteyes, the purpose of my post was to defame you :dazzled:
I was not aware that I am famous, am I really :greenclp:
if I am really a celebrity, maybe I need a body guard, someone like Frank I meant Farmer not frktons :biggrin:( Kevin Michael Costner) or Arnold Alois Schwarzenegger 8)(Terminator)
how about Bullseye from DareDevil
(http://www.wrak.pl/inne/daredevil_bullseye.jpg)
funniest bad guy ever :lol:
dedndave, you are a genius :t :eusa_clap:
what are you doing in this forum!!!???
you could struck rich somewhere else :bgrin:
You have DEFAMED me
qWord PellesC produced almost perfect code which should look like this:
sub_140001000 proc near
mov rax, rcx
cmp rcx, rdx
jz short locret_140001026
test r8d, r8d
jz short locret_140001026
loc_140001010:
mov r9b, [rdx]
mov [rcx], r9b
add rcx, 1
add rdx, 1
sub r8d, 1
jnz short loc_140001010
locret_140001026:
retn
sub_140001000 endp
Frank,
Quote from: frktons on February 11, 2013, 08:17:09 PM
Here you are. The code tests only REP STOSQ vs MOVNTDQ.
You can add the tests for MOVAPS, MOVDQA, etc... if you like.
Frank
I'll first study your code and see what's to do. Thank you for uploading the source. :t
Gunther
I think a better word might be "celebrated".
Main Entry:
celebrated [sel-uh-brey-tid] Show IPA
Part of Speech: adjective
Definition: distinguished, famous
Synonyms: acclaimed, big*, eminent, famed, glorious, great, high-powered, illustrious, immortal, important, large, laureate, lionized, notable, number one, numero uno, outstanding, popular, preeminent, prominent, renowned, revered, storied, up there, w. k., well-known
de·fame audio (d-fm) KEY
TRANSITIVE VERB:
de·famed, de·fam·ing, de·fames
To damage the reputation, character, or good name of by slander or libel. See Synonyms at malign.
Archaic To disgrace.
thank you Magnum, :t
now I know who I am:
distinguished, high-powered, immortal, numero uno, macho-man 8)
I also want to say(no joking this time):
This forum has gathered the most prominent assembler programmers, and if we decide HERE that:
we should not hold with our teeth
something that is already obsolete
but embrace 64 bit
other assembler programmers
will have this to swallow
and our example follow
i could write some 64-bit code, but i'd have to get you guys to test it for me :(
dedndave, I promise you I will be proud to do that for you :t
qWord this is what I get when compile your function in C with MSVC205:
xmemcpy:
0000000140063090 mov qword ptr [rsp+18h],r8
0000000140063095 mov qword ptr [rsp+10h],rdx
000000014006309A mov qword ptr [rsp+8],rcx
000000014006309F sub rsp,38h
00000001400630A3 mov rax,qword ptr [cb]
00000001400630A8 shr rax,3
00000001400630AC mov dword ptr [cnt1],eax
00000001400630B0 mov rax,qword ptr [cb]
00000001400630B5 and rax,7
00000001400630B9 mov dword ptr [cnt2],eax
00000001400630BD mov rax,qword ptr [dest]
00000001400630C2 mov qword ptr [p1],rax
00000001400630C7 mov rax,qword ptr [src]
00000001400630CC mov qword ptr [p2],rax
00000001400630D1 jmp xmemcpy+5Fh (1400630EFh)
00000001400630D3 mov rax,qword ptr [p1]
00000001400630D8 add rax,8
00000001400630DC mov qword ptr [p1],rax
00000001400630E1 mov rax,qword ptr [p2]
00000001400630E6 add rax,8
00000001400630EA mov qword ptr [p2],rax
00000001400630EF mov eax,dword ptr [cnt1]
00000001400630F3 mov ecx,dword ptr [cnt1]
00000001400630F7 sub ecx,1
00000001400630FA mov dword ptr [cnt1],ecx
00000001400630FE test eax,eax
0000000140063100 je xmemcpy+84h (140063114h)
0000000140063102 mov rax,qword ptr [p1]
0000000140063107 mov rcx,qword ptr [p2]
000000014006310C mov rcx,qword ptr [rcx]
000000014006310F mov qword ptr [rax],rcx
0000000140063112 jmp xmemcpy+43h (1400630D3h)
0000000140063114 mov rax,qword ptr [p1]
0000000140063119 mov qword ptr [p3],rax
000000014006311E mov rax,qword ptr [p2]
0000000140063123 mov qword ptr [rsp],rax
0000000140063127 xor eax,eax
0000000140063129 cmp eax,1
000000014006312C je xmemcpy+0DBh (14006316Bh)
000000014006312E mov eax,dword ptr [cnt2]
0000000140063132 and eax,4
0000000140063135 test eax,eax
0000000140063137 je xmemcpy+0DBh (14006316Bh)
0000000140063139 mov rax,qword ptr [p3]
000000014006313E mov rcx,qword ptr [rsp]
0000000140063142 mov ecx,dword ptr [rcx]
0000000140063144 mov dword ptr [rax],ecx
0000000140063146 mov rax,qword ptr [p3]
000000014006314B add rax,4
000000014006314F mov qword ptr [p3],rax
0000000140063154 mov rax,qword ptr [rsp]
0000000140063158 add rax,4
000000014006315C mov qword ptr [rsp],rax
0000000140063160 mov eax,dword ptr [cnt2]
0000000140063164 sub eax,4
0000000140063167 mov dword ptr [cnt2],eax
000000014006316B jmp xmemcpy+0F7h (140063187h)
000000014006316D mov rax,qword ptr [p3]
0000000140063172 add rax,1
0000000140063176 mov qword ptr [p3],rax
000000014006317B mov rax,qword ptr [rsp]
000000014006317F add rax,1
0000000140063183 mov qword ptr [rsp],rax
0000000140063187 mov eax,dword ptr [cnt2]
000000014006318B mov ecx,dword ptr [cnt2]
000000014006318F sub ecx,1
0000000140063192 mov dword ptr [cnt2],ecx
0000000140063196 test eax,eax
0000000140063198 je xmemcpy+11Ah (1400631AAh)
000000014006319A mov rax,qword ptr [p3]
000000014006319F mov rcx,qword ptr [rsp]
00000001400631A3 movzx ecx,byte ptr [rcx]
00000001400631A6 mov byte ptr [rax],cl
00000001400631A8 jmp xmemcpy+0DDh (14006316Dh)
00000001400631AA mov rax,qword ptr [dest]
00000001400631AF add rsp,38h
00000001400631B3 ret
I can not believe that it takes only 2 ticks
i think you must be measuring that wrong
i think there is something wrong with the 2 cycle measurement :P
maybe the timer code isn't doing what you think it is or something
i have a friend, not too far away...
he has a win 7-64 ultimate new-fangled machine at home, now
he uses it mostly for running his business
http://www.mesabattingcages.com/ (http://www.mesabattingcages.com/)
he will let me test whatever i like, but i would hate to mess up his machine
or even be near it if it messes up :P
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
; MSVC 2010
sub_140008A60 proc near
mov r10d, r8d
and r8d, 7
mov r9, rcx
shr r10d, 3
test r10d, r10d
jz short loc_140008A94
db 66h, 66h, 66h, 66h
nop word ptr [rax+rax+00000000h]
loc_140008A80:
mov rax, [rdx]
add r9, 8
add rdx, 8
dec r10d
mov [r9-8], rax
jnz short loc_140008A80
loc_140008A94:
test r8b, 4
jz short loc_140008AAC
mov eax, [rdx]
add r9, 4
add rdx, 4
mov [r9-4], eax
add r8d, 0FFFFFFFCh
loc_140008AAC:
test r8d, r8d
jz short loc_140008AD1
sub rdx, r9
db 66h, 66h, 66h, 66h
nop dword ptr [rax+rax+00000000h]
loc_140008AC0:
movzx eax, byte ptr [rdx+r9]
inc r9
dec r8d
mov [r9-1], al
jnz short loc_140008AC0
loc_140008AD1:
mov rax, rcx
retn
sub_140008A60 endp
in the attachment a testbench with loop count = 1
BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh** parody.
yes qWord, now you are talking... :t
that looks more real then before and doesn't contradict to what I said before
my xmemcpy is OK for transferring one or two lines of characters, but for greater data transfer your function is absolute
I always admired your laser sharp mind and programmers skills :eusa_clap:
Quote from: qWord on February 12, 2013, 07:17:35 AM
Quote from: habran on February 12, 2013, 06:37:57 AMI can not believe that it takes only 2 ticks
you are obviously not able to configure your compiler! Also, looking in the code of my testbench (and yes ... you can't compile it because there are some dependencies I've not include) you will see that I've used a high loop count, which blends out memory access.
; MSVC 2010
sub_140008A60 proc near
mov r10d, r8d
and r8d, 7
mov r9, rcx
shr r10d, 3
test r10d, r10d
jz short loc_140008A94
db 66h, 66h, 66h, 66h
nop word ptr [rax+rax+00000000h]
loc_140008A80:
mov rax, [rdx]
add r9, 8
add rdx, 8
dec r10d
mov [r9-8], rax
jnz short loc_140008A80
loc_140008A94:
test r8b, 4
jz short loc_140008AAC
mov eax, [rdx]
add r9, 4
add rdx, 4
mov [r9-4], eax
add r8d, 0FFFFFFFCh
loc_140008AAC:
test r8d, r8d
jz short loc_140008AD1
sub rdx, r9
db 66h, 66h, 66h, 66h
nop dword ptr [rax+rax+00000000h]
loc_140008AC0:
movzx eax, byte ptr [rdx+r9]
inc r9
dec r8d
mov [r9-1], al
jnz short loc_140008AC0
loc_140008AD1:
mov rax, rcx
retn
sub_140008A60 endp
in the attachment a testbench with loop count = 1
BTW: if you are not interested in a serious discussion, you may simply say that instead of this bullsh** parody.
The executable is quite big after unzipping = 190K. What's inside?
I can't believe a simple test on memory copy takes all that code.
here is the version what I was talking about IMO fastest ever :t
please prove me wrong :biggrin:
I use here xmm4 and ymm4 because first 4 registers are used in float calculation and this one is volatile as well
so we don't have to preserve it
option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx
.if (rcx!=rdx)
.for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)
vmovdqu ymm4,[rdx]
vmovdqu [rcx],ymm4
.endfor
shr r8,1
.if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
shr r8,1
.if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
shr r8,1
.if (CARRY?)
mov r9,[rdx]
mov [rcx],r9
add rcx,8
add rdx,8
.endif
shr r8,1
.if (CARRY?)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4
.endif
.endif
aexit: ret
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
it creates this code:
xmemcpy:
00000001`40034220 488bc1 mov rax,rcx
00000001`40034223 483bca cmp rcx,rdx
00000001`40034226 747a je xmemcpy+0x82 (00000001`400342a2)
00000001`40034228 4d8bd0 mov r10,r8
00000001`4003422b 49c1ea05 shr r10,5
00000001`4003422f 4d23d2 and r10,r10
00000001`40034232 7415 je xmemcpy+0x29 (00000001`40034249)
00000001`40034234 c5fe6f22 vmovdqu ymm4,ymmword ptr [rdx]
00000001`40034238 c5fe7f21 vmovdqu ymmword ptr [rcx],ymm4
00000001`4003423c 4883c120 add rcx,20h
00000001`40034240 4883c220 add rdx,20h
00000001`40034244 49ffca dec r10
00000001`40034247 75eb jne xmemcpy+0x14 (00000001`40034234)
00000001`40034249 49d1e8 shr r8,1
00000001`4003424c 730c jae xmemcpy+0x3a (00000001`4003425a)
00000001`4003424e 448a0a mov r9b,byte ptr [rdx]
00000001`40034251 448809 mov byte ptr [rcx],r9b
00000001`40034254 48ffc1 inc rcx
00000001`40034257 48ffc2 inc rdx
00000001`4003425a 49d1e8 shr r8,1
00000001`4003425d 7310 jae xmemcpy+0x4f (00000001`4003426f)
00000001`4003425f 66448b0a mov r9w,word ptr [rdx]
00000001`40034263 66448909 mov word ptr [rcx],r9w
00000001`40034267 4883c102 add rcx,2
00000001`4003426b 4883c202 add rdx,2
00000001`4003426f 49d1e8 shr r8,1
00000001`40034272 730e jae xmemcpy+0x62 (00000001`40034282)
00000001`40034274 448b0a mov r9d,dword ptr [rdx]
00000001`40034277 448909 mov dword ptr [rcx],r9d
00000001`4003427a 4883c104 add rcx,4
00000001`4003427e 4883c204 add rdx,4
00000001`40034282 49d1e8 shr r8,1
00000001`40034285 730e jae xmemcpy+0x75 (00000001`40034295)
00000001`40034287 4c8b0a mov r9,qword ptr [rdx]
00000001`4003428a 4c8909 mov qword ptr [rcx],r9
00000001`4003428d 4883c108 add rcx,8
00000001`40034291 4883c208 add rdx,8
00000001`40034295 49d1e8 shr r8,1
00000001`40034298 7308 jae xmemcpy+0x82 (00000001`400342a2)
00000001`4003429a f30f6f22 movdqu xmm4,xmmword ptr [rdx]
00000001`4003429e f30f7f21 movdqu xmmword ptr [rcx],xmm4
00000001`400342a2 c3 ret
and here is version for people without AVX
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx
.if (rcx!=rdx)
.for (r10=r8,r10>>=4¦r10¦rcx+=16,rdx+=16,r10--)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4
.endfor
shr r8,1
.if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
shr r8,1
.if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
shr r8,1
.if (CARRY?)
mov r9,[rdx]
mov [rcx],r9
.endif
.endif
ret
xmemcpy ENDP
I commented this for the visitors only, not for the members of this forum :bgrin:
option win64:0 ;no need for any option
OPTION PROLOGUE:NONE ;just pure code
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx ;save dest of transfered data for return befor it changes
.if (rcx!=rdx) ;check if there is not the same location of src and dest
;here is happening the MULTO IMPORTANTE transfer of data
.for (r10=r8,r10>>=5¦r10¦rcx+=32,rdx+=32,r10--)
vmovdqu ymm4,[rdx] ;transfer 32 byte at ones
vmovdqu [rcx],ymm4 ;with ymm4 AVX register (The Transporter)
.endfor ;RRRRRRRROOOOOOOOAAAAAAAAARRRRRRRR
;data is probably not aligned to 32 bytes so we have to check if so
;it could have been left for example 31 or 01Fh or 0000 0000 0001 1111 in reg r8 or count
shr r8,1 ;check if so by shifting right 1 time
.if (CARRY?) ;if 1 pops out it will enter in the carry flag
mov r9b,[rdx] ;transfer only one byte to dest
mov [rcx],r9b ;it can be only one byte
inc rcx ;if more than 1 it will be done
inc rdx ;in the next shift
.endif
shr r8,1 ;LET$;) see if there is a word prezent
.if (CARRY?) ;HA! I found you
mov r9w,[rdx] ;store that only word in the dest
mov [rcx],r9w
add rcx,2 ;this time add two to dest pos
add rdx,2 ;and src
.endif
shr r8,1 ;shift again for the dword
.if (CARRY?) ;nock-nock are you in cf
mov r9d,[rdx] ;get in
mov [rcx],r9d
add rcx,4 ;now we add 4 to both src and dest
add rdx,4
.endif
shr r8,1 ;looking for qword
.if (CARRY?) ;no job for you today go home and do some programming
mov r9,[rdx]
mov [rcx],r9
add rcx,8
add rdx,8 ;inrease your pay for 8 bucks an hour
.endif
shr r8,1 ;oword prezent today?
.if (CARRY?)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4 ;Last Stand!!!
.endif ;no need to increase pozition
.endif ;I finished!!! Did you finish yet???
aexit: ret ;have a smocko
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
mov ecx,dest
mov edx,src
mov ebx,count
.if (ecx!=edx)
.for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)
movdqu xmm4,[edx]
movdqu [ecx],xmm4
.endfor
shr ebx,1
.if (CARRY?)
mov al,[edx]
mov [ecx],al
inc ecx
inc edx
.endif
shr ebx,1
.if (CARRY?)
mov ax,[edx]
mov [ecx],ax
add ecx,2
add edx,2
.endif
shr ebx,1
.if (CARRY?)
mov eax,[edx]
mov [ecx],eax
add ecx,4
add edx,4
.endif
shr ebx,1
.if (CARRY?)
movq xmm4,[edx]
movq [ecx],xmm4
.endif
.endif
mov eax,dest
ret
xmemcpy ENDP
Doesn't assemble with my version of JWasm. Where is your latest build?
And what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 (http://masm32.com/board/index.php?topic=402.0) :shock:
QuoteAnd what does .for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--) mean? "Much bigger or equal"??
and
>>=4 means shift right 4 time it produces
shr eax,4 :biggrin:
it means "Much much much much less" :lol:
Hey qWord,
Cat got your tongue? :icon_eek:
(https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT6k8_OVV3GpA_eIvy8fUnCQ-1nGLI_RS1DuNQdCt0G9AsuOCYQ)
you have the same "Qosmio laptop" as me
did you test the speed? :bgrin:
Quote from: habran on February 20, 2013, 09:37:06 AM
I thought maybe those who still have 32 bit old-timers machines can feel the blow of the lightning speed in they hair :biggrin:
but be careful, it can blow away that little bit of hair left on your head ;)
so I wrote a 32 bit for them 8)
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
mov ecx,dest
mov edx,src
mov ebx,count
.if (ecx!=edx)
.for (eax=ebx,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)
movdqu xmm4,[edx]
movdqu [ecx],xmm4
.endfor
shr ebx,1
.if (CARRY?)
mov al,[edx]
mov [ecx],al
inc ecx
inc edx
.endif
shr ebx,1
.if (CARRY?)
mov ax,[edx]
mov [ecx],ax
add ecx,2
add edx,2
.endif
shr ebx,1
.if (CARRY?)
mov eax,[edx]
mov [ecx],eax
add ecx,4
add edx,4
.endif
shr ebx,1
.if (CARRY?)
movq xmm4,[edx]
movq [ecx],xmm4
.endif
.endif
mov eax,dest
ret
xmemcpy ENDP
Habran, why do you use MOVDQU and not align the memory
pointers to 16 bytes addresses? MOVAPS/MOVDQA are faster.
Unrolling the MOV can be another good option to test.
And if the area to copy is big, > 4 MB , MOVNTDQ is the best
option. Have a look at the old forum and search for CLEARBUFFER.
REP STOSQ is probably faster than your non AVX solution, give it
a shot on 64 bit version.
A last thing. You should post the results of your tests, if you like
to get the attention of somebody on these routines.
Frank
Hi Frank, :biggrin:
Quotewhy do you use MOVDQU and not align the memory
because this routine is created particularly for unaligned data like text or something
I totally agree with you that MOVDQA is much faster than MOVDQU :t
however, if data is aligned to 32 byte I wouldn't need that routine I would just write in my source:
;r8 can contain sizeof(buffer)
.for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
vmovdqa ymm4,[rdx]
vmovdqa [rcx],ymm4
.endfor
or for for 16 byte xmm:
;r8 can contain sizeof(buffer)
.for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
movdqa xmm4,[rdx]
movdqa [rcx],xmm4
.endfor
QuoteA last thing. You should post the results of your tests
I left it to qWord to do that for me because he likes testing and arguing :P
and I like and appreciate him :biggrin:
Quoteif you like to get the attention of somebody on these routines.
I don't give a damn about attention, take it or leave it 8)
Frank,
IMO it is not always advisable to align data to 16 or 32 bytes :(
if you have STRUCT in 32 bit program you align it to 4
in 64 bit logically is to align it to 8
however, when you work with big data transfer than it is logical to align it as big as your machine can afford :biggrin:
this version is even more optimized then former and it has more logical order
as well as it can be faster for less data then 32 bytes:
option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx
.if (rcx!=rdx)
shr r8,1
.if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
shr r8,1
.if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
shr r8,1
.if (CARRY?)
mov r9,[rdx]
mov [rcx],r9
add rcx,8
add rdx,8
.endif
shr r8,1
.if (CARRY?)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4
add rcx,16
add rdx,16
.endif
.for (¦r8¦rcx+=32,rdx+=32,r8--)
vmovdqu ymm4,[rdx]
vmovdqu [rcx],ymm4
.endfor
.endif
aexit: ret
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Quote from: habran on February 21, 2013, 05:48:21 AM
Hello JJ2007,
my latest build is as usual in the topic ".FOR built in JWasm" http://masm32.com/board/index.php?topic=402.0 (http://masm32.com/board/index.php?topic=402.0)
Doesn't work on XP: "Not a valid Win32 app", access denied.
sorry JJ2007, :(
it doesn't work on XP
Quotebinaries need at least Windows version 6
(Japheth)
however, there is a workaround for that
source code is in the folder and you can compile yourself if not to much hustle
just replace these two files in JW209s folder
if you don't have M$VC you can compile it with PelesC
but I don't believe you have enough energy to go through all that trouble :dazzled:
prove me wrong, I dare you :P
The standard JWasm works just fine on XP, I use it every day. And, no, I won't try to compile it myself. It is not a question of energy, though. I am too wise to invest my time in trying to compile a major C app :biggrin:
Wise man JJ2007 :biggrin:
believe it or not that excellent JWasm is written in C
and Japheth had to create binaries from it
how do you think he created it, by laying on it for four weeks or something ::)
NO!!! he compiled it!!!! and it looks that he did not dye of it
C is not a plug it is a programming language for Christ sake
don't be a chicken, roll your sleeves and get dirty
No pain no gain!!! :bgrin:
Quote from: habran on February 21, 2013, 01:14:23 PM
C is not a plug it is a programming language for Christ sake
It's spelled "plague", Habran.
thanks wise man JJ2007 :t
what kind of spelling checker is that when it did not worn me!!! ;)
I will reward you for that and only you can use it, you deserved it! ;)
here is for you changed source:
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
mov ecx,dest
mov edx,src
mov ebx,count
.if (ecx!=edx)
shr ebx,1
.if (CARRY?)
mov al,[edx]
mov [ecx],al
inc ecx
inc edx
.endif
shr ebx,1
.if (CARRY?)
mov ax,[edx]
mov [ecx],ax
add ecx,2
add edx,2
.endif
shr ebx,1
.if (CARRY?)
mov eax,[edx]
mov [ecx],eax
add ecx,4
add edx,4
.endif
shr ebx,1
.if (CARRY?)
movq xmm4,[edx]
movq [ecx],xmm4
add ecx,8
add edx,8
.endif
.while (ebx)
movdqu xmm4,[edx]
movdqu [ecx],xmm4
add ecx,16
add edx,16
dec ebx
.endw
.endif
mov eax,dest
ret
xmemcpy ENDP
hey 2007,
are you going to abandon me because of a little spelling mistake :icon_eek:
plug, plague, plug in,plug out, ear plug, plagiarism... who cares :dazzled:
you are just trying to mask the main issue: compiling JWasm :bgrin:
those ENGLEZE have made mess with unnecessary complex spelling just to tease pure strangers :exclaim:
they messed it up so much that even they can not write "for sale" but use "4 sale" :icon_confused:
Hello,
Quote from: habran on February 21, 2013, 12:08:50 PM
if you don't have M$VC you can compile it with PelesC
it's mentioned in jwasm's readme, but since nobody reads readmes, I'll repeat it here: better
do NOT use PellesC to compile JWasm - the jwasm binary created by PellesC is unable to pass the regression tests supplied with the assembler. I haven't analyzed the problem too deeply, but judging from the part that fails I assume that floating-point constants don't have the values as they should.
Good compilers are: Open Watcom, MSVC, GCC (MinGW)
Quote from: habran on February 21, 2013, 01:49:47 PM
thanks wise man JJ2007 :t
...
here is for you changed source:
xmemcpy uses ebx dest:DWORD,src:DWORD,count:DWORD
mov ecx,dest
mov edx,src
...
ret
xmemcpy ENDP
Thanks, it looks competitive :t
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
loop overhead is approx. 238/100 cycles
9458 cycles for 100 * xmemcpy
8056 cycles for 100 * MbCopy
9292 cycles for 100 * xmemcpy
7893 cycles for 100 * MbCopy
9289 cycles for 100 * xmemcpy
8072 cycles for 100 * MbCopy
Hi Japheth,
Quotedo NOT use PellesC to compile JWasm
sorry for misunderstanding :bgrin:
I've read it but I thought that it applies only to 64 bit
jj2007,
thanks for testing it
this version is created for unaligned data as I mentioned before
can you please try to compare when not aligned at all? :biggrin:
jj2007,
this is what my machine produce from your test:
Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
loop overhead is approx. 155/100 cycles
2242 cycles for 100 * xmemcpy
5356 cycles for 100 * MbCopy
2239 cycles for 100 * xmemcpy
5455 cycles for 100 * MbCopy
2243 cycles for 100 * xmemcpy
5166 cycles for 100 * MbCopy
--- ok ---
as double as fast as yours, wouldn't you say so :shock:
QuoteThanks, it looks competitive
I would say
It looks downright stunning!!!! :t
Quote from: habran on February 21, 2013, 10:46:27 PM
I would say It looks downright stunning!!!! :t
I fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...
here is 64 bit without .for:
xmemcpy ENDP
option win64:0
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
xmemcpy PROC dest:QWORD,src :QWORD, count:UINT_PTR
mov rax,rcx
.if (rcx!=rdx)
shr r8,1
.if (CARRY?)
mov r9b,[rdx]
mov [rcx],r9b
inc rcx
inc rdx
.endif
shr r8,1
.if (CARRY?)
mov r9w,[rdx]
mov [rcx],r9w
add rcx,2
add rdx,2
.endif
shr r8,1
.if (CARRY?)
mov r9d,[rdx]
mov [rcx],r9d
add rcx,4
add rdx,4
.endif
shr r8,1
.if (CARRY?)
mov r9,[rdx]
mov [rcx],r9
add rcx,8
add rdx,8
.endif
shr r8,1
.if (CARRY?)
movdqu xmm4,[rdx]
movdqu [rcx],xmm4
add rcx,16
add rdx,16
.endif
.while (r8)
vmovdqu ymm4,[rdx]
vmovdqu [rcx],ymm4
add rcx,32
add rdx,32
dec r8
.endw
.endif
ret
xmemcpy ENDP
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Japheth,
Quote
I fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...
I did not touch code I just executed JJ's exe on my machine
and I can do it again now, let see:
Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
loop overhead is approx. 154/100 cycles
2237 cycles for 100 * xmemcpy
5243 cycles for 100 * MbCopy
2240 cycles for 100 * xmemcpy
5176 cycles for 100 * MbCopy
2233 cycles for 100 * xmemcpy
5163 cycles for 100 * MbCopy
--- ok ---
Japheth, why don't you try it in your machine?
Quote from: habran on February 22, 2013, 12:22:58 AM
Japheth, why don't you try it in your machine?
The fastest machine that I have available is an 5 year old AMD 64 X2 5000+.
Cool down. If it's faster than the MasmBasic algo, it just means it is faster on your CPU. Well optimised for your CPU.
In case you like it less superficially (d7=destination is align 16+7, s3=src is 16+3 etc):
AMD Athlon(tm) Dual Core Processor 4450B (SSE3)
Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 561 549 360 439 424 361 547 541
2048, d1s1-0 720 597 410 473 473 421 1061 798
2048, d7s7-0 721 598 412 474 474 412 1060 798
2048, d7s8-1 809 851 1016 578 566 582 802 558
2048, d7s9-2 809 853 1016 567 566 567 1058 798
2048, d8s7+1 810 851 868 563 564 565 819 607
2048, d8s8-0 738 587 404 465 480 416 547 541
2048, d8s9-1 801 848 994 563 564 567 804 606
2048, d9s7+2 824 864 862 565 564 579 1060 798
2048, d9s8+1 808 853 862 564 567 565 803 543
2048, d9s9-0 721 595 411 472 472 409 1061 798
2048, d15s15 722 591 425 480 486 422 1072 798
Your algo is pretty good, but for the (frequent) aligned case, there are four algos that perform better on my AMD.
prescott w/htt
Intel(R) Pentium(R) 4 CPU 3.00GHz (SSE3)
Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 717 719 608 609 904 610 718 1590
2048, d1s1-0 1100 846 651 651 650 650 4435 3945
2048, d7s7-0 1003 849 656 657 656 655 4437 3952
2048, d7s8-1 1368 1445 1223 868 611 613 4303 3799
2048, d7s9-2 1367 1446 1224 867 611 611 4454 3929
2048, d8s7+1 1338 1446 1188 1342 611 1023 1343 1748
2048, d8s8-0 976 849 656 657 657 656 977 1588
2048, d8s9-1 1332 1470 1212 873 611 612 1333 1733
2048, d9s7+2 1663 1440 1179 1342 611 1023 4150 4085
2048, d9s8+1 1660 1439 1182 1343 610 1023 4026 4014
2048, d9s9-0 1098 850 664 667 664 664 4135 4127
2048, d15s15 770 853 664 665 662 664 4136 4108
Here the test results:
Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)
Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 427 223 251 248 247 250 224 292
2048, d1s1-0 275 251 275 270 277 273 274 303
2048, d7s7-0 275 253 282 273 278 276 274 303
2048, d7s8-1 279 271 617 453 247 269 273 303
2048, d7s9-2 279 272 617 450 254 269 274 303
2048, d8s7+1 275 270 621 483 256 272 274 304
2048, d8s8-0 275 255 295 284 288 291 274 303
2048, d8s9-1 275 271 610 452 254 269 273 294
2048, d9s7+2 283 272 611 486 262 276 276 309
2048, d9s8+1 287 277 612 486 261 276 274 309
2048, d9s9-0 280 260 287 280 281 285 280 309
2048, d15s15 280 260 287 281 282 286 280 309
Gunther
One more - not by accident, #4 was named "CeleronM" ;-)
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 556 566 363 363 373 363 563 1051
2048, d1s1-0 1047 619 421 423 444 423 1683 1782
2048, d7s7-0 567 619 418 420 446 420 1699 1782
2048, d7s8-1 1677 1714 1090 441 1118 1123 1302 1337
2048, d7s9-2 1677 1713 1090 441 1118 1123 1716 1782
2048, d8s7+1 1655 1502 1090 857 979 975 1647 1245
2048, d8s8-0 556 619 420 422 448 422 563 1051
2048, d8s9-1 1664 1714 1083 441 1118 1123 1661 1241
2048, d9s7+2 1668 1502 1081 857 979 975 1762 1495
2048, d9s8+1 1668 1502 1081 857 979 975 1283 1052
2048, d9s9-0 1047 619 420 422 448 422 1686 1497
2048, d15s15 567 619 422 424 446 424 1678 1497
as I said before this routine is PARTICULARLY made for UNALIGNED data
that is why I use MOVDQU command
there is no reason to create a sophisticated algorithm for aligned data
you can just use fastest command to do that depending on the ability of your machine
;r8 can contain sizeof(buffer)
.for (rcx=dest,rdx=src,r8=count,r8>>=5¦r8¦rcx+=32,rdx+=32,r8--)
vmovdqa ymm4,[rdx]
vmovdqa [rcx],ymm4
.endfor
;or for for 16 byte xmm:
;r8 can contain sizeof(buffer)
.for (rcx=dest,rdx=src,r8=count,r8>>=4¦r8¦rcx+=16,rdx+=16,r8--)
movdqa xmm4,[rdx]
movdqa [rcx],xmm4
.endfor
;for 32 bit machine
;eax can contain sizeof(buffer)
.for (ecx=dest,edx=src,eax=count,eax>>=4¦eax¦ecx+=16,edx+=16,eax--)
movdqa xmm4,[edx]
movdqa [ecx],xmm4
.endfor
;or for JJ2007
mov ecx,dest
mov edx,src
mov eax,sizeof(buffer)
shr eax,4
.while (eax)
movdqa xmm4,[edx]
movdqa [ecx],xmm4
add edx,16
add ecx,16
dec eax
.endw
we can also use this:
mov ecx,dest
mov edx,src
mov eax,sizeof(buffer)
sub eax,16
.while (SDWORD eax > 0)
movdqa xmm4,[edx+eax]
movdqa [ecx+eax],xmm4
sub eax,16
.endw
we can use macros rather then subs
like this:
xmcopy16 MACRO dest,crc,size
mov ecx,dest
mov edx,src
mov eax,size
sub eax,16
.while (SDWORD eax >= 0)
movdqa xmm4,[edx+eax]
movdqa [ecx+eax],xmm4
sub eax,16
.endw
ENDM
xmcopy32 MACRO dest,crc,size
mov rcx,size
mov rdx,src
mov rax,size
sub rax,32
.while (SQWORD rax >= 0)
movdqa xmm4,[rdx+rax]
movdqa [rcx+rax],xmm4
sub rax,32
.endw
ENDM
here is test on my computer for JJ's exe
Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz (SSE4)
Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xme
mcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33
104
------------------------------------------------------------------------------------
2048, d0s0-0 133 184 205 203 202 204 184 238
2048, d1s1-0 225 206 226 223 227 223 225 249
2048, d7s7-0 225 208 229 225 228 216 225 249
2048, d7s8-1 228 223 501 367 209 219 221 246
2048, d7s9-2 225 218 498 365 206 217 221 245
2048, d8s7+1 221 217 502 390 206 219 221 244
2048, d8s8-0 221 205 238 229 232 235 221 244
2048, d8s9-1 222 218 492 365 204 218 222 245
2048, d9s7+2 220 217 488 390 206 219 221 244
2048, d9s8+1 226 218 491 390 206 219 221 245
2048, d9s9-0 221 206 224 222 224 226 221 245
2048, d15s15 221 206 226 222 225 226 221 245
--- ok ---
It is interesting how my code has steady speed in different sizes
and it is interesting how older processors perform in different way than newer
thank you JJ for taking time to write testing programs :t
however, I suspect that you are puling my leg because I don't have time nor desire to learn your BSIC$ ;)
(for the reason I mentioned before)
when I talk about a beauty of the source code I talk about visual effect ,readability and functionality
sometime your programs can be maybe even faster than someone else's but no one will try to read it
because most of your MULTO IMPORTANTE routines are hidden either in $$$$$ macros or %$#% external functions
however, it is a pleasure to exchange opinions and diversity in programming technics :biggrin:
Japheth,
QuoteThe fastest machine that I have available is an 5 year old AMD 64 X2 5000+.
I saw on Google that they are advertising new laptops for $249 dollars (probably with AVX) :biggrin:
actually, Jochen's MasmBasic is a very productive library
you can bet many of the routines are quite fast
and - many of the functions aren't found in the masm32 library
i would use it more often, myself, except for one thing....
i am trying to learn assembler for windows
high-level constructs mask the assembler code i am trying to learn
the same may be said for many of your macros
hi dedndave,
Quoteactually, Jochen's MasmBasic is a very productive library
you can bet many of the routines are quite fast
and - many of the functions aren't found in the masm32 library
there is no doubt about :t
we are talking here about readability of sources :bgrin:
as soon as I look at his source code I feel like piercing my eyes with a cactus torn
programs that look like this:"LET$!@#$%^&*@#$%^&*!"
who can have now-days enough patience and concentration to follow this code
"Mission Impossible 32" with JJ2007 as main actor (Tom Cruise refused the role because of the age)
and he is hiding his most important sources from public eyes like double agent 007
another drawback is that Jochen's MasmBasic is 32 bit and I am programming only 64 bit
I love assembler that's why I joined to this forum otherwise I would be a member of some BASIC community
please don't tell to JJ about our conversation, I don't want him to feel bad because I like him and appreciate his brains
Macros are helpful to make programs more readable but they should be visible to programmers and named properly :biggrin:
thanks Gunther for your contribution to this topic :t
QuoteIntel(R) Core(TM) i7-3770 CPU @ 3.40GHz (SSE4)
Speedy Gonzales like my "Ferrari Testarossa xmemcpy"
(AVX tires would make it even faster)
(https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRIvLk3BkI91urzhSOVGY3L48NHdusYFcSO4Q9XukpSpHx9A398uQ)
thanks to our God Father JJ Corleone for naming it so
Japheth,
QuoteI fully agree! However - almost 100% faster than MB - which allegedly is already rocket-science? How is this possible? You must do something wrong...
I found this explanation in "INTEL® 64 AND IA-32 PROCESSOR ARCHITECTURES" manual
Quote
2.3.5.1 Efficient Handling of Alignment Hazards
The cache and memory subsystems handles a significant percentage of instructions
in every workload. Different address alignment scenarios will produce varying performance
impact for memory and cache operations. For example, 1-cycle throughput of
L1 (see Table 2-21) generally applies to naturally-aligned loads from L1 cache. But
using unaligned load instructions (e.g. MOVUPS, MOVUPD, MOVDQU, etc.) to access
data from L1 will experience varying amount of delays depending on specific microarchitectures
and alignment scenarios.
Table 2-21. Performance Impact of Address Alignments of MOVDQU from L1
Throughput (cycle) Intel Core i7 45 nm Intel Core 65 nm Intel
Processor Microarchitecture CoreMicroarchitecture
________________________________________________________________________
Alignment Scenario 06_1AH 06_17H 06_0FH
16B aligned 1 2 2
________________________________________________________________________
Not-16B aligned, not
cache split
1 ~2 ~2
________________________________________________________________________
Split cache line
boundary ~4.5 ~20 ~20
________________________________________________________________________
Because my procesor is 2.3 gig Core i7 with a lot of cashe
it takes only 1 cycle for ither MOVDQU or MOVDQA
Intel(R) Core(TM) i7 CPU 860 @ 2.80GHz (SSE4)
Algo memcpy MemCo1 MemCo2 MemCoC3 MemCoP4 MemCoC2 MemCoL xmemcpy
Description CRT rep movs movdqa lps+hps movdqa movdqa Masm32 Habran's
dest-al psllq CeleronM dest-al src-al library Ferrari
Code size ? 70 291 222 200 269 33 104
------------------------------------------------------------------------------------
2048, d0s0-0 196 257 252 252 583 235 600 444
2048, d1s1-0 460 274 687 690 284 277 704 444
2048, d7s7-0 468 277 286 286 289 281 293 444
2048, d7s8-1 302 299 732 521 240 253 705 444
2048, d7s9-2 302 300 867 607 256 253 294 445
2048, d8s7+1 294 726 640 551 239 247 265 444
2048, d8s8-0 471 280 700 288 287 282 293 443
2048, d8s9-1 295 303 637 522 272 253 704 444
2048, d9s7+2 300 301 634 553 289 593 703 444
2048, d9s8+1 301 724 633 552 277 247 292 444
2048, d9s9-0 469 670 694 269 696 282 294 446
2048, d15s15 415 280 289 251 289 284 293 447
--- ok ---on: February 10, 2013, 11:57:14 PM Gunther wrote:
QuoteThere are a few applications which really need more than 4 GB RAM (large data bases for example), but others do not.
The transporters of the future will need to access 7,000,000,000,000,000,000,000,000,000 points of data - that's a 795,807,864,054,000.1 terrabyte address space :icon_eek:
Hi drifter,
Quote from: drifter on February 26, 2013, 05:49:48 PM
The transporters of the future will need to access 7,000,000,000,000,000,000,000,000,000 points of data - that's a 795,807,864,054,000.1 terrabyte address space :icon_eek:
that might be, but that could be reached with a 64 bit architecture. But what's with the hole bunch of other applications? By the way, you'll find a few 64 bit applications in the forum, which I've written.
Gunther
hello drifter,
welcome to the forum :biggrin:
interesting to see the difference in speed with different processors
your is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
Gunter, your is i7 3,4 gig and still slower than qWord's and mine
here are specifications:
Intel® Core™ i7-3610QM Processor Intel® Core™ i7-3770 Processor
(6M Cache, up to 3.30 GHz) (8M Cache, up to 3.90 GHz)
Specifications Specifications
Essentials Essentials
Status Launched Status Launched
Launch Date Q2'12 Launch Date Q2'12
Processor Number i7-3610QM Processor Number i7-3770
# of Cores 4 # of Cores 4
# of Threads 8 # of Threads 8
Clock Speed 2.3 GHz Clock Speed 3.4 GHz
Max Turbo Frequency 3.3 GHz Max Turbo Frequency 3.9 GHz
Intel® Smart Cache 6 MB Intel® Smart Cache 8 MB
Bus/Core Ratio 23 Bus/Core Ratio 34
DMI 5 GT/s DMI 5 GT/s
Instruction Set 64-bit Instruction Set 64-bit
Instruction Set Extensions AVX Instruction Set Extensions SSE4.1/4.2, AVX
Embedded Options Available No Embedded Options Available Yes
Lithography 22 nm Lithography 22 nm
Max TDP 45 W Max TDP 77 W
Recommended Customer Price TRAY: $378.00 Recommended Customer Price TRAY: $294.00
BOX : $305.00
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark
you are comparing one algo to another
not comparing one cpu to another
Hi habran,
Quote from: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark
that's the answer.
Gunther
on: February 26, 2013, 09:24:23 PM Gunther wrote:
Quotebut that could be reached with a 64 bit architecture
I thought 64 bit architecture could only reach 18,446,744,073,709,551,616 bits?
QuoteBy the way, you'll find a few 64 bit applications in the forum, which I've written.
Thanks! I definately look forward to studying those (once I get up to speed on 32 bits).
on: February 26, 2013, 10:08:02 PM habran wrote:
Quoteyour is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
I probably didn't have my computer set up for an optimum running of the race - it was just my standard operating configuration. I did close down all other applications, including the ones I didn't need in the toolbar - there's probably a lot more I could have shut down in task manager.
Hi drifter,
Quote from: drifter on February 27, 2013, 05:34:08 AM
I thought 64 bit architecture could only reach 18,446,744,073,709,551,616 bits?
please, have a look here: https://en.wikipedia.org/wiki/64-bit_architecture (https://en.wikipedia.org/wiki/64-bit_architecture)
Gunther
Frederick,
these tests are normally written to run with elevated prioriity
you shouldn't have to close down background apps to get decent results
provided the guy followed a few simple guidelines :P
Quote from: Gunther on February 27, 2013, 01:07:02 AM
Hi habran,
Quote from: dedndave on February 26, 2013, 11:55:54 PM
the number of clock cycles it takes for a processor to do something doesn't make a very good benchmark
that's the answer.
Gunther
.if (Gunther && dedndave)
would please explain
to habran
jmp understand
nop
nop
understand:
return knowlage
.endif
:bgrin:
on: Feburary 26, 2013 at 07:02:09 AM dedndave wrote:
Quotethese tests are normally written to run with elevated prioriity
you shouldn't have to close down background apps to get decent results
I put this computer together a couple of years ago with and I've been very happy with it:
Computer:
MSI P55-GD80 Motherboard w/Intel Core i7CPU @ 2.8 Ghz
2 x 27" Samsung LCD monitors
2 x Seagate ST32000641AS - Barracuda XT 2 TB Hard drives
16 GB RAM
Windows 8 64-bit
Desk:
Apogee Rosetta 800 AD/DA 8-channel converter
Lexicon PCM80 Digital Effects Processor
Lexicon PCM90 Digital Reverberator
Focusrite ISA-110 Limited Edition pre-amp/equalizer
Avalon Vt-737sp Vacuum Tube pre-amp/compressor/equalizer
Miscellaneous:
Neumann U87 Ai Condenser microphone
M-Audio Keystation Pro88 keyboard
Software:
Cakewalk Sonar/Dimension Pro/Rapture (64-bit)
Sony Sound Forge/Acid/Vegas/CD Architect /DVD Architect
Spectrasonics Stylus RMX/Trilian/Omnishpere
and of course:
MASM32
IDA Pro 5.3
Visual Studio 2005
hey drifter,
that looks impressive and neat :shock:
how did you pay your wife to arrange all this for you ;)
however, your screens are to high and you get probably quickly tired of looking at it :dazzled:
when we run these tests....
do:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2
do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2
on: Feburary 26, 2013 at 09:10:54 AM habran wrote:
Quotehow did you pay your wife to arrange all this for you
I have a Russian wife - she still thinks it's her duty...
Quotehowever, your screens are to high and you get probably quickly tired of looking at it :dazzled:
If I have to read something, I take it to kinkos have it printed..
drifter
QuoteI have a Russian wife - she still thinks it's her duty...
I am *much *much younger than you but I have learned in my life that
NOTHING is for free in this world ;)
*much = 1 year :lol:
Quote from: dedndave on February 27, 2013, 09:16:14 AM
when we run these tests....
do:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2
do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2
is that how you test which processor is faster? :dazzled:
no - we don't want to know which processor is faster :P
we want to know which algorithm is best, let's say, overall
Quotedo:
compare algo A to algo B on computer 1
compare algo A to algo B on computer 2
do not:
compare algo A on computer 1 to algo A on computer 2
compare algo B on computer 1 to algo B on computer 2
the real information is in (algo B)/(algo A) on a given processor
you might compare that ratio with the same ratio on another processor
we aren't here to measure cpu's :t
Quote from: dedndave on February 27, 2013, 10:06:51 AM
we aren't here to measure cpu's :t
so, you think I was rude to be curious :shock:
Quote
interesting to see the difference in speed with different processors
your is i7 2.8 gig and mine is i7 2.3 but speed is double
I am curies why is that?
Gunter, your is i7 3,4 gig and still slower than qWord's and mine
that was not a provocation it was a curios technical question:
to put it that way:
you are a salesman and I ask you as a customer:
"why would I by i7 6.8 gig if i7 2.3 gig runs faster?" :icon_confused:
and you tell me: "i7 6.8 gig sounds better, you moron" :icon_mrgreen:
a clock cycle on one cpu is not the same as a clock cycle on another (moron) :lol:
someone posted the comparison of specs - then removed it
but, i suspect the ratio of internal to external bus clocks might have something to do with it
if you want to know which is fastest, run a real-time benchmark test
set up a test to measure minutes and seconds to accomplish some specific task
then run the same test on both machines
you want it to be fairly long, say a few minutes or so
now you talking... :lol:
I thought bicycle, motorcycle, reversecycle, evercycle wtfcycle... are the same :dazzled:
thanks dedndave :t
dedndave, how do you explain this: :greenclp:
QuoteClock Speed
Clock speed is the rate at which a processor can complete a processing cycle. It is typically measured in megahertz or gigahertz. One megahertz is equal to one million cycles per second, while one gigahertz equals one billion cycles per second. This means a 1.8 GHz processor has twice the clock speed of a 900 MHz processor.
However, it is important to note that a 1.8 GHz CPU is not necessarily twice as fast as a 900 MHz CPU. This is because different processors often use different architectures. For example, one processor may require more clock cycles to complete a multiplication instruction than another processor. If the 1.8 GHz CPU can complete a multiplication instruction in 4 cycles, while the 900 MHz CPU takes 7 cycles, the 1.8 GHz processor will be more than twice as fast as the 900 MHz processor. Conversely, if the 1.8 GHz processor takes more cycles to perform the instruction, it will be less than 2x as fast as the 900 MHz processor.
Other factors, such as a computer's bus speed, cache size, speed of the RAM, and hard drive speed also contribute to the overall performance of the machine. Therefore, while the processor's clock speed is a significant indicator of how fast a computer is, it is not the only factor that matters.
you can find it here:here (http://pc.net/glossary/definition/clockspeed)
that's what i've been trying to tell you - lol
are you sure?
QuoteClock speed is the rate at which a processor can complete a processing cycle. It is typically measured in megahertz or gigahertz.
so, if some computer needs 444 cycles and another computer needs 222 cycles for the same job, does that mean that the one with 444 cycles is faster because it has bigger number? :exclaim:
write a test app to know the answer :biggrin:
dedndave,
your motto is: "Never give up" :biggrin:
my motto is : "Never give in" 8)
you and me together unbreakable :t