include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
.data
q1 QWORD 10000000000000000000000000000000000b ; only bit 34 set
q2 QWORD 10010000000000000000000000000000000b ; bits 34 and 31 set
q3 QWORD 00010000000000000000000000000000000b ; only bit 31 set
bsr64 MACRO argQ
mov eax, dword ptr argQ
mov edx, dword ptr argQ[4]
; deb 4, "bsr", b:edx, b:eax
bsr eax, edx ; non-zero if edx is non-zero
.if Zero?
bsr eax, eax
.else
add eax, 32
.endif
exitm <eax>
ENDM
Init
Print Str$("bsr64 of q1 = %i\n", bsr64(q1))
Print Str$("bsr64 of q2 = %i\n", bsr64(q2))
Inkey Str$("bsr64 of q3 = %i\n", bsr64(q3))
EndOfCodeOutput:
bsr64 of q1 = 34
bsr64 of q2 = 34
bsr64 of q3 = 31I'd like to know whether anybody (e.g. on AMD cpu) sees a different output - exe is attached.
P.S.: See also Bit Manipulation Instruction Sets (https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets) and the September 2013 bsr eax, eax with eax=0 (http://masm32.com/board/index.php?topic=2312.0) thread.
Apparently it plays a role in chess programming, see Bsf/Bsr behavior with zero source (https://chessprogramming.wikispaces.com/Processor%20Instructions%20for%20Bitscans-x86-Bsf/Bsr%20behavior%20with%20zero%20source):
QuoteIntel and AMD specify different behavior. In praxis there seems no difference so far. However, as long as Intel docs explicitly state content undefined, it is recommend to don't rely on a pre-initialized content of that target register, if the source is zero.
Intel : If the content of the source operand is 0, the content of the destination operand is undefined. [35 (http://www.intel.com/Assets/PDF/manual/253666.pdf)]
AMD: If the second operand contains 0, the instruction sets ZF to 1 and does not change the contents of the destination register. [36 (http://support.amd.com/us/Processor_TechDocs/24594_APM_v3.pdf)]
Timings (b is the safe version):
Intel(R) Core(TM) i5-2450M CPU @ 2.50GHz (SSE4)
153 cycles for 100 * bsr64a, bit 31
153 cycles for 100 * bsr64b, bit 31
102 cycles for 100 * bsr64a, bit 34
165 cycles for 100 * bsr64b, bit 34
bsr64 of q1 = 34
bsr64 of q2 = 34
bsr64 of q3 = 31
Yuck.
AMD A10-7850K Radeon R7, 12 Compute Cores 4C+8G (SSE4)
867 cycles for 100 * bsr64a, bit 31
893 cycles for 100 * bsr64b, bit 31
468 cycles for 100 * bsr64a, bit 34
460 cycles for 100 * bsr64b, bit 34
882 cycles for 100 * bsr64a, bit 31
850 cycles for 100 * bsr64b, bit 31
425 cycles for 100 * bsr64a, bit 34
445 cycles for 100 * bsr64b, bit 34
844 cycles for 100 * bsr64a, bit 31
856 cycles for 100 * bsr64b, bit 31
443 cycles for 100 * bsr64a, bit 34
440 cycles for 100 * bsr64b, bit 34
855 cycles for 100 * bsr64a, bit 31
847 cycles for 100 * bsr64b, bit 31
432 cycles for 100 * bsr64a, bit 34
437 cycles for 100 * bsr64b, bit 34
840 cycles for 100 * bsr64a, bit 31
842 cycles for 100 * bsr64b, bit 31
441 cycles for 100 * bsr64a, bit 34
445 cycles for 100 * bsr64b, bit 34
851 cycles for 100 * bsr64a, bit 31
852 cycles for 100 * bsr64b, bit 31
440 cycles for 100 * bsr64a, bit 34
439 cycles for 100 * bsr64b, bit 34
24 bytes for bsr64a, bit 31
26 bytes for bsr64b, bit 31
24 bytes for bsr64a, bit 34
26 bytes for bsr64b, bit 34
31 = eax bsr64a, bit 31
31 = eax bsr64b, bit 31
34 = eax bsr64a, bit 34
34 = eax bsr64b, bit 34
Quote from: sinsi on June 02, 2016, 12:51:00 PM
Yuck.
AMD A10-7850K Radeon R7, 12 Compute Cores 4C+8G (SSE4)
It seems that AMD is not so fast for bsr. Thanks, it seems the additional push+pop doesn't slow the "safe" version down.
Btw there is a hilarious debate at SOF on What is the fastest/most efficient way to find the highest set bit (msb) in an integer in C? (http://stackoverflow.com/questions/671815/what-is-the-fastest-most-efficient-way-to-find-the-highest-set-bit-msb-in-an-i). At about half the page, Josh posts "some (simple) benchmarks". Compiles fine with GCC, using e.g. using c++ -O1 -o Tmp.exe "Tmp.cpp"
Now it turns out that the inline assembly part is
not particularly fast. Given that the other approaches are, ehm, not as simple and straightforward as
bsr eax, eax, I inserted an
int 3, and Olly told me a secret 8)
unsigned OUT;
#define POS_OF_HIGHESTBITasm(a) (({asm("bsrl %1,%0" : "=r"(OUT) : "r"(a));}), OUT)
...
start = clock();
for (ui = 0; ui < LOOPS; ++ui)
asm("int $3");
n = NUM_OF_HIGHESTBITasm(ui);
end = clock();
printf("asm:\t%e\n", (double)(end-start)/CLOCKS_PER_SEC);So far so good. But why is that simple bsr eax, eax as slow as the acrobatic C algos???
004016F4 ³. E8 7F0A0000 call <jmp.&msvcrt.clock> ; Jump to msvcrt.clock
004016F9 ³. 89C3 mov ebx, eax
004016FB ³. BA 00E1F505 mov edx, 5F5E100 ; edx is decimal 100000000
00401700 ³> CC Úint3
00401701 ³. 83EA 01 ³sub edx, 1
00401704 ³. 75 FA Àjnz short 00401700
00401706 ³. B8 00E1F505 mov eax, 5F5E100
0040170B ³. 0FBDC0 bsr eax, eax
0040170E ³. A3 08604000 mov [406008], eax
00401713 ³. E8 600A0000 call <jmp.&msvcrt.clock> ; Jump to msvcrt.clock
Because GCC inserts a little loop with 100000000 iterations there :greensml:
Quote from: jj2007 on June 02, 2016, 01:30:08 PM
for (ui = 0; ui < LOOPS; ++ui)
asm("int $3");
n = NUM_OF_HIGHESTBITasm(ui);
So far so good. But why is that simple bsr eax, eax as slow as the acrobatic C algos???
[...]
Because GCC inserts a little loop with 100000000 iterations there :greensml:
No, because you insert a loop with 100000000 iterations ;)
bsr64 of q1 = 34
bsr64 of q2 = 34
bsr64 of q3 = 31
Intel(R) Core(TM) i3-3110M CPU @ 2.40GHz (SSE4)
171 cycles for 100 * bsr64a, bit 31
176 cycles for 100 * bsr64b, bit 31
113 cycles for 100 * bsr64a, bit 34
181 cycles for 100 * bsr64b, bit 34
172 cycles for 100 * bsr64a, bit 31
173 cycles for 100 * bsr64b, bit 31
113 cycles for 100 * bsr64a, bit 34
180 cycles for 100 * bsr64b, bit 34
171 cycles for 100 * bsr64a, bit 31
174 cycles for 100 * bsr64b, bit 31
114 cycles for 100 * bsr64a, bit 34
181 cycles for 100 * bsr64b, bit 34
196 cycles for 100 * bsr64a, bit 31
201 cycles for 100 * bsr64b, bit 31
115 cycles for 100 * bsr64a, bit 34
182 cycles for 100 * bsr64b, bit 34
173 cycles for 100 * bsr64a, bit 31
173 cycles for 100 * bsr64b, bit 31
113 cycles for 100 * bsr64a, bit 34
182 cycles for 100 * bsr64b, bit 34
171 cycles for 100 * bsr64a, bit 31
172 cycles for 100 * bsr64b, bit 31
113 cycles for 100 * bsr64a, bit 34
182 cycles for 100 * bsr64b, bit 34
24 bytes for bsr64a, bit 31
26 bytes for bsr64b, bit 31
24 bytes for bsr64a, bit 34
26 bytes for bsr64b, bit 34
31 = eax bsr64a, bit 31
31 = eax bsr64b, bit 31
34 = eax bsr64a, bit 34
34 = eax bsr64b, bit 34
--- ok ---
Quote from: qWord on June 02, 2016, 10:50:27 PMNo, because you insert a loop with 100000000 iterations ;)
Oops, you are right - at war again <with> (the) [goddamn] {C brackets} :icon_redface: