For example: 0010 0110 0111 1000 1001 1001 1001 0101 should be reversed to: 1010 1001 1001 1001 0001 1110 0110 0100
This is a challenge, the solution with the minimum number of 32-bit ASM instructions wins. I have a solution with 8 instructions. Anyone can do in less?
I have not got time to do it but what about BSWAP then use a 0 to 255 lookup table ? Should be 5 instructions.
Quote from: hutch-- on November 01, 2019, 02:40:17 AM
I have not got time to do it but what about BSWAP then use a 0 to 255 lookup table ? Should be 5 instructions.
I would rather prefer no lookup tables. However, using BSWAP then reverse the bits in each byte is an idea but should take more than 8 instructions all together.
deleted
Quote from: nidud on November 01, 2019, 03:25:32 AM
mov ecx,32
B: shr edx,1
rcl eax,1
loop B
Very good :thumbsup: , eax don't need to be zeroed but you need 1 more instruction to load edx. This makes 5. Probably it is not possible to do better. Congratulations.
This was mine:
xor eax, eax
mov ecx, 32
revert: ; revert bits
mov ebx, esi
add eax, eax
and ebx, 1
shr esi, 1
or eax, ebx
loop revert
Quote from: AW on November 01, 2019, 01:53:40 AM
For example: 0010 0110 0111 1000 1001 1001 1001 0101 should be reversed to: 1010 1001 1001 1001 0001 1110 0110 0100
This is a challenge, the solution with the minimum number of 32-bit ASM instructions wins. I have a solution with 8 instructions. Anyone can do in less?
maybe reciprocal calculation smallest
Loop code has more than 8 executed instructions.
Why not in C too//#include <intrin.h>
// 0011 1101 0010 0110
// 0110 0100 1011 1100
// 3D26h -> 64BC
// 0010 0110 0111 1000 1001 1001 1001 0101
// 1010 1001 1001 1001 0001 1110 0110 0100
// 26789995h -> A9991E64h
unsigned char _bittest(const long *valueptr, long index);
unsigned char _bittestandset(long *valueptr, long index);
#if defined(__clang__)
unsigned char _interlockedbittestandreset(volatile long *valueptr, long index);
#else
unsigned char _interlockedbittestandreset(long *valueptr, long index);
#endif
int printf(const char * restrict format, ...);
#pragma comment(lib, "msvcrt.lib")
void __cdecl mainCRTStartup(void)
{
//long ul = 0x3D26;
long ul = 0x26789995;
long ul2 = 0;
//int i1 = _bit_scan_forward(ul); // first bit to set
//int i2 = _bit_scan_reverse(ul); // last bit to set
//for (int i=i1,j=31;i<=i2;i++,j--) {
for (int i=0,j=31;i<=31;i++,j--) {
if (_bittest((const long int *)&ul, i)) {
_bittestandset(&ul2, j);
printf("1");
} else {
_interlockedbittestandreset(&ul2, j);
printf("0");
}
}
printf("\n%lXh %lXh\n", ul, ul2);
}
without header intrin.h, C compilers accept local defines for testing.
EDIT:
unsigned char _bittest(const long *valueptr, long index);
unsigned char _bittestandset(long *valueptr, long index);
void __cdecl mainCRTStartup(void)
{
long ul = 0x26789995;
long ul2 = 0;
for (int i=0,j=31; j; i++,j--) {
if (_bittest((const long int *)&ul, i)) _bittestandset(&ul2, j);
}
printf("\n%lXh %lXh\n", ul, ul2);
}
.386
includelib msvcrt.lib
public mainCRTStartup
.data
fmt db "%lXh %lXh",10,0
.code
mainCRTStartup:
push ebp
mov ebp, esp
sub esp, 8h
mov dword ptr [ebp-4h], 26789995h
mov dword ptr [ebp-8h], 0h
xor eax, eax
mov edx, 1Fh
L1:
bt dword ptr [ebp-4h], eax
setb cl
test cl, cl
jz @F
bts dword ptr [ebp-8h], edx
@@:
setb cl
add eax, 1h
sub edx, 1h
jnz L1
push dword ptr [ebp-8h]
push dword ptr [ebp-4h]
push offset fmt
call printf
add esp, 0Ch
mov esp, ebp
pop ebp
ret
end
void print_bits(long val)
{
char *p, s[40];
p = s;
for (int i=31; i>=0; i--) {
if (_bittest((const long int *)&val, i)) *p = '1'; else *p = '0';
//if (val & (1<<i)) *p = '1'; else *p = '0';
p++;
if (!(i%4)) {
*p = ' ';
p++;
}
}
*(--p) = 0;
puts(s);
}
or edx, -1
m2m ecx, 32
@@: dec ecx
jle @F
ror eax, 1
jc @B
btr edx, ecx
jmp @B
@@:
Some AMDs have the XOP instruction set which contains an instruction called VPPERM that allows to produce bit reversal. I presume the XOP is deprecated and recent AMDs don't have it.
:biggrin:
edx = 00100110011110001001100110010101y, challenge [awch1.asm, 77]
eax = 10101001100110010001111001100100y, Nidud [awch1.asm, 85]
eax = 10101001100110010001111001100100y, AW [awch1.asm, 108]
edx = 10101001100110010001111001100101y, JJ [awch1.asm, 124] Oops!
deleted
:thumbsup:
edx = 00100110011110001001100110010101y, challenge [awch1.asm, 45]
eax = 10101001100110010001111001100100y, Nidud (4 active lines, value in edx) [awch1.asm, 52]
eax = 10101001100110010001111001100100y, AW (8 active lines, value in esi) [awch1.asm, 66]
edx = 10101001100110010001111001100100y, JJ-Nidud (11 active lines, value in eax) [awch1.asm, 83]
@HSE: Why 11 "active lines"? It's 8 instructions. But if you insist, it can be done in 8 lines, too:
or edx, -1
m2m ecx, 32
@@: dec ecx
jle $+11
ror eax, 1
jc @B
btr edx, ecx
jmp @B
Cleaned from C code:
.386
includelib msvcrt.lib
public mainCRTStartup
.data
fmt db "%lXh %lXh",10,0
.code
mainCRTStartup:
push ebp
mov ebp, esp
sub esp, 8h
mov dword ptr [ebp-4h], 26789995h
mov dword ptr [ebp-8h], 0h
xor eax, eax
mov edx, 1Fh
L1:
bt dword ptr [ebp-4h], eax
;setb cl ; useless
;test cl, cl ; useless
jnb @F ; jz @F
bts dword ptr [ebp-8h], edx
@@:
;setb cl ; useless
add eax, 1h
sub edx, 1h
jnz L1
push dword ptr [ebp-8h]
push dword ptr [ebp-4h]
push offset fmt
call printf
add esp, 0Ch
mov esp, ebp
pop ebp
ret
end
Quote from: jj2007 on November 01, 2019, 08:04:30 AM
@HSE: Why 11 "active lines"? It's 8 instructions.
Hi JJ!
See last Nidud post, you forget some lines. You can see your results in previous post (http://masm32.com/board/index.php?topic=8138.msg89374#msg89374)
Active lines are lines with instructions, lines with only labels or comments (or nothing) don't count. Some assemblers require only labels lines, others no. (I prefer to use only labes lines always)
Quote from: HSE on November 01, 2019, 09:16:03 AMyou forget some lines
New version. The input in eax remains unchanged, while the result is returned in edx. Actually,
7 lines are enough, and no labels needed - see attachment :tongue:
input before b:eax 11111111111111111111111111111111
reversed once b:edx 11111111111111111111111111111111
reversed twice b:edx 11111111111111111111111111111111
input before b:eax 11110000111100001111000011110000
reversed once b:edx 00001111000011110000111100001111
reversed twice b:edx 11110000111100001111000011110000
input before b:eax 11101110111011101110111011101110
reversed once b:edx 01110111011101110111011101110111
reversed twice b:edx 11101110111011101110111011101110
input before b:eax 11001100110011001100110011001100
reversed once b:edx 00110011001100110011001100110011
reversed twice b:edx 11001100110011001100110011001100
input before b:eax 10101010101010101010101010101010
reversed once b:edx 01010101010101010101010101010101
reversed twice b:edx 10101010101010101010101010101010
input before b:eax 00000000000000000000000000000000
reversed once b:edx 00000000000000000000000000000000
reversed twice b:edx 00000000000000000000000000000000
without m2m MACRO it could be 7.
Quote from: TimoVJL on November 01, 2019, 06:42:31 PM
without m2m MACRO it could be 7.
Right,
mov ecx, 32 is ok, too. If it's not speed-critical, I always use m2m for the range -128...127.
This is the smallest without using the loop instruction :badgrin:
mov eax, 0
mov esi, 00100110011110001001100110010101b
reverse:
lzcnt edx, esi
bts eax, edx
not dl
and dl, 1fh
btc esi, edx
jnz reverse
For people with Haswell because this requires Bit Manipulation Instruction Set 2 (BMI2):
mov eax, 0
mov esi, 00100110011110001001100110010101b
reverse:
lzcnt ecx, esi
bsr edx, esi
bts eax, ecx
bzhi esi, esi, edx ; requires BMI2
jnz reverse
Quote from: AW on November 01, 2019, 09:33:01 PM
This is the smallest without using the loop instruction :badgrin:
Apparently
lzcnt is a lot simpler CISC algorithm than
loop (but this machine only have SS3 :biggrin:) . I think that replacing loop with dec ecx / jnl @B is simpler and universal.
edx = 00100110011110001001100110010101y, challenge [awch1.asm, 49]
eax = 10101001100110010001111001100100y, Nidud (4 active lines, value in edx) [awch1.asm, 56]
eax = 10101001100110010001111001100100y, AW (8 active lines, value in esi) [awch1.asm, 70]
edx = 10101001100110010001111001100101y, JJ Oops (if wrong no matter active lines, value in eax) [awch1.asm, 83]
edx = 10101001100110010001111001100100y, JJ-Nidud (11 active lines, value in eax) [awch1.asm, 101]
edx = 10101001100110010001111001100100y, JJ Another (7 active lines, value in eax) [awch1.asm, 114]
note: asmc /Zne /c /coff awch1.asm
lzcnt is similar to bsr but bsr brought me a headache with the flags.
This was modified from C code mov esi, 26789995h
xor edi, edi
xor eax, eax
mov edx, 1Fh
L1:
bt esi, eax
;setb cl
;test cl, cl
jnb @F ; jz @F
bts edi, edx
@@:
;setb cl
;add eax, 1h
inc eax
;sub edx, 1h
dec edx
jnz L1
why shouldnt a single 1/x with fdiv with double work on reversing bits,as 2 reversed=0.5,4 reversed =0.25,8 reversed=0.125 ...?
Quote from: daydreamer on November 02, 2019, 12:30:37 AM
why shouldnt a single 1/x with fdiv with double work on reversing bits,as 2 reversed=0.5,4 reversed =0.25,8 reversed=0.125 ...?
Why don't you test it and post some code?