This is my first attemp to create a program to check entropy of a file.
It's not optimized, intention was to be easy to read.
For now only linux x86-64 version.
edited- forgot exit,0 after show usage
.X64
include ../inc/c.inc
include ../inc/glib.inc
count_frequency proto :ptr,:qword,:ptr
frequency2probabilities proto :ptr,:qword
.data?
input_filename dq ?
pinput_string dq ?
input_sz dq ?
.data
align 16
frequency_table dq 256 dup (0) ;qword integer but it's tranformed to real8
.code
main proc uses rbx rbp r12 r13 r14 r15 _argc:dword,_argv:ptr
local argc:guint32
local argv:gpointer
local first:qword
local second:qword
local entr:qword
local isentropic:qword
local error:ptr _GError
mov argc,_argc
mov argv,_argv
.if argc == 1 ;user typed only program name, show usage
invoke printf,CStr("Usage: entropy filename",10)
invoke exit,0
.elseif argc == 2 ;entropy input_file
mov rax,argv ;ptr command line tokens
add rax,8 ;ignore program name
mov rax,[rax] ;pointer to token 2
mov input_filename,rax ;input filename
invoke g_get_current_dir ;actual path rax
invoke g_build_filename,rax,input_filename,NULL ;append input filename to path
mov error,0
invoke g_file_get_contents,rax,addr pinput_string,addr input_sz,addr error ;open filename
.if eax == FALSE
mov rax,error
assume rax:GError
mov rsi,[rax].message
assume rax:nothing
invoke printf,CStr("%s",10),rsi
invoke exit,1
.endif
.endif
;count byte symbols frequency
invoke count_frequency,pinput_string,input_sz,addr frequency_table
;input size is equal to symbols sum, so I can transform frequency into probabilities
;input size divided by symbol frequency
invoke frequency2probabilities,addr frequency_table,input_sz
;calculate symbol_prob * log2 symbol_prob to each symbol
lea r12,frequency_table ;probabilities symbols table
mov r13,0
finit
.while r13 != 256
.if qword ptr [r12] != 0
fld qword ptr [r12]
fld qword ptr [r12]
fyl2x ;x * log2 (x)
fstp qword ptr [r12] ;store result
.endif
add r12,8
inc r13
.endw
;sum all x*log2x of each symbol
lea r12,frequency_table
mov r13,0
pxor xmm0,xmm0
.while r13 != 256
addsd xmm0,qword ptr [r12]
add r12,8
inc r13
.endw
movsd first,xmm0 ;prob sum
mov rax,input_sz
cvtsi2sd xmm1,rax ;input_sz
mulsd xmm0,xmm1
movsd second,xmm0 ;prob sum times input size
; mov rax,1
; invoke printf,CStr("%f bits",10),xmm0
invoke printf,CStr("input size = %d bytes",10),input_sz
invoke printf,CStr("In theory compressor order 0 byte based can compress this file to ")
mov rax,8
cvtsi2sd xmm1,rax
movsd xmm0,second
divsd xmm0,xmm1 ;minus result to plus result
movsd xmm1,xmm0
subsd xmm0,xmm1
subsd xmm0,xmm1
;get integer part of floating point and compare with floating point value
cvtsd2si rax,xmm0
cvtsi2sd xmm1,rax
comisd xmm1,xmm0
je @F
inc rax ;floating point is bigger than integer part
@@:
mov entr,rax
;mov rax,1
;invoke printf,CStr("%f bytes",10),xmm0
invoke printf,CStr("%d bytes",10),rax
; input = 100%
; entr = x%
xor edx,edx
mov rax,entr
mov rcx,100
mul rcx
xor edx,edx
mov rcx,input_sz
div rcx
mov rcx,100
sub rcx,rax
mov isentropic,rcx
invoke printf,CStr("compression <= %d%",10),rcx
mov rcx,isentropic
.if rcx <= 3 ;3%
invoke printf,CStr("This is an entropic file",10)
invoke printf,CStr("Probably its a compressed file, or cripto file, or random file",10)
invoke printf,CStr("So much information inside this file",10)
.elseif rcx <= 10 ;10%
invoke printf,CStr("More or less entropic file",10)
invoke printf,CStr("Information inside this file can be compressed",10)
.else
invoke printf,CStr("Not an entropic file",10)
invoke printf,CStr("Little information inside this file",10)
invoke printf,CStr("Compression order 1 or above can probably reduce this file size",10)
.endif
invoke exit,0
main endp
align 16
count_frequency proc uses rbx rbp r12 r13 r14 r15 _pointer:ptr,_ssize:guint64,ftable:ptr
local pinput:ptr
local insz:qword
local freq_table:ptr
mov pinput,_pointer
mov insz,_ssize
mov freq_table,ftable
mov rsi,pinput
mov rdi,freq_table
mov rcx,insz
.while rcx != 0
movzx rax,byte ptr [rsi]
dec rcx
inc rsi
inc qword ptr [rdi+rax*8]
.endw
ret
count_frequency endp
align 16
frequency2probabilities proc uses rbx rbp r12 r13 r14 r15 _pointer:ptr,_ssize:guint64
local pinput:ptr
local insz:qword
mov pinput,_pointer
mov insz,_ssize
mov rax,insz
cvtsi2sd xmm0,rax ;symbols sum
mov rsi,pinput
mov rcx,256
.while rcx != 0
mov rax,qword ptr [rsi] ;symbol frequency
.if rax != 0
cvtsi2sd xmm1,rax
divsd xmm1,xmm0 ;symbol frequency / symbols sum
movsd [rsi],xmm1 ;store in frequency (probabilities table)
.endif
add rsi,8
dec rcx
.endw
ret
frequency2probabilities endp
end main
Output sounds like this:
Quote
./entropy defrand.pad
input size = 1000000 bytes
In theory compressor order 0 byte based can compress this file to 999980 bytes
compression <= 1%
This is an entropic file
Probably its a compressed file, or cripto file, or random file
So much information inside this file
Quote
./entropy entropy.uasm
input size = 5449 bytes
In theory compressor order 0 byte based can compress this file to 3081 bytes
compression <= 44%
Not an entropic file
Little information inside this file
Compression order 1 or above can probably reduce this file size
I have a file in my random number generator :file of 13k primes worth use testing entropy with,much unique numbers and hard to compress
unfortunatly I dont have linux
hello sir daydreamer;
Sorry for posting only today, busy days here.
This is a windows x86-64 version. I tested it only in wine on linux.
If somebody find something wrong, please, tell me.
so x64 version does that mean it takes advantage of 64bit mode alloc enough to handle 13gb file?
just needed to find those files comparing between .exe that contains prime.inc LUT
and prime.inc source file
PS C:\masm32\1th> .\entropy 1thwithLUT.exe
input size = 16896 bytes
In theory compressor order 0 byte based can compress this file to 15744 bytes
compression <= 7 percent
More or less entropic file
Information inside this file can be compressed
-------------------------------------------------------------------------------------
PS C:\masm32\1th> .\entropy primes.inc
input size = 55282 bytes
In theory compressor order 0 byte based can compress this file to 24347 bytes
compression <= 56 percent
Not an entropic file
Little information inside this file
Compression order 1 or above can probably reduce this file size
PS C:\masm32\1th>
hello sir daydreamer;
Yes, takes advantage if symbol frequency don't overflow 8 bytes (qword) frequency count, and don't lost real8 precision. BUT I don't have checked in huge files.
One thing that need be said is about model. Model used in this program checks only a symbol without dependency of other symbols, so, byte order0 based. We can do this with bits and reach other entropy value. Entropy gives correct value but it's model dependent.
My tests was done in enwik8 file. I will present other file that most of us have inside computer, windows.inc file.
./entropy windows.inc
input size = 850039 bytes
...can compress this file to 464451 bytes
compression <= 46%
Exist a data compression program called dmc (Dynamic Markov Chain). This is bit based, an arithmetic encoder compression tool that uses Markov chain disposed like a braid (a tree where nodes reach roots). Uses a trigger in probabilities to satisfy clonning process of leaf nodes. Or in other words, Markov chain was used as one way to recreate (predict) source symbols, like talking with a bot you know.
./admc3 c windows.inc 1
done...
bytes in 850039, bytes out 181188, ratio 0.213153
So, windows.inc file was compressed to 181188 bytes. Very far from 464451 bytes suggested by entropy. This is because other model was used in compression. If we insert that model and count frequency based in that model used by dmc we can reach correct entropy value, and also check efficiency of arithmetic encoder, or how much bits was increased (or lost) by using an arithmetic encoder.
Other example comes from Mark Nelson. If I try their compression tool that uses a lot of memory too to compress a file based in order N, I got different results:
Usage: COMP-2 [-o order] [-f text file] [-c compressed file]
./comp-2 -o 1 -f windows.inc
Compressing windows.inc to test.cmp, order 1.
850039/261724, 2.463
./comp-2 -o 2 -f windows.inc
Compressing windows.inc to test.cmp, order 2.
850039/203682, 1.917
./comp-2 -o 3 -f windows.inc
Compressing windows.inc to test.cmp, order 3.
850039/166675, 1.569
./comp-2 -o 4 -f windows.inc
Compressing windows.inc to test.cmp, order 4.
850039/154910, 1.458
./comp-2 -o 5 -f windows.inc
Compressing windows.inc to test.cmp, order 5.
850039/154953, 1.458
So, if we increase order (inter/intra correlation between symbols) we can get more compression. But this is not absolute answer, because order 5 compressed file to 154953 bytes while order 4 compressed file to 154910 bytes.
First came gif compression, 256 color compression, jpg compressed 24bit color with few 0-2 noise on each channel that the eye didn't notice
Png good for both photographs and gif things with transparency
.Ico files 32bit with x bit transparency, 24bit,16bit,8bit palette, 4bit palette,1bit palette,when working with icons I noticed. Zip uncompressed icons worse than compressed. Ico format
My point being,check file format and switch/case check different entropy depending on bit depths? Simplest with. Bmps
Maybe need adjust to different algo with. Jpg lossy compression?
Quote from: daydreamer on October 14, 2021, 01:47:15 AM
My point being,check file format and switch/case check different entropy depending on bit depths? Simplest with. Bmps
Maybe need adjust to different algo with. Jpg lossy compression?
Good point, it's hard answer because it's related to data instead of specific algorithm.
Try a one collor only figure and save as jpg and gif. Gif will compress better.
Depending on data, Huffman tree can do the same as arithmetic encoder; but in other data can fail.
Jpg have 2 versions, a lossy and a lossless. When was released jpeg format the arithmetic encoder was patented hard, so they used Huffman. But now patent expired. I have see some persons exporting compressed jpg image as raw data, then compressing with arithmetic encoder and structured that back to jpeg structure/form. Have some gain by doing this. Jpeg support arithmetic encoding but don't used that because patents.
Exist a lot of compression algorithms, a lot of LempelZiv variations (Zip, Rar, ...), exist transforms like Burrows-Wheller (gzip) that is usefull in text files, a move-to-front after BW transform so more seen symbols are at start of 'pseudo' alphabeth, and output of this going to huffman/AC.
Others start with specific dictionary frequency, after digraphs, trigraphs, ..., elevating orders, trying a dictionary prediction (predict partial match, PPM), and outputing that to some arithmetic encoding.
AC reach very close to entropy, have some bits lost because magnitude problem. Well, the carry. It's not viable to take the whole file in memory, and deal with carry passing by a lot of bytes. Generally we take a register size, like 32 or 64 bits, and if carry goes and overflow register, we deal only with this. We predict that this can happen when left most 2 bits are oposite one from each other; so, take an action.
The tendency is to go to Artificial Inteligence. Teach a brain to learn about data and their own errors and correct itself, and output of this brain layer going to some data comrpression algorithm.
It's hard answer because if you change input source file being tested, one compressor program can compress better than other. And supposition here is that the same data type (text as example) is being source input.
I convert many pics to .ico files when I work on a game,sometimes compress to 8bit format get its smaller than 24bit,sometimes not,maybe some entropy test+taking account the extra 1k palette takes?
I don't play with .ico files; so I'm not the best person to answer.
I was reading .ico file format before post this message and not sounds to much complicate.
https://en.wikipedia.org/wiki/ICO_(file_format)
Entropy rule says something like: If a color inside a pallete color have same probabilities of other colors, so entropy will be high; if a pallete color have colors that is more probable to occur than other colors, so entropy will decrease.
In other words; if next color in an image have same probabilities of all other colors, so we cannot guess whats next color, entropy increase. If a color is blue, and a pattern is supposed to being created, so next times a blue color is seen, we can try to guess or predict next color after blue, this way entropy will decrease if our guess was right.
Entropy can be seen as surprise to us humans. If appear a japanese word in this text will cause so much surprise to us, our mind can't guess whats that, was not able to predict, so entropy increase. The word color in this text was so much present, we can predict the context, entropy decrease.
From now it's only suppositions.
From link above, .ico files starting from windows vista support compression (png), before was just an image raw dump structured in a file format. If your .ico files are bigger than others, this should be a color pallete using a lot of different colors, and these colors having the "same" frequency/probability than other colors.
What is possible to do is dump an .ico image as raw (Paint Shop Pro, ...). Insert that raw .ico image as bunch of db using assembler and create .ico header file in memory by programming. So, if works, next step can be compression. If all images have same size (width and height), better, little surprise here, can compress more with only one information.
Palletes are just like a dictionary (index) used in LempelZiv (zip, rar,...).
Besides Loadimage from file or included in resource script, which you need later version of resource compiler than the one included in masm32. Package for compressed. ico files
Exporting to raw in this runtime creation example would be helpful,easier make or import in paint shop pro and export to. C file, than write lots of data in text editor
https://docs.microsoft.com/en-us/windows/win32/menurc/using-icons
Quote from: daydreamer on October 16, 2021, 05:29:54 AM
Exporting to raw in this runtime creation example would be helpful,easier make or import in paint shop pro and export to. C file, than write lots of data in text editor
Yes, it's an option. We can also import that .raw file direct by assembler:
.data
align 16
myfile label qword
incbin "image.raw"
szmyfile equ $-myfile
.code
lea rax,myfile
mov rcx,szmyfile
Quote from: mineiro link=topic=9563.msg104708#msg104708
Yes, it's an option. We can also import that .raw file direct by assembler:
code]
.data
align 16
myfile label qword
incbin "image.raw"
szmyfile equ $-myfile
.code
lea rax,myfile
mov rcx,szmyfile
[/code]
Thanks :thumbsup:
Tested export. Raw with ca 20 kb compressed icon, it blew up to big size,zipped it like usually before posting code. And I got 30kb zip
It's easy to blow up. Exe size to 1.6mb including lots of icons and other resources ,so size reduce area before export, with those used only for small small buttons,smallest is 120 bytes,1 pixel,1bit color to give buttons a color
Hello sir daydreamer;
Can you upload one icon where size grows instead of getting down? I like to play with that.
here is Icon with different compression
Thanks, these are my tests.
filesz filename admc3 MarkNelson paq8l
25045 chest3.ico ;compressed
12862 chest64.ico ;not 4402 4493 2179
33158 chest8b.ico ;compressed
74814 chest8.ico ;not 22605 22412 18067
3039 chestc64.ico ;compressed
51262 chestc.ico ;not 12720 12204 5257
This is the link to paq8l from Matt Mahoney windows/linux executable and source code.
http://mattmahoney.net/dc/
Thanks :thumbsup:
asm related "compression ratio" , entropy of 4k,64k demoscene files vs entropy of youtube videos showing few minutes of them? Even if most are pixelshaders latest years