Entropy

mineiro · October 10, 2021, 01:03:49 AM

This is my first attemp to create a program to check entropy of a file.
It's not optimized, intention was to be easy to read.
For now only linux x86-64 version.
edited- forgot exit,0 after show usage

Code Select


.X64
include ../inc/c.inc
include ../inc/glib.inc

count_frequency proto :ptr,:qword,:ptr
frequency2probabilities proto :ptr,:qword

.data?
input_filename dq ?
pinput_string dq ?
input_sz dq ?

.data
align 16
frequency_table dq 256 dup (0)      ;qword integer but it's tranformed to real8

.code
main proc uses rbx rbp r12 r13 r14 r15 _argc:dword,_argv:ptr
local argc:guint32
local argv:gpointer

local first:qword
local second:qword
local entr:qword
local isentropic:qword

local error:ptr _GError

    mov argc,_argc
    mov argv,_argv

    .if argc == 1                                   ;user typed only program name, show usage
        invoke printf,CStr("Usage: entropy filename",10)
        invoke exit,0
    .elseif argc == 2                               ;entropy input_file
        mov rax,argv                                ;ptr command line tokens
        add rax,8                                   ;ignore program name
        mov rax,[rax]                               ;pointer to token 2
        mov input_filename,rax                ;input filename
        
        invoke g_get_current_dir                                     ;actual path rax
        invoke g_build_filename,rax,input_filename,NULL        ;append input filename to path
        mov error,0
        invoke g_file_get_contents,rax,addr pinput_string,addr input_sz,addr error      ;open filename
        .if eax == FALSE
            mov rax,error
            assume rax:GError
            mov rsi,[rax].message
            assume rax:nothing
            invoke printf,CStr("%s",10),rsi
            invoke exit,1
        .endif
    .endif
    
    
;count byte symbols frequency

    invoke count_frequency,pinput_string,input_sz,addr frequency_table

;input size is equal to symbols sum, so I can transform frequency into probabilities
;input size divided by symbol frequency

    invoke frequency2probabilities,addr frequency_table,input_sz
    
    
;calculate symbol_prob * log2 symbol_prob to each symbol

    lea r12,frequency_table             ;probabilities symbols table
    mov r13,0
    finit
    .while r13 != 256
        .if qword ptr [r12] != 0
            fld qword ptr [r12]
            fld qword ptr [r12]
            fyl2x                        ;x * log2 (x)
            fstp qword ptr [r12]        ;store result
        .endif
        add r12,8
        inc r13
    .endw
    
;sum all x*log2x of each symbol
    lea r12,frequency_table
    mov r13,0
    pxor xmm0,xmm0
    .while r13 != 256
        addsd xmm0,qword ptr [r12]
        add r12,8
        inc r13
    .endw
    movsd first,xmm0        ;prob sum
    mov rax,input_sz
    cvtsi2sd xmm1,rax       ;input_sz
    mulsd xmm0,xmm1
    movsd second,xmm0       ;prob sum times input size
;    mov rax,1
;    invoke printf,CStr("%f bits",10),xmm0
    
    invoke printf,CStr("input size = %d bytes",10),input_sz
    invoke printf,CStr("In theory compressor order 0 byte based can compress this file to ")
    mov rax,8
    cvtsi2sd xmm1,rax
    movsd xmm0,second
    divsd xmm0,xmm1         ;minus result to plus result
    
    movsd xmm1,xmm0
    subsd xmm0,xmm1
    subsd xmm0,xmm1
    
    ;get integer part of floating point and compare with floating point value
    cvtsd2si rax,xmm0
    cvtsi2sd xmm1,rax
    comisd xmm1,xmm0
    je @F
    inc rax         ;floating point is bigger than integer part
    @@:
    mov entr,rax
    ;mov rax,1
    ;invoke printf,CStr("%f bytes",10),xmm0
    invoke printf,CStr("%d bytes",10),rax
    
;    input = 100%
;    entr  = x%    
    xor edx,edx
    mov rax,entr
    mov rcx,100
    mul rcx
    
    xor edx,edx
    mov rcx,input_sz
    div rcx
    
    mov rcx,100
    sub rcx,rax
    mov isentropic,rcx
    invoke printf,CStr("compression <= %d%",10),rcx
    
    mov rcx,isentropic
    .if rcx <= 3            ;3%
        invoke printf,CStr("This is an entropic file",10)
        invoke printf,CStr("Probably its a compressed file, or cripto file, or random file",10)
        invoke printf,CStr("So much information inside this file",10)
    .elseif rcx <= 10       ;10%
        invoke printf,CStr("More or less entropic file",10)
        invoke printf,CStr("Information inside this file can be compressed",10)
    .else
        invoke printf,CStr("Not an entropic file",10)
        invoke printf,CStr("Little information inside this file",10)
        invoke printf,CStr("Compression order 1 or above can probably reduce this file size",10)
    .endif
    
    
    invoke exit,0
main endp


align 16
count_frequency proc uses rbx rbp r12 r13 r14 r15 _pointer:ptr,_ssize:guint64,ftable:ptr
local pinput:ptr
local insz:qword
local freq_table:ptr

mov pinput,_pointer
mov insz,_ssize
mov freq_table,ftable

mov rsi,pinput
mov rdi,freq_table
mov rcx,insz

.while rcx != 0
    movzx rax,byte ptr [rsi]
    dec rcx
    inc rsi
    inc qword ptr [rdi+rax*8]
.endw

ret
count_frequency endp


align 16
frequency2probabilities proc uses rbx rbp r12 r13 r14 r15 _pointer:ptr,_ssize:guint64
local pinput:ptr
local insz:qword

mov pinput,_pointer
mov insz,_ssize

mov rax,insz
cvtsi2sd xmm0,rax       ;symbols sum

mov rsi,pinput
mov rcx,256

.while rcx != 0
    mov rax,qword ptr [rsi]     ;symbol frequency
    .if rax != 0
        cvtsi2sd xmm1,rax
        divsd xmm1,xmm0         ;symbol frequency / symbols sum
        movsd [rsi],xmm1        ;store in frequency (probabilities table)
    .endif
    add rsi,8
    dec rcx
.endw

ret
frequency2probabilities endp

end main

Output sounds like this:

Quote
./entropy defrand.pad
input size = 1000000 bytes
In theory compressor order 0 byte based can compress this file to 999980 bytes
compression <= 1%
This is an entropic file
Probably its a compressed file, or cripto file, or random file
So much information inside this file

Quote
./entropy entropy.uasm
input size = 5449 bytes
In theory compressor order 0 byte based can compress this file to 3081 bytes
compression <= 44%
Not an entropic file
Little information inside this file
Compression order 1 or above can probably reduce this file size

daydreamer · October 10, 2021, 03:44:20 AM

I have a file in my random number generator :file of 13k primes worth use testing entropy with,much unique numbers and hard to compress
unfortunatly I dont have linux

mineiro · October 12, 2021, 08:30:47 AM

hello sir daydreamer;
Sorry for posting only today, busy days here.
This is a windows x86-64 version. I tested it only in wine on linux.
If somebody find something wrong, please, tell me.

daydreamer · October 13, 2021, 08:22:42 PM

so x64 version does that mean it takes advantage of 64bit mode alloc enough to handle 13gb file?
just needed to find those files comparing between .exe that contains prime.inc LUT
and prime.inc source file

Code Select

PS C:\masm32\1th> .\entropy 1thwithLUT.exe
input size = 16896 bytes
In theory compressor order 0 byte based can compress this file to 15744 bytes
compression <= 7 percent
More or less entropic file
Information inside this file can be compressed
-------------------------------------------------------------------------------------
PS C:\masm32\1th> .\entropy primes.inc
input size = 55282 bytes
In theory compressor order 0 byte based can compress this file to 24347 bytes
compression <= 56 percent
Not an entropic file
Little information inside this file
Compression order 1 or above can probably reduce this file size
PS C:\masm32\1th>

mineiro · October 13, 2021, 09:57:41 PM

hello sir daydreamer;
Yes, takes advantage if symbol frequency don't overflow 8 bytes (qword) frequency count, and don't lost real8 precision. BUT I don't have checked in huge files.
One thing that need be said is about model. Model used in this program checks only a symbol without dependency of other symbols, so, byte order0 based. We can do this with bits and reach other entropy value. Entropy gives correct value but it's model dependent.

My tests was done in enwik8 file. I will present other file that most of us have inside computer, windows.inc file.

Code Select


./entropy windows.inc 
input size = 850039 bytes
...can compress this file to 464451 bytes
compression <= 46%

Exist a data compression program called dmc (Dynamic Markov Chain). This is bit based, an arithmetic encoder compression tool that uses Markov chain disposed like a braid (a tree where nodes reach roots). Uses a trigger in probabilities to satisfy clonning process of leaf nodes. Or in other words, Markov chain was used as one way to recreate (predict) source symbols, like talking with a bot you know.

Code Select


./admc3 c windows.inc 1
done...
bytes in 850039, bytes out 181188, ratio 0.213153

So, windows.inc file was compressed to 181188 bytes. Very far from 464451 bytes suggested by entropy. This is because other model was used in compression. If we insert that model and count frequency based in that model used by dmc we can reach correct entropy value, and also check efficiency of arithmetic encoder, or how much bits was increased (or lost) by using an arithmetic encoder.

Other example comes from Mark Nelson. If I try their compression tool that uses a lot of memory too to compress a file based in order N, I got different results:

Code Select


Usage: COMP-2 [-o order] [-f text file] [-c compressed file]
./comp-2 -o 1 -f windows.inc
Compressing windows.inc to test.cmp, order 1.
850039/261724, 2.463
./comp-2 -o 2 -f windows.inc
Compressing windows.inc to test.cmp, order 2.
850039/203682, 1.917
./comp-2 -o 3 -f windows.inc
Compressing windows.inc to test.cmp, order 3.
850039/166675, 1.569
./comp-2 -o 4 -f windows.inc
Compressing windows.inc to test.cmp, order 4.
850039/154910, 1.458
./comp-2 -o 5 -f windows.inc
Compressing windows.inc to test.cmp, order 5.
850039/154953, 1.458

So, if we increase order (inter/intra correlation between symbols) we can get more compression. But this is not absolute answer, because order 5 compressed file to 154953 bytes while order 4 compressed file to 154910 bytes.

daydreamer · October 14, 2021, 01:47:15 AM

First came gif compression, 256 color compression, jpg compressed 24bit color with few 0-2 noise on each channel that the eye didn't notice
Png good for both photographs and gif things with transparency
.Ico files 32bit with x bit transparency, 24bit,16bit,8bit palette, 4bit palette,1bit palette,when working with icons I noticed. Zip uncompressed icons worse than compressed. Ico format
My point being,check file format and switch/case check different entropy depending on bit depths? Simplest with. Bmps
Maybe need adjust to different algo with. Jpg lossy compression?

mineiro · October 14, 2021, 04:33:58 AM

Quote from: daydreamer on October 14, 2021, 01:47:15 AM
My point being,check file format and switch/case check different entropy depending on bit depths? Simplest with. Bmps
Maybe need adjust to different algo with. Jpg lossy compression?

Good point, it's hard answer because it's related to data instead of specific algorithm.
Try a one collor only figure and save as jpg and gif. Gif will compress better.
Depending on data, Huffman tree can do the same as arithmetic encoder; but in other data can fail.
Jpg have 2 versions, a lossy and a lossless. When was released jpeg format the arithmetic encoder was patented hard, so they used Huffman. But now patent expired. I have see some persons exporting compressed jpg image as raw data, then compressing with arithmetic encoder and structured that back to jpeg structure/form. Have some gain by doing this. Jpeg support arithmetic encoding but don't used that because patents.
Exist a lot of compression algorithms, a lot of LempelZiv variations (Zip, Rar, ...), exist transforms like Burrows-Wheller (gzip) that is usefull in text files, a move-to-front after BW transform so more seen symbols are at start of 'pseudo' alphabeth, and output of this going to huffman/AC.
Others start with specific dictionary frequency, after digraphs, trigraphs, ..., elevating orders, trying a dictionary prediction (predict partial match, PPM), and outputing that to some arithmetic encoding.
AC reach very close to entropy, have some bits lost because magnitude problem. Well, the carry. It's not viable to take the whole file in memory, and deal with carry passing by a lot of bytes. Generally we take a register size, like 32 or 64 bits, and if carry goes and overflow register, we deal only with this. We predict that this can happen when left most 2 bits are oposite one from each other; so, take an action.
The tendency is to go to Artificial Inteligence. Teach a brain to learn about data and their own errors and correct itself, and output of this brain layer going to some data comrpression algorithm.

It's hard answer because if you change input source file being tested, one compressor program can compress better than other. And supposition here is that the same data type (text as example) is being source input.

daydreamer · October 16, 2021, 01:01:51 AM

I convert many pics to .ico files when I work on a game,sometimes compress to 8bit format get its smaller than 24bit,sometimes not,maybe some entropy test+taking account the extra 1k palette takes?

mineiro · October 16, 2021, 03:38:55 AM

I don't play with .ico files; so I'm not the best person to answer.
I was reading .ico file format before post this message and not sounds to much complicate.
https://en.wikipedia.org/wiki/ICO_(file_format)

Entropy rule says something like: If a color inside a pallete color have same probabilities of other colors, so entropy will be high; if a pallete color have colors that is more probable to occur than other colors, so entropy will decrease.
In other words; if next color in an image have same probabilities of all other colors, so we cannot guess whats next color, entropy increase. If a color is blue, and a pattern is supposed to being created, so next times a blue color is seen, we can try to guess or predict next color after blue, this way entropy will decrease if our guess was right.
Entropy can be seen as surprise to us humans. If appear a japanese word in this text will cause so much surprise to us, our mind can't guess whats that, was not able to predict, so entropy increase. The word color in this text was so much present, we can predict the context, entropy decrease.

From now it's only suppositions.
From link above, .ico files starting from windows vista support compression (png), before was just an image raw dump structured in a file format. If your .ico files are bigger than others, this should be a color pallete using a lot of different colors, and these colors having the "same" frequency/probability than other colors.
What is possible to do is dump an .ico image as raw (Paint Shop Pro, ...). Insert that raw .ico image as bunch of db using assembler and create .ico header file in memory by programming. So, if works, next step can be compression. If all images have same size (width and height), better, little surprise here, can compress more with only one information.

Palletes are just like a dictionary (index) used in LempelZiv (zip, rar,...).

daydreamer · October 16, 2021, 05:29:54 AM

Besides Loadimage from file or included in resource script, which you need later version of resource compiler than the one included in masm32. Package for compressed. ico files
Exporting to raw in this runtime creation example would be helpful,easier make or import in paint shop pro and export to. C file, than write lots of data in text editor
https://docs.microsoft.com/en-us/windows/win32/menurc/using-icons

mineiro · October 16, 2021, 10:58:16 PM

Quote from: daydreamer on October 16, 2021, 05:29:54 AM
Exporting to raw in this runtime creation example would be helpful,easier make or import in paint shop pro and export to. C file, than write lots of data in text editor

Yes, it's an option. We can also import that .raw file direct by assembler:

Code Select


.data
align 16
myfile label qword
incbin "image.raw"
szmyfile equ $-myfile

.code
lea rax,myfile
mov rcx,szmyfile

daydreamer · October 17, 2021, 09:33:38 PM

Quote from: mineiro link=topic=9563.msg104708#msg104708
Yes, it's an option. We can also import that .raw file direct by assembler:
code]
.data
align 16
myfile label qword
incbin "image.raw"
szmyfile equ $-myfile

.code
lea rax,myfile
mov rcx,szmyfile
[/code]

Thanks

Tested export. Raw with ca 20 kb compressed icon, it blew up to big size,zipped it like usually before posting code. And I got 30kb zip
It's easy to blow up. Exe size to 1.6mb including lots of icons and other resources ,so size reduce area before export, with those used only for small small buttons,smallest is 120 bytes,1 pixel,1bit color to give buttons a color

mineiro · October 17, 2021, 11:27:18 PM

Hello sir daydreamer;
Can you upload one icon where size grows instead of getting down? I like to play with that.

daydreamer · October 18, 2021, 01:14:03 AM

here is Icon with different compression

mineiro · October 18, 2021, 02:01:04 AM

Thanks, these are my tests.

Code Select


filesz filename                          admc3      MarkNelson     paq8l
25045 chest3.ico          ;compressed
12862 chest64.ico         ;not           4402       4493           2179
33158 chest8b.ico         ;compressed
74814 chest8.ico          ;not           22605      22412          18067
 3039 chestc64.ico        ;compressed
51262 chestc.ico          ;not           12720      12204          5257

This is the link to paq8l from Matt Mahoney windows/linux executable and source code.
http://mattmahoney.net/dc/

The MASM Forum

News:

Entropy

mineiro

daydreamer

mineiro

daydreamer

mineiro

daydreamer

mineiro

daydreamer

mineiro

daydreamer

mineiro

daydreamer

mineiro

daydreamer

mineiro