1- #!/usr/bin/env python3
2-
3- """
4- Implementation of entropy of information
5- https://en.wikipedia.org/wiki/Entropy_(information_theory)
6- """
7-
81from __future__ import annotations
92
103import math
114from collections import Counter
12- from string import ascii_lowercase
135
6+ """
7+ In information theory, entropy is a measure of the uncertainty or randomness of a
8+ source of data. It quantifies the expected amount of information contained in each
9+ message from the source.
1410
15- def calculate_prob (text : str ) -> None :
16- """
17- This method takes path and two dict as argument
18- and than calculates entropy of them.
19- :param dict:
20- :param dict:
21- :return: Prints
22- 1) Entropy of information based on 1 alphabet
23- 2) Entropy of information based on couples of 2 alphabet
24- 3) print Entropy of H(X n|Xn-1)
25-
26- Text from random books. Also, random quotes.
27- >>> text = ("Behind Winston's back the voice "
28- ... "from the telescreen was still "
29- ... "babbling and the overfulfilment")
30- >>> calculate_prob(text)
31- 4.0
32- 6.0
33- 2.0
34-
35- >>> text = ("The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official"
36- ... "face in elegant lettering, the three")
37- >>> calculate_prob(text)
38- 4.0
39- 5.0
40- 1.0
41- >>> text = ("Had repulsive dashwoods suspicion sincerity but advantage now him. "
42- ... "Remark easily garret nor nay. Civil those mrs enjoy shy fat merry. "
43- ... "You greatest jointure saw horrible. He private he on be imagine "
44- ... "suppose. Fertile beloved evident through no service elderly is. Blind "
45- ... "there if every no so at. Own neglected you preferred way sincerity "
46- ... "delivered his attempted. To of message cottage windows do besides "
47- ... "against uncivil. Delightful unreserved impossible few estimating "
48- ... "men favourable see entreaties. She propriety immediate was improving. "
49- ... "He or entrance humoured likewise moderate. Much nor game son say "
50- ... "feel. Fat make met can must form into gate. Me we offending prevailed "
51- ... "discovery.")
52- >>> calculate_prob(text)
53- 4.0
54- 7.0
55- 3.0
11+ The core formula for Shannon Entropy H(X) is:
12+ H(X) = -Σ P(x) * log2(P(x))
13+ where P(x) is the probability of an event x occurring.
14+
15+ This concept mirrors the thermodynamic entropy in physics, representing the level
16+ of disorder in a system. In a digital context, it defines the theoretical limit
17+ for data compression.
18+
19+ Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
20+ """
21+
22+
23+ def shannon_entropy (probabilities : list [float ]) -> float :
5624 """
57- single_char_strings , two_char_strings = analyze_text (text )
58- my_alphas = list (" " + ascii_lowercase )
59- # what is our total sum of probabilities.
60- all_sum = sum (single_char_strings .values ())
61-
62- # one length string
63- my_fir_sum = 0
64- # for each alpha we go in our dict and if it is in it we calculate entropy
65- for ch in my_alphas :
66- if ch in single_char_strings :
67- my_str = single_char_strings [ch ]
68- prob = my_str / all_sum
69- my_fir_sum += prob * math .log2 (prob ) # entropy formula.
70-
71- # print entropy
72- print (f"{ round (- 1 * my_fir_sum ):.1f} " )
73-
74- # two len string
75- all_sum = sum (two_char_strings .values ())
76- my_sec_sum = 0
77- # for each alpha (two in size) calculate entropy.
78- for ch0 in my_alphas :
79- for ch1 in my_alphas :
80- sequence = ch0 + ch1
81- if sequence in two_char_strings :
82- my_str = two_char_strings [sequence ]
83- prob = int (my_str ) / all_sum
84- my_sec_sum += prob * math .log2 (prob )
85-
86- # print second entropy
87- print (f"{ round (- 1 * my_sec_sum ):.1f} " )
88-
89- # print the difference between them
90- print (f"{ round ((- 1 * my_sec_sum ) - (- 1 * my_fir_sum )):.1f} " )
91-
92-
93- def analyze_text (text : str ) -> tuple [dict , dict ]:
25+ Calculates the Shannon entropy of a given probability distribution.
26+
27+ Args:
28+ probabilities: A list of probabilities representing a discrete distribution.
29+
30+ Returns:
31+ The entropy value in bits.
32+
33+ Raises:
34+ ValueError: If probabilities are negative or do not sum to approximately 1.0.
35+
36+ Examples:
37+ >>> shannon_entropy([0.5, 0.5])
38+ 1.0
39+ >>> shannon_entropy([1.0, 0.0])
40+ 0.0
41+ >>> shannon_entropy([0.25, 0.25, 0.25, 0.25])
42+ 2.0
9443 """
95- Convert text input into two dicts of counts.
96- The first dictionary stores the frequency of single character strings.
97- The second dictionary stores the frequency of two character strings.
44+ if any (p < 0 for p in probabilities ):
45+ raise ValueError ("Probabilities cannot be negative." )
46+
47+ # Due to floating point precision, we check for closeness to 1.0
48+ if not math .isclose (sum (probabilities ), 1.0 , rel_tol = 1e-9 ) and sum (probabilities ) > 0 :
49+ # Normalize if not summed to 1 but has values
50+ probabilities = [p / sum (probabilities ) for p in probabilities ]
51+
52+ entropy = 0.0
53+ for p in probabilities :
54+ if p > 0 :
55+ entropy -= p * math .log2 (p )
56+
57+ return entropy
58+
59+
60+ def analyze_text_entropy (text : str ) -> dict [str , float ]:
9861 """
99- single_char_strings = Counter () # type: ignore[var-annotated]
100- two_char_strings = Counter () # type: ignore[var-annotated]
101- single_char_strings [text [- 1 ]] += 1
62+ Analyzes the entropy of a given text at different levels (1-gram, 2-gram).
10263
103- # first case when we have space at start.
104- two_char_strings [" " + text [0 ]] += 1
105- for i in range (len (text ) - 1 ):
106- single_char_strings [text [i ]] += 1
107- two_char_strings [text [i : i + 2 ]] += 1
108- return single_char_strings , two_char_strings
64+ Args:
65+ text: The input string to analyze.
10966
67+ Returns:
68+ A dictionary containing entropy values for different n-gram levels.
11069
111- def main ():
70+ Examples:
71+ >>> result = analyze_text_entropy("aaaaa")
72+ >>> result['1-gram']
73+ 0.0
74+ >>> result = analyze_text_entropy("abab")
75+ >>> round(result['1-gram'], 2)
76+ 1.0
77+ """
78+ if not text :
79+ return {"1-gram" : 0.0 , "2-gram" : 0.0 }
80+
81+ # 1-gram analysis (individual characters)
82+ counts_1gram = Counter (text )
83+ total_chars = len (text )
84+ probs_1gram = [count / total_chars for count in counts_1gram .values ()]
85+ entropy_1gram = shannon_entropy (probs_1gram )
86+
87+ # 2-gram analysis (pairs of characters)
88+ if len (text ) < 2 :
89+ entropy_2gram = 0.0
90+ else :
91+ pairs = [text [i : i + 2 ] for i in range (len (text ) - 1 )]
92+ counts_2gram = Counter (pairs )
93+ total_pairs = len (pairs )
94+ probs_2gram = [count / total_pairs for count in counts_2gram .values ()]
95+ entropy_2gram = shannon_entropy (probs_2gram )
96+
97+ return {
98+ "1-gram" : entropy_1gram ,
99+ "2-gram" : entropy_2gram ,
100+ "conditional_entropy" : max (0.0 , entropy_2gram - entropy_1gram ),
101+ }
102+
103+
104+ if __name__ == "__main__" :
112105 import doctest
113106
114107 doctest .testmod ()
115- # text = (
116- # "Had repulsive dashwoods suspicion sincerity but advantage now him. Remark "
117- # "easily garret nor nay. Civil those mrs enjoy shy fat merry. You greatest "
118- # "jointure saw horrible. He private he on be imagine suppose. Fertile "
119- # "beloved evident through no service elderly is. Blind there if every no so "
120- # "at. Own neglected you preferred way sincerity delivered his attempted. To "
121- # "of message cottage windows do besides against uncivil. Delightful "
122- # "unreserved impossible few estimating men favourable see entreaties. She "
123- # "propriety immediate was improving. He or entrance humoured likewise "
124- # "moderate. Much nor game son say feel. Fat make met can must form into "
125- # "gate. Me we offending prevailed discovery. "
126- # )
127-
128- # calculate_prob(text)
129-
130108
131- if __name__ == "__main__" :
132- main ()
109+ # Manual demonstration
110+ sample_text = "Behind Winston's back the voice from the telescreen was still"
111+ entropy_stats = analyze_text_entropy (sample_text )
112+ print (f"Text: '{ sample_text [:30 ]} ...'" )
113+ for level , value in entropy_stats .items ():
114+ print (f"{ level :>20} : { value :.4f} bits" )
0 commit comments