Initial work.

2025-08-05 23:54:09 +02:00
parent 646d8d31bb
commit cb15013433
4 changed files with 291 additions and 0 deletions
@@ -205,3 +205,5 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+.idea/
@@ -1,2 +1,5 @@
 # myaw-tg
 Analyse messages from Telegram group messages export file. Entropy, etc.
+
+## Simple usage:
+```python myaw-tg.py -i history.json -o stats.csv```
@@ -0,0 +1,6 @@
+[project]
+name = "myaw-tg"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.13"
+dependencies = []
@@ -0,0 +1,280 @@
+#!/usr/bin/python3
+
+# Myaw-TG is Telegram chat history analyzer
+#     Copyright (C) 2024  Kirill Harmatulla Shakirov
+#
+#     This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This program is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import configparser
+import sys
+import os
+import math
+import json
+import argparse
+from datetime import datetime
+import itertools
+
+C_OTHER = '\033[93m'
+C_INFO = '\033[32m'
+C_WARNING = '\033[34m'
+C_ERROR = '\033[01m' + '\033[31m'
+C_DEBUG = '\033[36m'
+C_RESET = '\033[0m'
+
+def log_error(message):
+    log_text = f"{C_ERROR}{datetime.now().isoformat()} ERROR: {message}\n{C_RESET}"
+    sys.stdout.write(log_text)
+
+def log_warning(message):
+    log_text = f"{C_WARNING}{datetime.now().isoformat()} WARNING: {message}\n{C_RESET}"
+    sys.stdout.write(log_text)
+
+def log_info(message):
+    log_text = f"{C_INFO}{datetime.now().isoformat()} INFO: {message}\n{C_RESET}"
+    sys.stdout.write(log_text)
+
+def load_config(config_file_name: str):
+    if not os.path.exists(config_file_name):
+        raise FileNotFoundError(f"Config file ({config_file_name}) not found!")
+
+    return configparser.ConfigParser().read(config_file_name)
+
+class UserStats:
+    def __init__(self, from_id: str, name: str, chars_chain_len = 2, words_chain_len = 2):
+        self.words_chain_len = words_chain_len
+        self.chars_chain_len = chars_chain_len
+        self.messages_count: int = 0
+        self.chars_markov_chain = MarkChain()
+        self.words_markov_chain = MarkChain(start_token="#START#", end_token="#END#")
+        self.chars_entropy: float = 0.0
+        self.words_entropy: float = 0.0
+        self.mean_entropy: float = 0.0
+        self.chars_per_message: float = 0.0
+        self.words_per_message: float = 0.0
+
+        self.from_id = from_id
+        self.id = from_id[4:]
+        self.name = name
+
+    def process_message(self, msg: dict):
+        self.messages_count += 1
+
+        for ent in msg["text_entities"]:
+
+            if not filter_entry(ent):
+                continue
+
+            text = ent["text"]
+
+            # count chars
+            cur_token = self.chars_markov_chain.start_token
+            for cur_char in text:
+                self.chars_per_message += 1.0
+                self.chars_markov_chain.count_link(cur_token, cur_char)
+                cur_token = cur_char
+            self.chars_markov_chain.count_link(cur_token, self.chars_markov_chain.end_token)
+
+            # count words
+            words = text.split()
+            cur_token = self.words_markov_chain.start_token
+            for cur_word in words:
+                self.words_per_message += 1.0
+                self.words_markov_chain.count_link(cur_token, cur_word)
+                cur_token = cur_word
+            self.words_markov_chain.count_link(cur_token, self.words_markov_chain.end_token)
+
+    def calc_stats(self):
+        self.chars_entropy = calc_mark_entropy(self.chars_markov_chain, self.chars_chain_len)
+        self.words_entropy = calc_mark_entropy(self.words_markov_chain, self.words_chain_len)
+        self.mean_entropy = (self.chars_entropy + self.words_entropy) * 0.5
+        self.words_per_message = self.words_per_message / self.messages_count
+        self.chars_per_message = self.chars_per_message / self.messages_count
+
+    def to_string(self) -> str:
+        stats_list = [str(self.id), f"\"{self.name}\"", f"\"{self.from_id}\"", str(self.messages_count),
+                      f"{self.mean_entropy:.5f}", f"{self.chars_entropy:.5f}", f"{self.words_entropy:.5f}",
+                      f"{self.chars_per_message:.2f}", f"{self.words_per_message:.2f}"]
+
+        return ",".join(stats_list)
+
+    def __str__(self):
+        return self.to_string()
+
+class MarkChain:
+    def __init__(self, start_token: str = "SS", end_token:str = "EE"):
+        self.start_token = start_token
+        self.end_token = end_token
+        self.chain = dict()
+        self._chain_prob = None
+
+    def count_link(self, c1: str, c2: str):
+        """
+
+        :param c1:
+        :type c1:
+        :param c2:
+        :type c2:
+        """
+        if self.chain.get(c1) is None:
+            self.chain[c1] = {c2: 1}
+        else:
+            self.chain[c1][c2] = self.chain[c1].get(c2,0) + 1
+
+    def chain_prob(self) -> dict:
+        if self._chain_prob is None:
+            return self.update_chain_prob()
+        return self._chain_prob
+
+    def update_chain_prob(self) -> dict:
+        res = dict()
+        for c1, counts in self.chain.items():
+            probs = dict()
+            total = sum(counts.values())
+            for c2, toke_count in counts.items():
+                probs[c2] = float(toke_count)/float(total)
+            res[c1] = probs
+        self._chain_prob = res
+        return res
+
+    def state_probability(self, state_seq) -> tuple[float, float, any]:
+        """
+        Calculate probability of state specified by tokens sequence.
+        :param state_seq: Sequence representing chain state.
+        :type state_seq: Iterable
+        :return: Tuple of two floats and one token.
+        First value is probability of state, second value is probability of last state chain,
+        And third value is last token in the chain.
+        :rtype: tuple[float, float, any]
+        """
+        seq_iter = state_seq.__iter__()
+        seq_prob: float = 1.0
+        cur_prob: float = 0.0
+        cur_token = next(seq_iter)
+        for next_token in seq_iter:
+            cur_prob = self._chain_prob[cur_token].get(next_token, 0.0)
+            if cur_prob == 0.0:
+                return 0.0, 0.0, next_token
+            seq_prob *= cur_prob
+            cur_token = next_token
+
+        return seq_prob,cur_prob,cur_token
+
+
+def filter_entry(ent: dict)-> bool:
+    if ent["type"] in ["plain"]:
+        return True
+    return False
+
+
+
+def calc_mark_entropy(chain:MarkChain, chain_len: int) -> float:
+    if len(chain.chain) == 0:
+        return 0.0
+
+    H = 0.0
+    chain.update_chain_prob()
+    possible_tokens = set(chain.chain_prob().keys())
+    #possible_tokens.add(chain.end_token)
+    possible_tokens.remove(chain.start_token)
+    for tokens_seq in itertools.product(possible_tokens, repeat=(chain_len-1)):
+        # Pq probability of sequence
+        tokens_seq_list = [chain.start_token]
+        tokens_seq_list.extend(tokens_seq)
+        #log_info(f"Processing sequence: {tokens_seq_list}")
+        Pq,_,_token = chain.state_probability(tokens_seq_list)
+        if Pq != 0.0:
+            for last_token in possible_tokens:
+                last_t_prob,_,_ = chain.state_probability((_token, last_token))
+                if last_t_prob != 0.0:
+                    H += (Pq* last_t_prob) * math.log2(last_t_prob)
+            #process END Token separately
+            last_t_prob, _, _ = chain.state_probability((_token, chain.end_token))
+            if last_t_prob != 0.0:
+                H += (Pq * last_t_prob) * math.log2(last_t_prob)
+
+    return H * -1.0
+
+
+def main():
+    # Initialize arguments parser
+    parser = argparse.ArgumentParser(
+        prog="myaw-tg.py",
+        description="This program analyze TG chat history exported in json format and calculates users messages entropy",
+        epilog="Have a nice day!")
+
+    parser.add_argument("-i", "--input-file",
+                        action="store",
+                        default=None,
+                        help="exported history file name, for example results.json",
+                        required=True)
+    parser.add_argument("-o", "--output-file",
+                        action="store",
+                        default=None,
+                        help="Statistic output file name, for example stats.csv",
+                        required=True)
+
+
+    arguments = parser.parse_args()
+
+    log_info(f"Reading data from file: {arguments.input_file}")
+
+    try:
+        with open(arguments.input_file, "rt") as in_file:
+            in_json = json.load(in_file)
+    except FileNotFoundError:
+        log_error(f"Cannot find chat history file: {arguments.input_file}.")
+        log_info("Exiting.")
+        exit(1)
+
+    log_info(f"Total {len(in_json["messages"])} messages.")
+    log_info(f"Start processing...")
+    users_stats: dict = {}
+    for msg in in_json["messages"]:
+        if msg["type"] == "message":
+            from_id = msg["from_id"]
+            u_stats = users_stats.get(from_id)
+            if u_stats is None:
+                u_stats = UserStats(from_id, msg["from"])
+                users_stats[from_id] = u_stats
+            # process message
+            u_stats.process_message(msg)
+
+    log_info("Calculating users messages entropy...")
+    for from_id,stats in users_stats.items():
+        log_info(f"Processing user: {stats.name}...")
+        stats.calc_stats()
+    log_info("Done!")
+
+    log_info("Preparing and writing stats to CSV file...")
+    stats_list = list(users_stats.values())
+    stats_list.sort(key=lambda x: x.messages_count, reverse=True)
+    columns_names = ["id", "name", "raw_id", "messages_count",
+                      "mean_entropy", "chars_entropy", "words_entropy",
+                      "chars_per_message", "words_per_message"]
+    columns_names_str = ",".join(map(lambda x: f"\"{x}\"", columns_names))
+    with open(arguments.output_file, "wt") as out_file:
+        out_file.write(columns_names_str)
+        out_file.write("\n")
+        for u_st in stats_list:
+            out_file.write(u_st.to_string())
+            out_file.write("\n")
+
+    log_info("Done!")
+    log_info("Exiting...")
+
+
+if __name__ == '__main__':
+    main()
+
+