summary refs log tree commit diff stats
path: root/mailinglist
diff options
context:
space:
mode:
authorChristian Krinitsin <mail@krinitsin.com>2025-05-22 20:04:13 +0200
committerChristian Krinitsin <mail@krinitsin.com>2025-05-22 20:04:13 +0200
commit70136a2208308816a06c26b4487d3440dca5c1af (patch)
tree47028b47b4d380567609b123663e7e123df3697d /mailinglist
parent749d7114661682ad7a9b8c6b7765b499bee6f9fe (diff)
downloadqemu-analysis-70136a2208308816a06c26b4487d3440dca5c1af.tar.gz
qemu-analysis-70136a2208308816a06c26b4487d3440dca5c1af.zip
add mailing-thread parser
Diffstat (limited to 'mailinglist')
-rwxr-xr-xmailinglist/thread.py32
1 files changed, 32 insertions, 0 deletions
diff --git a/mailinglist/thread.py b/mailinglist/thread.py
new file mode 100755
index 000000000..2dc8ce2f7
--- /dev/null
+++ b/mailinglist/thread.py
@@ -0,0 +1,32 @@
+from requests import get
+from os import makedirs
+from re import search, DOTALL, compile
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
+def write_message(html, hash_value):
+    soup = BeautifulSoup(html, 'html.parser')
+    text = soup.get_text(separator='\n', strip=True)
+    with open(f"output_mailinglist/{hash_value}", "a") as file:
+        file.write(f"{text}\n\n")
+
+def process_thread(url, hash_value):
+    request = get(url)
+    text = request.text
+
+    match = search(r'<!--X-Body-of-Message-->(.*?)<!--X-Body-of-Message-End-->', text, DOTALL)
+    if match:
+        write_message(match.group(1).strip(), hash_value)
+
+    pattern = href_pattern = compile(r'\[<a\s+href="([^"]+)">Next in Thread</a>\]')
+    for line in text.splitlines():
+        if "Next in Thread" in line:
+            match = pattern.search(line)
+            if match:
+                href = match.group(1)
+                process_thread(urljoin(url, href), hash_value)
+
+if __name__ == "__main__":
+    makedirs("output_mailinglist", exist_ok=True)
+    process_thread("https://lists.nongnu.org/archive/html/qemu-devel/2025-04/msg05446.html", 861041)