From 70136a2208308816a06c26b4487d3440dca5c1af Mon Sep 17 00:00:00 2001 From: Christian Krinitsin Date: Thu, 22 May 2025 20:04:13 +0200 Subject: add mailing-thread parser --- mailinglist/thread.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100755 mailinglist/thread.py (limited to 'mailinglist/thread.py') diff --git a/mailinglist/thread.py b/mailinglist/thread.py new file mode 100755 index 00000000..2dc8ce2f --- /dev/null +++ b/mailinglist/thread.py @@ -0,0 +1,32 @@ +from requests import get +from os import makedirs +from re import search, DOTALL, compile +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +def write_message(html, hash_value): + soup = BeautifulSoup(html, 'html.parser') + text = soup.get_text(separator='\n', strip=True) + with open(f"output_mailinglist/{hash_value}", "a") as file: + file.write(f"{text}\n\n") + +def process_thread(url, hash_value): + request = get(url) + text = request.text + + match = search(r'(.*?)', text, DOTALL) + if match: + write_message(match.group(1).strip(), hash_value) + + pattern = href_pattern = compile(r'\[Next in Thread\]') + for line in text.splitlines(): + if "Next in Thread" in line: + match = pattern.search(line) + if match: + href = match.group(1) + process_thread(urljoin(url, href), hash_value) + +if __name__ == "__main__": + makedirs("output_mailinglist", exist_ok=True) + process_thread("https://lists.nongnu.org/archive/html/qemu-devel/2025-04/msg05446.html", 861041) -- cgit 1.4.1