diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-05-22 20:05:27 +0200 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-05-22 20:05:27 +0200 |
| commit | 5ff3863b1b734b5a98537f5ad411b9e79035d067 (patch) | |
| tree | 0f11da0ee57a2d8dbf4bf26631f8c2eff79c331e /mailinglist/thread.py | |
| parent | 4b927bc37359dec23f67d3427fc982945f24f404 (diff) | |
| parent | a5740cacb75d587346b07225e0e47dc6a3f12753 (diff) | |
| download | emulator-bug-study-5ff3863b1b734b5a98537f5ad411b9e79035d067.tar.gz emulator-bug-study-5ff3863b1b734b5a98537f5ad411b9e79035d067.zip | |
Merge branch 'mailing-list'
Diffstat (limited to 'mailinglist/thread.py')
| -rwxr-xr-x | mailinglist/thread.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/mailinglist/thread.py b/mailinglist/thread.py new file mode 100755 index 00000000..2dc8ce2f --- /dev/null +++ b/mailinglist/thread.py @@ -0,0 +1,32 @@ +from requests import get +from os import makedirs +from re import search, DOTALL, compile +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +def write_message(html, hash_value): + soup = BeautifulSoup(html, 'html.parser') + text = soup.get_text(separator='\n', strip=True) + with open(f"output_mailinglist/{hash_value}", "a") as file: + file.write(f"{text}\n\n") + +def process_thread(url, hash_value): + request = get(url) + text = request.text + + match = search(r'<!--X-Body-of-Message-->(.*?)<!--X-Body-of-Message-End-->', text, DOTALL) + if match: + write_message(match.group(1).strip(), hash_value) + + pattern = href_pattern = compile(r'\[<a\s+href="([^"]+)">Next in Thread</a>\]') + for line in text.splitlines(): + if "Next in Thread" in line: + match = pattern.search(line) + if match: + href = match.group(1) + process_thread(urljoin(url, href), hash_value) + +if __name__ == "__main__": + makedirs("output_mailinglist", exist_ok=True) + process_thread("https://lists.nongnu.org/archive/html/qemu-devel/2025-04/msg05446.html", 861041) |