blob: ae7da680ddabf830e154d1b82e57b50d3fdf8684 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
from requests import get
from os import makedirs
from re import search, DOTALL, compile
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def write_message(html : str, hash_value : str) -> None:
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text(separator = '\n', strip = True)
with open(f"output_mailinglist/{hash_value}", "a") as file:
file.write(f"{text}\n\n")
def process_thread(url : str, hash_value : str) -> None:
request = get(url)
text = request.text
match = search(r'<!--X-Body-of-Message-->(.*?)<!--X-Body-of-Message-End-->', text, DOTALL)
if match:
write_message(match.group(1).strip(), hash_value)
pattern = href_pattern = compile(r'\[<a\s+href="([^"]+)">Next in Thread</a>\]')
for line in text.splitlines():
if "Next in Thread" in line:
match = pattern.search(line)
if match:
href = match.group(1)
process_thread(urljoin(url, href), hash_value)
|