diff options
| -rw-r--r-- | mailinglist/.gitignore | 2 | ||||
| -rwxr-xr-x | mailinglist/downloader.py | 73 | ||||
| -rwxr-xr-x | mailinglist/launchpad.py | 27 | ||||
| -rwxr-xr-x | mailinglist/thread.py | 32 |
4 files changed, 126 insertions, 8 deletions
diff --git a/mailinglist/.gitignore b/mailinglist/.gitignore new file mode 100644 index 000000000..0d0ac50dd --- /dev/null +++ b/mailinglist/.gitignore @@ -0,0 +1,2 @@ +output_launchpad +output_mailinglist diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py index 214528907..0b8a42779 100755 --- a/mailinglist/downloader.py +++ b/mailinglist/downloader.py @@ -1,21 +1,78 @@ from datetime import datetime, timedelta +from urllib.request import urlopen +from os import makedirs, path, remove +from shutil import rmtree +from re import search, match +from urllib.parse import urljoin -end_date = datetime(2003, 4, 1) -start_date = datetime.today().replace(day=1) +from bs4 import BeautifulSoup + +from launchpad import process_launchpad_bug +from thread import process_thread + +start_date = datetime(2015, 4, 1) +end_date = datetime.today().replace(day=1) def months_iterator(start, end): current = start - while current >= end: + while current <= end: yield current - if current.month == 1: - current = current.replace(year=current.year - 1, month=12) + if current.month == 12: + current = current.replace(year=current.year + 1, month=1) else: - current = current.replace(month=current.month - 1) + current = current.replace(month=current.month + 1) def main(): + if path.exists("output_mailinglist"): + rmtree("output_mailinglist") + + if path.exists("output_launchpad"): + rmtree("output_launchpad") + + makedirs("output_mailinglist", exist_ok=True) for month in months_iterator(start = start_date, end = end_date): - url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime("%Y-%m")}/threads.html" - print(url) + print(f"{month.strftime('%Y-%m')}") + url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html" + + html = urlopen(url).read() + + soup = BeautifulSoup(html, features='html5lib') + + ul = soup.body.ul + threads = ul.find_all('li', recursive=False) + for li in reversed(threads): + a_tag = li.find('b').find('a') + + if not a_tag: + continue + + text = a_tag.get_text(strip=True) + href = a_tag.get('href') + + re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] + if not re_match: + continue + + re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad + if re_match: + process_launchpad_bug(re_match.group(1).strip()) + continue + + re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread + if re_match: + title_hash = hash(re_match.group(1).strip()) % 1000000 + if path.exists(f"output_mailinglist/{title_hash}"): + process_thread(urljoin(url, href), title_hash) + continue + + title_hash = hash(text.strip()) % 1000000 + if path.exists(f"output_mailinglist/{title_hash}"): + print(f"ERROR: {title_hash} should not exist!") + continue + + with open(f"output_mailinglist/{title_hash}", "w") as file: + file.write(f"{text}\n\n") + process_thread(urljoin(url, href), title_hash) if __name__ == "__main__": main() diff --git a/mailinglist/launchpad.py b/mailinglist/launchpad.py new file mode 100755 index 000000000..91d6cd8b3 --- /dev/null +++ b/mailinglist/launchpad.py @@ -0,0 +1,27 @@ +from requests import get +from os import makedirs, path + +def process_launchpad_bug(bug_id): + if path.exists(f"output_launchpad/{bug_id}"): + return + + bug_url = f"https://api.launchpad.net/1.0/bugs/{bug_id}" + + bug_response = get(url = bug_url) + + bug_data = bug_response.json() + + messages_response = get(url = bug_data['messages_collection_link']) + + messages_data = messages_response.json() + + makedirs("output_launchpad", exist_ok=True) + with open(f"output_launchpad/{bug_id}", "w") as file: + file.write(f"{bug_data['title']}\n\n") + + for entry in messages_data['entries']: + file.write(f"{entry['content']}\n\n") + +if __name__ == "__main__": + process_launchpad_bug(1629282) + process_launchpad_bug(1915063) diff --git a/mailinglist/thread.py b/mailinglist/thread.py new file mode 100755 index 000000000..2dc8ce2f7 --- /dev/null +++ b/mailinglist/thread.py @@ -0,0 +1,32 @@ +from requests import get +from os import makedirs +from re import search, DOTALL, compile +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +def write_message(html, hash_value): + soup = BeautifulSoup(html, 'html.parser') + text = soup.get_text(separator='\n', strip=True) + with open(f"output_mailinglist/{hash_value}", "a") as file: + file.write(f"{text}\n\n") + +def process_thread(url, hash_value): + request = get(url) + text = request.text + + match = search(r'<!--X-Body-of-Message-->(.*?)<!--X-Body-of-Message-End-->', text, DOTALL) + if match: + write_message(match.group(1).strip(), hash_value) + + pattern = href_pattern = compile(r'\[<a\s+href="([^"]+)">Next in Thread</a>\]') + for line in text.splitlines(): + if "Next in Thread" in line: + match = pattern.search(line) + if match: + href = match.group(1) + process_thread(urljoin(url, href), hash_value) + +if __name__ == "__main__": + makedirs("output_mailinglist", exist_ok=True) + process_thread("https://lists.nongnu.org/archive/html/qemu-devel/2025-04/msg05446.html", 861041) |