diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-05-22 20:05:27 +0200 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-05-22 20:05:27 +0200 |
| commit | 5ff3863b1b734b5a98537f5ad411b9e79035d067 (patch) | |
| tree | 0f11da0ee57a2d8dbf4bf26631f8c2eff79c331e /mailinglist/downloader.py | |
| parent | 4b927bc37359dec23f67d3427fc982945f24f404 (diff) | |
| parent | a5740cacb75d587346b07225e0e47dc6a3f12753 (diff) | |
| download | qemu-analysis-5ff3863b1b734b5a98537f5ad411b9e79035d067.tar.gz qemu-analysis-5ff3863b1b734b5a98537f5ad411b9e79035d067.zip | |
Merge branch 'mailing-list'
Diffstat (limited to 'mailinglist/downloader.py')
| -rwxr-xr-x | mailinglist/downloader.py | 73 |
1 files changed, 65 insertions, 8 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py index 214528907..0b8a42779 100755 --- a/mailinglist/downloader.py +++ b/mailinglist/downloader.py @@ -1,21 +1,78 @@ from datetime import datetime, timedelta +from urllib.request import urlopen +from os import makedirs, path, remove +from shutil import rmtree +from re import search, match +from urllib.parse import urljoin -end_date = datetime(2003, 4, 1) -start_date = datetime.today().replace(day=1) +from bs4 import BeautifulSoup + +from launchpad import process_launchpad_bug +from thread import process_thread + +start_date = datetime(2015, 4, 1) +end_date = datetime.today().replace(day=1) def months_iterator(start, end): current = start - while current >= end: + while current <= end: yield current - if current.month == 1: - current = current.replace(year=current.year - 1, month=12) + if current.month == 12: + current = current.replace(year=current.year + 1, month=1) else: - current = current.replace(month=current.month - 1) + current = current.replace(month=current.month + 1) def main(): + if path.exists("output_mailinglist"): + rmtree("output_mailinglist") + + if path.exists("output_launchpad"): + rmtree("output_launchpad") + + makedirs("output_mailinglist", exist_ok=True) for month in months_iterator(start = start_date, end = end_date): - url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime("%Y-%m")}/threads.html" - print(url) + print(f"{month.strftime('%Y-%m')}") + url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html" + + html = urlopen(url).read() + + soup = BeautifulSoup(html, features='html5lib') + + ul = soup.body.ul + threads = ul.find_all('li', recursive=False) + for li in reversed(threads): + a_tag = li.find('b').find('a') + + if not a_tag: + continue + + text = a_tag.get_text(strip=True) + href = a_tag.get('href') + + re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] + if not re_match: + continue + + re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad + if re_match: + process_launchpad_bug(re_match.group(1).strip()) + continue + + re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread + if re_match: + title_hash = hash(re_match.group(1).strip()) % 1000000 + if path.exists(f"output_mailinglist/{title_hash}"): + process_thread(urljoin(url, href), title_hash) + continue + + title_hash = hash(text.strip()) % 1000000 + if path.exists(f"output_mailinglist/{title_hash}"): + print(f"ERROR: {title_hash} should not exist!") + continue + + with open(f"output_mailinglist/{title_hash}", "w") as file: + file.write(f"{text}\n\n") + process_thread(urljoin(url, href), title_hash) if __name__ == "__main__": main() |