diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-05-22 20:04:54 +0200 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-05-22 20:04:54 +0200 |
| commit | a5740cacb75d587346b07225e0e47dc6a3f12753 (patch) | |
| tree | 13d855120d83c275833a0977832b79b91f9baf47 | |
| parent | 70136a2208308816a06c26b4487d3440dca5c1af (diff) | |
| download | emulator-bug-study-a5740cacb75d587346b07225e0e47dc6a3f12753.tar.gz emulator-bug-study-a5740cacb75d587346b07225e0e47dc6a3f12753.zip | |
add main function for mailinglist scraper
| -rwxr-xr-x | mailinglist/downloader.py | 60 |
1 files changed, 39 insertions, 21 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py index 08ca583c..0b8a4277 100755 --- a/mailinglist/downloader.py +++ b/mailinglist/downloader.py @@ -1,25 +1,37 @@ from datetime import datetime, timedelta from urllib.request import urlopen +from os import makedirs, path, remove +from shutil import rmtree +from re import search, match +from urllib.parse import urljoin from bs4 import BeautifulSoup -from re import search -end_date = datetime(2015, 4, 1) -# end_date = datetime.today().replace(day=1) ##### -start_date = datetime.today().replace(day=1) +from launchpad import process_launchpad_bug +from thread import process_thread + +start_date = datetime(2015, 4, 1) +end_date = datetime.today().replace(day=1) def months_iterator(start, end): current = start - while current >= end: + while current <= end: yield current - if current.month == 1: - current = current.replace(year=current.year - 1, month=12) + if current.month == 12: + current = current.replace(year=current.year + 1, month=1) else: - current = current.replace(month=current.month - 1) + current = current.replace(month=current.month + 1) def main(): - count = 0 + if path.exists("output_mailinglist"): + rmtree("output_mailinglist") + + if path.exists("output_launchpad"): + rmtree("output_launchpad") + + makedirs("output_mailinglist", exist_ok=True) for month in months_iterator(start = start_date, end = end_date): + print(f"{month.strftime('%Y-%m')}") url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html" html = urlopen(url).read() @@ -28,7 +40,7 @@ def main(): ul = soup.body.ul threads = ul.find_all('li', recursive=False) - for li in threads: + for li in reversed(threads): a_tag = li.find('b').find('a') if not a_tag: @@ -36,25 +48,31 @@ def main(): text = a_tag.get_text(strip=True) href = a_tag.get('href') - match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] - if not match: + re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] + if not re_match: continue - match = search(r'(Re\:|RE\:|re\:)', text) # matches bug enclosed in [] - - if match: + re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad + if re_match: + process_launchpad_bug(re_match.group(1).strip()) continue - match = search(r'\[Bug\s\d+\]', text) # matches bug enclosed in [] - - if match: + re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread + if re_match: + title_hash = hash(re_match.group(1).strip()) % 1000000 + if path.exists(f"output_mailinglist/{title_hash}"): + process_thread(urljoin(url, href), title_hash) continue - print(f"Text: {text}, Href: {href}") - count = count + 1 + title_hash = hash(text.strip()) % 1000000 + if path.exists(f"output_mailinglist/{title_hash}"): + print(f"ERROR: {title_hash} should not exist!") + continue - print(f"{month.strftime('%Y-%m')}, Count: {count}") + with open(f"output_mailinglist/{title_hash}", "w") as file: + file.write(f"{text}\n\n") + process_thread(urljoin(url, href), title_hash) if __name__ == "__main__": main() |