diff options
| -rwxr-xr-x | mailinglist/downloader.py | 49 | ||||
| -rwxr-xr-x | mailinglist/launchpad.py | 33 | ||||
| -rwxr-xr-x | mailinglist/thread.py | 6 |
3 files changed, 49 insertions, 39 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py index 38d37305f..e793271e0 100755 --- a/mailinglist/downloader.py +++ b/mailinglist/downloader.py @@ -1,9 +1,9 @@ -from datetime import datetime, timedelta +from datetime import datetime from urllib.request import urlopen -from os import makedirs, path, remove +from urllib.parse import urljoin +from os import makedirs, path from shutil import rmtree from re import search, match -from urllib.parse import urljoin from bs4 import BeautifulSoup @@ -18,55 +18,58 @@ def months_iterator(start, end): while current <= end: yield current if current.month == 12: - current = current.replace(year=current.year + 1, month=1) + current = current.replace(year = current.year + 1, month = 1) else: - current = current.replace(month=current.month + 1) + current = current.replace(month = current.month + 1) -def main(): +def prepare_output() -> None: if path.exists("output_mailinglist"): rmtree("output_mailinglist") - if path.exists("output_launchpad"): rmtree("output_launchpad") + makedirs("output_mailinglist", exist_ok = True) + +def is_bug(text : str) -> bool: + return search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] - makedirs("output_mailinglist", exist_ok=True) - for month in months_iterator(start = start_date, end = end_date): +def main(): + prepare_output() + + for month in months_iterator(start_date, end_date): print(f"{month.strftime('%Y-%m')}") url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html" - html = urlopen(url).read() - - soup = BeautifulSoup(html, features='html5lib') + soup = BeautifulSoup(html, features = 'html5lib') ul = soup.body.ul - threads = ul.find_all('li', recursive=False) + threads = ul.find_all('li', recursive = False) for li in reversed(threads): a_tag = li.find('b').find('a') - if not a_tag: continue - text = a_tag.get_text(strip=True) + text = a_tag.get_text(strip = True) href = a_tag.get('href') - re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] - if not re_match: + if not is_bug(text): continue - re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad + # bug issued in launchpad + re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if re_match: - if not process_launchpad_bug(re_match.group(1).strip()): - print(f"Could not parse launchpad bug with id: {re_match.group(1).strip()}") + process_launchpad_bug(re_match.group(1).strip()) continue - re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread + # existing thread + re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:' if re_match: - title_hash = hash(re_match.group(1).strip()) % 1000000 + title_hash = str(hash(re_match.group(1).strip()))[0:7] if path.exists(f"output_mailinglist/{title_hash}"): process_thread(urljoin(url, href), title_hash) continue - title_hash = hash(text.strip()) % 1000000 + # new thread + title_hash = str(hash(text.strip()))[0:7] if path.exists(f"output_mailinglist/{title_hash}"): print(f"ERROR: {title_hash} should not exist!") continue diff --git a/mailinglist/launchpad.py b/mailinglist/launchpad.py index 6b1cdd1ef..cb37996cc 100755 --- a/mailinglist/launchpad.py +++ b/mailinglist/launchpad.py @@ -1,27 +1,34 @@ -from requests import get +from requests import get, Response from os import makedirs, path -def process_launchpad_bug(bug_id) -> bool: - if path.exists(f"output_launchpad/{bug_id}"): - return False +def launchpad_id_valid(bug_id : str) -> bool: + return len(bug_id) == 7 or len(bug_id) == 6 - bug_url = f"https://api.launchpad.net/1.0/bugs/{bug_id}" +def response_valid(response : Response) -> bool: + return 'application/json' in response.headers.get('Content-Type', '') - bug_response = get(url = bug_url) +def process_launchpad_bug(bug_id : str) -> None: + if not launchpad_id_valid(bug_id): + print(f"{bug_id} is not valid") + return - if not 'application/json' in bug_response.headers.get('Content-Type', ''): - return False + if path.exists(f"output_launchpad/{bug_id}"): + print(f"output_launchpad/{bug_id} exists already") + return - bug_data = bug_response.json() + bug_url = f"https://api.launchpad.net/1.0/bugs/{bug_id}" + bug_response = get(bug_url) - messages_response = get(url = bug_data['messages_collection_link']) + if not response_valid(bug_response): + print(f"Response for {bug_id} is not valid") + return + bug_data = bug_response.json() + messages_response = get(bug_data['messages_collection_link']) messages_data = messages_response.json() - makedirs("output_launchpad", exist_ok=True) + makedirs("output_launchpad", exist_ok = True) with open(f"output_launchpad/{bug_id}", "w") as file: file.write(f"{bug_data['title']}\n\n") - for entry in messages_data['entries']: file.write(f"{entry['content']}\n\n") - return True diff --git a/mailinglist/thread.py b/mailinglist/thread.py index 8efd5a731..ae7da680d 100755 --- a/mailinglist/thread.py +++ b/mailinglist/thread.py @@ -5,13 +5,13 @@ from urllib.parse import urljoin from bs4 import BeautifulSoup -def write_message(html, hash_value): +def write_message(html : str, hash_value : str) -> None: soup = BeautifulSoup(html, 'html.parser') - text = soup.get_text(separator='\n', strip=True) + text = soup.get_text(separator = '\n', strip = True) with open(f"output_mailinglist/{hash_value}", "a") as file: file.write(f"{text}\n\n") -def process_thread(url, hash_value): +def process_thread(url : str, hash_value : str) -> None: request = get(url) text = request.text |