summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rwxr-xr-xmailinglist/downloader.py49
-rwxr-xr-xmailinglist/launchpad.py33
-rwxr-xr-xmailinglist/thread.py6
3 files changed, 49 insertions, 39 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py
index 38d37305..e793271e 100755
--- a/mailinglist/downloader.py
+++ b/mailinglist/downloader.py
@@ -1,9 +1,9 @@
-from datetime import datetime, timedelta
+from datetime import datetime
 from urllib.request import urlopen
-from os import makedirs, path, remove
+from urllib.parse import urljoin
+from os import makedirs, path
 from shutil import rmtree
 from re import search, match
-from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
 
@@ -18,55 +18,58 @@ def months_iterator(start, end):
     while current <= end:
         yield current
         if current.month == 12:
-            current = current.replace(year=current.year + 1, month=1)
+            current = current.replace(year = current.year + 1, month = 1)
         else:
-            current = current.replace(month=current.month + 1)
+            current = current.replace(month = current.month + 1)
 
-def main():
+def prepare_output() -> None:
     if path.exists("output_mailinglist"):
         rmtree("output_mailinglist")
-
     if path.exists("output_launchpad"):
         rmtree("output_launchpad")
+    makedirs("output_mailinglist", exist_ok = True)
+
+def is_bug(text : str) -> bool:
+    return search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
 
-    makedirs("output_mailinglist", exist_ok=True)
-    for month in months_iterator(start = start_date, end = end_date):
+def main():
+    prepare_output()
+
+    for month in months_iterator(start_date, end_date):
         print(f"{month.strftime('%Y-%m')}")
         url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html"
-
         html = urlopen(url).read()
-
-        soup = BeautifulSoup(html, features='html5lib')
+        soup = BeautifulSoup(html, features = 'html5lib')
 
         ul = soup.body.ul
-        threads = ul.find_all('li', recursive=False)
+        threads = ul.find_all('li', recursive = False)
         for li in reversed(threads):
             a_tag = li.find('b').find('a')
-
             if not a_tag:
                 continue
 
-            text = a_tag.get_text(strip=True)
+            text = a_tag.get_text(strip = True)
             href = a_tag.get('href')
 
-            re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
-            if not re_match:
+            if not is_bug(text):
                 continue
 
-            re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad
+            # bug issued in launchpad
+            re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>]
             if re_match:
-                if not process_launchpad_bug(re_match.group(1).strip()):
-                    print(f"Could not parse launchpad bug with id: {re_match.group(1).strip()}")
+                process_launchpad_bug(re_match.group(1).strip())
                 continue
 
-            re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread
+            # existing thread
+            re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:'
             if re_match:
-                title_hash = hash(re_match.group(1).strip()) % 1000000
+                title_hash = str(hash(re_match.group(1).strip()))[0:7]
                 if path.exists(f"output_mailinglist/{title_hash}"):
                     process_thread(urljoin(url, href), title_hash)
                 continue
 
-            title_hash = hash(text.strip()) % 1000000
+            # new thread
+            title_hash = str(hash(text.strip()))[0:7]
             if path.exists(f"output_mailinglist/{title_hash}"):
                 print(f"ERROR: {title_hash} should not exist!")
                 continue
diff --git a/mailinglist/launchpad.py b/mailinglist/launchpad.py
index 6b1cdd1e..cb37996c 100755
--- a/mailinglist/launchpad.py
+++ b/mailinglist/launchpad.py
@@ -1,27 +1,34 @@
-from requests import get
+from requests import get, Response
 from os import makedirs, path
 
-def process_launchpad_bug(bug_id) -> bool:
-    if path.exists(f"output_launchpad/{bug_id}"):
-        return False
+def launchpad_id_valid(bug_id : str) -> bool:
+    return len(bug_id) == 7 or len(bug_id) == 6
 
-    bug_url = f"https://api.launchpad.net/1.0/bugs/{bug_id}"
+def response_valid(response : Response) -> bool:
+    return 'application/json' in response.headers.get('Content-Type', '')
 
-    bug_response = get(url = bug_url)
+def process_launchpad_bug(bug_id : str) -> None:
+    if not launchpad_id_valid(bug_id):
+        print(f"{bug_id} is not valid")
+        return
 
-    if not 'application/json' in bug_response.headers.get('Content-Type', ''):
-        return False
+    if path.exists(f"output_launchpad/{bug_id}"):
+        print(f"output_launchpad/{bug_id} exists already")
+        return
 
-    bug_data = bug_response.json()
+    bug_url = f"https://api.launchpad.net/1.0/bugs/{bug_id}"
+    bug_response = get(bug_url)
 
-    messages_response = get(url = bug_data['messages_collection_link'])
+    if not response_valid(bug_response):
+        print(f"Response for {bug_id} is not valid")
+        return
 
+    bug_data = bug_response.json()
+    messages_response = get(bug_data['messages_collection_link'])
     messages_data = messages_response.json()
 
-    makedirs("output_launchpad", exist_ok=True)
+    makedirs("output_launchpad", exist_ok = True)
     with open(f"output_launchpad/{bug_id}", "w") as file:
         file.write(f"{bug_data['title']}\n\n")
-
         for entry in messages_data['entries']:
             file.write(f"{entry['content']}\n\n")
-    return True
diff --git a/mailinglist/thread.py b/mailinglist/thread.py
index 8efd5a73..ae7da680 100755
--- a/mailinglist/thread.py
+++ b/mailinglist/thread.py
@@ -5,13 +5,13 @@ from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
 
-def write_message(html, hash_value):
+def write_message(html : str, hash_value : str) -> None:
     soup = BeautifulSoup(html, 'html.parser')
-    text = soup.get_text(separator='\n', strip=True)
+    text = soup.get_text(separator = '\n', strip = True)
     with open(f"output_mailinglist/{hash_value}", "a") as file:
         file.write(f"{text}\n\n")
 
-def process_thread(url, hash_value):
+def process_thread(url : str, hash_value : str) -> None:
     request = get(url)
     text = request.text