add main function for mailinglist scraper

author: Christian Krinitsin <mail@krinitsin.com> 2025-05-22 20:04:54 +0200
committer: Christian Krinitsin <mail@krinitsin.com> 2025-05-22 20:04:54 +0200
commit: a5740cacb75d587346b07225e0e47dc6a3f12753 (patch)
tree: 13d855120d83c275833a0977832b79b91f9baf47
parent: 70136a2208308816a06c26b4487d3440dca5c1af (diff)
download: emulator-bug-study-a5740cacb75d587346b07225e0e47dc6a3f12753.tar.gz
emulator-bug-study-a5740cacb75d587346b07225e0e47dc6a3f12753.zip
1 files changed, 39 insertions, 21 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py
index 08ca583c..0b8a4277 100755
--- a/mailinglist/downloader.py
+++ b/mailinglist/downloader.py
@@ -1,25 +1,37 @@
 from datetime import datetime, timedelta
 from urllib.request import urlopen
+from os import makedirs, path, remove
+from shutil import rmtree
+from re import search, match
+from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
-from re import search
 
-end_date = datetime(2015, 4, 1)
-# end_date = datetime.today().replace(day=1) #####
-start_date = datetime.today().replace(day=1)
+from launchpad import process_launchpad_bug
+from thread import process_thread
+
+start_date = datetime(2015, 4, 1)
+end_date = datetime.today().replace(day=1)
 
 def months_iterator(start, end):
     current = start
-    while current >= end:
+    while current <= end:
         yield current
-        if current.month == 1:
-            current = current.replace(year=current.year - 1, month=12)
+        if current.month == 12:
+            current = current.replace(year=current.year + 1, month=1)
         else:
-            current = current.replace(month=current.month - 1)
+            current = current.replace(month=current.month + 1)
 
 def main():
-    count = 0
+    if path.exists("output_mailinglist"):
+        rmtree("output_mailinglist")
+
+    if path.exists("output_launchpad"):
+        rmtree("output_launchpad")
+
+    makedirs("output_mailinglist", exist_ok=True)
     for month in months_iterator(start = start_date, end = end_date):
+        print(f"{month.strftime('%Y-%m')}")
         url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html"
 
         html = urlopen(url).read()
@@ -28,7 +40,7 @@ def main():
 
         ul = soup.body.ul
         threads = ul.find_all('li', recursive=False)
-        for li in threads:
+        for li in reversed(threads):
             a_tag = li.find('b').find('a')
 
             if not a_tag:
@@ -36,25 +48,31 @@ def main():
 
             text = a_tag.get_text(strip=True)
             href = a_tag.get('href')
-            match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
 
-            if not match:
+            re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
+            if not re_match:
                 continue
 
-            match = search(r'(Re\:|RE\:|re\:)', text) # matches bug enclosed in []
-
-            if match:
+            re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad
+            if re_match:
+                process_launchpad_bug(re_match.group(1).strip())
                 continue
 
-            match = search(r'\[Bug\s\d+\]', text) # matches bug enclosed in []
-
-            if match:
+            re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread
+            if re_match:
+                title_hash = hash(re_match.group(1).strip()) % 1000000
+                if path.exists(f"output_mailinglist/{title_hash}"):
+                    process_thread(urljoin(url, href), title_hash)
                 continue
 
-            print(f"Text: {text}, Href: {href}")
-            count = count + 1
+            title_hash = hash(text.strip()) % 1000000
+            if path.exists(f"output_mailinglist/{title_hash}"):
+                print(f"ERROR: {title_hash} should not exist!")
+                continue
 
-        print(f"{month.strftime('%Y-%m')}, Count: {count}")
+            with open(f"output_mailinglist/{title_hash}", "w") as file:
+                file.write(f"{text}\n\n")
+            process_thread(urljoin(url, href), title_hash)
 
 if __name__ == "__main__":
     main()
author	Christian Krinitsin <mail@krinitsin.com>	2025-05-22 20:04:54 +0200
committer	Christian Krinitsin <mail@krinitsin.com>	2025-05-22 20:04:54 +0200
commit	a5740cacb75d587346b07225e0e47dc6a3f12753 (patch)
tree	13d855120d83c275833a0977832b79b91f9baf47
parent	70136a2208308816a06c26b4487d3440dca5c1af (diff)
download	emulator-bug-study-a5740cacb75d587346b07225e0e47dc6a3f12753.tar.gz emulator-bug-study-a5740cacb75d587346b07225e0e47dc6a3f12753.zip