summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorChristian Krinitsin <mail@krinitsin.com>2025-05-21 21:09:05 +0200
committerChristian Krinitsin <mail@krinitsin.com>2025-05-21 21:09:05 +0200
commit5d328cc20c0be9d128d3c3107185e43e3f934d53 (patch)
treede81222c46d9a81a149f1b171f09ed9394e98ad9
parentaaa3306a1b9e440c4adf9e54fcfd204243b54404 (diff)
downloademulator-bug-study-5d328cc20c0be9d128d3c3107185e43e3f934d53.tar.gz
emulator-bug-study-5d328cc20c0be9d128d3c3107185e43e3f934d53.zip
searches for specific thread titles and counts them
-rwxr-xr-xmailinglist/downloader.py45
1 files changed, 42 insertions, 3 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py
index 21452890..08ca583c 100755
--- a/mailinglist/downloader.py
+++ b/mailinglist/downloader.py
@@ -1,6 +1,11 @@
 from datetime import datetime, timedelta
+from urllib.request import urlopen
 
-end_date = datetime(2003, 4, 1)
+from bs4 import BeautifulSoup
+from re import search
+
+end_date = datetime(2015, 4, 1)
+# end_date = datetime.today().replace(day=1) #####
 start_date = datetime.today().replace(day=1)
 
 def months_iterator(start, end):
@@ -13,9 +18,43 @@ def months_iterator(start, end):
             current = current.replace(month=current.month - 1)
 
 def main():
+    count = 0
     for month in months_iterator(start = start_date, end = end_date):
-        url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime("%Y-%m")}/threads.html"
-        print(url)
+        url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html"
+
+        html = urlopen(url).read()
+
+        soup = BeautifulSoup(html, features='html5lib')
+
+        ul = soup.body.ul
+        threads = ul.find_all('li', recursive=False)
+        for li in threads:
+            a_tag = li.find('b').find('a')
+
+            if not a_tag:
+                continue
+
+            text = a_tag.get_text(strip=True)
+            href = a_tag.get('href')
+            match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
+
+            if not match:
+                continue
+
+            match = search(r'(Re\:|RE\:|re\:)', text) # matches bug enclosed in []
+
+            if match:
+                continue
+
+            match = search(r'\[Bug\s\d+\]', text) # matches bug enclosed in []
+
+            if match:
+                continue
+
+            print(f"Text: {text}, Href: {href}")
+            count = count + 1
+
+        print(f"{month.strftime('%Y-%m')}, Count: {count}")
 
 if __name__ == "__main__":
     main()