summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorChristian Krinitsin <mail@krinitsin.com>2025-05-22 20:05:27 +0200
committerChristian Krinitsin <mail@krinitsin.com>2025-05-22 20:05:27 +0200
commit5ff3863b1b734b5a98537f5ad411b9e79035d067 (patch)
tree0f11da0ee57a2d8dbf4bf26631f8c2eff79c331e
parent4b927bc37359dec23f67d3427fc982945f24f404 (diff)
parenta5740cacb75d587346b07225e0e47dc6a3f12753 (diff)
downloademulator-bug-study-5ff3863b1b734b5a98537f5ad411b9e79035d067.tar.gz
emulator-bug-study-5ff3863b1b734b5a98537f5ad411b9e79035d067.zip
Merge branch 'mailing-list'
-rw-r--r--mailinglist/.gitignore2
-rwxr-xr-xmailinglist/downloader.py73
-rwxr-xr-xmailinglist/launchpad.py27
-rwxr-xr-xmailinglist/thread.py32
4 files changed, 126 insertions, 8 deletions
diff --git a/mailinglist/.gitignore b/mailinglist/.gitignore
new file mode 100644
index 00000000..0d0ac50d
--- /dev/null
+++ b/mailinglist/.gitignore
@@ -0,0 +1,2 @@
+output_launchpad
+output_mailinglist
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py
index 21452890..0b8a4277 100755
--- a/mailinglist/downloader.py
+++ b/mailinglist/downloader.py
@@ -1,21 +1,78 @@
 from datetime import datetime, timedelta
+from urllib.request import urlopen
+from os import makedirs, path, remove
+from shutil import rmtree
+from re import search, match
+from urllib.parse import urljoin
 
-end_date = datetime(2003, 4, 1)
-start_date = datetime.today().replace(day=1)
+from bs4 import BeautifulSoup
+
+from launchpad import process_launchpad_bug
+from thread import process_thread
+
+start_date = datetime(2015, 4, 1)
+end_date = datetime.today().replace(day=1)
 
 def months_iterator(start, end):
     current = start
-    while current >= end:
+    while current <= end:
         yield current
-        if current.month == 1:
-            current = current.replace(year=current.year - 1, month=12)
+        if current.month == 12:
+            current = current.replace(year=current.year + 1, month=1)
         else:
-            current = current.replace(month=current.month - 1)
+            current = current.replace(month=current.month + 1)
 
 def main():
+    if path.exists("output_mailinglist"):
+        rmtree("output_mailinglist")
+
+    if path.exists("output_launchpad"):
+        rmtree("output_launchpad")
+
+    makedirs("output_mailinglist", exist_ok=True)
     for month in months_iterator(start = start_date, end = end_date):
-        url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime("%Y-%m")}/threads.html"
-        print(url)
+        print(f"{month.strftime('%Y-%m')}")
+        url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html"
+
+        html = urlopen(url).read()
+
+        soup = BeautifulSoup(html, features='html5lib')
+
+        ul = soup.body.ul
+        threads = ul.find_all('li', recursive=False)
+        for li in reversed(threads):
+            a_tag = li.find('b').find('a')
+
+            if not a_tag:
+                continue
+
+            text = a_tag.get_text(strip=True)
+            href = a_tag.get('href')
+
+            re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
+            if not re_match:
+                continue
+
+            re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad
+            if re_match:
+                process_launchpad_bug(re_match.group(1).strip())
+                continue
+
+            re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread
+            if re_match:
+                title_hash = hash(re_match.group(1).strip()) % 1000000
+                if path.exists(f"output_mailinglist/{title_hash}"):
+                    process_thread(urljoin(url, href), title_hash)
+                continue
+
+            title_hash = hash(text.strip()) % 1000000
+            if path.exists(f"output_mailinglist/{title_hash}"):
+                print(f"ERROR: {title_hash} should not exist!")
+                continue
+
+            with open(f"output_mailinglist/{title_hash}", "w") as file:
+                file.write(f"{text}\n\n")
+            process_thread(urljoin(url, href), title_hash)
 
 if __name__ == "__main__":
     main()
diff --git a/mailinglist/launchpad.py b/mailinglist/launchpad.py
new file mode 100755
index 00000000..91d6cd8b
--- /dev/null
+++ b/mailinglist/launchpad.py
@@ -0,0 +1,27 @@
+from requests import get
+from os import makedirs, path
+
+def process_launchpad_bug(bug_id):
+    if path.exists(f"output_launchpad/{bug_id}"):
+        return
+
+    bug_url = f"https://api.launchpad.net/1.0/bugs/{bug_id}"
+
+    bug_response = get(url = bug_url)
+
+    bug_data = bug_response.json()
+
+    messages_response = get(url = bug_data['messages_collection_link'])
+
+    messages_data = messages_response.json()
+
+    makedirs("output_launchpad", exist_ok=True)
+    with open(f"output_launchpad/{bug_id}", "w") as file:
+        file.write(f"{bug_data['title']}\n\n")
+
+        for entry in messages_data['entries']:
+            file.write(f"{entry['content']}\n\n")
+
+if __name__ == "__main__":
+    process_launchpad_bug(1629282)
+    process_launchpad_bug(1915063)
diff --git a/mailinglist/thread.py b/mailinglist/thread.py
new file mode 100755
index 00000000..2dc8ce2f
--- /dev/null
+++ b/mailinglist/thread.py
@@ -0,0 +1,32 @@
+from requests import get
+from os import makedirs
+from re import search, DOTALL, compile
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
+def write_message(html, hash_value):
+    soup = BeautifulSoup(html, 'html.parser')
+    text = soup.get_text(separator='\n', strip=True)
+    with open(f"output_mailinglist/{hash_value}", "a") as file:
+        file.write(f"{text}\n\n")
+
+def process_thread(url, hash_value):
+    request = get(url)
+    text = request.text
+
+    match = search(r'<!--X-Body-of-Message-->(.*?)<!--X-Body-of-Message-End-->', text, DOTALL)
+    if match:
+        write_message(match.group(1).strip(), hash_value)
+
+    pattern = href_pattern = compile(r'\[<a\s+href="([^"]+)">Next in Thread</a>\]')
+    for line in text.splitlines():
+        if "Next in Thread" in line:
+            match = pattern.search(line)
+            if match:
+                href = match.group(1)
+                process_thread(urljoin(url, href), hash_value)
+
+if __name__ == "__main__":
+    makedirs("output_mailinglist", exist_ok=True)
+    process_thread("https://lists.nongnu.org/archive/html/qemu-devel/2025-04/msg05446.html", 861041)