mailinglist/downloader.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

from datetime import datetime, timedelta
from urllib.request import urlopen
from os import makedirs, path, remove
from shutil import rmtree
from re import search, match
from urllib.parse import urljoin

from bs4 import BeautifulSoup

from launchpad import process_launchpad_bug
from thread import process_thread

start_date = datetime(2015, 4, 1)
end_date = datetime.today().replace(day=1)

def months_iterator(start, end):
    current = start
    while current <= end:
        yield current
        if current.month == 12:
            current = current.replace(year=current.year + 1, month=1)
        else:
            current = current.replace(month=current.month + 1)

def main():
    if path.exists("output_mailinglist"):
        rmtree("output_mailinglist")

    if path.exists("output_launchpad"):
        rmtree("output_launchpad")

    makedirs("output_mailinglist", exist_ok=True)
    for month in months_iterator(start = start_date, end = end_date):
        print(f"{month.strftime('%Y-%m')}")
        url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html"

        html = urlopen(url).read()

        soup = BeautifulSoup(html, features='html5lib')

        ul = soup.body.ul
        threads = ul.find_all('li', recursive=False)
        for li in reversed(threads):
            a_tag = li.find('b').find('a')

            if not a_tag:
                continue

            text = a_tag.get_text(strip=True)
            href = a_tag.get('href')

            re_match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []
            if not re_match:
                continue

            re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>] if bug is issued in launchpad
            if re_match:
                process_launchpad_bug(re_match.group(1).strip())
                continue

            re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:', meaning it's not a new thread
            if re_match:
                title_hash = hash(re_match.group(1).strip()) % 1000000
                if path.exists(f"output_mailinglist/{title_hash}"):
                    process_thread(urljoin(url, href), title_hash)
                continue

            title_hash = hash(text.strip()) % 1000000
            if path.exists(f"output_mailinglist/{title_hash}"):
                print(f"ERROR: {title_hash} should not exist!")
                continue

            with open(f"output_mailinglist/{title_hash}", "w") as file:
                file.write(f"{text}\n\n")
            process_thread(urljoin(url, href), title_hash)

if __name__ == "__main__":
    main()