mailinglist/downloader.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

from datetime import datetime
from urllib.request import urlopen
from urllib.parse import urljoin
from os import makedirs, path
from shutil import rmtree
from re import search, match

from bs4 import BeautifulSoup

from launchpad import process_launchpad_bug
from thread import process_thread

start_date = datetime(2015, 4, 1)
end_date = datetime.today().replace(day=1)

def months_iterator(start, end):
    current = start
    while current <= end:
        yield current
        if current.month == 12:
            current = current.replace(year = current.year + 1, month = 1)
        else:
            current = current.replace(month = current.month + 1)

def prepare_output() -> None:
    if path.exists("output_mailinglist"):
        rmtree("output_mailinglist")
    if path.exists("output_launchpad"):
        rmtree("output_launchpad")
    makedirs("output_mailinglist", exist_ok = True)

def is_bug(text : str) -> bool:
    return search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in []

def main():
    prepare_output()

    for month in months_iterator(start_date, end_date):
        print(f"{month.strftime('%Y-%m')}")
        url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html"
        html = urlopen(url).read()
        soup = BeautifulSoup(html, features = 'html5lib')

        ul = soup.body.ul
        threads = ul.find_all('li', recursive = False)
        for li in reversed(threads):
            a_tag = li.find('b').find('a')
            if not a_tag:
                continue

            text = a_tag.get_text(strip = True)
            href = a_tag.get('href')

            if not is_bug(text):
                continue

            # bug issued in launchpad
            re_match = search(r'\[Bug\s(\d+)\]', text) # matches [Bug <number>]
            if re_match:
                process_launchpad_bug(re_match.group(1).strip())
                continue

            # existing thread
            re_match = match(r'(?i)^re:\s*(.*)', text) # matches 'Re:'
            if re_match:
                title_hash = str(hash(re_match.group(1).strip()))[1:9]
                if path.exists(f"output_mailinglist/{title_hash}"):
                    process_thread(urljoin(url, href), title_hash)
                continue

            # new thread
            title_hash = str(hash(text.strip()))[1:9]
            if path.exists(f"output_mailinglist/{title_hash}"):
                print(f"ERROR: {title_hash} should not exist!")
                continue

            with open(f"output_mailinglist/{title_hash}", "w") as file:
                file.write(f"{text}\n\n")
            process_thread(urljoin(url, href), title_hash)

if __name__ == "__main__":
    main()