diff options
Diffstat (limited to 'mailinglist')
| -rwxr-xr-x | mailinglist/downloader.py | 45 |
1 files changed, 42 insertions, 3 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py index 214528907..08ca583c1 100755 --- a/mailinglist/downloader.py +++ b/mailinglist/downloader.py @@ -1,6 +1,11 @@ from datetime import datetime, timedelta +from urllib.request import urlopen -end_date = datetime(2003, 4, 1) +from bs4 import BeautifulSoup +from re import search + +end_date = datetime(2015, 4, 1) +# end_date = datetime.today().replace(day=1) ##### start_date = datetime.today().replace(day=1) def months_iterator(start, end): @@ -13,9 +18,43 @@ def months_iterator(start, end): current = current.replace(month=current.month - 1) def main(): + count = 0 for month in months_iterator(start = start_date, end = end_date): - url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime("%Y-%m")}/threads.html" - print(url) + url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html" + + html = urlopen(url).read() + + soup = BeautifulSoup(html, features='html5lib') + + ul = soup.body.ul + threads = ul.find_all('li', recursive=False) + for li in threads: + a_tag = li.find('b').find('a') + + if not a_tag: + continue + + text = a_tag.get_text(strip=True) + href = a_tag.get('href') + match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] + + if not match: + continue + + match = search(r'(Re\:|RE\:|re\:)', text) # matches bug enclosed in [] + + if match: + continue + + match = search(r'\[Bug\s\d+\]', text) # matches bug enclosed in [] + + if match: + continue + + print(f"Text: {text}, Href: {href}") + count = count + 1 + + print(f"{month.strftime('%Y-%m')}, Count: {count}") if __name__ == "__main__": main() |