diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-05-21 21:09:05 +0200 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-05-21 21:09:05 +0200 |
| commit | 5d328cc20c0be9d128d3c3107185e43e3f934d53 (patch) | |
| tree | de81222c46d9a81a149f1b171f09ed9394e98ad9 /mailinglist/downloader.py | |
| parent | aaa3306a1b9e440c4adf9e54fcfd204243b54404 (diff) | |
| download | emulator-bug-study-5d328cc20c0be9d128d3c3107185e43e3f934d53.tar.gz emulator-bug-study-5d328cc20c0be9d128d3c3107185e43e3f934d53.zip | |
searches for specific thread titles and counts them
Diffstat (limited to 'mailinglist/downloader.py')
| -rwxr-xr-x | mailinglist/downloader.py | 45 |
1 files changed, 42 insertions, 3 deletions
diff --git a/mailinglist/downloader.py b/mailinglist/downloader.py index 21452890..08ca583c 100755 --- a/mailinglist/downloader.py +++ b/mailinglist/downloader.py @@ -1,6 +1,11 @@ from datetime import datetime, timedelta +from urllib.request import urlopen -end_date = datetime(2003, 4, 1) +from bs4 import BeautifulSoup +from re import search + +end_date = datetime(2015, 4, 1) +# end_date = datetime.today().replace(day=1) ##### start_date = datetime.today().replace(day=1) def months_iterator(start, end): @@ -13,9 +18,43 @@ def months_iterator(start, end): current = current.replace(month=current.month - 1) def main(): + count = 0 for month in months_iterator(start = start_date, end = end_date): - url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime("%Y-%m")}/threads.html" - print(url) + url = f"https://lists.nongnu.org/archive/html/qemu-devel/{month.strftime('%Y-%m')}/threads.html" + + html = urlopen(url).read() + + soup = BeautifulSoup(html, features='html5lib') + + ul = soup.body.ul + threads = ul.find_all('li', recursive=False) + for li in threads: + a_tag = li.find('b').find('a') + + if not a_tag: + continue + + text = a_tag.get_text(strip=True) + href = a_tag.get('href') + match = search(r'\[[^\]]*\b(BUG|bug|Bug)\b[^\]]*\]', text) # matches bug enclosed in [] + + if not match: + continue + + match = search(r'(Re\:|RE\:|re\:)', text) # matches bug enclosed in [] + + if match: + continue + + match = search(r'\[Bug\s\d+\]', text) # matches bug enclosed in [] + + if match: + continue + + print(f"Text: {text}, Href: {href}") + count = count + 1 + + print(f"{month.strftime('%Y-%m')}, Count: {count}") if __name__ == "__main__": main() |