From 2843bb65aeaeb86eb89bf3d9690db61b9dc6306e Mon Sep 17 00:00:00 2001 From: Christian Krinitsin Date: Wed, 16 Jul 2025 14:52:28 +0200 Subject: add a github scraper --- github/.gitignore | 1 + github/downloader.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ github/output.py | 16 ++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 github/.gitignore create mode 100755 github/downloader.py create mode 100755 github/output.py (limited to 'github') diff --git a/github/.gitignore b/github/.gitignore new file mode 100644 index 00000000..11f8666c --- /dev/null +++ b/github/.gitignore @@ -0,0 +1 @@ +issues diff --git a/github/downloader.py b/github/downloader.py new file mode 100755 index 00000000..abebfae3 --- /dev/null +++ b/github/downloader.py @@ -0,0 +1,44 @@ +from requests import get, Response +from output import output_issue +from argparse import ArgumentParser + +parser = ArgumentParser(prog='downloader.py') +parser.add_argument('-r', '--repository', required=True, help="Which repository to download the issues from") +args = parser.parse_args() + +per_page = 100 +url = f"https://api.github.com/repos/{args.repository}/issues?per_page={per_page}&state=all" +check_url = f"https://api.github.com/repos/{args.repository}" + +def pages_iterator(first : Response): + current = first + while current.links.get('next'): + current.raise_for_status() + yield current + current = get(url = current.links.get('next').get('url')) + current.raise_for_status() + yield current + +def main(): + check = get(check_url) + check.raise_for_status() + + for index, response in enumerate(pages_iterator(get(url))): + print(f"Current page: {index+1}") + + data = response.json() + for i in data: + if "pull_request" in i: + continue + + issue = { + "id": i['number'], + "title": i['title'], + "labels": [label['name'] for label in i['labels']], + "description": i['body'], + } + + output_issue(issue) + +if __name__ == "__main__": + main() diff --git a/github/output.py b/github/output.py new file mode 100755 index 00000000..6d64d71d --- /dev/null +++ b/github/output.py @@ -0,0 +1,16 @@ +from os import path, makedirs + +def write_file(file_path : str, string : str) -> None: + makedirs(path.dirname(file_path), exist_ok = True) + with open(file_path, "w") as file: + file.write(string) + +def output_issue(issue : dict) -> None: + try: + if 'documentation' in issue['labels']: + write_file(f"issues/documentation/{issue['id']}", issue['title'] + '\n' + (issue['description'] or "")) + else: + write_file(f"issues/{issue['id']}", issue['title'] + '\n' + (issue['description'] or "")) + except TypeError: + print(f"error with bug {issue['id']}") + exit() -- cgit 1.4.1