diff options
| author | Jonas Gerg <joniogerg@gmail.com> | 2025-09-09 20:06:52 +0200 |
|---|---|---|
| committer | Jonas Gerg <joniogerg@gmail.com> | 2025-09-09 20:06:52 +0200 |
| commit | 3e5d3ca82193e8e8561beb9ceac9982f376d84e2 (patch) | |
| tree | 76e4c260123b68b93da2417482024ba11f9838ee /archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb | |
| parent | a910d0a3e57f4de47cf2387ac239ae8d0eaca507 (diff) | |
| download | research-work-archive-artifacts-3e5d3ca82193e8e8561beb9ceac9982f376d84e2.tar.gz research-work-archive-artifacts-3e5d3ca82193e8e8561beb9ceac9982f376d84e2.zip | |
Add bsc_gerg
Diffstat (limited to 'archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb')
| -rw-r--r-- | archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb | 99 |
1 files changed, 99 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb b/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb new file mode 100644 index 000000000..e90d9e939 --- /dev/null +++ b/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2025-05-29T15:55:02.984855Z", + "start_time": "2025-05-29T15:55:02.922109Z" + } + }, + "source": [ + "import requests as r\n", + "from bs4 import BeautifulSoup\n", + "from spacy.matcher.dependencymatcher import defaultdict" + ], + "outputs": [], + "execution_count": 1 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-05-29T16:07:48.398378Z", + "start_time": "2025-05-29T16:07:39.187954Z" + } + }, + "cell_type": "code", + "source": [ + "languages = [\"de\", \"en\", \"hu\", \"cs\", \"it\", \"sr\"]\n", + "\n", + "glossaries = [BeautifulSoup(r.get(f\"https://www.railcargo.com/{lang}/glossar\").text, \"html.parser\") for lang in languages]" + ], + "id": "ebb676fd7a9aecc9", + "outputs": [], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-05-29T16:10:08.647656Z", + "start_time": "2025-05-29T16:10:08.601711Z" + } + }, + "cell_type": "code", + "source": [ + "from collections import defaultdict\n", + "\n", + "definitions = defaultdict(list)\n", + "for lang, glossary_doc in zip(languages, glossaries):\n", + " for entry in glossary_doc.find_all(\"div\", class_=\"rcg-glossary-term\"):\n", + " title = entry.find_next(\"h3\").text.strip()\n", + " definition = entry.find_next(\"div\", class_=\"rcg-glossary-term__description\").text.strip()\n", + " definitions[lang].append((title, definition))" + ], + "id": "6e450f214fc0b8b1", + "outputs": [], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-05-29T16:11:35.732104Z", + "start_time": "2025-05-29T16:11:35.469701Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "for lang, defs in definitions.items():\n", + " pd.DataFrame(defs).to_csv(f\"{lang}-glossary.csv\", index=True)" + ], + "id": "fa0e0d9a45363408", + "outputs": [], + "execution_count": 23 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} |