about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb')
-rw-r--r--archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb99
1 files changed, 99 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb b/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb
new file mode 100644
index 000000000..e90d9e939
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-05-29T15:55:02.984855Z",
+     "start_time": "2025-05-29T15:55:02.922109Z"
+    }
+   },
+   "source": [
+    "import requests as r\n",
+    "from bs4 import BeautifulSoup\n",
+    "from spacy.matcher.dependencymatcher import defaultdict"
+   ],
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-05-29T16:07:48.398378Z",
+     "start_time": "2025-05-29T16:07:39.187954Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "languages = [\"de\", \"en\", \"hu\", \"cs\", \"it\", \"sr\"]\n",
+    "\n",
+    "glossaries = [BeautifulSoup(r.get(f\"https://www.railcargo.com/{lang}/glossar\").text, \"html.parser\") for lang in languages]"
+   ],
+   "id": "ebb676fd7a9aecc9",
+   "outputs": [],
+   "execution_count": 11
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-05-29T16:10:08.647656Z",
+     "start_time": "2025-05-29T16:10:08.601711Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "definitions = defaultdict(list)\n",
+    "for lang, glossary_doc in zip(languages, glossaries):\n",
+    "    for entry in glossary_doc.find_all(\"div\", class_=\"rcg-glossary-term\"):\n",
+    "        title = entry.find_next(\"h3\").text.strip()\n",
+    "        definition = entry.find_next(\"div\", class_=\"rcg-glossary-term__description\").text.strip()\n",
+    "        definitions[lang].append((title, definition))"
+   ],
+   "id": "6e450f214fc0b8b1",
+   "outputs": [],
+   "execution_count": 20
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-05-29T16:11:35.732104Z",
+     "start_time": "2025-05-29T16:11:35.469701Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "for lang, defs in definitions.items():\n",
+    "    pd.DataFrame(defs).to_csv(f\"{lang}-glossary.csv\", index=True)"
+   ],
+   "id": "fa0e0d9a45363408",
+   "outputs": [],
+   "execution_count": 23
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}