{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-05-29T15:55:02.984855Z", "start_time": "2025-05-29T15:55:02.922109Z" } }, "source": [ "import requests as r\n", "from bs4 import BeautifulSoup\n", "from spacy.matcher.dependencymatcher import defaultdict" ], "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-05-29T16:07:48.398378Z", "start_time": "2025-05-29T16:07:39.187954Z" } }, "cell_type": "code", "source": [ "languages = [\"de\", \"en\", \"hu\", \"cs\", \"it\", \"sr\"]\n", "\n", "glossaries = [BeautifulSoup(r.get(f\"https://www.railcargo.com/{lang}/glossar\").text, \"html.parser\") for lang in languages]" ], "id": "ebb676fd7a9aecc9", "outputs": [], "execution_count": 11 }, { "metadata": { "ExecuteTime": { "end_time": "2025-05-29T16:10:08.647656Z", "start_time": "2025-05-29T16:10:08.601711Z" } }, "cell_type": "code", "source": [ "from collections import defaultdict\n", "\n", "definitions = defaultdict(list)\n", "for lang, glossary_doc in zip(languages, glossaries):\n", " for entry in glossary_doc.find_all(\"div\", class_=\"rcg-glossary-term\"):\n", " title = entry.find_next(\"h3\").text.strip()\n", " definition = entry.find_next(\"div\", class_=\"rcg-glossary-term__description\").text.strip()\n", " definitions[lang].append((title, definition))" ], "id": "6e450f214fc0b8b1", "outputs": [], "execution_count": 20 }, { "metadata": { "ExecuteTime": { "end_time": "2025-05-29T16:11:35.732104Z", "start_time": "2025-05-29T16:11:35.469701Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "for lang, defs in definitions.items():\n", " pd.DataFrame(defs).to_csv(f\"{lang}-glossary.csv\", index=True)" ], "id": "fa0e0d9a45363408", "outputs": [], "execution_count": 23 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }