1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-05-29T15:55:02.984855Z",
"start_time": "2025-05-29T15:55:02.922109Z"
}
},
"source": [
"import requests as r\n",
"from bs4 import BeautifulSoup\n",
"from spacy.matcher.dependencymatcher import defaultdict"
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-05-29T16:07:48.398378Z",
"start_time": "2025-05-29T16:07:39.187954Z"
}
},
"cell_type": "code",
"source": [
"languages = [\"de\", \"en\", \"hu\", \"cs\", \"it\", \"sr\"]\n",
"\n",
"glossaries = [BeautifulSoup(r.get(f\"https://www.railcargo.com/{lang}/glossar\").text, \"html.parser\") for lang in languages]"
],
"id": "ebb676fd7a9aecc9",
"outputs": [],
"execution_count": 11
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-05-29T16:10:08.647656Z",
"start_time": "2025-05-29T16:10:08.601711Z"
}
},
"cell_type": "code",
"source": [
"from collections import defaultdict\n",
"\n",
"definitions = defaultdict(list)\n",
"for lang, glossary_doc in zip(languages, glossaries):\n",
" for entry in glossary_doc.find_all(\"div\", class_=\"rcg-glossary-term\"):\n",
" title = entry.find_next(\"h3\").text.strip()\n",
" definition = entry.find_next(\"div\", class_=\"rcg-glossary-term__description\").text.strip()\n",
" definitions[lang].append((title, definition))"
],
"id": "6e450f214fc0b8b1",
"outputs": [],
"execution_count": 20
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-05-29T16:11:35.732104Z",
"start_time": "2025-05-29T16:11:35.469701Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"for lang, defs in definitions.items():\n",
" pd.DataFrame(defs).to_csv(f\"{lang}-glossary.csv\", index=True)"
],
"id": "fa0e0d9a45363408",
"outputs": [],
"execution_count": 23
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|