about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/tools/scrap_railcargo_glossary.ipynb
blob: e90d9e93940ad5b25a916544f92f5728c37fba80 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-05-29T15:55:02.984855Z",
     "start_time": "2025-05-29T15:55:02.922109Z"
    }
   },
   "source": [
    "import requests as r\n",
    "from bs4 import BeautifulSoup\n",
    "from spacy.matcher.dependencymatcher import defaultdict"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-29T16:07:48.398378Z",
     "start_time": "2025-05-29T16:07:39.187954Z"
    }
   },
   "cell_type": "code",
   "source": [
    "languages = [\"de\", \"en\", \"hu\", \"cs\", \"it\", \"sr\"]\n",
    "\n",
    "glossaries = [BeautifulSoup(r.get(f\"https://www.railcargo.com/{lang}/glossar\").text, \"html.parser\") for lang in languages]"
   ],
   "id": "ebb676fd7a9aecc9",
   "outputs": [],
   "execution_count": 11
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-29T16:10:08.647656Z",
     "start_time": "2025-05-29T16:10:08.601711Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "definitions = defaultdict(list)\n",
    "for lang, glossary_doc in zip(languages, glossaries):\n",
    "    for entry in glossary_doc.find_all(\"div\", class_=\"rcg-glossary-term\"):\n",
    "        title = entry.find_next(\"h3\").text.strip()\n",
    "        definition = entry.find_next(\"div\", class_=\"rcg-glossary-term__description\").text.strip()\n",
    "        definitions[lang].append((title, definition))"
   ],
   "id": "6e450f214fc0b8b1",
   "outputs": [],
   "execution_count": 20
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-29T16:11:35.732104Z",
     "start_time": "2025-05-29T16:11:35.469701Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "for lang, defs in definitions.items():\n",
    "    pd.DataFrame(defs).to_csv(f\"{lang}-glossary.csv\", index=True)"
   ],
   "id": "fa0e0d9a45363408",
   "outputs": [],
   "execution_count": 23
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}