about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/experiments.ipynb
diff options
context:
space:
mode:
authorMartin Fink <martin@finkmartin.com>2025-09-11 09:19:48 +0200
committerGitHub <noreply@github.com>2025-09-11 09:19:48 +0200
commit17af5f6fc0538f615b8612dcd2cb77c2affad63f (patch)
tree76e4c260123b68b93da2417482024ba11f9838ee /archive/2025/summer/bsc_gerg/experiments.ipynb
parenta910d0a3e57f4de47cf2387ac239ae8d0eaca507 (diff)
parent3e5d3ca82193e8e8561beb9ceac9982f376d84e2 (diff)
downloadresearch-work-archive-artifacts-17af5f6fc0538f615b8612dcd2cb77c2affad63f.tar.gz
research-work-archive-artifacts-17af5f6fc0538f615b8612dcd2cb77c2affad63f.zip
Merge pull request #10 from walamana/main
Add bsc_gerg
Diffstat (limited to 'archive/2025/summer/bsc_gerg/experiments.ipynb')
-rw-r--r--archive/2025/summer/bsc_gerg/experiments.ipynb233
1 files changed, 233 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/experiments.ipynb b/archive/2025/summer/bsc_gerg/experiments.ipynb
new file mode 100644
index 000000000..1bd3669ca
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/experiments.ipynb
@@ -0,0 +1,233 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Similarity Comparison for Tests",
+   "id": "b6d4f9b05f293ec8"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Fuzzy string matching",
+   "id": "87c701ae2678e5db"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "# TODO",
+   "id": "23c3f92212402a87"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Sentence Embeddings",
+   "id": "275e303877d17c08"
+  },
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-06-30T09:41:56.203596Z",
+     "start_time": "2025-06-30T09:41:54.449778Z"
+    }
+   },
+   "source": [
+    "\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "# The \"all-MiniLM-L6-v2\" model is used for demonstration\n",
+    "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
+    "\n",
+    "expected = [\"Rangieren\", \"420\", \"Gleis\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n",
+    "actual = [\"Rangieren\", \"420er\", \"Gleis\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n",
+    "\n",
+    "sentences = expected + actual\n",
+    "\n",
+    "embeddings = model.encode(sentences)\n",
+    "\n",
+    "similarities = model.similarity(embeddings, embeddings)\n",
+    "for i in range(len(expected)):\n",
+    "    print(f\"Expected: {expected[i]}, Actual: {actual[i]}, Similarity: {similarities[i][i+len(expected)]}\")"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Expected: Rangieren, Actual: Rangieren, Similarity: 1.0\n",
+      "Expected: 420, Actual: 420er, Similarity: 0.8273268938064575\n",
+      "Expected: Gleis, Actual: Gleis, Similarity: 1.0000003576278687\n",
+      "Expected: Abstellgleis, Actual: Abstellgleis, Similarity: 1.000000238418579\n",
+      "Expected: auf Sicht fahren, Actual: auf Sicht fahren, Similarity: 1.0\n",
+      "Expected: Signal, Actual: Signal, Similarity: 1.0\n"
+     ]
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Assesment using LLMs",
+   "id": "dff194fdcd03c59c"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-30T13:07:07.597774Z",
+     "start_time": "2025-06-30T13:07:07.582024Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import random\n",
+    "import numpy as np\n",
+    "from openai import OpenAI\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "client = OpenAI()\n",
+    "input_text = \"Rangiere mir bitte mal den 420er von Gleis 3 auf das Abstellgleis. Passt auf, du musst auf Sicht bis zu den Signalen fahren.\"\n",
+    "expected_terms = [\"Rangieren\", \"420\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n",
+    "actual_terms =   [\"Rangieren\", \"420er\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n",
+    "\n",
+    "def run_test(shuffle=False):\n",
+    "    expected = expected_terms.copy()\n",
+    "    actual = actual_terms.copy()\n",
+    "    if shuffle:\n",
+    "        random.shuffle(expected)\n",
+    "        random.shuffle(actual)\n",
+    "        print(f\"Expected: {expected}, Actual: {actual}\")\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"gpt-4o-mini\",\n",
+    "        response_format={\"type\": \"text\"},\n",
+    "        temperature=0,\n",
+    "        top_p=0,\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": \"Bewerte die Ähnlichkeit der Ergebnisse der Term Extraktion. Gegeben ist ein Ausgangstext, \"\n",
+    "                     \"aus dem Fachbegriffe extrahiert werden mussten. Der Text ist gegeben. Darunter stehen die erwarteten Begriffe, \"\n",
+    "                     \"die extrahiert werden sollten. Zum Schluss stehen die tatsächlich extrahierten Begriffe. \"\n",
+    "                     \"Bewerte die Ähnlichkeit der extrahierten Begriffe. \"\n",
+    "                     \"Ignoriere die Reihenfolge der Begriffe.\"\n",
+    "                     \"Sobald sich ein Begriff grundlegend unterscheidet, beende sofort mit FALSE.\"\n",
+    "                     \"Wenn ein erwarteter Begriff gänzlich fehlt, beende sofort mit FALSE.\"\n",
+    "                     \"Wenn ein Begriff extrahiert wurde, der sicher kein Fachbegriff ist, beende sofort mit FALSE.\"\n",
+    "                     \"Ansonsten Ende sofort mit TRUE.\"\n",
+    "                     \"Bewerte die extrahierten Begriffe.\"\n",
+    "            },\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": f\"\"\"{input_text}\\n\\nErwartete Begriffe: {\", \".join(expected)}\\n\\nTatsächliche Begriffe: {\", \".join(actual)}\"\"\"\n",
+    "            }\n",
+    "        ],\n",
+    "        logprobs=True,\n",
+    "        seed=42,\n",
+    "        store=False,\n",
+    "        top_logprobs=5,\n",
+    "        frequency_penalty=0,\n",
+    "        presence_penalty=0,\n",
+    "    )\n",
+    "    result, logprobs = response.choices[0].message.content, response.choices[0].logprobs\n",
+    "    # Get probabilities for TRUE and FALSE as the last token\n",
+    "    probs = {token.token: float(np.exp(token.logprob)) for token in logprobs.content[0].top_logprobs if token.token == \"TRUE\" or token.token == \"FALSE\"}\n",
+    "    # Normalize the probabilities to only account for TRUE and FALSE\n",
+    "    # This might actually distort the result, as other, maybe more likely tokens are ignored (however, such results can be considered faulty)\n",
+    "    total_end = sum(probs.values())\n",
+    "    normalized_probs = {token: value / total_end for token, value in probs.items()}\n",
+    "    print(f\"Probability for success of test {normalized_probs['TRUE']}\")"
+   ],
+   "id": "6924f5f7741325d0",
+   "outputs": [],
+   "execution_count": 49
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-30T13:07:13.049204Z",
+     "start_time": "2025-06-30T13:07:09.535572Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "for i in range(5):\n",
+    "    run_test(shuffle=False)"
+   ],
+   "id": "235b5091f0936d3e",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Probability for success of test 0.2942149597859341\n",
+      "Probability for success of test 0.46879062662624377\n",
+      "Probability for success of test 0.2942149597859341\n",
+      "Probability for success of test 0.26894142136999516\n",
+      "Probability for success of test 0.24508501864634824\n"
+     ]
+    }
+   ],
+   "execution_count": 50
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-30T13:07:16.452505Z",
+     "start_time": "2025-06-30T13:07:14.379517Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "for i in range(5):\n",
+    "    run_test(shuffle=True)"
+   ],
+   "id": "6e5159dd554e4469",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Expected: ['420', 'auf Sicht fahren', 'Rangieren', 'Abstellgleis', 'Signal'], Actual: ['Abstellgleis', 'Rangieren', 'Signal', '420er', 'auf Sicht fahren']\n",
+      "Probability for success of test 0.2689414096510109\n",
+      "Expected: ['Rangieren', 'auf Sicht fahren', '420', 'Signal', 'Abstellgleis'], Actual: ['Rangieren', '420er', 'auf Sicht fahren', 'Abstellgleis', 'Signal']\n",
+      "Probability for success of test 0.053403330553099\n",
+      "Expected: ['Rangieren', 'auf Sicht fahren', 'Signal', '420', 'Abstellgleis'], Actual: ['Signal', '420er', 'Abstellgleis', 'Rangieren', 'auf Sicht fahren']\n",
+      "Probability for success of test 0.9820137910906878\n",
+      "Expected: ['Abstellgleis', '420', 'auf Sicht fahren', 'Signal', 'Rangieren'], Actual: ['420er', 'auf Sicht fahren', 'Signal', 'Abstellgleis', 'Rangieren']\n",
+      "Probability for success of test 0.8807970779778824\n",
+      "Expected: ['Rangieren', 'Signal', '420', 'auf Sicht fahren', 'Abstellgleis'], Actual: ['Rangieren', 'auf Sicht fahren', 'Signal', '420er', 'Abstellgleis']\n",
+      "Probability for success of test 0.09534946618445304\n"
+     ]
+    }
+   ],
+   "execution_count": 51
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}