{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "# Similarity Comparison for Tests", "id": "b6d4f9b05f293ec8" }, { "metadata": {}, "cell_type": "markdown", "source": "## Fuzzy string matching", "id": "87c701ae2678e5db" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# TODO", "id": "23c3f92212402a87" }, { "metadata": {}, "cell_type": "markdown", "source": "## Sentence Embeddings", "id": "275e303877d17c08" }, { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-06-30T09:41:56.203596Z", "start_time": "2025-06-30T09:41:54.449778Z" } }, "source": [ "\n", "from sentence_transformers import SentenceTransformer\n", "\n", "# The \"all-MiniLM-L6-v2\" model is used for demonstration\n", "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "expected = [\"Rangieren\", \"420\", \"Gleis\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n", "actual = [\"Rangieren\", \"420er\", \"Gleis\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n", "\n", "sentences = expected + actual\n", "\n", "embeddings = model.encode(sentences)\n", "\n", "similarities = model.similarity(embeddings, embeddings)\n", "for i in range(len(expected)):\n", " print(f\"Expected: {expected[i]}, Actual: {actual[i]}, Similarity: {similarities[i][i+len(expected)]}\")" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Expected: Rangieren, Actual: Rangieren, Similarity: 1.0\n", "Expected: 420, Actual: 420er, Similarity: 0.8273268938064575\n", "Expected: Gleis, Actual: Gleis, Similarity: 1.0000003576278687\n", "Expected: Abstellgleis, Actual: Abstellgleis, Similarity: 1.000000238418579\n", "Expected: auf Sicht fahren, Actual: auf Sicht fahren, Similarity: 1.0\n", "Expected: Signal, Actual: Signal, Similarity: 1.0\n" ] } ], "execution_count": 3 }, { "metadata": {}, "cell_type": "markdown", "source": "## Assesment using LLMs", "id": "dff194fdcd03c59c" }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-30T13:07:07.597774Z", "start_time": "2025-06-30T13:07:07.582024Z" } }, "cell_type": "code", "source": [ "import random\n", "import numpy as np\n", "from openai import OpenAI\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "\n", "client = OpenAI()\n", "input_text = \"Rangiere mir bitte mal den 420er von Gleis 3 auf das Abstellgleis. Passt auf, du musst auf Sicht bis zu den Signalen fahren.\"\n", "expected_terms = [\"Rangieren\", \"420\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n", "actual_terms = [\"Rangieren\", \"420er\", \"Abstellgleis\", \"auf Sicht fahren\", \"Signal\"]\n", "\n", "def run_test(shuffle=False):\n", " expected = expected_terms.copy()\n", " actual = actual_terms.copy()\n", " if shuffle:\n", " random.shuffle(expected)\n", " random.shuffle(actual)\n", " print(f\"Expected: {expected}, Actual: {actual}\")\n", " response = client.chat.completions.create(\n", " model=\"gpt-4o-mini\",\n", " response_format={\"type\": \"text\"},\n", " temperature=0,\n", " top_p=0,\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"Bewerte die Ähnlichkeit der Ergebnisse der Term Extraktion. Gegeben ist ein Ausgangstext, \"\n", " \"aus dem Fachbegriffe extrahiert werden mussten. Der Text ist gegeben. Darunter stehen die erwarteten Begriffe, \"\n", " \"die extrahiert werden sollten. Zum Schluss stehen die tatsächlich extrahierten Begriffe. \"\n", " \"Bewerte die Ähnlichkeit der extrahierten Begriffe. \"\n", " \"Ignoriere die Reihenfolge der Begriffe.\"\n", " \"Sobald sich ein Begriff grundlegend unterscheidet, beende sofort mit FALSE.\"\n", " \"Wenn ein erwarteter Begriff gänzlich fehlt, beende sofort mit FALSE.\"\n", " \"Wenn ein Begriff extrahiert wurde, der sicher kein Fachbegriff ist, beende sofort mit FALSE.\"\n", " \"Ansonsten Ende sofort mit TRUE.\"\n", " \"Bewerte die extrahierten Begriffe.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"\"\"{input_text}\\n\\nErwartete Begriffe: {\", \".join(expected)}\\n\\nTatsächliche Begriffe: {\", \".join(actual)}\"\"\"\n", " }\n", " ],\n", " logprobs=True,\n", " seed=42,\n", " store=False,\n", " top_logprobs=5,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", " )\n", " result, logprobs = response.choices[0].message.content, response.choices[0].logprobs\n", " # Get probabilities for TRUE and FALSE as the last token\n", " probs = {token.token: float(np.exp(token.logprob)) for token in logprobs.content[0].top_logprobs if token.token == \"TRUE\" or token.token == \"FALSE\"}\n", " # Normalize the probabilities to only account for TRUE and FALSE\n", " # This might actually distort the result, as other, maybe more likely tokens are ignored (however, such results can be considered faulty)\n", " total_end = sum(probs.values())\n", " normalized_probs = {token: value / total_end for token, value in probs.items()}\n", " print(f\"Probability for success of test {normalized_probs['TRUE']}\")" ], "id": "6924f5f7741325d0", "outputs": [], "execution_count": 49 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-30T13:07:13.049204Z", "start_time": "2025-06-30T13:07:09.535572Z" } }, "cell_type": "code", "source": [ "for i in range(5):\n", " run_test(shuffle=False)" ], "id": "235b5091f0936d3e", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Probability for success of test 0.2942149597859341\n", "Probability for success of test 0.46879062662624377\n", "Probability for success of test 0.2942149597859341\n", "Probability for success of test 0.26894142136999516\n", "Probability for success of test 0.24508501864634824\n" ] } ], "execution_count": 50 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-30T13:07:16.452505Z", "start_time": "2025-06-30T13:07:14.379517Z" } }, "cell_type": "code", "source": [ "for i in range(5):\n", " run_test(shuffle=True)" ], "id": "6e5159dd554e4469", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Expected: ['420', 'auf Sicht fahren', 'Rangieren', 'Abstellgleis', 'Signal'], Actual: ['Abstellgleis', 'Rangieren', 'Signal', '420er', 'auf Sicht fahren']\n", "Probability for success of test 0.2689414096510109\n", "Expected: ['Rangieren', 'auf Sicht fahren', '420', 'Signal', 'Abstellgleis'], Actual: ['Rangieren', '420er', 'auf Sicht fahren', 'Abstellgleis', 'Signal']\n", "Probability for success of test 0.053403330553099\n", "Expected: ['Rangieren', 'auf Sicht fahren', 'Signal', '420', 'Abstellgleis'], Actual: ['Signal', '420er', 'Abstellgleis', 'Rangieren', 'auf Sicht fahren']\n", "Probability for success of test 0.9820137910906878\n", "Expected: ['Abstellgleis', '420', 'auf Sicht fahren', 'Signal', 'Rangieren'], Actual: ['420er', 'auf Sicht fahren', 'Signal', 'Abstellgleis', 'Rangieren']\n", "Probability for success of test 0.8807970779778824\n", "Expected: ['Rangieren', 'Signal', '420', 'auf Sicht fahren', 'Abstellgleis'], Actual: ['Rangieren', 'auf Sicht fahren', 'Signal', '420er', 'Abstellgleis']\n", "Probability for success of test 0.09534946618445304\n" ] } ], "execution_count": 51 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }