archive/2025/summer/bsc_gerg/tests/test_integration.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

import random
from unittest import TestCase

import numpy as np
from fastapi.testclient import TestClient

from src.main import app
from tests.util import create_completion_openai_sync

REDUCE_REASONING = False


class TestIntegrationTerminology(TestCase):

    def setUp(self):
        self.client = TestClient(app)

    def testExtractDomainTerminology(self):
        """This test allows manually added variations"""
        input = [
            "Servus Zofia!",
            "Hallo Markus.",
            "Rangiere mir bitte mal den 420er von Gleis 3 auf das Abstellgleis. Passt auf, du musst auf Sicht bis zu den Signalen fahren."
        ]

        expectedTerms = [
            ["Rangieren"],
            ["420", "420er"],   # The variation "420er" was added after evaluation of the test results, as it is also a valid term
            ["Abstellgleis"],
            ["auf Sicht fahren"],
            ["Signal"]
        ]

        response = self.client.post("/extractTerminology", json={
            "text": input[-1],
            "context": "\n".join(input[:-1])
        })

        response.raise_for_status()

        response = response.json()

        self.assertIn("terms", response)
        terms = [term["normalization"] or term["text"] for term in response["terms"]]
        terms_lower = [term.lower() for term in terms]

        missing_terms = []
        for variations in expectedTerms:
            missing_terms_variation = []
            for term in variations:
                if term.lower() not in terms_lower:
                    missing_terms_variation.append(term)
            # If no variation was matched, the term is not contained in the response
            if len(missing_terms_variation) == len(variations):
                missing_terms.append(variations)

        if len(missing_terms) > 0:
            self.fail(f"Missing the following terms in the response: {missing_terms}")


    def testExtractDomainTerminology_LLM(self):
        input = [
            "Servus Zofia!",
            "Hallo Markus.",
            "Rangiere mir bitte mal den 420er von Gleis 3 auf das Abstellgleis. Passt auf, du musst auf Sicht bis zu den Signalen fahren."
        ]

        expectedTerms = [
            "Rangieren",
            "420",
            "Abstellgleis",
            "Fahrt auf Sicht",
            "Signal"
        ]

        response = self.client.post("/extractTerminology", json={
            "text": input[-1],
            "context": "\n".join(input[:-1])
        })

        response.raise_for_status()

        response = response.json()

        self.assertIn("terms", response)
        terms = [term["normalization"] or term["text"] for term in response["terms"]]


        probs_all = []

        no_reasoning = " sofort" if REDUCE_REASONING else ""

        for i in range(5):
            print(f"##### TEST {i} #####")
            # Note: shuffling the results changed the outcome significantly
            random.shuffle(expectedTerms)
            random.shuffle(terms)
            response, logprobs = create_completion_openai_sync(
                messages=[
                    (
                        "user",
                        "Bewerte die Ähnlichkeit der Ergebnisse der Term Extraktion. Gegeben ist ein Ausgangstext, "
                         "aus dem Fachbegriffe extrahiert werden mussten. Der Text ist gegeben. Darunter stehen die erwarteten Begriffe, "
                         "die extrahiert werden sollten. Zum Schluss stehen die tatsächlich extrahierten Begriffe. "
                         "Bewerte die Ähnlichkeit der extrahierten Begriffe."
                         "Nur sprachliche Variationen für einen erwarteten Begriff sind erlaubt."
                         f"Gibt es für einen erwarteten Begriff keinen ähnlichen extrahierten Begriff, beende{no_reasoning} mit FALSE."
                         f"Wenn ein erwarteter Begriff gänzlich fehlt, beende{no_reasoning} mit FALSE."
                         f"Wenn ein Begriff extrahiert wurde, der sicher kein Fachbegriff ist, beende{no_reasoning} mit FALSE."
                         "Wenn ein Begriff extrahiert wurde, der nicht erwartet wurde, ignoriere diesen. Dies gilt nicht als Unterschied."
                         f"{'Antworte sofort.' if REDUCE_REASONING else ''}"
                         f"Ansonsten Ende{no_reasoning} mit TRUE."
                         "Bewerte die extrahierten Begriffe."
                    ),
                    ("user", f"""{input[len(input) - 1]}\n\nErwartete Begriffe: {", ".join(expectedTerms)}\n\nTatsächliche Begriffe: {", ".join(terms)}"""),
                ],
                logprobs=True
            )

            print(f"{response}")

            # Look at the last 5 output tokens
            last_tokens = logprobs.content[-5:]
            last_tokens.reverse()
            for content in last_tokens:
                cur_token = content.token.strip()
                probs = {token.token: float(np.exp(token.logprob)) for token in content.top_logprobs}
                # print(f"Probs: {probs}")
                if "TRUE" in cur_token or "FALSE" in cur_token:
                    if "TRUE" not in probs.keys() or "FALSE" not in probs.keys():
                        continue
                    probs = {token: prob for token, prob in probs.items() if token == "TRUE" or token == "FALSE"}
                    total_end = sum(probs.values())
                    normalized_probs = {token: value / total_end for token, value in probs.items()}
                    print(normalized_probs)
                    probs_all.append(normalized_probs["TRUE"] if "TRUE" in normalized_probs.keys() else 0)
                    break
            print("")
        min_prob = min(probs_all)
        max_prob = max(probs_all)
        avg = sum(probs_all) / len(probs_all)
        var = np.var(probs_all)

        print(f"min: {min_prob}, max: {max_prob}, avg: {avg}, var: {var}")

        self.assertLess(var, 0.05)
        self.assertGreater(avg, 0.8)
        self.assertGreater(min_prob, 0.75)