about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/src/knowledge/openai
diff options
context:
space:
mode:
Diffstat (limited to 'archive/2025/summer/bsc_gerg/src/knowledge/openai')
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py75
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py63
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py129
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py15
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py17
7 files changed, 299 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py
new file mode 100644
index 000000000..265602801
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py
@@ -0,0 +1,75 @@
+import asyncio
+from typing import Annotated
+from uuid import UUID
+
+from pydantic import Field
+
+from src.knowledge.llm.definition.combiner import LLMDefinitionCombiner, RELEVANCE_USER_PROMPT, COMBINE_SYSTEM_PROMPT, \
+    COMBINE_USER_PROMPT
+from src.llm import create_completion_openai
+from src.terminology.event import PartialDefinitionGenerated
+from src.terminology.models import Definition, Term
+from src.terminology.terminology import Blackboard
+
+
+class OpenAIDefinitionCombiner(LLMDefinitionCombiner):
+
+    MIN_PARTIAL_DEFINITIONS: int = 3
+
+    locks: Annotated[dict[UUID, asyncio.Lock], Field(default_factory=lambda: {})]
+    lock: asyncio.Lock = asyncio.Lock()
+
+    async def get_llm_response_relevance(self, term: str, definition: str) -> str:
+        return await create_completion_openai(
+            messages=[
+                ("user", RELEVANCE_USER_PROMPT.replace("%term%", term).replace("%definition%", definition)),
+            ],
+        )
+
+    async def get_llm_response_combine(self, term: str, definitions: str) -> str:
+        return await create_completion_openai(
+            messages=[
+                ("system", COMBINE_SYSTEM_PROMPT),
+                ("user", COMBINE_USER_PROMPT.replace("%term%", term).replace("%definitions%", definitions)),
+            ],
+        )
+
+
+
+
+
+if __name__ == '__main__':
+    blackboard = Blackboard()
+    combiner = OpenAIDefinitionCombiner(blackboard=blackboard)
+
+    term = Term(
+        text="Sperrfahrt",
+        normalization="Sperrfahrt",
+        occurrences=[],
+        definitions=[
+            Definition(
+                text="Eine Sperrfahrt ist ein Zug, der anstelle des Begriffs 'Zug' bezeichnet wird, wenn es sich um eine spezielle Fahrt handelt, die nicht regulär verkehrt.",
+                partial=True,
+                verified=False,
+                source=None
+            ),
+            Definition(
+                text="Eine Sperrfahrt ist ein Zug, der in Aufträgen oder Meldungen anstelle des Begriffs 'Zug' verwendet wird, wenn es sich um eine spezielle Fahrt handelt, die nicht regulär im Fahrplan enthalten ist.",
+                partial=True,
+                verified=False,
+            ),
+            Definition(
+                text="Eine Sperrfahrt ist eine spezielle Art von Zugfahrt, die anstelle des Begriffs \"Zug\" verwendet wird, wenn es sich um eine Sperrfahrt handelt.",
+                partial=True,
+                verified=False,
+            )
+        ]
+    )
+
+    async def test():
+        async for event in combiner.activate(PartialDefinitionGenerated(term=term, definition=term.definitions[2])):
+            print(f"Event: {event}")
+            print(f"Definition: {event.combined_definition}")
+
+    asyncio.run(test())
+
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py
new file mode 100644
index 000000000..381546f8d
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py
@@ -0,0 +1,63 @@
+import asyncio
+
+import numpy as np
+
+from src.knowledge.llm.definition.generator import DEVELOPER_PROMPT, LLMDefinitionGenerator
+from src.llm import create_completion_openai
+from src.terminology.terminology import OccurrenceResolved, \
+    Blackboard
+
+
+class OpenAIDefinitionGenerator(LLMDefinitionGenerator):
+
+    async def generate_definition_from_source(self, term: str, context: str) -> str | None:
+        response, log_probs = await create_completion_openai(
+            messages=[
+                ("developer", f"{DEVELOPER_PROMPT}"),
+                ("user", f"{context}"),
+                ("user", f"Definiere den Begriff \"{term}\"."),
+            ],
+            logprobs=True
+        )
+
+        for token in log_probs.content[0].top_logprobs:
+            prob = np.exp(token.logprob)
+            if token.token == "ERROR" and prob > self.CERTAINTY_THRESHOLD:
+                # logger.debug(f"Generation uncertain. Probability of 'ERROR' token {prob}>{self.CERTAINTY_THRESHOLD}!")
+                return None
+
+        if response == "ERROR":
+            return None
+        return response
+
+
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard()
+    generator = OpenAIDefinitionGenerator(
+        blackboard=blackboard
+    )
+
+    context = """
+
+Abstellen
+
+Züge und Triebfahrzeuge sind abgestellt, wenn sie nicht mit einem Triebfahrzeugführer besetzt sind oder nicht gesteuert werden. Wagen sind abgestellt, sofern sie nicht in Züge eingestellt sind oder nicht rangiert werden.
+
+Abstoßen
+
+Abstoßen ist das Bewegen geschobener, nicht mit einem arbeitenden Triebfahrzeug gekuppelter Fahrzeuge durch Beschleunigen, so dass die Fahrzeuge allein weiterfahren, nachdem das Triebfahrzeug angehalten hat.
+
+""".strip()
+
+    context = "Dies gilt auch für das Abstoßen, sofern in örtlichen Zusätzen nicht Ausnahmen zugelassen sind."
+
+    term = blackboard.add_term("Abstellen")
+    source = blackboard.add_text_source(text=context)
+
+    async def test():
+        async for event in generator.activate(OccurrenceResolved(term=term, source=source)):
+            print(f"Event {event}")
+
+    asyncio.run(test())
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py
new file mode 100644
index 000000000..4f8c8dd3c
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py
@@ -0,0 +1,129 @@
+import asyncio
+import re
+from typing import Annotated, AsyncIterable
+from uuid import UUID
+
+from pydantic import Field
+
+from src.llm import create_completion_openai
+from src.logger import simple_custom_logger
+from src.terminology.event import OccurrenceResolved, Event
+from src.terminology.terminology import DefinitionGenerator, Blackboard
+
+logger = simple_custom_logger("UNIGEN")
+
+prompt_introduction = """
+Erstelle eine Definition für den Begriff "%term%" anhand von gegebenen Textausschnitten.
+Bleibe präzise und kurz. Nutze nur die Informationen aus dem gegebenen Kontext. 
+Wenn nicht genug Information vorhanden ist oder zu generell, vage oder nicht fachspezifisch ist, gebe "ERROR" aus.
+Füge in die Definition die jeweiligen Referenzen hinzu, indem du die Nummer des Abschnitts verwendest im Format [<nummer>].
+""".strip()
+
+class OpenAIUnifiedDefinitionGenerator(DefinitionGenerator):
+
+    MIN_OCCURRENCES: int = 3
+
+    WINDOW_START: int = 100
+    WINDOW_END: int = 200
+
+    locks: Annotated[dict[UUID, asyncio.Lock], Field(default_factory=lambda: {})]
+
+    # TODO: See Prompt Engineering for LLMs -> Elastic Snippets
+    async def activate(self, event: OccurrenceResolved) -> AsyncIterable[Event]:
+        if event.term.id not in self.locks:
+            self.locks[event.term.id] = asyncio.Lock()
+
+        async with self.locks[event.term.id]:
+            logger.info(f"Locking {event.term.normalized_or_text()} for unified definition generator")
+
+            term = event.term.normalized_or_text()
+            pattern = rf"{term}"
+            context = []
+            for source_id in event.term.occurrences:
+                source = self.blackboard.get_text_source(id=source_id)
+                # FIXME: Create elastic snippet? -> dynamic window length? -> differences in quality?
+
+                # Find all occurrences of the term
+                snippets = []
+
+                current_start = 0
+                current_end = 0
+                snippet = ""
+
+                matches = list(re.finditer(pattern, source.text, re.IGNORECASE))
+
+                for match in matches:
+                    start = max(0, match.start() - self.WINDOW_START)
+                    end = match.end() + self.WINDOW_END
+                    if start < current_end:
+                        # snippet overlaps with current
+                        current_end = end
+                        pass
+                    else:
+                        # snippet is further away -> new one
+                        if snippet != "":
+                            snippets.append(snippet)
+                        current_start = start
+                        current_end = end
+                    snippet = source.text[current_start:current_end]
+                if snippet != "":
+                    snippets.append(snippet)
+
+                context += snippets
+
+            # for snippet in context:
+            #     logger.debug(f"Snippet: {snippet}")
+
+            logger.debug(f"Found {len(snippets)} snippets for {term}")
+
+            # TODO: think about: how can cost be reduced? How can I decide if a text is relevant to a term?
+
+            messages = [
+                ("system", f"{prompt_introduction.replace('%term%', term)}"),
+                ("user", f"Hier sind einige Textausschnitte, die du verwenden kannst. Beziehe dich bei der Generation nur auf Wissen aus den Textstellen!"),
+                ("user", "\n\n".join([f"[{index}] {context}" for index, context in enumerate(context)])),
+                ("user", f"Definiere den Begriff \"{term}\". Beziehe dich nur auf die Textabschnitte!"),
+            ]
+
+            # logger.debug("\n----\n".join([text for _, text in messages]))
+
+            result = await create_completion_openai(
+                messages=messages,
+                model="o4-mini"
+            )
+
+            print(result)
+
+
+
+
+
+        yield
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard(
+        terms=[],
+        sources=[]
+    )
+
+    term = blackboard.add_term("Fahrdienstleiter")
+
+    with open("./../../../data/Handbuch-40820-data_11-15-1.txt", "r") as f:
+        source1 = blackboard.add_text_source(text=f.read())
+
+    with open("./../../../data/Handbuch-40820-data_11-15-5.txt", "r") as f:
+        source2 = blackboard.add_text_source(text=f.read())
+
+    with open("./../../../data/Handbuch-40820-data.txt", "r") as f:
+        source3 = blackboard.add_text_source(text=f.read())
+
+    term.occurrences = [source.id for source in [source1, source2, source3]]
+
+    generator = OpenAIUnifiedDefinitionGenerator(blackboard=blackboard)
+
+    async def test():
+        async for event in generator.activate(OccurrenceResolved(term=term, source=source3)):
+            pass
+
+    asyncio.run(test())
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py
new file mode 100644
index 000000000..236476acf
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py
@@ -0,0 +1,15 @@
+from src.knowledge.llm.extract import LLMTermExtractor, DEVELOPER_PROMPT, EXAMPLE_USER, OUTPUT_ASSISTANT
+from src.llm import create_completion_openai
+
+
+class OpenAIExtractor(LLMTermExtractor):
+
+    async def get_llm_response(self, text: str) -> str:
+        return await create_completion_openai(
+            messages=[
+                ("developer", f"{DEVELOPER_PROMPT}"),
+                ("user", EXAMPLE_USER),
+                ("assistant", OUTPUT_ASSISTANT),
+                ("user", "Input: \n" + text)
+            ]
+        )
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py
new file mode 100644
index 000000000..98e27e397
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py
@@ -0,0 +1,17 @@
+from src.knowledge.llm.lemmatize import LLMTermLemmatizer, DEVELOPER_PROMPT_SHORT, EXAMPLES
+from src.llm import create_completion_openai
+
+
+class OpenAILemmatizer(LLMTermLemmatizer):
+
+    async def get_llm_response(self, term: str) -> str:
+        messages = [
+            ("system", f"{DEVELOPER_PROMPT_SHORT}"),
+            *EXAMPLES,
+            # ("user", example_user),
+            # ("assistant", output_assistant),
+            ("user", term)
+        ]
+        return await create_completion_openai(
+            messages=messages,
+        )