about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/src
diff options
context:
space:
mode:
Diffstat (limited to 'archive/2025/summer/bsc_gerg/src')
-rw-r--r--archive/2025/summer/bsc_gerg/src/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/document.py67
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/extract.py76
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/lemmatize.py1
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/combiner.py79
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/generator.py97
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/unified_generator.py129
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/extract.py71
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/llm/lemmatize.py46
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/occurrence.py24
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py75
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py63
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py129
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py15
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py17
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/resolver.py38
-rw-r--r--archive/2025/summer/bsc_gerg/src/llm.py52
-rw-r--r--archive/2025/summer/bsc_gerg/src/logger.py28
-rw-r--r--archive/2025/summer/bsc_gerg/src/main.py67
-rw-r--r--archive/2025/summer/bsc_gerg/src/prompts/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/prompts/extract.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/prompts/lemmatize.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/terminology/__init__.py0
-rw-r--r--archive/2025/summer/bsc_gerg/src/terminology/event.py103
-rw-r--r--archive/2025/summer/bsc_gerg/src/terminology/models.py34
-rw-r--r--archive/2025/summer/bsc_gerg/src/terminology/session.py108
-rw-r--r--archive/2025/summer/bsc_gerg/src/terminology/terminology.py123
-rw-r--r--archive/2025/summer/bsc_gerg/src/utils.py9
33 files changed, 1451 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/src/__init__.py b/archive/2025/summer/bsc_gerg/src/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/document.py b/archive/2025/summer/bsc_gerg/src/knowledge/document.py
new file mode 100644
index 000000000..3bc2ada51
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/document.py
@@ -0,0 +1,67 @@
+import asyncio
+import os
+import tempfile
+from pathlib import Path
+from typing import AsyncIterable
+
+from pypdf import PdfReader, PdfWriter
+
+from src.logger import logger
+from src.terminology.event import Event
+from src.terminology.terminology import DocumentAdded, TextExtracted, TextExtractor, Blackboard
+from src.utils import lazy_module
+
+
+def get_document_converter():
+    module = lazy_module("docling.document_converter")
+    from tqdm import tqdm
+    tqdm(disable=True, total=0)
+    return module.DocumentConverter
+
+
+class Pdf2Text(TextExtractor):
+
+    def extract_text(self, path: str):
+        converter = get_document_converter()()
+        doc = converter.convert(Path(path)).document
+        return path, doc.export_to_markdown()
+
+    def split_into_pages(self, path: str, tmp_path: str):
+        reader = PdfReader(open(path, "rb"))
+        paths = []
+        for i in range(len(reader.pages)):
+            output = PdfWriter()
+            output.add_page(reader.pages[i])
+            out_path = f"{tmp_path}/{i}.pdf"
+            paths.append(out_path)
+            with open(f"{tmp_path}/{i}.pdf", "wb") as file:
+                output.write(file)
+        return paths
+
+    async def activate(self, event: DocumentAdded) -> AsyncIterable[Event]:
+
+        paths = self.split_into_pages(event.path, tempfile.gettempdir())
+
+        logger.info(f"Found {len(paths)} pages in {event.path}")
+
+        tasks = [asyncio.to_thread(self.extract_text, path) for path in paths]
+
+        for task in asyncio.as_completed(tasks):
+            path, text = await task
+            os.unlink(path)
+            yield TextExtracted(text=text)
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard()
+    pdf2text = Pdf2Text(blackboard=blackboard)
+
+
+    async def test():
+        counter = 0
+        async for event in pdf2text.activate(DocumentAdded(path="./../../data/Handbuch-40820-data_43.pdf")):
+            counter += 1
+            with open(f"./../../data/Handbuch-40820-data_43-{counter}.txt", "w") as f:
+                f.write(event.text)
+
+    asyncio.run(test())
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/extract.py b/archive/2025/summer/bsc_gerg/src/knowledge/extract.py
new file mode 100644
index 000000000..3d5d3ae60
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/extract.py
@@ -0,0 +1,76 @@
+import asyncio
+from typing import AsyncIterable, Any
+
+import spacy
+from spacy import Language
+
+from src.logger import simple_custom_logger
+from src.terminology.event import Event, TextExtracted, TermExtracted
+from src.terminology.terminology import TermExtractor, OccurrenceResolved, Blackboard
+from src.utils import lazy_module
+
+logger = simple_custom_logger("TERMEXTRACTOR")
+
+class CValue(TermExtractor):
+
+    nlp: Language = None
+
+    def model_post_init(self, __context: Any) -> None:
+        self.nlp = spacy.load("de_core_news_md")
+        lazy_module("pyate").TermExtraction.configure({
+            "language": "de",
+            "model_name": "de_core_news_md",
+            "MAX_WORD_LENGTH": 3
+        })
+
+    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
+        result = lazy_module("pyate").cvalues(event.text, have_single_word=True)
+        candidates = result.to_dict().keys()
+        source = self.blackboard.add_text_source(event.text)
+        for term in candidates:
+            t = self.blackboard.add_term(term)
+            yield OccurrenceResolved(term=t, source=source)
+            yield TermExtracted(term=t)
+
+
+class ComboBasicTermExtractor(TermExtractor):
+    nlp: Language = None
+
+    def model_post_init(self, __context: Any) -> None:
+        self.nlp = spacy.load("de_core_news_md")
+        lazy_module("pyate").TermExtraction.configure({
+            "language": "de",
+            "model_name": "de_core_news_md",
+            "MAX_WORD_LENGTH": 3
+        })
+
+    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
+        result = lazy_module("pyate").combo_basic(event.text, have_single_word=True)
+        print(result.sort_values(ascending=False))
+        candidates = result.to_dict().keys()
+        source = self.blackboard.add_text_source(event.text)
+        for term in candidates:
+            t = self.blackboard.add_term(term)
+            yield OccurrenceResolved(term=t, source=source)
+            yield TermExtracted(term=t)
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard()
+    extractor_cvalue = CValue(blackboard=blackboard)
+    extractor_combo_basic = ComboBasicTermExtractor(blackboard=blackboard)
+
+    text = "Wenn im Zug außergewöhnliche Sendungen oder außergewöhnliche Fahrzeuge eingestellt sind, müssen sich deren Beförderungsanordnungen beim Zug befinden und die Nummern der Beförderungsanordnungen dem Fahrdienstleiter mitgeteilt worden sein."
+
+    async def run():
+        # print("C-Value")
+        # async for event in extractor_cvalue.activate(TextExtracted(text=text)):
+        #     if isinstance(event, TermExtracted):
+        #         print(event.term.normalized_or_text())
+        print("\nCombo Basic")
+        async for event in extractor_combo_basic.activate(TextExtracted(text=text)):
+            if isinstance(event, TermExtracted):
+                print(event.term.normalized_or_text())
+
+
+    asyncio.run(run())
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/lemmatize.py b/archive/2025/summer/bsc_gerg/src/knowledge/lemmatize.py
new file mode 100644
index 000000000..bff243961
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/lemmatize.py
@@ -0,0 +1 @@
+# TODO: simple lemmatizer
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/combiner.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/combiner.py
new file mode 100644
index 000000000..efe81b5be
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/combiner.py
@@ -0,0 +1,79 @@
+import asyncio
+from typing import Annotated, AsyncIterable
+from uuid import UUID
+
+from pydantic import Field
+
+from src.logger import simple_custom_logger
+from src.terminology.event import Event, CombinedDefinitionGenerated, PartialDefinitionGenerated
+from src.terminology.models import Definition
+from src.terminology.terminology import DefinitionCombiner
+
+logger = simple_custom_logger("COMBINER")
+
+RELEVANCE_USER_PROMPT = """Ist der folgende Text eine Definition für den Begriff \"%term%\"? Wenn die Definition spezifisch genug ist, beende deine Folgerung mit TRUE, ansonsten mit FALSE.
+
+%definition%"""
+
+COMBINE_SYSTEM_PROMPT = """Nutze nur das gegebene Wissen aus den Anfragen."""
+COMBINE_USER_PROMPT="""Erstelle eine kombinierte Definition für \"%term%\" anhand der folgenden Definitionen. Starte mit allgemeinen Informationen und werde dann spezifischer. Verwende nur die Informationen aus den unten stehenden Texten.
+
+%definitions%"""
+
+class LLMDefinitionCombiner(DefinitionCombiner):
+
+    MIN_PARTIAL_DEFINITIONS: int = 3
+
+    locks: Annotated[dict[UUID, asyncio.Lock], Field(default_factory=lambda: {})]
+    lock: asyncio.Lock = asyncio.Lock()
+
+    async def get_llm_response_relevance(self, term: str, definition: str) -> str:
+        pass
+
+    async def get_llm_response_combine(self, term: str, definitions: str) -> str:
+        pass
+
+    async def activate(self, event: PartialDefinitionGenerated) -> AsyncIterable[Event]:
+        # Since definitions for a term can be generated concurrently, definitions might get combined multiple times
+        # For now, only one definition can be combined at once.
+        # FIXME: Improve locking (lock per term?)
+        if event.term.id not in self.locks:
+            self.locks[event.term.id] = asyncio.Lock()
+        async with self.locks[event.term.id]:
+            logger.info(f"Locking {event.term.normalized_or_text()} definition combiner {id(event)}")
+            has_verified_definition = next((definition for definition in event.term.definitions if definition.verified), None) is not None
+
+            partial_definitions = [definition for definition in event.term.definitions if definition.is_partial()]
+
+            if not has_verified_definition and len(partial_definitions) >= self.MIN_PARTIAL_DEFINITIONS:
+
+                event.term.definitions = [definition for definition in event.term.definitions if not definition.is_combined()]
+
+                async with asyncio.TaskGroup() as tg:
+                    tasks = []
+                    for definition in event.term.definitions:
+                        task = tg.create_task(self.get_llm_response_relevance(event.term.normalized_or_text(), definition.text))
+                        tasks.append((definition, task))
+
+                relevant_definitions = [definition for definition, task in tasks if task.result().endswith("TRUE")]
+                logger.debug(f"Relevant definitions: {relevant_definitions}")
+                relevant_definitions_text = "\n\n".join([definition.text for definition in relevant_definitions])
+                response = await self.get_llm_response_combine(event.term.normalized_or_text(), relevant_definitions_text)
+
+                combined_definition = Definition(
+                    text=response,
+                    verified=False,
+                    partial=False,
+                    source=relevant_definitions
+                )
+
+                event.term.definitions.append(combined_definition)
+
+                yield CombinedDefinitionGenerated(
+                    term=event.term,
+                    combined_definition=combined_definition,
+                    relevant_definitions=relevant_definitions
+                )
+            logger.info(f"Lock released for {event.term.normalized_or_text()}")
+
+
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/generator.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/generator.py
new file mode 100644
index 000000000..1927a3bef
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/generator.py
@@ -0,0 +1,97 @@
+import asyncio
+import re
+from typing import AsyncIterable, Annotated
+
+from pydantic import Field
+
+from src.logger import simple_custom_logger
+from src.terminology.event import Event
+from src.terminology.models import Term
+from src.terminology.terminology import Definition, DefinitionGenerator, PartialDefinitionGenerated, OccurrenceResolved
+
+DEVELOPER_PROMPT = """
+Erstelle eine Definition für einen Begriff anhand von gegebenen Textausschnitten.
+Bleibe präzise und kurz. Nutze nur die Informationen aus dem gegebenen Text. Nutze kein gelerntes Wissen aus deinen Trainingsdaten!
+Wenn nicht genug Information vorhanden ist oder die Definition zu generell, vage oder nicht fachspezifisch ist, gebe "ERROR" aus.
+"""
+
+logger = simple_custom_logger("DEFGEN")
+
+class LLMDefinitionGenerator(DefinitionGenerator):
+
+    WINDOW_START: int = 200
+    WINDOW_END: int = 300
+    MIN_OVERLAP: int = 100
+    MAX_LENGTH: int = 1000
+
+    CERTAINTY_THRESHOLD: int = 0.05
+
+    known_sources: Annotated[dict[str, list[Term]], Field(default_factory=dict[str, list[Term]])]
+
+    async def generate_definition_from_source(self, term: str, context: str) -> str | None:
+        pass
+
+    def get_matches(self, term: str, text: str):
+        pattern = rf"{term}"
+        matches = list(re.finditer(pattern, text, re.IGNORECASE))
+        if len(matches) == 0:
+            return [text]
+
+        excerpts = []
+        last_start = 0
+        last_end = 0
+        for match in matches:
+            start = max(0, match.start() - self.WINDOW_START)
+            end = min(len(text), match.end() + self.WINDOW_END)
+
+            overlap = last_end - start
+            length = end - last_start
+            if overlap > self.MIN_OVERLAP and length <= self.MAX_LENGTH:
+                if len(excerpts) == 0:
+                    excerpts.append(text[start:end])
+                else:
+                    excerpts[-1] = text[last_start:end]
+                last_end = end
+            else:
+                last_start = start
+                last_end = end
+                excerpts.append(text[start:end])
+        return excerpts
+
+
+
+    async def activate(self, event: OccurrenceResolved) -> AsyncIterable[Event]:
+        if str(event.source.id) not in self.known_sources:
+            self.known_sources[str(event.source.id)] = list()
+
+        if event.term in self.known_sources[str(event.source.id)]:
+            return
+
+        self.known_sources[str(event.source.id)].append(event.term)
+
+        tasks = []
+        async with asyncio.TaskGroup() as tg:
+            matches = self.get_matches(term=event.term.normalized_or_text(), text=event.source.text)
+            if len(matches) > 0:
+                for match in matches:
+                    task = tg.create_task(self.generate_definition_from_source(event.term.normalized_or_text(), match))
+                    tasks.append(task)
+            else:
+                tg.create_task(self.generate_definition_from_source(event.term.normalized_or_text(), event.source.text))
+
+
+        for task in asyncio.as_completed(tasks):
+            result = await task
+            # print(f"Resolved for {event.term.normalized_or_text()}: {result}")
+            if result is not None:
+                definition = Definition(
+                    text=result,
+                    verified=False,
+                    partial=True,
+                    source=event.source
+                )
+                event.term.definitions.append(definition)
+                yield PartialDefinitionGenerated(
+                    term=event.term,
+                    definition=definition
+                )
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/unified_generator.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/unified_generator.py
new file mode 100644
index 000000000..4f8c8dd3c
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/definition/unified_generator.py
@@ -0,0 +1,129 @@
+import asyncio
+import re
+from typing import Annotated, AsyncIterable
+from uuid import UUID
+
+from pydantic import Field
+
+from src.llm import create_completion_openai
+from src.logger import simple_custom_logger
+from src.terminology.event import OccurrenceResolved, Event
+from src.terminology.terminology import DefinitionGenerator, Blackboard
+
+logger = simple_custom_logger("UNIGEN")
+
+prompt_introduction = """
+Erstelle eine Definition für den Begriff "%term%" anhand von gegebenen Textausschnitten.
+Bleibe präzise und kurz. Nutze nur die Informationen aus dem gegebenen Kontext. 
+Wenn nicht genug Information vorhanden ist oder zu generell, vage oder nicht fachspezifisch ist, gebe "ERROR" aus.
+Füge in die Definition die jeweiligen Referenzen hinzu, indem du die Nummer des Abschnitts verwendest im Format [<nummer>].
+""".strip()
+
+class OpenAIUnifiedDefinitionGenerator(DefinitionGenerator):
+
+    MIN_OCCURRENCES: int = 3
+
+    WINDOW_START: int = 100
+    WINDOW_END: int = 200
+
+    locks: Annotated[dict[UUID, asyncio.Lock], Field(default_factory=lambda: {})]
+
+    # TODO: See Prompt Engineering for LLMs -> Elastic Snippets
+    async def activate(self, event: OccurrenceResolved) -> AsyncIterable[Event]:
+        if event.term.id not in self.locks:
+            self.locks[event.term.id] = asyncio.Lock()
+
+        async with self.locks[event.term.id]:
+            logger.info(f"Locking {event.term.normalized_or_text()} for unified definition generator")
+
+            term = event.term.normalized_or_text()
+            pattern = rf"{term}"
+            context = []
+            for source_id in event.term.occurrences:
+                source = self.blackboard.get_text_source(id=source_id)
+                # FIXME: Create elastic snippet? -> dynamic window length? -> differences in quality?
+
+                # Find all occurrences of the term
+                snippets = []
+
+                current_start = 0
+                current_end = 0
+                snippet = ""
+
+                matches = list(re.finditer(pattern, source.text, re.IGNORECASE))
+
+                for match in matches:
+                    start = max(0, match.start() - self.WINDOW_START)
+                    end = match.end() + self.WINDOW_END
+                    if start < current_end:
+                        # snippet overlaps with current
+                        current_end = end
+                        pass
+                    else:
+                        # snippet is further away -> new one
+                        if snippet != "":
+                            snippets.append(snippet)
+                        current_start = start
+                        current_end = end
+                    snippet = source.text[current_start:current_end]
+                if snippet != "":
+                    snippets.append(snippet)
+
+                context += snippets
+
+            # for snippet in context:
+            #     logger.debug(f"Snippet: {snippet}")
+
+            logger.debug(f"Found {len(snippets)} snippets for {term}")
+
+            # TODO: think about: how can cost be reduced? How can I decide if a text is relevant to a term?
+
+            messages = [
+                ("system", f"{prompt_introduction.replace('%term%', term)}"),
+                ("user", f"Hier sind einige Textausschnitte, die du verwenden kannst. Beziehe dich bei der Generation nur auf Wissen aus den Textstellen!"),
+                ("user", "\n\n".join([f"[{index}] {context}" for index, context in enumerate(context)])),
+                ("user", f"Definiere den Begriff \"{term}\". Beziehe dich nur auf die Textabschnitte!"),
+            ]
+
+            # logger.debug("\n----\n".join([text for _, text in messages]))
+
+            result = await create_completion_openai(
+                messages=messages,
+                model="o4-mini"
+            )
+
+            print(result)
+
+
+
+
+
+        yield
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard(
+        terms=[],
+        sources=[]
+    )
+
+    term = blackboard.add_term("Fahrdienstleiter")
+
+    with open("./../../../data/Handbuch-40820-data_11-15-1.txt", "r") as f:
+        source1 = blackboard.add_text_source(text=f.read())
+
+    with open("./../../../data/Handbuch-40820-data_11-15-5.txt", "r") as f:
+        source2 = blackboard.add_text_source(text=f.read())
+
+    with open("./../../../data/Handbuch-40820-data.txt", "r") as f:
+        source3 = blackboard.add_text_source(text=f.read())
+
+    term.occurrences = [source.id for source in [source1, source2, source3]]
+
+    generator = OpenAIUnifiedDefinitionGenerator(blackboard=blackboard)
+
+    async def test():
+        async for event in generator.activate(OccurrenceResolved(term=term, source=source3)):
+            pass
+
+    asyncio.run(test())
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/extract.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/extract.py
new file mode 100644
index 000000000..682b100df
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/extract.py
@@ -0,0 +1,71 @@
+import re
+from typing import AsyncIterable
+
+from src.terminology.event import TextExtracted, Event, TermExtracted, OccurrenceResolved
+from src.terminology.terminology import TermExtractor
+
+DEVELOPER_PROMPT: str = """
+Du bist Experte für Terminologie und Fachbegriffe. 
+Deine Aufgabe besteht darin, aus einem Text Begriffe, Abkürzungen und Phrasen zu extrahieren. 
+Du extrahierst nur Terminologie, die wahrscheinlich in der Eisenbahn verwendet wird.
+Du erkennst Abkürzungen und behällst sie unverändert bei. Nur wenn die vollständige Form vorhanden ist, fügst du sie in Klammern am Ende des Begriffs an.
+Du extrahierst Phrasen und Wörter sowie verschachtelte Begriffe und deren Einzelteile.
+Achte bei längeren Phrasen darauf, ob aus dem Text klar wird, dass es sich um einen besonderen Begriff handelt, der Wahrscheinlich verwendet wird.
+Beginne mit den Begriffen, die am wahrscheinlichsten relevant sind.
+Gib nur eine Liste von Begriffen zurück. Extrahiere nur Begriffe, die besonders für den Kontext "Eisenbahn" sind!
+"""
+
+EXAMPLE_USER: str = """
+Input:
+Du musst das Hauptsignal auf Fahrt stellen.
+"""
+
+OUTPUT_ASSISTANT: str = """
+Output:
+- Hauptsignal auf Fahrt stellen
+- Hauptsignal
+- auf Fahrt stellen
+- Fahrtstellung eines Hauptsignals
+"""
+
+class LLMTermExtractor(TermExtractor):
+
+
+    async def get_llm_response(self, text: str) -> str:
+        pass
+
+    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
+        source = self.blackboard.add_text_source(event.text)
+        response = await self.get_llm_response(event.text)
+        response = response.split("\n")
+        terms = [candidate[2:] for candidate in response if candidate.startswith("-") or candidate.startswith("*")]
+
+        for term in terms:
+
+            variation_match = re.search(r"\(.+\)$", term)
+            abbreviation = None
+
+            if variation_match:
+                variation = (variation_match.group(0)
+                                .replace("(", "")
+                                .replace(")", "")
+                                .strip())
+                term = term.replace(variation_match.group(0), "").strip()
+                if len(variation) > len(term):
+                    abbreviation = term
+                    term = variation
+                else:
+                    abbreviation = variation
+                    term = term
+
+            t = self.blackboard.find_term(term_str=term)
+
+            if t is None:
+                t = self.blackboard.add_term(term=term)
+
+            if abbreviation and not abbreviation in t.variations:
+                t.variations.append(abbreviation)
+
+            t.occurrences.append(source.id)
+            yield TermExtracted(term=t)
+            yield OccurrenceResolved(term=t, source=source)
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/llm/lemmatize.py b/archive/2025/summer/bsc_gerg/src/knowledge/llm/lemmatize.py
new file mode 100644
index 000000000..0a7bfb7b9
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/llm/lemmatize.py
@@ -0,0 +1,46 @@
+from typing import AsyncIterable
+
+from src.terminology.event import TermExtracted, Event, TermNormalized
+from src.terminology.terminology import TermNormalizer
+
+DEVELOPER_PROMPT = """
+You are an expert in linguistics and languages.
+Your job is to transform words and phrases into a normalized and generalized form.
+You transform words and phrases into singular form.
+You do not replace words with other similar words.
+"""
+
+DEVELOPER_PROMPT_SHORT: str = """
+Bringen den folgenden Begriff in eine Basisform. Behalte die Wortart.
+"""
+
+EXAMPLE_USER: list[str] = [
+    "örtlicher Zusatz",
+    "örtliche Zusätze",
+    "Betra",
+    "Aufgabe der Triebfahrzeugführerin",
+    "Triebfahrzeugführerin",
+    "Rangierbegleitender",
+]
+
+OUTPUT_ASSISTANT = [
+    "örtlicher Zusatz",
+    "örtlicher Zusatz",
+    "Betra",
+    "Aufgabe der Triebfahrzeugführer",
+    "Triebfahrzeugführer",
+    "Rangierbegleiter",
+]
+
+EXAMPLES = [message for input_term, output_term in zip(EXAMPLE_USER, OUTPUT_ASSISTANT) for message in
+                [("user", input_term), ("assistant", output_term)]]
+
+class LLMTermLemmatizer(TermNormalizer):
+
+    async def get_llm_response(self, term: str) -> str:
+        pass
+
+    async def activate(self, event: TermExtracted) -> AsyncIterable[Event]:
+        response = await self.get_llm_response(event.term.text)
+        event.term.normalization = response
+        yield TermNormalized(term=event.term)
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/occurrence.py b/archive/2025/summer/bsc_gerg/src/knowledge/occurrence.py
new file mode 100644
index 000000000..0aa1ce209
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/occurrence.py
@@ -0,0 +1,24 @@
+from typing import Annotated, List
+
+from pydantic import Field
+
+from src.terminology.terminology import TermExtracted, OccurrenceResolver, OccurrenceResolved
+
+
+
+class MockOccurrenceResolver(OccurrenceResolver):
+
+    texts: Annotated[List[str], Field(default_factory=lambda:[
+        "Das ist ein Text über den Schrankenwärter",
+        "Das Gleis ist noch nicht sichern",
+        "Wir fahren hier eine Sperrfahrt."
+        "Die Strecke muss man sichern."
+    ])]
+
+    async def activate(self, event: TermExtracted):
+        term = event.term
+        for text in self.texts:
+            if term.text in text:
+                source = self.blackboard.add_text_source(text)
+                term.occurrences.append(source.id)
+                yield OccurrenceResolved(term=term, source=source)
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py
new file mode 100644
index 000000000..265602801
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/combiner.py
@@ -0,0 +1,75 @@
+import asyncio
+from typing import Annotated
+from uuid import UUID
+
+from pydantic import Field
+
+from src.knowledge.llm.definition.combiner import LLMDefinitionCombiner, RELEVANCE_USER_PROMPT, COMBINE_SYSTEM_PROMPT, \
+    COMBINE_USER_PROMPT
+from src.llm import create_completion_openai
+from src.terminology.event import PartialDefinitionGenerated
+from src.terminology.models import Definition, Term
+from src.terminology.terminology import Blackboard
+
+
+class OpenAIDefinitionCombiner(LLMDefinitionCombiner):
+
+    MIN_PARTIAL_DEFINITIONS: int = 3
+
+    locks: Annotated[dict[UUID, asyncio.Lock], Field(default_factory=lambda: {})]
+    lock: asyncio.Lock = asyncio.Lock()
+
+    async def get_llm_response_relevance(self, term: str, definition: str) -> str:
+        return await create_completion_openai(
+            messages=[
+                ("user", RELEVANCE_USER_PROMPT.replace("%term%", term).replace("%definition%", definition)),
+            ],
+        )
+
+    async def get_llm_response_combine(self, term: str, definitions: str) -> str:
+        return await create_completion_openai(
+            messages=[
+                ("system", COMBINE_SYSTEM_PROMPT),
+                ("user", COMBINE_USER_PROMPT.replace("%term%", term).replace("%definitions%", definitions)),
+            ],
+        )
+
+
+
+
+
+if __name__ == '__main__':
+    blackboard = Blackboard()
+    combiner = OpenAIDefinitionCombiner(blackboard=blackboard)
+
+    term = Term(
+        text="Sperrfahrt",
+        normalization="Sperrfahrt",
+        occurrences=[],
+        definitions=[
+            Definition(
+                text="Eine Sperrfahrt ist ein Zug, der anstelle des Begriffs 'Zug' bezeichnet wird, wenn es sich um eine spezielle Fahrt handelt, die nicht regulär verkehrt.",
+                partial=True,
+                verified=False,
+                source=None
+            ),
+            Definition(
+                text="Eine Sperrfahrt ist ein Zug, der in Aufträgen oder Meldungen anstelle des Begriffs 'Zug' verwendet wird, wenn es sich um eine spezielle Fahrt handelt, die nicht regulär im Fahrplan enthalten ist.",
+                partial=True,
+                verified=False,
+            ),
+            Definition(
+                text="Eine Sperrfahrt ist eine spezielle Art von Zugfahrt, die anstelle des Begriffs \"Zug\" verwendet wird, wenn es sich um eine Sperrfahrt handelt.",
+                partial=True,
+                verified=False,
+            )
+        ]
+    )
+
+    async def test():
+        async for event in combiner.activate(PartialDefinitionGenerated(term=term, definition=term.definitions[2])):
+            print(f"Event: {event}")
+            print(f"Definition: {event.combined_definition}")
+
+    asyncio.run(test())
+
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py
new file mode 100644
index 000000000..381546f8d
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/generator.py
@@ -0,0 +1,63 @@
+import asyncio
+
+import numpy as np
+
+from src.knowledge.llm.definition.generator import DEVELOPER_PROMPT, LLMDefinitionGenerator
+from src.llm import create_completion_openai
+from src.terminology.terminology import OccurrenceResolved, \
+    Blackboard
+
+
+class OpenAIDefinitionGenerator(LLMDefinitionGenerator):
+
+    async def generate_definition_from_source(self, term: str, context: str) -> str | None:
+        response, log_probs = await create_completion_openai(
+            messages=[
+                ("developer", f"{DEVELOPER_PROMPT}"),
+                ("user", f"{context}"),
+                ("user", f"Definiere den Begriff \"{term}\"."),
+            ],
+            logprobs=True
+        )
+
+        for token in log_probs.content[0].top_logprobs:
+            prob = np.exp(token.logprob)
+            if token.token == "ERROR" and prob > self.CERTAINTY_THRESHOLD:
+                # logger.debug(f"Generation uncertain. Probability of 'ERROR' token {prob}>{self.CERTAINTY_THRESHOLD}!")
+                return None
+
+        if response == "ERROR":
+            return None
+        return response
+
+
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard()
+    generator = OpenAIDefinitionGenerator(
+        blackboard=blackboard
+    )
+
+    context = """
+
+Abstellen
+
+Züge und Triebfahrzeuge sind abgestellt, wenn sie nicht mit einem Triebfahrzeugführer besetzt sind oder nicht gesteuert werden. Wagen sind abgestellt, sofern sie nicht in Züge eingestellt sind oder nicht rangiert werden.
+
+Abstoßen
+
+Abstoßen ist das Bewegen geschobener, nicht mit einem arbeitenden Triebfahrzeug gekuppelter Fahrzeuge durch Beschleunigen, so dass die Fahrzeuge allein weiterfahren, nachdem das Triebfahrzeug angehalten hat.
+
+""".strip()
+
+    context = "Dies gilt auch für das Abstoßen, sofern in örtlichen Zusätzen nicht Ausnahmen zugelassen sind."
+
+    term = blackboard.add_term("Abstellen")
+    source = blackboard.add_text_source(text=context)
+
+    async def test():
+        async for event in generator.activate(OccurrenceResolved(term=term, source=source)):
+            print(f"Event {event}")
+
+    asyncio.run(test())
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py
new file mode 100644
index 000000000..4f8c8dd3c
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/definition/unified_generator.py
@@ -0,0 +1,129 @@
+import asyncio
+import re
+from typing import Annotated, AsyncIterable
+from uuid import UUID
+
+from pydantic import Field
+
+from src.llm import create_completion_openai
+from src.logger import simple_custom_logger
+from src.terminology.event import OccurrenceResolved, Event
+from src.terminology.terminology import DefinitionGenerator, Blackboard
+
+logger = simple_custom_logger("UNIGEN")
+
+prompt_introduction = """
+Erstelle eine Definition für den Begriff "%term%" anhand von gegebenen Textausschnitten.
+Bleibe präzise und kurz. Nutze nur die Informationen aus dem gegebenen Kontext. 
+Wenn nicht genug Information vorhanden ist oder zu generell, vage oder nicht fachspezifisch ist, gebe "ERROR" aus.
+Füge in die Definition die jeweiligen Referenzen hinzu, indem du die Nummer des Abschnitts verwendest im Format [<nummer>].
+""".strip()
+
+class OpenAIUnifiedDefinitionGenerator(DefinitionGenerator):
+
+    MIN_OCCURRENCES: int = 3
+
+    WINDOW_START: int = 100
+    WINDOW_END: int = 200
+
+    locks: Annotated[dict[UUID, asyncio.Lock], Field(default_factory=lambda: {})]
+
+    # TODO: See Prompt Engineering for LLMs -> Elastic Snippets
+    async def activate(self, event: OccurrenceResolved) -> AsyncIterable[Event]:
+        if event.term.id not in self.locks:
+            self.locks[event.term.id] = asyncio.Lock()
+
+        async with self.locks[event.term.id]:
+            logger.info(f"Locking {event.term.normalized_or_text()} for unified definition generator")
+
+            term = event.term.normalized_or_text()
+            pattern = rf"{term}"
+            context = []
+            for source_id in event.term.occurrences:
+                source = self.blackboard.get_text_source(id=source_id)
+                # FIXME: Create elastic snippet? -> dynamic window length? -> differences in quality?
+
+                # Find all occurrences of the term
+                snippets = []
+
+                current_start = 0
+                current_end = 0
+                snippet = ""
+
+                matches = list(re.finditer(pattern, source.text, re.IGNORECASE))
+
+                for match in matches:
+                    start = max(0, match.start() - self.WINDOW_START)
+                    end = match.end() + self.WINDOW_END
+                    if start < current_end:
+                        # snippet overlaps with current
+                        current_end = end
+                        pass
+                    else:
+                        # snippet is further away -> new one
+                        if snippet != "":
+                            snippets.append(snippet)
+                        current_start = start
+                        current_end = end
+                    snippet = source.text[current_start:current_end]
+                if snippet != "":
+                    snippets.append(snippet)
+
+                context += snippets
+
+            # for snippet in context:
+            #     logger.debug(f"Snippet: {snippet}")
+
+            logger.debug(f"Found {len(snippets)} snippets for {term}")
+
+            # TODO: think about: how can cost be reduced? How can I decide if a text is relevant to a term?
+
+            messages = [
+                ("system", f"{prompt_introduction.replace('%term%', term)}"),
+                ("user", f"Hier sind einige Textausschnitte, die du verwenden kannst. Beziehe dich bei der Generation nur auf Wissen aus den Textstellen!"),
+                ("user", "\n\n".join([f"[{index}] {context}" for index, context in enumerate(context)])),
+                ("user", f"Definiere den Begriff \"{term}\". Beziehe dich nur auf die Textabschnitte!"),
+            ]
+
+            # logger.debug("\n----\n".join([text for _, text in messages]))
+
+            result = await create_completion_openai(
+                messages=messages,
+                model="o4-mini"
+            )
+
+            print(result)
+
+
+
+
+
+        yield
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard(
+        terms=[],
+        sources=[]
+    )
+
+    term = blackboard.add_term("Fahrdienstleiter")
+
+    with open("./../../../data/Handbuch-40820-data_11-15-1.txt", "r") as f:
+        source1 = blackboard.add_text_source(text=f.read())
+
+    with open("./../../../data/Handbuch-40820-data_11-15-5.txt", "r") as f:
+        source2 = blackboard.add_text_source(text=f.read())
+
+    with open("./../../../data/Handbuch-40820-data.txt", "r") as f:
+        source3 = blackboard.add_text_source(text=f.read())
+
+    term.occurrences = [source.id for source in [source1, source2, source3]]
+
+    generator = OpenAIUnifiedDefinitionGenerator(blackboard=blackboard)
+
+    async def test():
+        async for event in generator.activate(OccurrenceResolved(term=term, source=source3)):
+            pass
+
+    asyncio.run(test())
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py
new file mode 100644
index 000000000..236476acf
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/extract.py
@@ -0,0 +1,15 @@
+from src.knowledge.llm.extract import LLMTermExtractor, DEVELOPER_PROMPT, EXAMPLE_USER, OUTPUT_ASSISTANT
+from src.llm import create_completion_openai
+
+
+class OpenAIExtractor(LLMTermExtractor):
+
+    async def get_llm_response(self, text: str) -> str:
+        return await create_completion_openai(
+            messages=[
+                ("developer", f"{DEVELOPER_PROMPT}"),
+                ("user", EXAMPLE_USER),
+                ("assistant", OUTPUT_ASSISTANT),
+                ("user", "Input: \n" + text)
+            ]
+        )
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py b/archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py
new file mode 100644
index 000000000..98e27e397
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/openai/lemmatize.py
@@ -0,0 +1,17 @@
+from src.knowledge.llm.lemmatize import LLMTermLemmatizer, DEVELOPER_PROMPT_SHORT, EXAMPLES
+from src.llm import create_completion_openai
+
+
+class OpenAILemmatizer(LLMTermLemmatizer):
+
+    async def get_llm_response(self, term: str) -> str:
+        messages = [
+            ("system", f"{DEVELOPER_PROMPT_SHORT}"),
+            *EXAMPLES,
+            # ("user", example_user),
+            # ("assistant", output_assistant),
+            ("user", term)
+        ]
+        return await create_completion_openai(
+            messages=messages,
+        )
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/resolver.py b/archive/2025/summer/bsc_gerg/src/knowledge/resolver.py
new file mode 100644
index 000000000..1e5d7a8b8
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/resolver.py
@@ -0,0 +1,38 @@
+from typing import Annotated, AsyncIterable, Optional, Any
+
+from pydantic import Field
+
+from src.terminology.event import Event, VerifiedDefinitionResolved
+from src.terminology.models import TextSource, Definition
+from src.terminology.terminology import DefinitionResolver
+
+
+class CSVDefinitionResolver(DefinitionResolver):
+    definitions: Annotated[dict[str, Definition], Field(default_factory=dict)]
+    source: Optional[TextSource] = None
+
+
+    def model_post_init(self, __context: Any) -> None:
+        langs = ["de"]
+        self.source = self.blackboard.add_text_source("DICTIONARY")
+        for lang in langs:
+            with open(f"data/{lang}-glossary.csv", "r") as f:
+                data = f.read().split("\n")
+                for row in data:
+                    key, value = row.split("\t")
+                    self.definitions[key] = Definition(
+                        text=value,
+                        verified=True,
+                        partial=False,
+                        source=self.source,
+                    )
+
+    async def activate(self, event: Event) -> AsyncIterable[Event]:
+        term = event.term
+        term_str = term.normalized_or_text()
+        if term_str in self.definitions:
+            definition = self.definitions[term_str]
+            if definition in term.definitions:
+                return
+            term.definitions.append(definition)
+            yield VerifiedDefinitionResolved(term=term, definition=definition)
diff --git a/archive/2025/summer/bsc_gerg/src/llm.py b/archive/2025/summer/bsc_gerg/src/llm.py
new file mode 100644
index 000000000..1e11df26f
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/llm.py
@@ -0,0 +1,52 @@
+from typing import Tuple
+
+import backoff
+import dotenv
+from openai import AsyncOpenAI, RateLimitError
+
+client: AsyncOpenAI | None = None
+def get_openai_client() -> AsyncOpenAI:
+    global client
+    if client is None:
+        dotenv.load_dotenv()
+        client = AsyncOpenAI()
+    return client
+
+seed = 42
+
+@backoff.on_exception(backoff.expo, RateLimitError)
+async def create_completion_openai(
+                             messages: list[Tuple[str, str]],
+                             model: str = "gpt-4o-mini",
+                             temperature=0,
+                             max_completion_tokens=2048,
+                             top_p=0,
+                             frequency_penalty=0,
+                             presence_penalty=0,
+                             store=False,
+                            logprobs=False,
+                             ):
+    response = await get_openai_client().chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": role,
+                "content": prompt
+            } for role, prompt in messages
+        ],
+        response_format={"type": "text"},
+        temperature=temperature,
+        max_completion_tokens=max_completion_tokens,
+        top_p=top_p,
+        frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
+        store=store,
+        logprobs=logprobs,
+        top_logprobs=5 if logprobs else None,
+        seed=seed,
+    )
+
+    if logprobs:
+        return response.choices[0].message.content, response.choices[0].logprobs
+    else:
+        return response.choices[0].message.content
diff --git a/archive/2025/summer/bsc_gerg/src/logger.py b/archive/2025/summer/bsc_gerg/src/logger.py
new file mode 100644
index 000000000..6ba0b84ed
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/logger.py
@@ -0,0 +1,28 @@
+import logging
+import os
+import sys
+
+from dotenv import load_dotenv
+
+load_dotenv()
+debug = os.getenv("DEBUG") == "true"
+
+def simple_custom_logger(name):
+    custom_logger = logging.getLogger(name)
+    if debug: custom_logger.setLevel(logging.DEBUG)
+
+    console_handler = logging.StreamHandler(sys.stdout)
+
+    # Set the logging level for the console handler
+    console_handler.setLevel(logging.DEBUG)
+
+    # Create a formatter and set it for the console handler
+    formatter = logging.Formatter('%(asctime)s [%(name)s] %(levelname)s: %(message)s')
+    console_handler.setFormatter(formatter)
+
+    # Add the console handler to the logger
+    custom_logger.addHandler(console_handler)
+
+    return custom_logger
+
+logger = simple_custom_logger("TAS")
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/main.py b/archive/2025/summer/bsc_gerg/src/main.py
new file mode 100644
index 000000000..eec0b2cf1
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/main.py
@@ -0,0 +1,67 @@
+"""
+
+This acts as the TerminologyResource
+
+
+"""
+
+import os
+import shutil
+import tempfile
+from typing import Optional
+
+from fastapi import FastAPI, UploadFile
+from pydantic import BaseModel
+from starlette.staticfiles import StaticFiles
+
+from src.terminology.session import SessionManager, KnowledgeSourcePolicy
+from src.terminology.terminology import Blackboard
+
+app = FastAPI()
+
+app.mount("/demo", StaticFiles(directory="html", html=True), name="demo")
+
+class TextRequestBody(BaseModel):
+    text: str
+    context: Optional[str] = None
+
+
+@app.post("/extractTerminology")
+async def process_text(request: TextRequestBody) -> Blackboard:
+    session = SessionManager.create_session(KnowledgeSourcePolicy(use_llm=True))
+    blackboard = await session.extract_terminology(request.text, context=request.context)
+    SessionManager.remove_session(session_id=session.id)
+    return blackboard
+
+@app.post("/processText")
+async def process_text(request: TextRequestBody) -> Blackboard:
+    session = SessionManager.create_session(KnowledgeSourcePolicy(use_llm=True))
+    blackboard = await session.retrieve_term_definition(request.text, context=request.context)
+    SessionManager.remove_session(session_id=session.id)
+    return blackboard
+
+@app.post("/processFile")
+async def process_file(file: UploadFile) -> Blackboard:
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, file.filename)
+
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+
+    session = SessionManager.create_session(KnowledgeSourcePolicy(use_llm=True))
+
+    blackboard = await session.process_document(file_path)
+
+    SessionManager.remove_session(session_id=session.id)
+
+    os.unlink(file_path)
+
+    return blackboard
+
+
+@app.get("/simple")
+async def process_simple(text: str) -> Blackboard:
+    session = SessionManager.create_session(KnowledgeSourcePolicy(use_llm=True))
+    blackboard = await session.retrieve_term_definition(text)
+    SessionManager.remove_session(session_id=session.id)
+    return blackboard
\ No newline at end of file
diff --git a/archive/2025/summer/bsc_gerg/src/prompts/__init__.py b/archive/2025/summer/bsc_gerg/src/prompts/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/prompts/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/prompts/extract.py b/archive/2025/summer/bsc_gerg/src/prompts/extract.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/prompts/extract.py
diff --git a/archive/2025/summer/bsc_gerg/src/prompts/lemmatize.py b/archive/2025/summer/bsc_gerg/src/prompts/lemmatize.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/prompts/lemmatize.py
diff --git a/archive/2025/summer/bsc_gerg/src/terminology/__init__.py b/archive/2025/summer/bsc_gerg/src/terminology/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/terminology/__init__.py
diff --git a/archive/2025/summer/bsc_gerg/src/terminology/event.py b/archive/2025/summer/bsc_gerg/src/terminology/event.py
new file mode 100644
index 000000000..a16a5e54c
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/terminology/event.py
@@ -0,0 +1,103 @@
+import asyncio
+import os
+from asyncio import TaskGroup
+from collections.abc import AsyncIterable
+from typing import Dict, Annotated, Type, List
+
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+
+from src.logger import logger
+from src.terminology.models import Term, TextSource, Definition
+
+load_dotenv()
+debug = os.getenv("DEBUG") == "true"
+
+
+
+class Event(BaseModel):
+    pass
+
+class Handler(BaseModel):
+    handles: List[Type[Event]]
+
+    async def activate(self, event: Event) -> AsyncIterable[Event]:
+        yield None
+
+
+class DocumentAdded(Event):
+    path: str
+
+
+class TextExtracted(Event):
+    text: str
+
+
+class TermExtracted(Event):
+    term: Term
+
+
+class TermsExtracted(Event):
+    terms: List[TermExtracted]
+
+
+class TermNormalized(Event):
+    term: Term
+
+
+class OccurrenceResolved(Event):
+    term: Term
+    source: TextSource
+
+class VerifiedDefinitionResolved(Event):
+    term: Term
+    definition: Definition
+
+class PartialDefinitionGenerated(Event):
+    term: Term
+    definition: Definition
+
+
+class CombinedDefinitionGenerated(Event):
+    term: Term
+    combined_definition: Definition
+    relevant_definitions: List[Definition]
+
+class EventDispatcher:
+    handler: Annotated[Dict[Type[Event], List[Handler]], Field(default_factory=dict)]
+    task_group: Annotated[TaskGroup, Field(default_factory=TaskGroup)]
+    done_event: Annotated[asyncio.Event, Field(default_factory=asyncio.Event)]
+    active_handlers: int = 0
+
+    def __init__(self):
+        self.handler = {}
+        self.task_group = TaskGroup()
+        self.done_event = asyncio.Event()
+        self.active_handlers = 0
+
+    def register_handler(self, handler: Handler) -> None:
+        for event in handler.handles:
+            if event not in self.handler:
+                self.handler[event] = []
+            self.handler[event].append(handler)
+
+
+    def emit(self, event: Event):
+        if type(event) not in self.handler:
+            logger.debug(f"No handler found for {event.__class__.__name__}")
+            return
+
+        for handler in self.handler[type(event)]:
+            async def handle_event(h: Handler):
+                self.done_event.clear()
+                self.active_handlers += 1
+                logger.debug(f"Event {event.__class__.__name__} calls handler {h.__class__.__name__}")
+                async for x in h.activate(event):
+                    if x is not None:
+                        self.emit(x)
+                self.active_handlers -= 1
+
+                if self.active_handlers == 0:
+                    self.done_event.set()
+
+            self.task_group.create_task(handle_event(handler))
diff --git a/archive/2025/summer/bsc_gerg/src/terminology/models.py b/archive/2025/summer/bsc_gerg/src/terminology/models.py
new file mode 100644
index 000000000..270751e6c
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/terminology/models.py
@@ -0,0 +1,34 @@
+import uuid
+from typing import Any, Optional, Annotated, List
+from uuid import UUID
+
+from pydantic import BaseModel, Field
+
+
+class TextSource(BaseModel):
+    id: UUID = Field(UUID)
+    text: str
+
+
+class Definition(BaseModel):
+    text: str
+    verified: bool
+    partial: bool
+    source: Optional[TextSource | Any] = None
+
+    def is_combined(self) -> bool:
+        return not self.verified and not self.partial
+
+    def is_partial(self) -> bool:
+        return not self.verified and self.partial
+
+class Term(BaseModel):
+    id: Annotated[UUID, Field(default_factory=uuid.uuid4)]
+    text: str
+    normalization: Optional[str] = None
+    variations: Annotated[List[str], Field(default_factory=list)]
+    occurrences: Annotated[List[UUID], Field(default_factory=list)]
+    definitions: Annotated[List[Definition], Field(default_factory=list)]
+
+    def normalized_or_text(self):
+        return self.normalization if self.normalization is not None else self.text
diff --git a/archive/2025/summer/bsc_gerg/src/terminology/session.py b/archive/2025/summer/bsc_gerg/src/terminology/session.py
new file mode 100644
index 000000000..534c69d6e
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/terminology/session.py
@@ -0,0 +1,108 @@
+import uuid
+from typing import Annotated, Optional
+from uuid import UUID
+
+from pydantic import BaseModel, Field
+
+from src.knowledge.document import Pdf2Text
+from src.knowledge.extract import CValue
+from src.knowledge.openai.definition.combiner import OpenAIDefinitionCombiner
+from src.knowledge.openai.definition.generator import OpenAIDefinitionGenerator
+from src.knowledge.openai.extract import OpenAIExtractor
+from src.knowledge.openai.lemmatize import OpenAILemmatizer
+from src.knowledge.resolver import CSVDefinitionResolver
+from src.terminology.event import DocumentAdded, TextExtracted
+from src.terminology.terminology import Controller, Blackboard
+
+
+class KnowledgeSourcePolicy(BaseModel):
+    use_llm: bool = False
+    pass
+
+class Session(BaseModel):
+    id: Annotated[UUID, Field(default_factory=uuid.uuid4)]
+    policy: KnowledgeSourcePolicy
+
+    def setup_controller_document_processing(self, controller: Controller) -> Controller:
+        controller.register_knowledge_source(Pdf2Text)
+        return controller
+
+    def setup_controller_term_extraction(self, controller: Controller) -> Controller:
+        if self.policy.use_llm:
+            controller.register_knowledge_source(OpenAIExtractor)
+            controller.register_knowledge_source(OpenAILemmatizer)
+        else:
+            controller.register_knowledge_source(CValue)
+        return controller
+
+    def setup_controller_definition_generation(self, controller: Controller) -> Controller:
+        controller.register_knowledge_source(CSVDefinitionResolver)
+        if self.policy.use_llm:
+            controller.register_knowledge_source(OpenAIDefinitionGenerator)
+            controller.register_knowledge_source(OpenAIDefinitionCombiner)
+        return controller
+
+
+    async def process_document(self, file_path: str) -> Blackboard:
+        controller = Controller()
+        self.setup_controller_document_processing(controller)
+        self.setup_controller_term_extraction(controller)
+        self.setup_controller_definition_generation(controller)
+
+        await controller.emit(DocumentAdded(path=file_path))
+
+        return controller.blackboard
+
+
+    async def retrieve_term_definition(self, text: str, context: Optional[str] = None) -> Blackboard:
+        controller = Controller()
+        self.setup_controller_term_extraction(controller)
+        self.setup_controller_definition_generation(controller)
+
+        # TODO: Make proper use of context!!!
+        if context is not None:
+            controller.blackboard.add_text_source(context)
+
+        await controller.emit(TextExtracted(text=text))
+
+        return controller.blackboard
+
+    async def extract_terminology(self, text: str, context: Optional[str] = None) -> Blackboard:
+        controller = Controller()
+        self.setup_controller_term_extraction(controller)
+
+        # TODO: Make proper use of context!!!
+        if context is not None:
+            controller.blackboard.add_text_source(context)
+
+        await controller.emit(TextExtracted(text=text))
+
+        return controller.blackboard
+
+    model_config = {
+        "arbitrary_types_allowed": True,
+    }
+
+
+class SessionManager:
+
+    sessions = {}
+
+    @staticmethod
+    def setup_controller_llm(controller: Controller):
+        controller.register_knowledge_source(OpenAIExtractor)
+        # controller.register_knowledge_source(CValue)
+        controller.register_knowledge_source(OpenAILemmatizer)
+        # TODO: Occurrence Resolver
+        # controller.register_knowledge_source(OpenAIDefinitionGenerator)
+        # controller.register_knowledge_source(OpenAIDefinitionCombiner)
+
+    @classmethod
+    def create_session(cls, policy: KnowledgeSourcePolicy) -> Session:
+        session = Session(policy=policy)
+        cls.sessions[session.id] = session
+        return session
+
+    @classmethod
+    def remove_session(cls, session_id: UUID):
+        cls.sessions.pop(session_id)
diff --git a/archive/2025/summer/bsc_gerg/src/terminology/terminology.py b/archive/2025/summer/bsc_gerg/src/terminology/terminology.py
new file mode 100644
index 000000000..30353ed12
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/terminology/terminology.py
@@ -0,0 +1,123 @@
+import asyncio
+import uuid
+from typing import Optional, Annotated, List, AsyncIterable, Type, Any
+from uuid import UUID
+
+from pydantic import BaseModel, Field
+
+from src.terminology.event import Handler, Event, EventDispatcher, DocumentAdded, TextExtracted, TermExtracted, \
+    OccurrenceResolved, PartialDefinitionGenerated, TermNormalized
+from src.terminology.models import Term, TextSource, Definition
+
+
+class Blackboard(BaseModel):
+    terms: Annotated[List[Term], Field(default_factory=list)]
+    sources: Annotated[List[TextSource], Field(default_factory=list)]
+
+    def add_term(self, term: str):
+        term = Term(text=term)
+        self.terms.append(term)
+        return term
+
+    def find_term(self, term_str: str):
+        for term in self.terms:
+            if term.text == term_str:
+                return term
+
+    def add_text_source(self, text: str):
+        source = TextSource(id=uuid.uuid4(), text=text)
+        self.sources.append(source)
+        return source
+
+    def get_text_source(self, id: UUID) -> Optional[TextSource]:
+        for source in self.sources:
+            if source.id == id:
+                return source
+        return None
+
+
+class KnowledgeSource(Handler):
+    blackboard: Blackboard
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+
+class TextExtractor(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [DocumentAdded])]
+
+    async def activate(self, event: DocumentAdded) -> AsyncIterable[Event]:
+        yield
+
+
+class TermExtractor(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [TextExtracted])]
+
+    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
+        yield
+
+
+class TermNormalizer(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [TermExtracted])]
+
+    async def activate(self, event: TermExtracted) -> AsyncIterable[Event]:
+        yield
+
+
+class OccurrenceResolver(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [TermExtracted, TermNormalized])]
+
+    async def activate(self, event: TermExtracted | TermNormalized) -> AsyncIterable[Event]:
+        yield
+
+
+class DefinitionResolver(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [TermExtracted, TermNormalized])]
+
+    async def activate(self, event: Event) -> AsyncIterable[Event]:
+        yield
+
+
+class DefinitionGenerator(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [OccurrenceResolved])]
+
+    async def activate(self, event: OccurrenceResolved) -> AsyncIterable[Event]:
+        yield
+
+
+class DefinitionCombiner(KnowledgeSource):
+    handles: Annotated[List[Type[Event]], Field(default_factory=lambda: [PartialDefinitionGenerated])]
+
+    async def activate(self, event: PartialDefinitionGenerated) -> AsyncIterable[Event]:
+        yield
+
+
+class Controller:
+
+    def __init__(self):
+        self.blackboard = Blackboard()
+        self.knowledge_sources = []
+        self.broker = EventDispatcher()
+
+    def register_knowledge_source(self, knowledge_source: Type[KnowledgeSource]):
+        knowledge_source.blackboard = self.blackboard
+        instance = knowledge_source(blackboard=self.blackboard)
+        self.knowledge_sources.append(instance)
+        self.broker.register_handler(instance)
+
+    async def emit(self, event: Event):
+        async with self.broker.task_group:
+            self.broker.emit(event)
+
+    async def analyse_document(self, path: str):
+        async with self.broker.task_group:
+            self.broker.emit(
+                DocumentAdded(path=path)
+            )
+
+    async def start(self):
+        async with self.broker.task_group:
+            self.broker.emit(
+                TextExtracted(text="Der Schrankenwärter muss das Gleis sichern.")
+            )
diff --git a/archive/2025/summer/bsc_gerg/src/utils.py b/archive/2025/summer/bsc_gerg/src/utils.py
new file mode 100644
index 000000000..479760acc
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/utils.py
@@ -0,0 +1,9 @@
+from importlib import import_module
+
+__lazy_modules: dict = {}
+
+def lazy_module(module: str):
+    global __lazy_modules
+    if module not in __lazy_modules:
+        __lazy_modules[module] = import_module(module)
+    return __lazy_modules[module]
\ No newline at end of file