about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/src/knowledge/extract.py
diff options
context:
space:
mode:
Diffstat (limited to 'archive/2025/summer/bsc_gerg/src/knowledge/extract.py')
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/extract.py76
1 files changed, 76 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/extract.py b/archive/2025/summer/bsc_gerg/src/knowledge/extract.py
new file mode 100644
index 000000000..3d5d3ae60
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/extract.py
@@ -0,0 +1,76 @@
+import asyncio
+from typing import AsyncIterable, Any
+
+import spacy
+from spacy import Language
+
+from src.logger import simple_custom_logger
+from src.terminology.event import Event, TextExtracted, TermExtracted
+from src.terminology.terminology import TermExtractor, OccurrenceResolved, Blackboard
+from src.utils import lazy_module
+
+logger = simple_custom_logger("TERMEXTRACTOR")
+
+class CValue(TermExtractor):
+
+    nlp: Language = None
+
+    def model_post_init(self, __context: Any) -> None:
+        self.nlp = spacy.load("de_core_news_md")
+        lazy_module("pyate").TermExtraction.configure({
+            "language": "de",
+            "model_name": "de_core_news_md",
+            "MAX_WORD_LENGTH": 3
+        })
+
+    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
+        result = lazy_module("pyate").cvalues(event.text, have_single_word=True)
+        candidates = result.to_dict().keys()
+        source = self.blackboard.add_text_source(event.text)
+        for term in candidates:
+            t = self.blackboard.add_term(term)
+            yield OccurrenceResolved(term=t, source=source)
+            yield TermExtracted(term=t)
+
+
+class ComboBasicTermExtractor(TermExtractor):
+    nlp: Language = None
+
+    def model_post_init(self, __context: Any) -> None:
+        self.nlp = spacy.load("de_core_news_md")
+        lazy_module("pyate").TermExtraction.configure({
+            "language": "de",
+            "model_name": "de_core_news_md",
+            "MAX_WORD_LENGTH": 3
+        })
+
+    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
+        result = lazy_module("pyate").combo_basic(event.text, have_single_word=True)
+        print(result.sort_values(ascending=False))
+        candidates = result.to_dict().keys()
+        source = self.blackboard.add_text_source(event.text)
+        for term in candidates:
+            t = self.blackboard.add_term(term)
+            yield OccurrenceResolved(term=t, source=source)
+            yield TermExtracted(term=t)
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard()
+    extractor_cvalue = CValue(blackboard=blackboard)
+    extractor_combo_basic = ComboBasicTermExtractor(blackboard=blackboard)
+
+    text = "Wenn im Zug außergewöhnliche Sendungen oder außergewöhnliche Fahrzeuge eingestellt sind, müssen sich deren Beförderungsanordnungen beim Zug befinden und die Nummern der Beförderungsanordnungen dem Fahrdienstleiter mitgeteilt worden sein."
+
+    async def run():
+        # print("C-Value")
+        # async for event in extractor_cvalue.activate(TextExtracted(text=text)):
+        #     if isinstance(event, TermExtracted):
+        #         print(event.term.normalized_or_text())
+        print("\nCombo Basic")
+        async for event in extractor_combo_basic.activate(TextExtracted(text=text)):
+            if isinstance(event, TermExtracted):
+                print(event.term.normalized_or_text())
+
+
+    asyncio.run(run())
\ No newline at end of file