about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/src/knowledge/extract.py
blob: 3d5d3ae60fa0add54eb1e9db3687c17173dc7be8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import asyncio
from typing import AsyncIterable, Any

import spacy
from spacy import Language

from src.logger import simple_custom_logger
from src.terminology.event import Event, TextExtracted, TermExtracted
from src.terminology.terminology import TermExtractor, OccurrenceResolved, Blackboard
from src.utils import lazy_module

logger = simple_custom_logger("TERMEXTRACTOR")

class CValue(TermExtractor):

    nlp: Language = None

    def model_post_init(self, __context: Any) -> None:
        self.nlp = spacy.load("de_core_news_md")
        lazy_module("pyate").TermExtraction.configure({
            "language": "de",
            "model_name": "de_core_news_md",
            "MAX_WORD_LENGTH": 3
        })

    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
        result = lazy_module("pyate").cvalues(event.text, have_single_word=True)
        candidates = result.to_dict().keys()
        source = self.blackboard.add_text_source(event.text)
        for term in candidates:
            t = self.blackboard.add_term(term)
            yield OccurrenceResolved(term=t, source=source)
            yield TermExtracted(term=t)


class ComboBasicTermExtractor(TermExtractor):
    nlp: Language = None

    def model_post_init(self, __context: Any) -> None:
        self.nlp = spacy.load("de_core_news_md")
        lazy_module("pyate").TermExtraction.configure({
            "language": "de",
            "model_name": "de_core_news_md",
            "MAX_WORD_LENGTH": 3
        })

    async def activate(self, event: TextExtracted) -> AsyncIterable[Event]:
        result = lazy_module("pyate").combo_basic(event.text, have_single_word=True)
        print(result.sort_values(ascending=False))
        candidates = result.to_dict().keys()
        source = self.blackboard.add_text_source(event.text)
        for term in candidates:
            t = self.blackboard.add_term(term)
            yield OccurrenceResolved(term=t, source=source)
            yield TermExtracted(term=t)


if __name__ == "__main__":
    blackboard = Blackboard()
    extractor_cvalue = CValue(blackboard=blackboard)
    extractor_combo_basic = ComboBasicTermExtractor(blackboard=blackboard)

    text = "Wenn im Zug außergewöhnliche Sendungen oder außergewöhnliche Fahrzeuge eingestellt sind, müssen sich deren Beförderungsanordnungen beim Zug befinden und die Nummern der Beförderungsanordnungen dem Fahrdienstleiter mitgeteilt worden sein."

    async def run():
        # print("C-Value")
        # async for event in extractor_cvalue.activate(TextExtracted(text=text)):
        #     if isinstance(event, TermExtracted):
        #         print(event.term.normalized_or_text())
        print("\nCombo Basic")
        async for event in extractor_combo_basic.activate(TextExtracted(text=text)):
            if isinstance(event, TermExtracted):
                print(event.term.normalized_or_text())


    asyncio.run(run())