about summary refs log tree commit diff stats
path: root/archive/2025/summer/bsc_gerg/src/knowledge/document.py
diff options
context:
space:
mode:
authorJonas Gerg <joniogerg@gmail.com>2025-09-09 20:06:52 +0200
committerJonas Gerg <joniogerg@gmail.com>2025-09-09 20:06:52 +0200
commit3e5d3ca82193e8e8561beb9ceac9982f376d84e2 (patch)
tree76e4c260123b68b93da2417482024ba11f9838ee /archive/2025/summer/bsc_gerg/src/knowledge/document.py
parenta910d0a3e57f4de47cf2387ac239ae8d0eaca507 (diff)
downloadresearch-work-archive-artifacts-3e5d3ca82193e8e8561beb9ceac9982f376d84e2.tar.gz
research-work-archive-artifacts-3e5d3ca82193e8e8561beb9ceac9982f376d84e2.zip
Add bsc_gerg
Diffstat (limited to 'archive/2025/summer/bsc_gerg/src/knowledge/document.py')
-rw-r--r--archive/2025/summer/bsc_gerg/src/knowledge/document.py67
1 files changed, 67 insertions, 0 deletions
diff --git a/archive/2025/summer/bsc_gerg/src/knowledge/document.py b/archive/2025/summer/bsc_gerg/src/knowledge/document.py
new file mode 100644
index 000000000..3bc2ada51
--- /dev/null
+++ b/archive/2025/summer/bsc_gerg/src/knowledge/document.py
@@ -0,0 +1,67 @@
+import asyncio
+import os
+import tempfile
+from pathlib import Path
+from typing import AsyncIterable
+
+from pypdf import PdfReader, PdfWriter
+
+from src.logger import logger
+from src.terminology.event import Event
+from src.terminology.terminology import DocumentAdded, TextExtracted, TextExtractor, Blackboard
+from src.utils import lazy_module
+
+
+def get_document_converter():
+    module = lazy_module("docling.document_converter")
+    from tqdm import tqdm
+    tqdm(disable=True, total=0)
+    return module.DocumentConverter
+
+
+class Pdf2Text(TextExtractor):
+
+    def extract_text(self, path: str):
+        converter = get_document_converter()()
+        doc = converter.convert(Path(path)).document
+        return path, doc.export_to_markdown()
+
+    def split_into_pages(self, path: str, tmp_path: str):
+        reader = PdfReader(open(path, "rb"))
+        paths = []
+        for i in range(len(reader.pages)):
+            output = PdfWriter()
+            output.add_page(reader.pages[i])
+            out_path = f"{tmp_path}/{i}.pdf"
+            paths.append(out_path)
+            with open(f"{tmp_path}/{i}.pdf", "wb") as file:
+                output.write(file)
+        return paths
+
+    async def activate(self, event: DocumentAdded) -> AsyncIterable[Event]:
+
+        paths = self.split_into_pages(event.path, tempfile.gettempdir())
+
+        logger.info(f"Found {len(paths)} pages in {event.path}")
+
+        tasks = [asyncio.to_thread(self.extract_text, path) for path in paths]
+
+        for task in asyncio.as_completed(tasks):
+            path, text = await task
+            os.unlink(path)
+            yield TextExtracted(text=text)
+
+
+if __name__ == "__main__":
+    blackboard = Blackboard()
+    pdf2text = Pdf2Text(blackboard=blackboard)
+
+
+    async def test():
+        counter = 0
+        async for event in pdf2text.activate(DocumentAdded(path="./../../data/Handbuch-40820-data_43.pdf")):
+            counter += 1
+            with open(f"./../../data/Handbuch-40820-data_43-{counter}.txt", "w") as f:
+                f.write(event.text)
+
+    asyncio.run(test())
\ No newline at end of file