feat: integrate with docling (#471) bump:patch

* feat: add docling reader implementation * feat: expose docling to UI * fix: improve docling output parsing * docs: update README --------- Co-authored-by: Tadashi <[email protected]>
Cinnamon · Nov 16, 2024 · 56c40f1 · 56c40f1 · eliasjudin · Nov 17, 2024
1 parent 5b828c2
commit 56c40f1
Show file tree

Hide file tree

Showing 7 changed files with 271 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -216,6 +216,17 @@ documents and developers who want to build their own RAG pipeline.
 
 See [Local model setup](docs/local_model.md).
 
+### Setup multimodal document parsing (OCR, table parsing, figure extraction)
+
+These options are available:
+
+- [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence)
+- [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/)
+- [Docling (local, open-source)](https://github.com/DS4SD/docling)
+  - To use Docling, first install required dependencies: `pip install docling`
+
+Select corresponding loaders in `Settings -> Retrieval Settings -> File loader`
+
 ### Customize your application
 
 - By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.

diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -13,6 +13,7 @@
     AdobeReader,
     AzureAIDocumentIntelligenceLoader,
     DirectoryReader,
+    DoclingReader,
     HtmlReader,
     MathpixPDFReader,
     MhtmlReader,
@@ -32,9 +33,10 @@
     credential=str(config("AZURE_DI_CREDENTIAL", default="")),
     cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
 )
-adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr(
-    flowsettings, "KH_VLM_ENDPOINT", ""
-)
+docling_reader = DoclingReader()
+adobe_reader.vlm_endpoint = (
+    azure_reader.vlm_endpoint
+) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
 
 
 KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {

diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py
@@ -2,6 +2,7 @@
 from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
 from .base import AutoReader, BaseReader
 from .composite_loader import DirectoryReader
+from .docling_loader import DoclingReader
 from .docx_loader import DocxReader
 from .excel_loader import ExcelReader, PandasExcelReader
 from .html_loader import HtmlReader, MhtmlReader
@@ -30,4 +31,5 @@
     "TxtReader",
     "PDFThumbnailReader",
     "WebReader",
+    "DoclingReader",
 ]
diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
@@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
     """
     left, upper, right, lower = bbox
 
+    left, right = min(left, right), max(left, right)
+    upper, lower = min(upper, lower), max(upper, lower)
+
     img: Image.Image
     suffix = file_path.suffix.lower()
     if suffix == ".pdf":

diff --git a/libs/kotaemon/kotaemon/loaders/docling_loader.py b/libs/kotaemon/kotaemon/loaders/docling_loader.py
@@ -0,0 +1,232 @@
+import base64
+from collections import defaultdict
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional
+
+from kotaemon.base import Document, Param
+
+from .azureai_document_intelligence_loader import crop_image
+from .base import BaseReader
+from .utils.adobe import generate_single_figure_caption, make_markdown_table
+
+
+class DoclingReader(BaseReader):
+    """Using Docling to extract document structure and content"""
+
+    _dependencies = ["docling"]
+
+    vlm_endpoint: str = Param(
+        help=(
+            "Default VLM endpoint for figure captioning. "
+            "If not provided, will not caption the figures"
+        )
+    )
+
+    max_figure_to_caption: int = Param(
+        100,
+        help=(
+            "The maximum number of figures to caption. "
+            "The rest will be indexed without captions."
+        ),
+    )
+
+    figure_friendly_filetypes: list[str] = Param(
+        [".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"],
+        help=(
+            "File types that we can reliably open and extract figures. "
+            "For files like .docx or .html, the visual layout may be different "
+            "when viewed from different tools, hence we cannot use Azure DI location "
+            "to extract figures."
+        ),
+    )
+
+    @Param.auto(cache=True)
+    def converter_(self):
+        try:
+            from docling.document_converter import DocumentConverter
+        except ImportError:
+            raise ImportError("Please install docling: 'pip install docling'")
+
+        return DocumentConverter()
+
+    def run(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        return self.load_data(file_path, extra_info, **kwargs)
+
+    def load_data(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        """Extract the input file, allowing multi-modal extraction"""
+
+        metadata = extra_info or {}
+
+        result = self.converter_.convert(file_path)
+        result_dict = result.document.export_to_dict()
+
+        file_path = Path(file_path)
+        file_name = file_path.name
+
+        # extract the figures
+        figures = []
+        gen_caption_count = 0
+        for figure_obj in result_dict.get("pictures", []):
+            if not self.vlm_endpoint:
+                continue
+            if file_path.suffix.lower() not in self.figure_friendly_filetypes:
+                continue
+
+            # retrieve extractive captions provided by docling
+            caption_refs = [caption["$ref"] for caption in figure_obj["captions"]]
+            extractive_captions = []
+            for caption_ref in caption_refs:
+                text_id = caption_ref.split("/")[-1]
+                try:
+                    caption_text = result_dict["texts"][int(text_id)]["text"]
+                    extractive_captions.append(caption_text)
+                except (ValueError, TypeError, IndexError) as e:
+                    print(e)
+                    continue
+
+            # read & crop image
+            page_number = figure_obj["prov"][0]["page_no"]
+
+            try:
+                page_number_text = str(page_number)
+                page_width = result_dict["pages"][page_number_text]["size"]["width"]
+                page_height = result_dict["pages"][page_number_text]["size"]["height"]
+
+                bbox_obj = figure_obj["prov"][0]["bbox"]
+                bbox: list[float] = [
+                    bbox_obj["l"],
+                    bbox_obj["t"],
+                    bbox_obj["r"],
+                    bbox_obj["b"],
+                ]
+                if bbox_obj["coord_origin"] == "BOTTOMLEFT":
+                    bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)
+
+                img = crop_image(file_path, bbox, page_number - 1)
+            except KeyError as e:
+                print(e, list(result_dict["pages"].keys()))
+                continue
+
+            # convert img to base64
+            img_bytes = BytesIO()
+            img.save(img_bytes, format="PNG")
+            img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+            img_base64 = f"data:image/png;base64,{img_base64}"
+
+            # generate the generative caption
+            if gen_caption_count >= self.max_figure_to_caption:
+                gen_caption = ""
+            else:
+                gen_caption_count += 1
+                gen_caption = generate_single_figure_caption(
+                    img_base64, self.vlm_endpoint
+                )
+
+            # join the extractive and generative captions
+            caption = "\n".join(extractive_captions + [gen_caption])
+
+            # store the image into document
+            figure_metadata = {
+                "image_origin": img_base64,
+                "type": "image",
+                "page_label": page_number,
+                "file_name": file_name,
+                "file_path": file_path,
+            }
+            figure_metadata.update(metadata)
+
+            figures.append(
+                Document(
+                    text=caption,
+                    metadata=figure_metadata,
+                )
+            )
+
+        # extract the tables
+        tables = []
+        for table_obj in result_dict.get("tables", []):
+            # convert the tables into markdown format
+            markdown_table = self._parse_table(table_obj)
+            caption_refs = [caption["$ref"] for caption in table_obj["captions"]]
+
+            extractive_captions = []
+            for caption_ref in caption_refs:
+                text_id = caption_ref.split("/")[-1]
+                try:
+                    caption_text = result_dict["texts"][int(text_id)]["text"]
+                    extractive_captions.append(caption_text)
+                except (ValueError, TypeError, IndexError) as e:
+                    print(e)
+                    continue
+            # join the extractive and generative captions
+            caption = "\n".join(extractive_captions)
+            markdown_table = f"{caption}\n{markdown_table}"
+
+            page_number = table_obj["prov"][0].get("page_no", 1)
+
+            table_metadata = {
+                "type": "table",
+                "page_label": page_number,
+                "table_origin": markdown_table,
+                "file_name": file_name,
+                "file_path": file_path,
+            }
+            table_metadata.update(metadata)
+
+            tables.append(
+                Document(
+                    text=markdown_table,
+                    metadata=table_metadata,
+                )
+            )
+
+        # join plain text elements
+        texts = []
+        page_number_to_text = defaultdict(list)
+
+        for text_obj in result_dict["texts"]:
+            page_number = text_obj["prov"][0].get("page_no", 1)
+            page_number_to_text[page_number].append(text_obj["text"])
+
+        for page_number, txts in page_number_to_text.items():
+            texts.append(
+                Document(
+                    text="\n".join(txts),
+                    metadata={
+                        "page_label": page_number,
+                        "file_name": file_name,
+                        "file_path": file_path,
+                        **metadata,
+                    },
+                )
+            )
+
+        return texts + tables + figures
+
+    def _convert_bbox_bl_tl(
+        self, bbox: list[float], page_width: int, page_height: int
+    ) -> list[float]:
+        """Convert bbox from bottom-left to top-left"""
+        x0, y0, x1, y1 = bbox
+        return [
+            x0 / page_width,
+            (page_height - y1) / page_height,
+            x1 / page_width,
+            (page_height - y0) / page_height,
+        ]
+
+    def _parse_table(self, table_obj: dict) -> str:
+        """Convert docling table object to markdown table"""
+        table_as_list: List[List[str]] = []
+        grid = table_obj["data"]["grid"]
+        for row in grid:
+            table_as_list.append([])
+            for cell in row:
+                table_as_list[-1].append(cell["text"])
+
+        return make_markdown_table(table_as_list)
diff --git a/libs/kotaemon/kotaemon/loaders/utils/adobe.py b/libs/kotaemon/kotaemon/loaders/utils/adobe.py
@@ -110,7 +110,7 @@ def request_adobe_service(file_path: str, output_path: str = "") -> str:
     return output_path
 
 
-def make_markdown_table(table_as_list: List[str]) -> str:
+def make_markdown_table(table_as_list: List[List[str]]) -> str:
     """
     Convert table from python list representation to markdown format.
     The input list consists of rows of tables, the first row is the header.
@@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:
 
 
 def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
+    output = ""
+
     """Summarize a single figure using GPT-4V"""
     if figure:
-        output = generate_gpt4v(
-            endpoint=vlm_endpoint,
-            prompt="Provide a short 2 sentence summary of this image?",
-            images=figure,
-        )
-        if "sorry" in output.lower():
-            output = ""
-    else:
-        output = ""
+        try:
+            output = generate_gpt4v(
+                endpoint=vlm_endpoint,
+                prompt="Provide a short 2 sentence summary of this image?",
+                images=figure,
+            )
+            if "sorry" in output.lower():
+                output = ""
+        except Exception as e:
+            print(f"Error generating caption: {e}")
+
     return output
 
 

diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py
@@ -39,6 +39,7 @@
     KH_DEFAULT_FILE_EXTRACTORS,
     adobe_reader,
     azure_reader,
+    docling_reader,
     unstructured,
     web_reader,
 )
@@ -673,6 +674,8 @@ def readers(self):
             readers[".pdf"] = adobe_reader
         elif self.reader_mode == "azure-di":
             readers[".pdf"] = azure_reader
+        elif self.reader_mode == "docling":
+            readers[".pdf"] = docling_reader
 
         dev_readers, _, _ = dev_settings()
         readers.update(dev_readers)
@@ -692,6 +695,7 @@ def get_user_settings(cls):
                         "Azure AI Document Intelligence (figure+table extraction)",
                         "azure-di",
                     ),
+                    ("Docling", "docling"),
                 ],
                 "component": "dropdown",
             },