Skip to content

Commit

Permalink
feat: integrate with docling (#471) bump:patch
Browse files Browse the repository at this point in the history
* feat: add docling reader implementation

* feat: expose docling to UI

* fix: improve docling output parsing

* docs: update README

---------

Co-authored-by: Tadashi <[email protected]>
  • Loading branch information
cin-albert and taprosoft authored Nov 16, 2024
1 parent 5b828c2 commit 56c40f1
Show file tree
Hide file tree
Showing 7 changed files with 271 additions and 13 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,17 @@ documents and developers who want to build their own RAG pipeline.

See [Local model setup](docs/local_model.md).

### Setup multimodal document parsing (OCR, table parsing, figure extraction)

These options are available:

- [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence)
- [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/)
- [Docling (local, open-source)](https://github.com/DS4SD/docling)
- To use Docling, first install required dependencies: `pip install docling`

Select corresponding loaders in `Settings -> Retrieval Settings -> File loader`

### Customize your application

- By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.
Expand Down
8 changes: 5 additions & 3 deletions libs/kotaemon/kotaemon/indices/ingests/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
AdobeReader,
AzureAIDocumentIntelligenceLoader,
DirectoryReader,
DoclingReader,
HtmlReader,
MathpixPDFReader,
MhtmlReader,
Expand All @@ -32,9 +33,10 @@
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
)
adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr(
flowsettings, "KH_VLM_ENDPOINT", ""
)
docling_reader = DoclingReader()
adobe_reader.vlm_endpoint = (
azure_reader.vlm_endpoint
) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")


KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
Expand Down
2 changes: 2 additions & 0 deletions libs/kotaemon/kotaemon/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
from .base import AutoReader, BaseReader
from .composite_loader import DirectoryReader
from .docling_loader import DoclingReader
from .docx_loader import DocxReader
from .excel_loader import ExcelReader, PandasExcelReader
from .html_loader import HtmlReader, MhtmlReader
Expand Down Expand Up @@ -30,4 +31,5 @@
"TxtReader",
"PDFThumbnailReader",
"WebReader",
"DoclingReader",
]
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
"""
left, upper, right, lower = bbox

left, right = min(left, right), max(left, right)
upper, lower = min(upper, lower), max(upper, lower)

img: Image.Image
suffix = file_path.suffix.lower()
if suffix == ".pdf":
Expand Down
232 changes: 232 additions & 0 deletions libs/kotaemon/kotaemon/loaders/docling_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
import base64
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import List, Optional

from kotaemon.base import Document, Param

from .azureai_document_intelligence_loader import crop_image
from .base import BaseReader
from .utils.adobe import generate_single_figure_caption, make_markdown_table


class DoclingReader(BaseReader):
"""Using Docling to extract document structure and content"""

_dependencies = ["docling"]

vlm_endpoint: str = Param(
help=(
"Default VLM endpoint for figure captioning. "
"If not provided, will not caption the figures"
)
)

max_figure_to_caption: int = Param(
100,
help=(
"The maximum number of figures to caption. "
"The rest will be indexed without captions."
),
)

figure_friendly_filetypes: list[str] = Param(
[".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"],
help=(
"File types that we can reliably open and extract figures. "
"For files like .docx or .html, the visual layout may be different "
"when viewed from different tools, hence we cannot use Azure DI location "
"to extract figures."
),
)

@Param.auto(cache=True)
def converter_(self):
try:
from docling.document_converter import DocumentConverter
except ImportError:
raise ImportError("Please install docling: 'pip install docling'")

return DocumentConverter()

def run(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
return self.load_data(file_path, extra_info, **kwargs)

def load_data(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Extract the input file, allowing multi-modal extraction"""

metadata = extra_info or {}

result = self.converter_.convert(file_path)
result_dict = result.document.export_to_dict()

file_path = Path(file_path)
file_name = file_path.name

# extract the figures
figures = []
gen_caption_count = 0
for figure_obj in result_dict.get("pictures", []):
if not self.vlm_endpoint:
continue
if file_path.suffix.lower() not in self.figure_friendly_filetypes:
continue

# retrieve extractive captions provided by docling
caption_refs = [caption["$ref"] for caption in figure_obj["captions"]]
extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue

# read & crop image
page_number = figure_obj["prov"][0]["page_no"]

try:
page_number_text = str(page_number)
page_width = result_dict["pages"][page_number_text]["size"]["width"]
page_height = result_dict["pages"][page_number_text]["size"]["height"]

bbox_obj = figure_obj["prov"][0]["bbox"]
bbox: list[float] = [
bbox_obj["l"],
bbox_obj["t"],
bbox_obj["r"],
bbox_obj["b"],
]
if bbox_obj["coord_origin"] == "BOTTOMLEFT":
bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)

img = crop_image(file_path, bbox, page_number - 1)
except KeyError as e:
print(e, list(result_dict["pages"].keys()))
continue

# convert img to base64
img_bytes = BytesIO()
img.save(img_bytes, format="PNG")
img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
img_base64 = f"data:image/png;base64,{img_base64}"

# generate the generative caption
if gen_caption_count >= self.max_figure_to_caption:
gen_caption = ""
else:
gen_caption_count += 1
gen_caption = generate_single_figure_caption(
img_base64, self.vlm_endpoint
)

# join the extractive and generative captions
caption = "\n".join(extractive_captions + [gen_caption])

# store the image into document
figure_metadata = {
"image_origin": img_base64,
"type": "image",
"page_label": page_number,
"file_name": file_name,
"file_path": file_path,
}
figure_metadata.update(metadata)

figures.append(
Document(
text=caption,
metadata=figure_metadata,
)
)

# extract the tables
tables = []
for table_obj in result_dict.get("tables", []):
# convert the tables into markdown format
markdown_table = self._parse_table(table_obj)
caption_refs = [caption["$ref"] for caption in table_obj["captions"]]

extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue
# join the extractive and generative captions
caption = "\n".join(extractive_captions)
markdown_table = f"{caption}\n{markdown_table}"

page_number = table_obj["prov"][0].get("page_no", 1)

table_metadata = {
"type": "table",
"page_label": page_number,
"table_origin": markdown_table,
"file_name": file_name,
"file_path": file_path,
}
table_metadata.update(metadata)

tables.append(
Document(
text=markdown_table,
metadata=table_metadata,
)
)

# join plain text elements
texts = []
page_number_to_text = defaultdict(list)

for text_obj in result_dict["texts"]:
page_number = text_obj["prov"][0].get("page_no", 1)
page_number_to_text[page_number].append(text_obj["text"])

for page_number, txts in page_number_to_text.items():
texts.append(
Document(
text="\n".join(txts),
metadata={
"page_label": page_number,
"file_name": file_name,
"file_path": file_path,
**metadata,
},
)
)

return texts + tables + figures

def _convert_bbox_bl_tl(
self, bbox: list[float], page_width: int, page_height: int
) -> list[float]:
"""Convert bbox from bottom-left to top-left"""
x0, y0, x1, y1 = bbox
return [
x0 / page_width,
(page_height - y1) / page_height,
x1 / page_width,
(page_height - y0) / page_height,
]

def _parse_table(self, table_obj: dict) -> str:
"""Convert docling table object to markdown table"""
table_as_list: List[List[str]] = []
grid = table_obj["data"]["grid"]
for row in grid:
table_as_list.append([])
for cell in row:
table_as_list[-1].append(cell["text"])

return make_markdown_table(table_as_list)
24 changes: 14 additions & 10 deletions libs/kotaemon/kotaemon/loaders/utils/adobe.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def request_adobe_service(file_path: str, output_path: str = "") -> str:
return output_path


def make_markdown_table(table_as_list: List[str]) -> str:
def make_markdown_table(table_as_list: List[List[str]]) -> str:
"""
Convert table from python list representation to markdown format.
The input list consists of rows of tables, the first row is the header.
Expand Down Expand Up @@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:


def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
output = ""

"""Summarize a single figure using GPT-4V"""
if figure:
output = generate_gpt4v(
endpoint=vlm_endpoint,
prompt="Provide a short 2 sentence summary of this image?",
images=figure,
)
if "sorry" in output.lower():
output = ""
else:
output = ""
try:
output = generate_gpt4v(
endpoint=vlm_endpoint,
prompt="Provide a short 2 sentence summary of this image?",
images=figure,
)
if "sorry" in output.lower():
output = ""
except Exception as e:
print(f"Error generating caption: {e}")

return output


Expand Down
4 changes: 4 additions & 0 deletions libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
KH_DEFAULT_FILE_EXTRACTORS,
adobe_reader,
azure_reader,
docling_reader,
unstructured,
web_reader,
)
Expand Down Expand Up @@ -673,6 +674,8 @@ def readers(self):
readers[".pdf"] = adobe_reader
elif self.reader_mode == "azure-di":
readers[".pdf"] = azure_reader
elif self.reader_mode == "docling":
readers[".pdf"] = docling_reader

dev_readers, _, _ = dev_settings()
readers.update(dev_readers)
Expand All @@ -692,6 +695,7 @@ def get_user_settings(cls):
"Azure AI Document Intelligence (figure+table extraction)",
"azure-di",
),
("Docling", "docling"),
],
"component": "dropdown",
},
Expand Down

1 comment on commit 56c40f1

@eliasjudin
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cin-albert @taprosoft since docling provides a "unified expressive representation format", it could be worth looking into adding a docling formatting option to format markdown obtained by any of the readers using docling. i.e. an additional step in the file loader pipeline.

Please sign in to comment.