Skip to content

Commit

Permalink
Merge branch 'main' into feat-support-google-ocr
Browse files Browse the repository at this point in the history
  • Loading branch information
BushrHaddad committed Jan 8, 2025
2 parents aae5167 + ead396a commit e571e31
Show file tree
Hide file tree
Showing 126 changed files with 88,667 additions and 2,245 deletions.
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
## [v2.14.0](https://github.com/DS4SD/docling/releases/tag/v2.14.0) - 2024-12-18

### Feature

* Create a backend to transform PubMed XML files to DoclingDocument ([#557](https://github.com/DS4SD/docling/issues/557)) ([`fd03480`](https://github.com/DS4SD/docling/commit/fd034802b65a0e567531b8ecc9a283aaf030e050))

## [v2.13.0](https://github.com/DS4SD/docling/releases/tag/v2.13.0) - 2024-12-17

### Feature

* Updated Layout processing with forms and key-value areas ([#530](https://github.com/DS4SD/docling/issues/530)) ([`60dc852`](https://github.com/DS4SD/docling/commit/60dc852f16dc1adbb5e9284c81a146043a301ec1))
* Create a backend to parse USPTO patents into DoclingDocument ([#606](https://github.com/DS4SD/docling/issues/606)) ([`4e08750`](https://github.com/DS4SD/docling/commit/4e087504cc4b04210574e69f616badcddfa1f8e5))
* Add Easyocr parameter recog_network ([#613](https://github.com/DS4SD/docling/issues/613)) ([`3b53bd3`](https://github.com/DS4SD/docling/commit/3b53bd38c8efcc5ba54421fbfa90d047f1a61f82))

### Documentation

* Add Haystack RAG example ([#615](https://github.com/DS4SD/docling/issues/615)) ([`3e599c7`](https://github.com/DS4SD/docling/commit/3e599c7bbeef211dc346e9bc1d3a249113fcc4e4))
* Fix the path to the run_with_accelerator.py example ([#608](https://github.com/DS4SD/docling/issues/608)) ([`3bb3bf5`](https://github.com/DS4SD/docling/commit/3bb3bf57150c9705a055982e6fb0cc8d1408f161))

## [v2.12.0](https://github.com/DS4SD/docling/releases/tag/v2.12.0) - 2024-12-13

### Feature
Expand Down
4 changes: 2 additions & 2 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]

try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
text_stream = self.path_or_stream.getvalue()
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, "rb") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
Expand Down
26 changes: 15 additions & 11 deletions docling/backend/mspowerpoint_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
TableCell,
TableData,
)
from PIL import Image
from PIL import Image, UnidentifiedImageError
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER

Expand Down Expand Up @@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip())

# Identify if shape contains lists
Expand Down Expand Up @@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
im_dpi, _ = image.dpi

# Open it with PIL
pil_image = Image.open(BytesIO(image_bytes))

# shape has picture
prov = self.generate_prov(shape, slide_ind, "")
doc.add_picture(
parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
caption=None,
prov=prov,
)
try:
pil_image = Image.open(BytesIO(image_bytes))

# shape has picture
prov = self.generate_prov(shape, slide_ind, "")
doc.add_picture(
parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
caption=None,
prov=prov,
)
except (UnidentifiedImageError, OSError) as e:
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
return

def handle_tables(self, shape, parent_slide, slide_ind, doc):
Expand Down
Empty file added docling/backend/xml/__init__.py
Empty file.
Loading

0 comments on commit e571e31

Please sign in to comment.