Skip to content

Commit

Permalink
Color and font information for chars, words and boxes (#39)
Browse files Browse the repository at this point in the history
The information originates in chars, but is pushed to words, lines and boxes in case the values don't differ.
  • Loading branch information
kreuzberger authored Mar 15, 2024
1 parent 518ead3 commit 62086d0
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 1 deletion.
45 changes: 45 additions & 0 deletions libpdf/models/horizontal_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Char: # pylint: disable=too-few-public-methods # simplicity is good.
:ivar y1: distance from the bottom of the page to the upper edge of the character
(greater than y0)
:vartype y1: float
:ivar ncolor: non-stroking-color as rgb value
:vartype ncolor: Tuple[float, float, float]
"""

def __init__(
Expand All @@ -28,13 +30,17 @@ def __init__(
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
ncolor: tuple | None = None,
fontname: str | None = None,
):
"""Init with plain char of a character and its rectangular coordinates."""
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.text = text
self.ncolor = ncolor
self.fontname = fontname

def __repr__(self) -> str:
"""Make the text part of the repr for better debugging."""
Expand Down Expand Up @@ -65,13 +71,24 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.chars = chars
self.ncolor = None
self.fontname = None

if self.chars:
# Obtain the rectangle coordinates from a list of libpdf text objects
self.x0 = min(text_obj.x0 for text_obj in self.chars)
self.y0 = min(text_obj.y0 for text_obj in self.chars)
self.x1 = max(text_obj.x1 for text_obj in self.chars)
self.y1 = max(text_obj.y1 for text_obj in self.chars)

for n in ["ncolor", "fontname"]:
if all(
getattr(x, n) == getattr(self.chars[0], n)
and getattr(x, n) is not None
for x in self.chars
):
setattr(self, n, getattr(self.chars[0], n))

@property
def text(self) -> str:
"""Return plain text."""
Expand Down Expand Up @@ -106,13 +123,24 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.words = words
self.ncolor = None
self.fontname = None

if self.words:
# Obtain the rectangle coordinates from a list of libpdf text objects
self.x0 = min(text_obj.x0 for text_obj in self.words)
self.y0 = min(text_obj.y0 for text_obj in self.words)
self.x1 = max(text_obj.x1 for text_obj in self.words)
self.y1 = max(text_obj.y1 for text_obj in self.words)

for n in ["ncolor", "fontname"]:
if all(
getattr(x, n) == getattr(self.words[0], n)
and getattr(x, n) is not None
for x in self.words
):
setattr(self, n, getattr(self.words[0], n))

@property
def text(self) -> str:
"""Return plain text."""
Expand Down Expand Up @@ -147,18 +175,35 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.lines = lines
self.ncolor = None
self.fontname = None

if self.lines:
# Obtain the rectangle coordinates from a list of libpdf text objects.
self.x0 = min(text_obj.x0 for text_obj in self.lines)
self.y0 = min(text_obj.y0 for text_obj in self.lines)
self.x1 = max(text_obj.x1 for text_obj in self.lines)
self.y1 = max(text_obj.y1 for text_obj in self.lines)

_words = [word for line in self.lines for word in line.words]

for n in ["ncolor", "fontname"]:
if all(
getattr(x, n) == getattr(_words[0], n) and getattr(x, n) is not None
for x in _words
):
setattr(self, n, getattr(_words[0], n))

@property
def text(self) -> str:
"""Return plain text."""
return "\n".join([x.text for x in self.lines])

@property
def words(self) -> list[str]:
"""Return list of words."""
return [word for line in self.lines for word in line.words]

def __repr__(self) -> str | None:
"""Make the text part of the repr for better debugging."""
if self.lines:
Expand Down
10 changes: 9 additions & 1 deletion libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,15 @@ def assemble_to_textlines(
for lt_obj in flatten_lt_objs:
if lt_obj.get_text() != " " and lt_obj.get_text() != "\n":
# instantiate Char
char = Char(lt_obj.get_text(), lt_obj.x0, lt_obj.y0, lt_obj.x1, lt_obj.y1)
char = Char(
lt_obj.get_text(),
lt_obj.x0,
lt_obj.y0,
lt_obj.x1,
lt_obj.y1,
lt_obj.graphicstate.ncolor if hasattr(lt_obj, "graphicstate") else None,
lt_obj.fontname,
)
chars.append(char)

if lt_obj is flatten_lt_objs[-1]:
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
# test PDF for rect extraction generateby by sphinx-simplepdf
PDF_RECTS_EXTRACTION = Path(__file__).parent / "pdf" / "test_rects_extraction.pdf"

# test PDF for color style info
PDF_COLOR_STYLE = Path(__file__).parent / "pdf" / "test_words_color_style.pdf"


@pytest.fixture(scope="session")
def load_full_features_pdf(
Expand Down
Binary file added tests/pdf/test_words_color_style.odt
Binary file not shown.
Binary file added tests/pdf/test_words_color_style.pdf
Binary file not shown.
143 changes: 143 additions & 0 deletions tests/test_word_colors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""Test catalog extraction."""

import libpdf
from tests.conftest import PDF_COLOR_STYLE


def test_colors_0() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if chapter.title == "Color in Text and Heading":
assert chapter.textbox.ncolor == (1, 0, 0)


def test_colors_1() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if chapter.title == "HorizontalLine":
for content in chapter.content:
if (
content.type == "paragraph"
and "Paragraph text is blue" in content.textbox.text
):
assert content.textbox.ncolor == (0, 0, 1)
if (
content.type == "paragraph"
and "This chapter is for" in content.textbox.text
):
assert content.textbox.ncolor == (0, 0, 0)


def test_colors_2() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if chapter.title == "HorizontalBox":
for content in chapter.content:
if content.type == "paragraph":
assert content.textbox.ncolor == (0, 1, 0)
elif chapter.title == "UncoloredHorizontalbox":
for content in chapter.content:
if content.type == "paragraph":
assert content.textbox.ncolor is None
for line in content.textbox.lines:
assert line.ncolor is not None


def test_colors_3() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if "Words" in chapter.title:
for content in chapter.content:
if (
content.type == "paragraph"
and "This line has no color" in content.textbox.text
):
assert content.textbox.ncolor is None

for word in content.textbox.words:
if word.text == "has":
assert word.ncolor == (0, 0, 1)
elif word.text == "color":
assert word.ncolor in [(0, 1, 0), (0, 0, 0)]
elif word.text == "changes":
assert word.ncolor == (1, 0, 0)
elif word.text == "words":
assert word.ncolor == (0, 0, 1)


def test_colors_4() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if "Words" in chapter.title:
for content in chapter.content:
if "This words have no color" in content.textbox.text:
assert content.textbox.ncolor is None

for word in content.textbox.words:
assert word.ncolor is None or word.ncolor == (0, 0, 0)


def test_colors_5() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if "Words" in chapter.title:
for content in chapter.content:
if "These words are printed" in content.textbox.text:
assert content.textbox.ncolor is None

for word in content.textbox.words:
if word.text in ["words", "but"]:
assert word.ncolor == (0, 1, 0)
elif word.text == "printed":
assert word.ncolor == (0, 0, 1)
elif word.text == "background":
assert word.ncolor == (1, 0, 0)


def test_colors_6() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_COLOR_STYLE)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if "Styled Text" in chapter.title:
for content in chapter.content:
if "bold text format" in content.textbox.text:
for word in content.textbox.words:
if word.text == "bold":
assert "Bold" in word.fontname
else:
assert "Bold" not in word.fontname
elif "italic text format" in content.textbox.text:
if word.text == "italic":
assert "Italic" in word.fontname
else:
assert "Italic" not in word.fontname
elif "underline text format" in content.textbox.text:
# this seems to be exracted as rect
pass

0 comments on commit 62086d0

Please sign in to comment.