Skip to content

Commit

Permalink
Format
Browse files Browse the repository at this point in the history
  • Loading branch information
oliverkinch committed Feb 13, 2024
1 parent 2c88617 commit 5a6e8ee
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 13 deletions.
24 changes: 13 additions & 11 deletions src/doms_databasen/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,9 @@ def _get_images(self, pdf_path: Path | str) -> List[np.ndarray]:
images = list(map(np.array, convert_from_path(pdf_path=pdf_path, dpi=DPI)))

# Grayscale
images = list(map(lambda image: cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), images))
images = list(
map(lambda image: cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), images)
)
return images

def _find_tables(self, image: np.ndarray, read_tables: bool = False) -> List[dict]:
Expand Down Expand Up @@ -1834,7 +1836,7 @@ def _process_crop_before_read(
scale = self._get_scale(box_length=box_length)
crop_scaled = self._scale_image(image=crop_refined, scale=scale)

# Ensure that highest pixel value is 255, else
# Ensure that highest pixel value is 255, else
# sharpening might not work as expected.
crop_scaled = np.array(crop_scaled / crop_scaled.max() * 255, dtype=np.uint8)

Expand Down Expand Up @@ -2051,7 +2053,7 @@ def _remove_black_border(self, blob_image: np.ndarray) -> np.ndarray:
return blob_image

def _split_blob_to_multiple_boxes(self, blob: RegionProperties) -> List[dict]:
"""This function is called if a blob is not splitted
"""This function is called if a blob is not splitted
correctly with initial methods.
Args:
Expand Down Expand Up @@ -2454,9 +2456,9 @@ def _remove_boundary_noise(
) -> np.ndarray:
"""Removes noise on the boundary of an anonymized box.
All white pixels in a perfect bounding box
All white pixels in a perfect bounding box
should be a pixel of a relevant character.
Some images have white pixel defect at the
Some images have white pixel defect at the
boundary of the bounding box, and
this function removes those white pixels.
Expand Down Expand Up @@ -2511,7 +2513,7 @@ def _too_few_pixels(self, blob: RegionProperties, touches_boundary: bool) -> boo
Returns:
bool:
True if blob has too few pixels to
True if blob has too few pixels to
be a relevant character. False otherwise.
"""
coords = blob.coords
Expand All @@ -2535,7 +2537,7 @@ def _low_longest_distance_from_boundary(
Returns:
bool:
True if blob has a low longest distance from the
True if blob has a low longest distance from the
boundary of the image. False otherwise.
"""
n = min(crop.shape)
Expand All @@ -2546,11 +2548,11 @@ def _maximum_distance_from_boundary(
) -> float:
"""Get maximum distance from blob to boundary of image.
E.g. if the minimum distance from the blob to
E.g. if the minimum distance from the blob to
the top boundary of the image is 5,
and the minimum distance from the blob to
and the minimum distance from the blob to
the bottom boundary of the image is 10,
to the left boundary is 3, and to the right
to the left boundary is 3, and to the right
boundary is 7, then the maximum distance
from the blob to the boundary of the image is 10.
Expand Down Expand Up @@ -2837,7 +2839,7 @@ def _read_text_with_tika(pdf_path: str) -> str:
except:
pass
return text.strip()

@staticmethod
def _get_text_from_pages(pages: dict) -> str:
"""Get text from pages.
Expand Down
2 changes: 1 addition & 1 deletion src/scripts/finalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main(config: DictConfig) -> None:

def _get_text(processed_data: dict, config: DictConfig) -> Tuple[str, str]:
"""Get `text` and `text_anon` from processed data.
Args:
processed_data (dict):
Processed data for a case.
Expand Down
4 changes: 3 additions & 1 deletion src/scripts/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def main(config: DictConfig) -> None:
elif config.process.case_id:
processor.process(config.process.case_id)
else:
logger.info("Please specify either a 'case_id' or use 'all' to process all cases.")
logger.info(
"Please specify either a 'case_id' or use 'all' to process all cases."
)

logger.info("Processing done!")

Expand Down

0 comments on commit 5a6e8ee

Please sign in to comment.