Skip to content

Commit

Permalink
Merge pull request #42 from wrznr/issue_0041
Browse files Browse the repository at this point in the history
Allow local (file) paths as source for linked full texts
  • Loading branch information
wrznr authored May 26, 2020
2 parents 0590a85 + c646567 commit 21e8bd0
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 40 deletions.
15 changes: 13 additions & 2 deletions Changelog
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,28 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [0.1.1] - 2020-05-11
### Added
- Treat nested AMD-type (non-logical) divs in logical struct map (i.e.
newspaper case)
- Make full text file group selectable by user
- Allow for file entries (in addition to URLs) in METS
- Add special treatment for URNs and VD IDs
- Add poor man's namespace versioning handling

### Changed
- Make extraction of subtitles conditional on their presence
- Use "licence" for all types of licences (even unknown ones)

### Fixed
- https://github.com/slub/mets-mods2tei/issues/28
- https://github.com/slub/mets-mods2tei/issues/37
- https://github.com/slub/mets-mods2tei/issues/39
- https://github.com/slub/mets-mods2tei/issues/41

## [0.1.0] - 2019-12-04
### Added
- Correctly Place structures which are not on top of a page
- Correctly place structures which are not on top of a page
- Set `corresp` and `facs` attributes of `pb` elements
- Store links to `DEFAULT` images in METS
- Tests for new functionality
Expand Down
19 changes: 8 additions & 11 deletions mets_mods2tei/api/alto.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@

import os
import logging
import re
import Levenshtein

ns = {
'xlink' : "http://www.w3.org/1999/xlink",
'alto': "http://www.loc.gov/standards/alto/ns-v2#",
'alto': "http://www.loc.gov/standards/alto/ns-v4#",
}
XLINK = "{%s}" % ns['xlink']
ALTO = "{%s}" % ns['alto']

norm_alto_ns_re = re.compile(rb'alto/ns-v.#')

class Alto:

def __init__(self):
Expand Down Expand Up @@ -47,7 +50,8 @@ def read(cls, source):
if hasattr(source, 'read'):
return cls.fromfile(source)
if os.path.exists(source):
return cls.fromfile(source)
with open(source, 'rb') as f:
return cls.fromfile(f)

@classmethod
def fromfile(cls, path):
Expand All @@ -65,7 +69,7 @@ def _fromfile(self, path):
:param str path: Path to a ALTO document.
"""
parser = etree.XMLParser(remove_blank_text=True)
self.tree = etree.parse(path, parser)
self.tree = etree.XML(norm_alto_ns_re.sub(b"alto/ns-v4#", path.read()), parser)
self.path = path

def get_text_blocks(self):
Expand All @@ -88,14 +92,7 @@ def get_text_in_line(self, line):
Returns the ALTO-encoded text .
:param Element line: The line to extract the text from.
"""
line_text = ""
for element in line.xpath("./alto:String|./alto:SP", namespaces=ns):
if element.tag == "%sString" % ALTO:
line_text += element.get("CONTENT")
elif element.tag == "%sSP" % ALTO:
line_text += " "
#line_text += "\n"
return line_text
return " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns))

def __compute_fuzzy_distance(self, text1, text2):
"""
Expand Down
34 changes: 23 additions & 11 deletions mets_mods2tei/api/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ def __init__(self):
The constructor.
"""
self.map = {}
filep = open(os.path.realpath(resource_filename(Requirement.parse("mets_mods2tei"), 'mets_mods2tei/data/iso15924-utf8-20180827.txt')))
reader = csv.DictReader(filter(lambda row: row[0]!='#', filep), delimiter=';', quoting=csv.QUOTE_NONE, fieldnames=['code','index','name_eng', 'name_fr', 'alias', 'Age', 'Date'])
for row in reader:
self.map[row['code']] = row['name_eng']
filep.close()
with open(os.path.realpath(resource_filename(Requirement.parse("mets_mods2tei"), 'mets_mods2tei/data/iso15924-utf8-20180827.txt'))) as filep:
reader = csv.DictReader(filter(lambda row: row[0]!='#', filep), delimiter=';', quoting=csv.QUOTE_NONE, fieldnames=['code','index','name_eng', 'name_fr', 'alias', 'Age', 'Date'])
for row in reader:
self.map[row['code']] = row['name_eng']

def get(self, code):
"""
Expand All @@ -55,6 +54,7 @@ def __init__(self):
self.img_map = {}
self.alto_map = {}
self.struct_links = {}
self.fulltext_group_name = 'FULLTEXT'

self.title = None
self.sub_titles = None
Expand Down Expand Up @@ -87,21 +87,21 @@ def read(cls, source):
:param source: METS (file) source.
"""
if hasattr(source, 'read'):
return cls.fromfile(source)
return cls.from_file(source)
if os.path.exists(source):
return cls.fromfile(source)
return cls.from_file(source)

@classmethod
def fromfile(cls, path):
def from_file(cls, path):
"""
Reads in METS from a given file source.
:param str path: Path to a METS document.
"""
i = cls()
i.__fromfile(path)
i.fromfile(path)
return i

def __fromfile(self, path):
def fromfile(self, path):
"""
Reads in METS from a given file source.
:param str path: Path to a METS document.
Expand Down Expand Up @@ -271,7 +271,7 @@ def __spur(self):

# fulltext
fulltext_map = {}
fulltext_group = self.tree.xpath("//mets:fileGrp[@USE='FULLTEXT']", namespaces=ns)
fulltext_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.fulltext_group_name, namespaces=ns)
if fulltext_group:
fulltext_map = {}
for entry in fulltext_group[0].xpath("./mets:file", namespaces=ns):
Expand Down Expand Up @@ -300,6 +300,18 @@ def __spur(self):
self.struct_links[sm_link.get("%sfrom" % XLINK)] = []
self.struct_links[sm_link.get("%sfrom" % XLINK)].append(sm_link.get("%sto" % XLINK))

@property
def fulltext_group_name(self):
"""
Return the currently configured full-text-related
file group use attribute.
"""
return self.__fulltext_group_name

@fulltext_group_name.setter
def fulltext_group_name(self, fulltext_use):
self.__fulltext_group_name = fulltext_use

def get_main_title(self):
"""
Return the main title of the work.
Expand Down
20 changes: 18 additions & 2 deletions mets_mods2tei/api/tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import logging
import copy

from contextlib import closing
from urllib.request import urlopen
from urllib.parse import urlparse
from pkg_resources import resource_filename, Requirement

from .alto import Alto
Expand Down Expand Up @@ -602,8 +604,22 @@ def __add_ocr_to_node(self, node, mets):
alto_link = mets.get_alto(struct_link)
# only collect ocr from a file once!
if not alto_link in self.alto_map:
f = urlopen(alto_link)
alto = Alto.read(f)
try:
sections = urlparse(alto_link)
except:
continue

# use urlopen for both paths and URLs
if not sections.scheme:
mod_link = 'file:' + alto_link
else:
mod_link = alto_link
self.logger.debug(mod_link)

with closing(urlopen(mod_link)) as f:
alto = Alto.read(f)

# save original link!
self.alto_map[alto_link] = alto

pb = etree.SubElement(node, "%spb" % TEI)
Expand Down
7 changes: 5 additions & 2 deletions mets_mods2tei/scripts/mets_mods2tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
@click.command()
@click.argument('mets', required=True)
@click.option('-o', '--ocr', is_flag=True, default=False, help="Serialize OCR into resulting TEI")
@click.option('-T', '--text-group', default="FULLTEXT", help="File group which contains the full text")
@click.option('-l', '--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'OFF']), default='WARN')
def cli(mets, ocr, log_level):
def cli(mets, ocr, text_group, log_level):
""" METS: File containing or URL pointing to the METS/MODS XML to be converted """

#
Expand All @@ -29,7 +30,9 @@ def cli(mets, ocr, log_level):

#
# read in METS
mets = Mets.read(f)
mets = Mets()
mets.fulltext_group_name = text_group
mets.fromfile(f)

#
# create TEI (from skeleton)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name='mets-mods2tei',
version='0.1.0',
version='0.1.1',
description='Convert digital documents in METS/MODS format to TEI',
long_description=open('README.md').read(),
long_description_content_type="text/markdown",
Expand Down
20 changes: 10 additions & 10 deletions tests/test_alto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,41 +38,41 @@ def test_reading_local_file(datadir):
'''
Test reading a local alto file
'''
f = open(datadir.join('test_alto.xml'))
alto = Alto.read(f)
with open(datadir.join('test_alto.xml'), 'rb') as f:
alto = Alto.read(f)
assert(alto.tree is not None)

def test_loading_local_file(datadir):
'''
Test loading a local alto file
'''
f = open(datadir.join('test_alto.xml'))
alto = Alto.fromfile(f)
with open(datadir.join('test_alto.xml'), 'rb') as f:
alto = Alto.read(f)
assert(alto.tree is not None)

def test_text_block_extraction(datadir):
'''
Test the extraction of text blocks
'''
f = open(datadir.join('test_alto.xml'))
alto = Alto.fromfile(f)
with open(datadir.join('test_alto.xml'), 'rb') as f:
alto = Alto.read(f)
assert(len(list(alto.get_text_blocks())) == 1)

def test_text_line_extraction(datadir):
'''
Test the extraction of text lines
'''
f = open(datadir.join('test_alto.xml'))
alto = Alto.fromfile(f)
with open(datadir.join('test_alto.xml'), 'rb') as f:
alto = Alto.read(f)
text_block = list(alto.get_text_blocks())[0]
assert(len(list(alto.get_lines_in_text_block(text_block))) == 26)

def test_text_line_text_extraction(datadir):
'''
Test the extraction of text from text lines
'''
f = open(datadir.join('test_alto.xml'))
alto = Alto.fromfile(f)
with open(datadir.join('test_alto.xml'), 'rb') as f:
alto = Alto.read(f)
text_block = list(alto.get_text_blocks())[0]
text_line = list(alto.get_lines_in_text_block(text_block))[0]
assert(alto.get_text_in_line(text_line) == "Vorbericht.")
25 changes: 24 additions & 1 deletion tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,32 @@ def test_loading_local_file(datadir):
Test loading a local mets file
'''
f = open(datadir.join('test_mets.xml'))
mets = Mets.fromfile(f)
mets = Mets.from_file(f)
assert(mets.mets is not None)

def test_intermediate_file_loading(datadir):
'''
Test loading a local mets file
'''
f = open(datadir.join('test_mets.xml'))
mets = Mets()
mets.fromfile(f)
assert(mets.mets is not None)

def test_fulltext_group_name(subtests, datadir):
'''
Test getting and setting the full text group name
'''
f = open(datadir.join('test_mets.xml'))
mets = Mets.read(f)

with subtests.test("Check getter"):
assert(mets.fulltext_group_name == "FULLTEXT")

with subtests.test("Check setter"):
mets.fulltext_group_name = "TEXT"
assert(mets.fulltext_group_name == "TEXT")

def test_mappings(subtests, datadir):
'''
Test the correct interpretation of the structural linking
Expand Down

0 comments on commit 21e8bd0

Please sign in to comment.