Princeton-CDH/ppa-django

View on GitHub
ppa/archive/management/commands/check_hathi_excerpts.py

Summary

Maintainability
A
2 hrs
Test Coverage
import csv

from django.core.management.base import BaseCommand
from pairtree import storage_exceptions

from intspan import intspan

from ppa.archive.models import DigitizedWork
from ppa.archive.templatetags.ppa_tags import hathi_page_url


class Command(BaseCommand):
    """Check page alignment for excerpted HathiTrust digitized items."""

    help = __doc__

    #: normal verbosity level
    v_normal = 1
    #: verbosity level for the current run; defaults to 1 / normal
    verbosity = v_normal

    def handle(self, *args, **kwargs):
        self.verbosity = kwargs.get("verbosity", self.verbosity)

        # find all excerpted, non-suppressed hathi volumes
        hathi_vols = DigitizedWork.objects.filter(
            source=DigitizedWork.HATHI,
            status=DigitizedWork.PUBLIC,
        ).exclude(item_type=DigitizedWork.FULL)

        output_fields = [
            "source_id",
            "unique_id",
            "pages_orig",
            "pages_digital",
            "orig_label_match",
            "pages_digital_corrected",
            "old_hathi_start",
            "new_hathi_start",
            "notes",
        ]

        report_filename = "ppa-excerpt-pagecheck.csv"
        with open(report_filename, "w") as csvfile:
            csvwriter = csv.DictWriter(csvfile, fieldnames=output_fields)
            csvwriter.writeheader()

            for digwork in hathi_vols:
                info = {
                    "source_id": digwork.source_id,
                    # source id + first page (currently digital, will be switching to original)
                    "unique_id": digwork.index_id(),
                    "pages_orig": digwork.pages_orig,
                    "pages_digital": digwork.pages_digital,
                    "old_hathi_start": hathi_page_url(
                        digwork.source_id, digwork.first_page_digital()
                    ),
                }
                # NOTE: mets loading copied from hathi_page_index_data method
                # worth movint to a method on the hathi object?
                try:
                    mmets = digwork.hathi.mets_xml()
                except storage_exceptions.ObjectNotFoundException:
                    # document the error in the output csv, stop processing
                    info["notes"] = "pairtree data not found"
                    csvwriter.writerow(info)
                    continue
                except storage_exceptions.PartNotFoundException:
                    info["notes"] = "error loading mets file (part not found)"
                    csvwriter.writerow(info)
                    continue

                # make a list of page labels and order from mets structmap
                page_info = [
                    {"order": page.order, "label": page.orderlabel}
                    # also have access to label (@LABEL vs @ORDERLABEL)
                    for page in mmets.structmap_pages
                ]

                # use digital page range to get the first page in the mets
                # that would be included with current digital range (1-based index)

                try:
                    excerpt_first_page = page_info[digwork.first_page_digital() - 1]
                except IndexError:
                    if digwork.first_page_digital() >= len(page_info):
                        excerpt_first_page[-1]
                        info["notes"] = "digital page out of range; trying last page"

                # some mets records don't have labels
                # or, label attribute may be present but empty
                # do we need to check if all pages are missing labels?
                if (
                    excerpt_first_page["label"] is None
                    or excerpt_first_page["label"].strip() == ""
                ):
                    # add a note that mets doesn't have labels, stop processing
                    info["notes"] = "no page label in METS structmap"
                    csvwriter.writerow(info)
                    continue

                # check if METS page label for the first page in range
                # matches the desired first original page
                if excerpt_first_page["label"] != str(digwork.first_page_original()):
                    info["orig_label_match"] = "N"
                    # if they don't match, can we calculate the offset?
                    # (only works for numeric page labels)
                    try:
                        diff = int(digwork.first_page_original()) - int(
                            excerpt_first_page["label"]
                        )
                        # calculate the expected new digital page range
                        # - apply the difference to each number in range,
                        #   since we do have some discontinuous ranges
                        # - convert back to intspan so we can output in
                        #   page range format (1-3 or 1-3,5)
                        new_range = [n + diff for n in digwork.page_span]
                        info["pages_digital_corrected"] = intspan(new_range)
                        info["new_hathi_start"] = hathi_page_url(
                            digwork.source_id, new_range[0]
                        )
                    except ValueError as err:
                        info["notes"] = "could not calculate page offset (%s)" % err

                else:
                    info["orig_label_match"] = "Y"
                    info["notes"] = "page labels match"

                # either way, write out the info
                csvwriter.writerow(info)

        self.stdout.write(f"Excerpt page check report available in {report_filename}")