hamlet/citations/extract_refs.py from thatandromeda/hamlet

hamlet/citations/extract_refs.py
Summary

Maintainability

1 hr
Test Coverage

Issues
import multiprocessing as mp
import os
import pickle
import string

from refextract import extract_references_from_string

from django.conf import settings

localpath = os.path.join(settings.PROJECT_DIR, 'neural', 'files', 'main')


# Categorizing references as good or bad ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _enough_punctuation(ref):
    # Determine whether a string has a high enough percentage of punctuation
    # to be a plausible citation.
    #
    # Citations have a much higher percentage of punctuation than normal
    # free text, as citation formats dictate that they are mostly short strings
    # (like years and personal names) set off by commas, parentheses, etc.
    # This test has a moderate false positive rate but an extremely low false
    # negative rate.
    punct = len([c for c in ref if c in string.punctuation])
    total = len(ref)
    if round(100 * punct / total) >= 10:
        return True
    else:
        return False


def _classify_ref(handle, refdict, goodrefs, badrefs):
    # Determine whether a given candidate citation should be categorized as
    # good or bad.
    #
    # :param handle: The filename of the current thesis.
    # :type handle: str
    # :param refdict: A dictionary representing a single candidate citation.
    # :type refdict: dict
    # :param goodrefs: The current believed-good citations.
    # :type goodrefs: dict. Its keys are handles; its values are lists of
    #                 dicts.
    # :param badrefs: The current believed-bad citations.
    # :type badrefs: dict. Per goodrefs.
    # :rtype: dict
    # :rtype: dict
    try:
        ref = refdict['raw_ref'][0]
    except KeyError:
        return goodrefs, badrefs

    if all([len(ref) <= 500,
            len(ref) >= 30],
            not ref.lower().startswith('table'),
            not ref.lower().startswith('figure'),
            _enough_punctuation(ref),
            ref.upper() != ref):
        goodrefs.setdefault(handle, []).append(refdict)
    else:
        badrefs.setdefault(handle, []).append(refdict)

    return goodrefs, badrefs


def _verify_reftuple_format(reftuples):
    # Ensure that reftuples has the data structure we expect.
    # This is a smoke test - it's just checking the first member, not all of
    # them.
    assert isinstance(reftuples, list)
    first_tuple = reftuples[0]
    assert isinstance(first_tuple, tuple)
    assert isinstance(first_tuple[0], str)
    assert isinstance(first_tuple[1], list)
    assert isinstance(first_tuple[1][0], dict)
    assert 'raw_ref' in first_tuple[1][0].keys()


def _find_candidate_refs(reftuples):
    # Given citation data associated with various filenames, classify the
    # citations as good or bad.
    #
    # :param reftuples: list of tuples of (filename, list of dictd of citation
    # data).
    # :rtype: two dicts. Each dict has filenames as keys; its values are lists
    # of dicts of citation data.
    if not reftuples:
        return {}, {}

    goodrefs = {}
    badrefs = {}

    for reftuple in reftuples:
        try:
            handle = reftuple[0]
            reflist = reftuple[1]
            for refdict in reflist:
                goodrefs, badrefs = \
                    _classify_ref(handle, refdict, goodrefs, badrefs)
        except TypeError:
            pass

    return goodrefs, badrefs


def _reprocess_bad(good, bad):
    for handle in bad.keys():
        filteredbad = [refdict for refdict in bad[handle]
                       if refdict and len(refdict['raw_ref'][0]) < 200]

        for x in range(0, len(filteredbad) - 1):
            testref = '{} {}'.format(filteredbad[x]['raw_ref'][0],
                                     filteredbad[x + 1]['raw_ref'][0])
            if len(testref) < 200:
                ref = extract_references_from_string(testref)

                local_good, local_bad = _find_candidate_refs([(handle, ref)])
                good.setdefault(handle, []).append(local_good)
                bad.setdefault(handle, []).append(local_bad)

    return good, bad


def _find_good_refs(reftuples):
    # Given a list of tuples of (filename, list of citation data dicts),
    # find the probably good citation data.
    #
    # :param reftuples: A list of 2-tuples. Each tuple is (filename, list of
    # candidate references). Each candidate reference is a dict.
    # :rtype: dict. The keys of the dict are filenames; the values are lists of
    # dicts of believed-good citation data.
    _verify_reftuple_format(reftuples)

    good_candidates, bad_candidates = _find_candidate_refs(reftuples)

    # At this point the goodrefs are very good. The badrefs include some
    # goodrefs - particularly in cases where refs have been split over multiple
    # lines. Let's try to extract those.
    new_good, new_bad = _reprocess_bad(good_candidates, bad_candidates)

    print("Theses with good candidates: %d" % len(new_good))
    print("Theses with bad candidates: %d" % len(new_bad))

    print("Total good candidates: %d" %
        sum([len(x) for x in new_good.values()]))
    print("Total bad candidates: %d" %
        sum([len(x) for x in new_bad.values()]))

    return new_good


# Extracting references from files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _extract_refs(handle):
    # Extract potential references from ends of files. Return a tuple of
    # the file handle and the candidate references (a list of dicts).
    #
    # We don't want to parse the beginning, because if the extractor sees
    # "References" or "Bibliography" in the table of contents, it may conclude
    # it has found a reference section and parse the entire file for
    # references, which is unacceptably time-consuming.
    #
    # :param handle: name of a file containing thesis text (not the full path,
    # just the name).
    # :type handle: str.
    # :rtype: tuple or None
    filepath = os.path.join(localpath, handle)
    if os.path.isfile(filepath):
        try:
            popenstring = 'tail -n 1000 {}'.format(filepath)
            end_of_file = os.popen(popenstring).read()
            # The 'is_only_references' flag for this function, when False, is
            # supposed to increase accuracy for text that may contain things
            # other than the reference section (as ours does). It doesn't
            # seem to work, however.
            refs = extract_references_from_string(end_of_file)
            if refs:
                return (handle, refs)
        except:
            pass


# The main function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def extract_good_refs(maxfiles):
    # Extracts believed-good citations from files and returns them.
    # Along the way pickles the extracted but unprocessed refs to simplify
    # debugging and future processing (the extraction is the time-consuming
    # step, so we persist its results.)
    #
    # :param maxfiles: The maximum number of files to extract data from.
    # (Extracting citations from all 43K+ theses takes days, even parallelized,
    # so a shorter option is provided for testing purposes.)
    # :type maxfiles: int
    # :rtype: dict. The keys of the dict are filenames; the values are lists of
    # dicts of believed-good citation data.

    # Extract refs from files ----------------
    pool = mp.Pool(mp.cpu_count())
    if not maxfiles:
        file_list = os.listdir(localpath)
    else:
        file_list = os.listdir(localpath)[:maxfiles]

    results = pool.map(_extract_refs, file_list)

    # Even at 8 cores this data takes days to extract, so persist it now -
    # that way if there are any problems with the subsequent steps, you can
    # recover.
    pickle.dump(results, open("refs.p", "wb"))

    pool.close()
    pool.join()

    # Find good refs -------------------------
    good = _find_good_refs(results)

    return good