hamlet/neural/train_neural_net.py from thatandromeda/hamlet

hamlet/neural/train_neural_net.py
Summary

Maintainability

1 day
Test Coverage

Issues
from glob import glob
import json
import logging
import os
import random
import re
import shutil
from string import punctuation
import time
import xml.etree.ElementTree as ET

from gensim.models.doc2vec import LabeledSentence, Doc2Vec
from nltk import sent_tokenize, WordPunctTokenizer
import requests
from tika import parser as tikaparser

from django.db.models import Count
from django.db.utils import DataError

from hamlet.theses.models import Thesis, Contribution, Person

# See https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1

logger = logging.getLogger(__name__)

CUR_DIR = os.path.dirname(os.path.realpath(__file__))
with open(CUR_DIR + '/thesis_set_list.json', 'r') as f:
    THESIS_SET_LIST = json.loads(f.read())

DSPACE_OAI_IDENTIFIER = os.environ.get('DSPACE_OAI_IDENTIFIER')
DSPACE_OAI_URI = os.environ.get('DSPACE_OAI_URI')

# Where to put the files we train on.
FILES_DIR = 'files'

METS_NAMESPACE = {'mets': 'http://www.loc.gov/METS/',
                  'mods': 'http://www.loc.gov/mods/v3',
                  'oai': 'http://www.openarchives.org/OAI/2.0/',
                  'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
                  'dc': 'http://purl.org/dc/elements/1.1/',
                  'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}


class LabeledLineSentence(object):
    def __init__(self, subdir):
        doc_list = glob(os.path.join(CUR_DIR, FILES_DIR, subdir, '*'))

        self.doc_list = [doc for doc in doc_list if doc.endswith('.txt')]

    def __iter__(self):
        for filename in self.doc_list:
            with open(filename, 'r') as f:
                doc = f.read()
            yield LabeledSentence(words=self._tokenize(doc),
                                  tags=[os.path.basename(filename)])

    def _tokenize(self, doc):
        all_tokens = []
        sentences = sent_tokenize(doc)

        tokenizer = WordPunctTokenizer()
        for sentence in sentences:
            words = tokenizer.tokenize(sentence.lower())
            words = [word for word in words if word not in punctuation]
            all_tokens.extend(words)
        return all_tokens


class MetadataWriter(object):
    DEGREE_OPTIONS = ["Bachelor's degree",
                      "Engineer's degree",
                      "Master's degree",
                      "Ph.D. / Sc.D."]

    def extract_contributors(self, metadata):
        # Includes advisor and department.
        contributors = metadata.findall('.//dc:contributor', METS_NAMESPACE)
        advisors = []
        departments = []

        for contributor in contributors:
            text = contributor.text
            if not text:
                continue
            if any(['Massachusetts Institute' in text,
                    'Dept' in text,
                    'Department' in text]):
                departments.append(text)
            else:
                advisors.append(text)
        return advisors, departments

    def extract_date(self, metadata):
        # There will be several (representing copyright, accessioning, etc.)
        # The copyright date will be a four-digit year. Find the earliest
        # year (there may be a substantial difference between copyright year
        # and archival processing years).
        date_format = r'^[0-9]{4}$'
        dates = metadata.findall('.//dc:date', METS_NAMESPACE)
        earliest = 20000
        for date in dates:
            if re.match(date_format, date.text):
                year = int(date.text)
                if year < earliest:
                    earliest = year
        return earliest

    def extract_degree_from_sets(self, item_sets):
        """Get degree from thesis set name if possible.

        This is easier than parsing the metadata but doesn't always work, so we
        try it first and fall back to metadata parsing if needed."""
        for item_set in item_sets:
            try:
                candidate = THESIS_SET_LIST[item_set].split('-')[1].strip()
                if candidate in self.DEGREE_OPTIONS:
                    return candidate
            except (IndexError, KeyError):
                # If the set text isn't of the form "Department - Degree",
                # taking an element from the split() will fail. That's ok.
                continue

        return None

    def extract_degree(self, metadata, item_sets):
        if item_sets:
            degree = self.extract_degree_from_sets(item_sets)
            if degree:
                return degree

        try:
            result = [e.text for e in
                      self.mets.findall('.//mods:note', METS_NAMESPACE) if
                      e.text is not None and
                      (e.text.startswith('Thesis') or
                       e.text.startswith('Massachusetts Institute of '
                                         'Technology'))]
        except AttributeError:
            result = None
        deg_text = result[0] if result else None

        return Thesis.extract_degree(deg_text) if deg_text else None

    def extract_identifier(self, metadata):
        identifiers = metadata.findall('.//dc:identifier', METS_NAMESPACE)
        id_str = None
        matcher = r'http[s]?://hdl.handle.net/1721.1/([0-9]*)'

        # There may be multiple identifiers; find the first that looks like a
        # handle.
        for identifier in identifiers:
            try:
                id_str = re.match(matcher, identifier.text).groups()[0]
                return int(id_str)
            except AttributeError:
                continue

    def extract_metadata(self, metadata_dc, metadata_mets, item_sets):
        try:
            dc = ET.fromstring(metadata_dc)
            mets = ET.fromstring(metadata_mets)
        except ET.ParseError:
            return None

        authors = dc.findall('.//dc:creator', METS_NAMESPACE)
        advisors, departments = self.extract_contributors(dc)
        date = self.extract_date(dc)
        degree = self.extract_degree(mets, item_sets)
        id = self.extract_identifier(dc)
        title = self.extract_title(mets)
        url = self.extract_url(mets)

        return {'authors': [author.text for author in authors],
                'advisors': advisors,
                'date': date,
                'degree': degree,
                'departments': departments,
                'id': id,
                'title': title,
                'url': url}

    def extract_title(self, mets):
        title = mets.find('.//mods:title', METS_NAMESPACE)
        try:
            return title.text
        except:
            return ''

    def extract_url(self, mets):
        record = mets.find('.//mets:file[@MIMETYPE="application/pdf"]/',
                           METS_NAMESPACE)

        # Do this instead of "if not record: return", because record will be
        # falsy *even when it exists*.
        try:
            url = record.get('{http://www.w3.org/1999/xlink}href')
        except:
            return None

        if url:
            url = url.replace('http://', 'https://')

        return url

    def write(self, metadata_dc, metadata_mets, item_sets):
        datadict = self.extract_metadata(metadata_dc, metadata_mets, item_sets)
        if not datadict:
            return False
        # Catch non-thesis objects.
        required = ['title', 'url', 'date', 'id', 'degree', 'authors',
                    'advisors', 'departments']
        if not all([datadict[key] for key in required]):
            return False

        try:
            Thesis.objects.get(identifier=datadict['id'])
        except Thesis.DoesNotExist:
            try:
                thesis = Thesis.objects.create(
                    title=datadict['title'],
                    url=datadict['url'],
                    year=datadict['date'],
                    identifier=datadict['id'],
                    degree=datadict['degree']
                )
                print('Created {}'.format(thesis.id))
                thesis.add_people(datadict['authors'])
                thesis.add_people(datadict['advisors'], author=False)
                thesis.add_departments(datadict['departments'])
            except DataError as e:
                print('~~~~~~Failed; identifier was {}'.format(datadict['id']))
                print(e)

        return True


class DocFetcher(object):
    DOCS_CACHE = {}
    WRITER = MetadataWriter()

    def get_network_files(self, args, start_date=None, end_date=None):
        print('Network files!!!!!!')
        items = self.get_record_list(DSPACE_OAI_URI, start_date, end_date)
        parsed_items = self.parse_record_list(items)
        total_items_processed = 0

        for item in parsed_items:
            if item['handle'] not in self.DOCS_CACHE.keys():
                self.DOCS_CACHE[item['handle']] = {}

            if not self.is_thesis(item):
                continue
            print('Processing {}'.format(item['handle']))

            total_items_processed += 1

            print('Processing item %s' % item['handle'])
            if 'textfile' not in self.DOCS_CACHE[item['handle']].keys():
                self.get_single_network_file(item, args)

    def get_record(self, dspace_oai_uri, dspace_oai_identifier, identifier,
                   metadata_format):
        '''Gets metadata record for a single item in OAI-PMH repository in
        specified metadata format.
        '''
        params = {'verb': 'GetRecord',
                  'identifier': dspace_oai_identifier + identifier,
                  'metadataPrefix': metadata_format}
        r = requests.get(dspace_oai_uri, params=params)
        return r.text

    def get_record_list(self, dspace_oai_uri, start_date=None,
                        end_date=None):
        '''Returns a list of record headers for items in OAI-PMH repository.
        Must pass in desired metadata format prefix. Can optionally pass
        bounding dates to limit harvest.
        '''
        params = {'verb': 'ListIdentifiers', 'metadataPrefix': 'mets'}

        if start_date:
            params['from'] = start_date
        if end_date:
            params['until'] = end_date

        r = requests.get(dspace_oai_uri, params=params)
        return r.text

    def get_single_network_file(self, item, args):
        if Thesis.objects.filter(identifier=item['identifier']):
            return

        metadata_mets = self.get_record(DSPACE_OAI_URI, DSPACE_OAI_IDENTIFIER,
                                        item['identifier'], 'mets')

        # The available formats are oai_dc; qdc; rdf; ore; and mets. None of
        # them match the Dublin Core displayed at dspace.mit.edu, but rdf seems
        # to have the content we want. To get a list of all verbs, issue a
        # get request to the OAI endpoint with
        # params={'verb': 'ListMetadataFormats'}.
        metadata_dc = self.get_record(DSPACE_OAI_URI, DSPACE_OAI_IDENTIFIER,
                                      item['identifier'], 'rdf')

        if args['write_metadata']:
            outcome = self.write_metadata(metadata_dc,
                                          metadata_mets,
                                          item['sets'])
            if not outcome:
                return

    def is_thesis(self, item):
        '''Returns True if any set_spec in given sets is in the
        thesis_set_spec_list, otherwise returns false.
        '''
        # There are some things that are in the Thesis set but aren't really
        # theses (e.g. they are technical reports). These things will fail when
        # we try to extract their departments or degrees; we should filter them
        # out at that point.
        try:
            return self.DOCS_CACHE[item['handle']]['is_thesis']
        except KeyError:
            ans = any([s in THESIS_SET_LIST.keys()for s in item['sets']])
            self.DOCS_CACHE[item['handle']]['is_thesis'] = ans
            return ans

    def parse_record_list(self, record_xml):
        xml = ET.fromstring(record_xml)
        records = xml.findall('.//oai:header', METS_NAMESPACE)
        for record in records:
            handle = record.find('oai:identifier', METS_NAMESPACE).text\
                .replace('oai:dspace.mit.edu:', '').replace('/', '-')
            identifier = handle.replace('1721.1-', '')
            setSpecs = record.findall('oai:setSpec', METS_NAMESPACE)
            sets = [s.text for s in setSpecs]
            yield {'handle': handle, 'identifier': identifier, 'sets': sets}

    def write_metadata(self, metadata_dc, metadata_mets, item_sets):
        return self.WRITER.write(metadata_dc, metadata_mets, item_sets)


class ModelTrainer(object):
    # Accept queryset of Theses as arg
    # Fetch network data
    #   Check if it's in a main directory already; if so, ignore
    #   otherwise, fetch file, extract text, and write to main directory
    # Split test and training sets
    #   For all files in our set, cp from main to test or training
    # Train test model
    # Train training model
    MAIN_FILES_DIR = os.path.join(CUR_DIR, FILES_DIR, 'main')
    STARTING_FILES = os.listdir(MAIN_FILES_DIR)

    def __init__(self, files_subdirs=None):
        # If a list of subdirectory names is passed in, ModelTrainer will
        # split files into those directories and train a model on each.
        # If not, it will train on the entire contents of the main file
        # directory.
        if not files_subdirs:
            files_subdirs = ['main']

        self.FILES_SUBDIRS = files_subdirs

    def reset_directories(self):
        print('Resetting directories')
        training_dir = os.path.join(CUR_DIR, FILES_DIR, 'training')
        test_dir = os.path.join(CUR_DIR, FILES_DIR, 'test')
        shutil.rmtree(training_dir)
        shutil.rmtree(test_dir)
        os.mkdir(training_dir)
        os.mkdir(test_dir)

    def get_filename(self, thesis):
        return '1721.1-{}.txt'.format(thesis.identifier)

    def fetch_and_write_file(self, thesis, pdf_filepath):
        print('Fetching network file for thesis {}'.format(thesis.identifier))
        with open(pdf_filepath, 'wb') as f:
            r = requests.get(thesis.url, stream=True)
            r.raise_for_status()
            for chunk in r.iter_content(1024):
                f.write(chunk)
            f.flush()

    def extract_text(self, thesis):
        # If we've already extracted this file, let's not do it again.
        filename = self.get_filename(thesis)
        if filename in self.STARTING_FILES:
            print('File already gotten; continuing')
            return

        pdf_filepath = os.path.join(self.MAIN_FILES_DIR, 'temp.pdf')
        filepath = os.path.join(self.MAIN_FILES_DIR, filename)
        try:
            self.fetch_and_write_file(thesis, pdf_filepath)
        except:
            print('~~~~~~WARNING: Download failed for {}'.format(thesis.identifier))
            return

        print('Extracting pdf text from {}'.format(thesis.identifier))
        try:
            parsed = tikaparser.from_file(pdf_filepath)
        except:
            thesis.unextractable = True
            thesis.save()
            return

        try:
            content = parsed['content']
        except KeyError:
            thesis.unextractable = True
            thesis.save()
            return

        if not content:
            thesis.unextractable = True
            thesis.save()
            return
        with open(filepath, 'w') as f:
            f.write(content)

        os.remove(pdf_filepath)

    def split_data(self, queryset):
        """
        Randomly assigns to training or test set, by copying the file to the
        test or training directory. Weighted, such that 80% of objects end up
        in the training set.
        """
        # Clear test/training directories.
        self.reset_directories()

        print('Splitting test and training sets')
        # Sort theses into test/training directories. Throw out some of them
        # to save on memory usage during training and file size after -
        # we'll use the test set metric to gauge whether the trained model
        # generalizes.
        for thesis in queryset:
            set_dir = random.choices(self.FILES_SUBDIRS + [None],
                                     weights=[3, 1, 6])[0]
            if not set_dir:
                continue
            filename = self.get_filename(thesis)
            filepath = os.path.join(self.MAIN_FILES_DIR, filename)
            destination = os.path.join(CUR_DIR, FILES_DIR, set_dir)
            shutil.copy2(filepath, destination)

    def get_iterator(self, subdir):
        return LabeledLineSentence(subdir)

    def inner_train_model(self, window, size, iterator, filename,
                          max_vocab_size=None):
        model = Doc2Vec(alpha=0.025,
                        # Alpha starts at `alpha` and decreases to
                        # `min_alpha`
                        min_alpha=0.025,
                        # Size of DBOW window (default=5).
                        window=window,
                        # Feature vector dimensionality (default=100).
                        size=size,
                        # Min word frequency for inclusion (default=5).
                        min_count=10,
                        max_vocab_size=max_vocab_size)

        print("Building vocab for %s..." % filename)
        model.build_vocab(iterator)

        print("Training %s..." % filename)
        model.train(iterator,
                    total_examples=model.corpus_count,
                    epochs=model.iter)

        fn = '{}_w{}_s{}.model'.format(filename, window, size)
        filepath = os.path.join(CUR_DIR, 'nets', fn)
        model.save(filepath)

    def train_model(self, filename, queryset=Thesis.objects.all()):
        # Don't bother with theses when we know we can't get text from them.
        queryset = queryset.filter(unextractable=False)
        print('About to extract all text')
        for thesis in queryset:
            self.extract_text(thesis)

        print('All text extracted; time to get our ML on')

        iterators = []
        if self.FILES_SUBDIRS:
            self.split_data(queryset)
            for subdir in self.FILES_SUBDIRS:
                iterators.append(self.get_iterator(subdir))
        else:
                iterators.append(self.get_iterator(None))

        for window in range(3, 10):
            for step in range(1, 5):
                size = step * 52  # Multiples of 4 have better performance.
                start_time = time.time()

                print('Training with parameters window={}, '
                      'size={}'.format(window, size))
                for iterator in iterators:
                    self.inner_train_model(window, size,
                                           iterator,
                                           '{}_train'.format(filename))

                print('Finished training, took {}'.format(
                    time.time() - start_time))


class Evaluator(object):
    """
    The Evaluator assumes that, if we have two theses A and B which we expect
    to be similar, and another C which we expect to be unlike A and B, a
    good model will place A and B much closer together than A and C or B and C.

    It:
    * takes a list of models and a queryset they were trained on
    * selects tuples (A, B, C) of suitable theses within the queryset
    * sums sim(A, B) - sim(A, C) and sim(A, B) - sim(B, C) for all tuples for
      the given model
    * uses that sum as a score
    * ranks the models according to that score (highest is best)

    To choose tuples:
    * A, B are chosen such that they share an advisor, and thus we assume they
      are on similar topics
    * C is randomly chosen from among theses with a different advisor, and thus
      on average we expect C to be more different from A and B than they are
      from one another.

    Why the math works:
    There are 4 basic scenarios.

    sim(A, B) high ; sim(A, C) low
        This is good! This means similar things are identified as such and
        there is a large separation between similar and dissimilar objects.
        The score metric comes out positive.

    sim(A, B) high ; sim(A, C) high
        This is bad! It means everything is similar to everything, which means
        the model is meaningless (at least for this example). The score metric
        comes out near zero.

    sim(A, B) low ; sim(A, C) low
        Also meaningless; also near zero.

    sim(A, B) low ; sim(A, C) high
        This is very bad! It means the model thinks similar theses are
        dissimilar, and vice versa. This is actively misleading. The score
        metric comes out negative.

    Note that for any individual tuple, there may be low-to-negative results
    even if the overall model is good. Because we haven't hand-inspected the
    theses, we don't know for sure that A/B *should* be similar, or that A/C
    *should* be dissimilar. That's why we use a bunch of tuples - we expect
    this to work on average.

    Note also that the model was NOT trained on who the advisors are (we could
    have used this as a label, but did not). This is important because we don't
    want the metric to be simply measuring how well the model learned known
    inputs. If we were to tell it about advisors, it would successfully
    separate theses with different advisors without necessarily learning
    anything about the semantics of thesis text, and the metric would be
    uninformative.
    """
    def __init__(self, model_list):
        self.model_list = model_list
        self.queryset = self.get_queryset()
        self.tuples = self.choose_tuples()
        self.scores = []
        self.tokenizer = LabeledLineSentence('fake')  # used only for tokenizer

    def get_queryset(self):
        """
        The queryset for the evaluator is the set of files that the neural net
        was *trained* on.

        It's important to restrict ourselves to these, and not to the entire
        universe of theses, because we only get meaningful data from the test
        set about whether the parameters have generalized well if we are
        looking at its performance on training set data.
        """
        matcher = '1721.1-(\d+).txt'
        training_dir = os.path.join(CUR_DIR, FILES_DIR, 'training')
        training_files = os.listdir(training_dir)
        training_ids = [re.match(matcher, filename).groups()[0]
                        for filename in training_files]
        return Thesis.objects.filter(identifier__in=training_ids)

    def choose_tuples(self):
        """Choose up to 50 tuples we can use to evaluate our net."""
        print('Choosing tuples')
        tuples = []

        targets = Contribution.objects.filter(role=Contribution.ADVISOR,
                                              thesis__in=self.queryset)
        advisors = Person.objects.annotate(
            num_theses=Count('contribution__thesis')
        ).filter(
            contribution__in=targets, num_theses__gte=2
        ).distinct()

        count = 0

        for advisor in advisors:
            try:
                a, b = advisor.thesis_set.intersection(self.queryset)[0:2]
            except ValueError:
                # It's possible that, even if an advisor has advised 2 or more
                # theses, that the theses won't all be in our queryset. (For
                # example, some advisors supervise theses in multiple
                # departments.) If we encounter these, just move on to the next
                # option.
                continue

            theses = self.queryset.exclude(contribution__person=advisor)
            c = self.get_random_outsider_thesis(theses)
            tuples.append((a, b, c))
            count += 1

            if count == 50:
                break

        return tuples

    def get_random_outsider_thesis(self, theses):
        num = theses.count()
        while True:
            idx = random.randrange(0, num)
            filename = os.path.join(CUR_DIR, FILES_DIR, 'main',
                                    '1721.1-{}.txt'.format(idx))
            if os.path.isfile(filename):
                c = theses[idx]
                break
        return c

    def get_filename_from_identifier(self, identifier):
        return

    def get_tokens(self, doctag):
        filename = os.path.join(CUR_DIR, FILES_DIR, 'main', doctag)
        with open(filename, 'r') as f:
            doc = f.read()

        return self.tokenizer._tokenize(doc)

    def trained_similarities(self, model, a_label, b_label, c_label):
        a_to_b = model.docvecs.similarity(a_label, b_label)
        a_to_c = model.docvecs.similarity(a_label, c_label)
        b_to_c = model.docvecs.similarity(b_label, c_label)

        return a_to_b, a_to_c, b_to_c

    def untrained_similarities(self, model, a_tokens, b_tokens, c_tokens):
        a_to_b = model.docvecs.similarity_unseen_docs(
            model, a_tokens, b_tokens)
        a_to_c = model.docvecs.similarity_unseen_docs(
            model, a_tokens, c_tokens)
        b_to_c = model.docvecs.similarity_unseen_docs(
            model, b_tokens, c_tokens)

        return a_to_b, a_to_c, b_to_c

    def calculate_score(self, model, training):
        print('Scoring model')
        score = 0

        for tuple in self.tuples:
            a_label = '1721.1-{}.txt'.format(tuple[0].identifier)
            b_label = '1721.1-{}.txt'.format(tuple[1].identifier)
            c_label = '1721.1-{}.txt'.format(tuple[2].identifier)

            a_tokens = self.get_tokens(a_label)
            b_tokens = self.get_tokens(b_label)
            c_tokens = self.get_tokens(c_label)

            # If the documents are in the trained set, use the doc2vec
            # similarity() function. We don't want to use
            # similarity_unseen_docs for both test and training, because it
            # needs to use infer_vector, and the results of that are somewhat
            # unpredictable. In particular, the test data may score *better*
            # than the training data. The vector calculated during training is
            # more accurate and we should use it where available.
            if training:
                a_to_b, a_to_c, b_to_c = self.trained_similarities(
                    model, a_label, b_label, c_label)
            else:
                a_to_b, a_to_c, b_to_c = self.untrained_similarities(
                    model, a_tokens, b_tokens, c_tokens)

            subscore = 2 * a_to_b - a_to_c - b_to_c

            score += subscore

        return score

    def score_models(self):
        for filename in self.model_list:
            fullpath = os.path.join(CUR_DIR, 'nets', filename)
            model = Doc2Vec.load(fullpath)
            training = 'training' in fullpath
            score = self.calculate_score(model, training)

            self.scores.append((filename, score))

        self.scores.sort(key=lambda x: x[1], reverse=True)

    def pretty_print(self):
        print('       Model  |   Score')
        print('------------------------------')
        for scoretuple in self.scores:
            print('{} |   {}'.format(scoretuple[0], scoretuple[1]))
        print('------------------------------')
        print('🌈 🎉 🦄')

    def evaluate(self):
        print('About to score all models')
        self.score_models()
        self.pretty_print()


def write_metadata(args):
    fetcher = DocFetcher()
    fetcher.get_network_files(args)


def train_model(args):
    qs = args['queryset']
    fn = args['filename']
    subdirs = args['subdir_list']

    mt = ModelTrainer(subdirs)
    mt.train_model(fn, qs)

    model_list = os.listdir(os.path.join(CUR_DIR, 'nets'))
    model_list = [x for x in model_list
                  if x.startswith(fn) and x.endswith('.model')]
    evie = Evaluator(model_list)
    evie.evaluate()