hamlet/theses/models.py
from functools import reduce
import re
from django.conf import settings
from django.urls import reverse
from django.db import models
from django.utils.functional import cached_property
class Person(models.Model):
# NOTE: distinct people with the same name may be stored as the same Person
# instance, because we have no way to disambiguate.
name = models.CharField(max_length=75)
def __str__(self):
return self.name
@staticmethod
def handle_special_cases(namestring):
# Fix specific authors with wonky metadata not captured by other rules.
namestring = namestring.replace('Ren, Xiaoyuan, S.M. (Xiaoyuan Charlene) Massachusetts Institute of Technology', # noqa
'Ren, Xiaoyuan (Xiaoyuan Charlene)'
).replace('Stanford, Joseph, S.M. (Joseph Marsh) Massachusetts Institute of Technology', # noqa
'Stanford, Joseph (Joseph Marsh)'
).replace('Wang, Zhiyong, S.M. Massachusetts Institute of Technology. Engineering Systems Division', # noqa
'Wang, Zhiyong'
).replace('Williams, Christina M., M.B.A. (Christina Marie). Massachusetts Institute of Technology', # noqa
'Williams, Christina M. (Christina Marie)'
).replace('Lu, Xin. Ph. D. Massachusetts Institute of Technology. Department of Materials Science and Engineering', # noqa
'Lu, Xin'
).replace('Rodriguez, Miguel A. (Miguel Angel), M.C.P. Massachusetts Institute of Technology', # noqa
'Rodriguez, Miguel A. (Miguel Angel)'
)
return namestring
@staticmethod
def clean_metadata(namestring):
"""Extract a list of personal names out of raw metadata strings.
Breaking apart authors/advisors into separate Person instances is
better than storing them in an ArrayField because ArrayField only
supports searching for entire tokens, whereas with CharField we can
use icontains to search for substrings."""
namestring = Person.handle_special_cases(namestring)
# Split on " and " - sometimes there are multiple authors given in a
# string.
names = namestring.split(' and ')
# Remove degrees.
degrees = [', S.M. Massachusetts Institute of Technology',
', M. Eng. Massachusetts Institute of Technology',
', Ph. D. Massachusetts Institute of Technology',
', Nav.E. Massachusetts Institute of Technology',
', Nav. E. Massachusetts Institute of Technology',
', M.B.A. Massachusetts Institute of Technology',
', Massachusetts Institute of Technology']
for deg in degrees:
names = [name.replace(deg, '') for name in names]
# Strip leading & trailing whitespace and trailing periods.
names = [name.strip().rstrip('.') for name in names]
return names
class Meta:
ordering = ['name']
class Department(models.Model):
# Anticipates things like "Department of Mathematics", not "Course 18".
name = models.CharField(max_length=255)
# See http://catalog.mit.edu/subjects/#bycoursenumbertext . Add these by
# hand after initializing the database. Some historical departments may not
# have course numbers. Some programs do not have a name of the form
# "Course X" but instead have acronyms, so the course field has to allow
# for "Course" to be written in, not assume it can be prefixed.
course = models.CharField(max_length=10, blank=True)
def __str__(self):
return self.name
@staticmethod
def clean_metadata(deptstring):
deptstring = deptstring.replace(
'Massachusetts Institute of Technology.', ''
).replace(
'Dept. of', 'Department of'
).strip().rstrip('.')
return deptstring
@staticmethod
def get_or_create_from_metadata(metadata):
clean = Department.clean_metadata(metadata)
dept, _ = Department.objects.get_or_create(name=clean)
return dept
class Meta:
ordering = ['name']
class Thesis(models.Model):
REPLS = (('E.E', 'Elec.E'), ('Elect.E', 'Elec.E'), ('OceanE', 'Ocean.E'),
('M.ArchAS', 'M.Arch.A.S'), ('PhD', 'Ph.D'), ('ScD', 'Sc.D'))
DEGREES = ['B.Arch.', 'B.C.P.', 'B.S.', 'C.P.H.', 'Chem.E.', 'Civ.E.',
'E.A.A.', 'Elec.E.', 'Env.E.', 'M.Arch.', 'M.Arch.A.S.',
'M.B.A.', 'M.C.P.', 'M.Eng.', 'M.Fin.', 'M.S.', 'M.S.V.S.',
'Mat.Eng.', 'Nav.Arch.', 'Mech.E.', 'Nav.E.', 'Nucl.E.',
'Ocean.E.', 'Ph.D.', 'S.B.', 'S.M.', 'S.M.M.O.T.', 'Sc.D.']
# Max length observed by sampling in the wild is 183; 255 is max length
# guaranteed to be supported by CharField.
# It would be great to use CICharField here, but we need superuser
# privileges to set up the database for it, so deploying it to Heroku is a
# no-go.
title = models.TextField()
contributor = models.ManyToManyField(Person, through='Contribution')
department = models.ManyToManyField(Department)
degree = models.CharField(max_length=20) # SB, M. Eng., etc.
url = models.URLField()
# Not DateField, because we only have year, not month or day. Storing as an
# integer rather than a string should allow for comparisons to happen in
# the expected way. This is the copyright date, NOT the accessioning or
# availability dates, which may be quite different.
year = models.IntegerField()
identifier = models.IntegerField(unique=True, db_index=True,
help_text='The part after the final slash in things '
'like http://hdl.handle.net/1721.1/39504')
unextractable = models.BooleanField(default=False,
help_text='Will be set to True if attempts to extract text from '
'the pdf failed; such theses are not part of the neural net, '
'and cannot be used in data visualization.')
def __str__(self):
return self.title
@cached_property
def label(self):
return '1721.1-{}.txt'.format(self.identifier)
@cached_property
def authors(self):
contribs = Contribution.objects.filter(
thesis=self, role=Contribution.AUTHOR)
return Person.objects.filter(contribution__in=contribs)
@cached_property
def advisors(self):
contribs = Contribution.objects.filter(
thesis=self, role=Contribution.ADVISOR)
return Person.objects.filter(contribution__in=contribs)
@cached_property
def dspace_url(self):
return 'https://dspace.mit.edu/handle/1721.1/{id}'.format(
id=self.identifier)
def get_absolute_url(self):
return reverse('theses:similar_to',
kwargs={'identifier': self.identifier})
# ~~~~~~~~~~~~~~~~~~~~~ Functions for metadata ingest ~~~~~~~~~~~~~~~~~~~~~
def add_people(self, people, author=True):
"""Given a list of person name strings, add Person relations."""
if author:
role = Contribution.AUTHOR
else:
role = Contribution.ADVISOR
for person in people:
names = Person.clean_metadata(person)
for name in names:
if not self.contribution_set.filter(person__name=name,
role=role):
person, _ = Person.objects.get_or_create(name=name)
Contribution.objects.create(
person=person,
role=role,
thesis=self
)
def add_departments(self, departments):
for deptstring in departments:
dept = Department.get_or_create_from_metadata(deptstring)
self.department.add(dept)
@classmethod
def extract_degree(self, degree_statement):
"""Takes METS format metadata and finds degrees."""
result = []
try:
degree = re.findall(r'[A-Z][a-z]{,4}\.? ?[A-Z][a-z]{,3}\.?[A-Z]?\.?'
r'[A-Z]?\.?[A-Z]?\.?', degree_statement)
for item in degree:
i = item.replace(' ', '')
i = i.rstrip('.')
i = reduce(lambda a, kv: a.replace(*kv), self.REPLS, i)
if not i.endswith('.'):
i += '.'
if i in self.DEGREES:
result.append(i)
except TypeError:
result = None
return result or None
# ~~~~~~~~~~~~~~~~~ Functions for neural net interactions ~~~~~~~~~~~~~~~~~
# See https://radimrehurek.com/gensim/models/doc2vec.html for affordances
# offered by doc2vec.
def get_most_similar(self, threshold=0.75, topn=50):
"""Find theses above a given similarity threshold. If there are more
than topn, only the topn most similar will be returned (to a maximum
of 50).
Threshold defaults to 0.75, because in practice that seems to usually
result in theses that humans find similar, but also a manageable number
of results."""
topn = min(topn, 50)
friends = settings.NEURAL_NET.docvecs.most_similar(
[self.label], topn=topn)
friend_labels = [x[0] for x in friends if x[1] > threshold]
friend_ids = [x.split('-')[1].split('.')[0] for x in friend_labels]
return Thesis.objects.filter(identifier__in=friend_ids)
def get_similarity(self, thesis):
"""Get the similarity between this and another thesis."""
return settings.NEURAL_NET.docvecs.similarity(
self.label, thesis.label)
class Meta:
verbose_name_plural = 'theses'
class Contribution(models.Model):
AUTHOR = 'author'
ADVISOR = 'advisor'
ROLE_CHOICES = (
(AUTHOR, AUTHOR),
(ADVISOR, ADVISOR),
)
thesis = models.ForeignKey(Thesis, on_delete=models.CASCADE)
person = models.ForeignKey(Person, on_delete=models.CASCADE)
role = models.CharField(max_length=7, choices=ROLE_CHOICES)