sosia-dev/sosia

View on GitHub
sosia/classes/scientist.py

Summary

Maintainability
D
1 day
Test Coverage
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author:   Michael E. Rose <michael.ernst.rose@gmail.com>
"""Super class to represent a scientist."""

from warnings import warn

from pybliometrics.scopus import AbstractRetrieval, AffiliationRetrieval
from pybliometrics.scopus.exception import Scopus404Error

from sosia.establishing import connect_database, DEFAULT_DATABASE
from sosia.processing import add_source_names, base_query, count_citations,\
    extract_authors, find_main_affiliation, get_authors, get_main_field,\
    maybe_add_source_names, read_fields_sources_list
from sosia.utils import accepts


class Scientist(object):
    @property
    def active_year(self):
        """The scientist's most recent year with publication(s) before
         provided year (which may be the same).
         """
        return self._active_year

    @active_year.setter
    @accepts(int)
    def active_year(self, val):
        self._active_year = val

    @property
    def affiliation_country(self):
        """The current country of the affiliation defined in affiliation_id."""
        return self._affiliation_country

    @affiliation_country.setter
    @accepts(str)
    def affiliation_country(self, val):
        self._affiliation_country = val

    @property
    def affiliation_id(self):
        """The affiliation ID (as string) of the scientist's most frequent
        affiliation in or before the active year.
        """
        return self._affiliation_id

    @affiliation_id.setter
    @accepts(str)
    def affiliation_id(self, val):
        self._affiliation_id = val

    @property
    def affiliation_name(self):
        """The current name of the affiliation defined in affiliation_id."""
        return self._affiliation_name

    @affiliation_name.setter
    @accepts(str)
    def affiliation_name(self, val):
        self._affiliation_name = val

    @property
    def affiliation_type(self):
        """The current type of the affiliation defined in affiliation_id."""
        return self._affiliation_type

    @affiliation_type.setter
    @accepts(str)
    def affiliation_type(self, val):
        self.affiliation_type = val

    @property
    def citations(self):
        """The citations of the scientist until the provided year."""
        return self._citations

    @citations.setter
    @accepts(int)
    def citations(self, val):
        self._citations = val

    @property
    def citations_period(self):
        """The citations of the scientist during the given period."""
        return self._citations_period

    @citations_period.setter
    @accepts(int)
    def citations_period(self, val):
        self._citations_period = val

    @property
    def coauthors(self):
        """Set of coauthors of the scientist on all publications until the
        provided year.
        """
        return self._coauthors

    @coauthors.setter
    @accepts((set, list, tuple))
    def coauthors(self, val):
        self._coauthors = val

    @property
    def coauthors_period(self):
        """Set of coauthors of the scientist on all publications during the
        given period.
        """
        return self._coauthors_period

    @coauthors_period.setter
    @accepts((set, list, tuple))
    def coauthors_period(self, val):
        self._coauthors_period = val

    @property
    def fields(self):
        """The fields of the scientist until the provided year, estimated from
        the sources (journals, books, etc.) she published in.
        """
        return self._fields

    @fields.setter
    @accepts((set, list, tuple))
    def fields(self, val):
        self._fields = val

    @property
    def first_year(self):
        """The scientist's year of first publication."""
        return self._first_year

    @first_year.setter
    @accepts(int)
    def first_year(self, val):
        self._first_year = val

    @property
    def first_name(self):
        """The scientist's first name."""
        return self._first_name

    @first_name.setter
    @accepts(str)
    def first_name(self, val):
        self._name = val

    @property
    def main_field(self):
        """The scientist's main field of research, as tuple in
        the form (ASJC code, general category).

        The main field is the field with the most publications, provided it
        is not Multidisciplinary (ASJC code 1000).  In case of an equal number
        of publications, preference is given to non-general fields (those
        whose ASJC ends on a digit other than 0).
        """
        return self._main_field

    @main_field.setter
    def main_field(self, val):
        if not isinstance(val, tuple) or len(val) != 2:
            raise Exception("Value must be a two-element tuple.")
        self._main_field = val

    @property
    def name(self):
        """The scientist's complete name."""
        return self._name

    @name.setter
    @accepts(str)
    def name(self, val):
        self._name = val

    @property
    def language(self):
        """The language(s) of the scientist published in."""
        return self._language

    @language.setter
    @accepts(str)
    def language(self, val):
        self._language = val

    @property
    def publications(self):
        """List of the scientists' publications."""
        return self._publications

    @publications.setter
    @accepts((set, list, tuple))
    def publications(self, val):
        self._publications = val

    @property
    def publications_period(self):
        """The publications of the scientist published during
        the given period.
        """
        return self._publications_period

    @publications_period.setter
    @accepts((set, list, tuple))
    def publications_period(self, val):
        self._publications_period = val

    @property
    def sources(self):
        """The Scopus IDs of sources (journals, books) in which the
        scientist published in.
        """
        return self._sources

    @sources.setter
    @accepts((list, tuple))
    def sources(self, val):
        self._sources = maybe_add_source_names(val, self.source_names)

    @property
    def surname(self):
        """The scientist's surname."""
        return self._surname

    @surname.setter
    @accepts(str)
    def surname(self, val):
        self._name = val

    @property
    def subjects(self):
        """The subject areas of the scientist's publications."""
        return self._subjects

    @subjects.setter
    @accepts((set, list, tuple))
    def subjects(self, val):
        self._subjects = val

    def __init__(self, identifier, year, refresh=False, period=None, eids=None,
                 sql_fname=None):
        """Class to represent a scientist.

        Parameters
        ----------
        identifier : list of int
            List of Scopus Author IDs of the scientist.

        year : str or numeric
            Year for which characteristics should be defined for.

        refresh : boolean or int (optional, default=False)
            Whether to refresh cached results (if they exist) or not. If int
            is passed, results will be refreshed if they are older than
            that value in number of days.

        eids : list (optional, default=None)
            A list of scopus EIDs of the publications of the scientist.  If
            it is provided, the scientist's properties are set based on these
            publications, instead of the list of publications obtained from
            the Scopus Author ID(s).

        period: int (optional, default=None)
            In additional starting x years prior to the treatment year,
            which is also used to compute characteristics in the treatment
            year.

        sql_fname : str (optional or pathlib.Path(), default=None)
            The path of the SQLite database to connect to.  If None will
            default to `~/.cache/sosia/main.sqlite`.

        Raises
        ------
        Exception
            When there are no publications for the author until the
            provided year.
        """
        self.identifier = identifier
        self.year = int(year)
        if not sql_fname:
            sql_fname = DEFAULT_DATABASE
        self.sql_conn = connect_database(sql_fname)

        # Read mapping of fields to sources
        fields, info = read_fields_sources_list()
        self.field_source = fields
        self.source_info = info
        source_names = self.source_info.set_index("source_id")["title"].to_dict()
        self.source_names = source_names

        # Load list of publications
        if eids:
            q = f"EID({' OR '.join(eids)})"
        else:
            q = f"AU-ID({') OR AU-ID('.join([str(i) for i in identifier])})"
        integrity_fields = ["eid", "author_ids", "coverDate", "source_id"]
        res = base_query("docs", q, refresh, fields=integrity_fields)
        self._publications = [p for p in res if int(p.coverDate[:4]) <= year]
        if not len(self._publications):
            text = "No publications found for author "\
                   f"{'-'.join([str(i) for i in identifier])} until {year}"
            raise Exception(text)
        self._eids = eids or [p.eid for p in self._publications]

        # First year of publication
        pub_years = [p.coverDate[:4] for p in self._publications]
        self._first_year = int(min(pub_years))
        self._period_year = self.year - (period or (self.year+1)) + 1
        if self._period_year < self._first_year:
            self._period_year = 0

        # Count of citations
        search_ids = eids or identifier
        self._citations = count_citations(search_ids, self.year+1, identifier)

        # Coauthors
        self._coauthors = set(extract_authors(self._publications)) - set(identifier)

        # Period counts simply set to total if period is or goes back to None
        if self._period_year:
            pubs = [p for p in self._publications if
                    self._period_year <= int(p.coverDate[:4]) <= year]
            self._publications_period = pubs
            if not len(self._publications_period):
                text = "No publications found for author "\
                       f"{'-'.join([str(i) for i in identifier])} until "\
                       f"{year} in a {self._period_year}-years period"
                raise Exception(text)
            eids_period = [p.eid for p in self._publications_period]
            n_cits = count_citations(eids_period, self.year+1, identifier)
            self._citations_period = n_cits
            self._coauthors_period = set(extract_authors(self._publications_period))
            self._coauthors_period -= set(identifier)
        else:
            self._coauthors_period = None
            self._publications_period = None
            self._citations_period = None

        # Author search information
        source_ids = set([int(p.source_id) for p in self._publications
                          if p.source_id])
        self._sources = add_source_names(source_ids, self.source_names)
        self._active_year = int(max(pub_years))
        mask = fields["source_id"].isin(source_ids)
        self._fields = fields[mask]["asjc"].astype(int).tolist()
        self._main_field = get_main_field(self._fields)
        if not self._main_field[0]:
            text = "Not possible to determine research field(s) of "\
                   "researcher.  Functionality is reduced."
            warn(text, UserWarning)

        # Most recent geolocation
        afid = find_main_affiliation(identifier, self._publications, year)
        self._affiliation_id = afid
        try:
            aff = AffiliationRetrieval(afid, refresh=refresh)
            self._affiliation_country = aff.country
            self._affiliation_name = aff.affiliation_name
            self._affiliation_type = aff.org_type
        except (Scopus404Error, ValueError):
            self._affiliation_country = None
            self._affiliation_name = None
            self._affiliation_type = None
        self._language = None

        # Author name from profile with most documents
        df = get_authors(self.identifier, self.sql_conn,
                         refresh=refresh, verbose=False)
        au = df.sort_values("documents", ascending=False).iloc[0]
        self._subjects = [a.split(" ")[0] for a in au.areas.split("; ")]
        self._surname = au.surname or None
        self._first_name = au.givenname or None
        name = ", ".join([self._surname or "", au.givenname or ""])
        if name == ", ":
            name = None
        self._name = name

    def get_publication_languages(self, refresh=False):
        """Parse languages of published documents."""
        langs = set()
        for eid in self._eids:
            try:
                ab = AbstractRetrieval(eid, view="FULL", refresh=refresh)
            except Scopus404Error:
                continue
            langs.add(ab.language)
        self._language = "; ".join(sorted(filter(None, langs)))
        return self