sosia/processing/finding.py from sosia-dev/sosia

sosia/processing/finding.py
Summary

Maintainability

5 hrs
Test Coverage

Issues
from itertools import product
from string import Template

import pandas as pd
from tqdm import tqdm

from sosia.processing.caching import insert_data, retrieve_author_info
from sosia.processing.extracting import extract_authors
from sosia.processing.filtering import filter_pub_counts, same_affiliation
from sosia.processing.getting import get_authors_from_sourceyear, get_authors
from sosia.processing.querying import base_query, count_citations, stacked_query
from sosia.processing.utils import build_dict, flat_set_from_df, margin_range
from sosia.utils import custom_print


def find_matches(original, stacked, verbose, refresh):
    """Find matches within the search group.

    Parameters
    ----------
    original : sosia.Original()
        The object containing information for the original scientist to
        search for.  Attribute search_group needs to exist.
    
    stacked : bool (optional, default=False)
        Whether to combine searches in few queries or not.  Cached
        files will most likely not be reusable.  Set to True if you
        query in distinct fields or you want to minimize API key usage.

    verbose : bool (optional, default=False)
        Whether to report on the progress of the process.

    refresh : bool (optional, default=False)
        Whether to refresh cached results (if they exist) or not. If int
        is passed and stacked=False, results will be refreshed if they are
        older than that value in number of days.
    """
    # Variables
    _years = range(original.first_year-original.first_year_margin,
                   original.first_year+original.first_year_margin+1)
    _npapers = margin_range(len(original.publications), original.pub_margin)
    _max_papers = max(_npapers)
    _ncits = margin_range(original.citations, original.cits_margin)
    _max_cits = max(_ncits)
    _ncoauth = margin_range(len(original.coauthors), original.coauth_margin)
    _max_coauth = max(_ncoauth)
    if original.period:
        _npapers = margin_range(len(original.publications_period), original.pub_margin)
        _ncits = margin_range(original.citations_period, original.cits_margin)
        _ncoauth = margin_range(len(original.coauthors_period), original.coauth_margin)
    text = "Searching through characteristics of "\
           f"{len(original.search_group):,} authors..."
    custom_print(text, verbose)
    conn = original.sql_conn

    # First round of filtering: minimum publications and main field
    # create df of authors
    authors = get_authors(original.search_group, original.sql_conn, verbose=verbose)
    same_field = authors['areas'].str.startswith(original.main_field[1])
    enough_pubs = authors['documents'].astype(int) >= int(min(_npapers))
    group = sorted(authors[same_field & enough_pubs]["auth_id"].tolist())
    text = f"Left with {len(group):,} authors with sufficient "\
           "number of publications and same main field"
    custom_print(text, verbose)

    # Second round of filtering:
    # Check having no publications before minimum year, and if 0, the
    # number of publications in the relevant period.
    params = {"group": group, "ybefore": min(_years)-1,
              "yupto": original.year, "npapers": _npapers,
              "yfrom": original._period_year, "verbose": verbose, "conn": conn}
    group, _, _ = filter_pub_counts(**params)
    # Screen out profiles with too many publications over the full period
    if original.period:
        params.update({"npapers": [1, _max_papers], "yfrom": None,
                       "group": group})
        group, _, _ = filter_pub_counts(**params)
    text = f"Left with {len(group):,} researchers"
    custom_print(text, verbose)

    # Third round of filtering: citations (in the FULL period)
    authors = pd.DataFrame({"auth_id": group, "year": original.year})
    auth_cits, missing = retrieve_author_info(authors, conn, "author_ncits")
    if not missing.empty:
        total = missing.shape[0]
        text = f"Counting citations of {total:,} authors..."
        custom_print(text, verbose)
        missing['n_cits'] = 0
        start = 0
        for i, au in tqdm(missing.iterrows(), disable=~verbose, total=total):
            n_cits = count_citations([str(au['auth_id'])], original.year+1)
            missing.at[i, 'n_cits'] = n_cits
            if i % 100 == 0 or i == len(missing) - 1:
                insert_data(missing.iloc[start:i+1], conn, table="author_ncits")
                start = i
    auth_cits = pd.concat([auth_cits, missing])
    auth_cits['auth_id'] = auth_cits['auth_id'].astype("uint64")
    # Keep if citations are in range
    custom_print("Filtering based on count of citations...", verbose)
    mask = auth_cits["n_cits"].between(min(_ncits), _max_cits)
    group = auth_cits[mask]['auth_id'].tolist()

    # Fourth round of filtering: Download publications, verify coauthors
    # (in the FULL period) and first year
    text = f"Left with {len(group):,} authors\nFiltering based on "\
           "coauthor count..."
    custom_print(text, verbose)
    authors = pd.DataFrame({"auth_id": group, "year": original.year},
                           dtype="uint64")
    _, author_year_search = retrieve_author_info(authors, conn, "author_year")
    matches = []

    if not author_year_search.empty:
        q = Template(f"AU-ID($fill) AND PUBYEAR BEF {original.year + 1}")
        auth_year_group = author_year_search["auth_id"].tolist()
        params = {"group": auth_year_group, "template": q, "refresh": refresh,
                  "joiner": ") OR AU-ID(", "q_type": "docs",
                  "verbose": verbose, "stacked": stacked}
        res = stacked_query(**params)
        res = build_dict(res, auth_year_group)
        if res:
            # res can become empty after build_dict if a au_id is old
            res = pd.DataFrame.from_dict(res, orient="index")
            res["year"] = original.year
            res = res[["year", "first_year", "n_pubs", "n_coauth"]]
            res.index.name = "auth_id"
            res = res.reset_index()
            insert_data(res, original.sql_conn, table="author_year")
    authors_year, _ = retrieve_author_info(authors, conn, "author_year")
    # Check for number of coauthors within margin
    mask = authors_year["n_coauth"].between(min(_ncoauth), _max_coauth)
    # Check for year of first publication within range
    if not original.first_year_name_search:
        same_start = authors_year["first_year"].between(min(_years), max(_years))
        mask = mask & same_start
    # Filter
    matches = sorted(authors_year[mask]["auth_id"].tolist())

    text = f"Left with {len(matches)} authors"
    custom_print(text, verbose)
    if original.period:
        text =  "Filtering based on citations and coauthor count during period..."
        custom_print(text, verbose)
        # Further screen matches based on period cits and coauths
        to_loop = [m for m in matches]  # temporary copy
        for m in to_loop:
            res = base_query("docs", f"AU-ID({m})", refresh=refresh,
                             fields=["eid", "author_ids", "coverDate"])
            pubs = [p for p in res if
                    original._period_year <= int(p.coverDate[:4]) <= original.year]
            coauths = set(extract_authors(pubs)) - {str(m)}
            if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)):
                matches.remove(m)
                continue
            eids_period = [p.eid for p in pubs]
            n_cits = count_citations(eids_period, original.year+1, [str(m)])
            if not (min(_ncits) <= n_cits <= max(_ncits)):
                matches.remove(m)

    # Eventually filter on affiliations
    if original.search_affiliations:
        text = "Filtering based on affiliations..."
        custom_print(text, verbose)
        matches[:] = [m for m in matches if same_affiliation(original, m, refresh)]
    return matches


def search_group_from_sources(original, stacked=False, verbose=False,
                              refresh=False):
    """Define groups of authors based on publications from a set of sources.

    Parameters
    ----------
    original : sosia.Original
        The object of the Scientist to search information for.

    stacked : bool (optional, default=False)
        Whether to use fewer queries that are not reusable, or to use modular
        queries of the form "SOURCE-ID(<SID>) AND PUBYEAR IS <YYYY>".

    verbose : bool (optional, default=False)
        Whether to report on the progress of the process.

    refresh : bool (optional, default=False)
        Whether to refresh cached search files.

    Returns
    -------
    group : set
        Set of authors publishing in year of treatment, in years around
        first publication, and not before the latter period.
    """
    # Define variables
    search_sources, _ = zip(*original.search_sources)
    text = f"Defining 'search_group' using up to {len(search_sources):,} sources..."
    custom_print(text, verbose)

    # Retrieve author list for today
    sources_today = pd.DataFrame(product(search_sources, [original.active_year]),
                                 columns=["source_id", "year"])
    auth_today = get_authors_from_sourceyear(sources_today, original.sql_conn,
        refresh=refresh, stacked=stacked, verbose=verbose)
    mask = None
    if original.search_affiliations:
        mask = auth_today["afid"].isin(original.search_affiliations)
    today = flat_set_from_df(auth_today, "auids", condition=mask)

    # Authors active around year of first publication
    min_year = original.first_year - original.first_year_margin
    max_year = original.first_year + original.first_year_margin
    then_years = [min_year-1]
    if not original.first_year_name_search:
        then_years.extend(range(min_year, max_year+1))
    sources_then = pd.DataFrame(product(search_sources, then_years),
                                columns=["source_id", "year"])
    auth_then = get_authors_from_sourceyear(sources_then, original.sql_conn,
        refresh=refresh, stacked=stacked, verbose=verbose)
    mask = auth_then["year"].between(min_year, max_year)
    then = flat_set_from_df(auth_then, "auids", condition=mask)

    # Remove authors active before
    mask = auth_then["year"] < min_year
    before = flat_set_from_df(auth_then, "auids", condition=mask)
    today -= before

    # Compile group
    group = today
    if not original.first_year_name_search:
        group = today.intersection(then)
    return {int(a) for a in group}