sosia-dev/sosia

View on GitHub
sosia/processing/getting.py

Summary

Maintainability
A
1 hr
Test Coverage
import pandas as pd

from sosia.processing.caching import insert_data, retrieve_authors,\
    retrieve_authors_from_sourceyear
from sosia.processing.querying import query_pubs_by_sourceyear, stacked_query


def get_authors(authors, conn, refresh=False, verbose=False):
    """Wrapper function to search author data for a list of authors, searching
    first in the SQL database and then via stacked search.

    Parameters
    ----------
    authors : list
       List of Scopus Author IDs to search.

    conn : sqlite3 connection
        Standing connection to a SQLite3 database.

    refresh : bool (optional, default=False)
        Whether to refresh scopus cached files if they exist, or not.

    verbose : bool (optional, default=False)
        Whether to print information on the search progress.

    Returns
    -------
    data : DataFrame
        Data on the provided authors.
    """
    from string import Template

    # Retrieve existing data from SQL cache
    authors = pd.DataFrame(authors, columns=["auth_id"], dtype="uint64")
    data, missing = retrieve_authors(authors, conn)
    # Query missing records and insert at the same time
    if missing:
        params = {"group": missing, "refresh": refresh, "joiner": ") OR AU-ID(",
                  "q_type": "author", "template": Template("AU-ID($fill)"),
                  "stacked": True, "verbose": verbose}
        if verbose:
            print("Pre-filtering...")
        res = stacked_query(**params)
        res = pd.DataFrame(res)
        insert_data(res, conn, table="authors")
        data, _ = retrieve_authors(authors, conn)
    return data


def get_authors_from_sourceyear(df, conn, refresh=False, stacked=False,
                                verbose=False):
    """Get authors publishing in specified sourced in specified years.

    Handles retrieving data, and in case of missing data querying for it
    and inserting it into the SQL database.

    Parameters
    ----------
    df : DataFrame
        DataFrame of source-year-combinations to be searched for.

    conn : sqlite3 connection
        Standing connection to an SQLite3 database.

    refresh : bool (optional, default=False)
        Whether to refresh cached search files.

    stacked : bool (optional, default=False)
        Whether to use fewer queries that are not reusable, or to use modular
        queries of the form "SOURCE-ID(<SID>) AND PUBYEAR IS <YYYY>".

    verbose : bool (optional, default=False)
        Whether to print information on the search progress.

    Returns
    -------
    data : DataFrame
        DataFrame in format ("source_id", "year", "auids", "afid"), where
        entries correspond to an individual paper.
    """
    # Retrieve information in cache
    data, missing = retrieve_authors_from_sourceyear(df, conn, refresh=refresh)

    # Download and add missing data
    to_add = pd.DataFrame()
    empty = []
    for year in missing["year"].unique():
        subset = missing[missing["year"] == year]
        sources = subset["source_id"].unique()
        new = query_pubs_by_sourceyear(sources, year, refresh=refresh,
                                       stacked=stacked, verbose=verbose)
        no_info = set(sources) - set(new["source_id"].unique())
        empty.extend([(s, year) for s in no_info])
        to_add = pd.concat([to_add, new])

    # Format useful information
    if data.empty:
        data = to_add
    else:
        data = pd.concat([data, to_add])
    data = data[data["auids"] != ""]
    data["auids"] = data["auids"].str.replace(";", ",").str.split(",")

    # Insert new information and information on missing data
    if empty:
        sources, years = list(zip(*empty))
        d = {"source_id": sources, "year": years, "auids": [""]*len(sources),
             "afid": [""]*len(sources)}
        to_add = pd.concat([to_add, pd.DataFrame(d)])
    if not to_add.empty:
        to_add["auids"] = to_add["auids"].str.replace(";", ",").str.split(",")
        insert_data(to_add, conn, table="sources_afids")
    return data