NatLibFi/Annif

View on GitHub
annif/registry.py

Summary

Maintainability
B
4 hrs
Test Coverage
"""Registry that keeps track of Annif projects"""

from __future__ import annotations

import os
import re

from flask import Flask, current_app

import annif
from annif.config import parse_config
from annif.exception import ConfigurationException
from annif.project import Access, AnnifProject
from annif.util import parse_args
from annif.vocab import AnnifVocabulary

logger = annif.logger


class AnnifRegistry:
    """Class that keeps track of the Annif projects and vocabularies"""

    # Note: The individual projects and vocabularies are stored in shared
    # static variables, keyed by the "registry ID" which is unique to the
    # registry instance. This is done to make it possible to serialize
    # AnnifRegistry instances without including the potentially huge objects
    # (which contain backends with large models, vocabularies with lots of
    # concepts etc). Serialized AnnifRegistry instances can then be passed
    # between processes when using the multiprocessing module.
    _projects = {}
    _vocabs = {}

    def __init__(
        self, projects_config_path: str, datadir: str, init_projects: bool
    ) -> None:
        self._rid = id(self)
        self._projects_config_path = projects_config_path
        self._datadir = datadir
        self._init_vars()
        projects_pattern = os.getenv("ANNIF_PROJECTS_INIT", ".*")
        if init_projects:
            for project in self._projects[self._rid].values():
                if re.search(projects_pattern, project.project_id) is not None:
                    project.initialize()

    def _init_vars(self) -> None:
        # initialize the static variables, if necessary
        if self._rid not in self._projects:
            self._projects[self._rid] = self._create_projects()
            self._vocabs[self._rid] = {}

    def _create_projects(self) -> dict:
        # parse the configuration
        config = parse_config(self._projects_config_path)

        # handle the case where the config file doesn't exist
        if config is None:
            return {}

        # create AnnifProject objects from the configuration file
        projects = dict()
        for project_id in config.project_ids:
            projects[project_id] = AnnifProject(
                project_id, config[project_id], self._datadir, self
            )
        return projects

    def get_projects(
        self, min_access: Access = Access.private
    ) -> dict[str, AnnifProject]:
        """Return the available projects as a dict of project_id ->
        AnnifProject. The min_access parameter may be used to set the minimum
        access level required for the returned projects."""

        self._init_vars()
        return {
            project_id: project
            for project_id, project in self._projects[self._rid].items()
            if project.access >= min_access
        }

    def get_project(
        self, project_id: str, min_access: Access = Access.private
    ) -> AnnifProject:
        """return the definition of a single Project by project_id"""

        projects = self.get_projects(min_access)
        try:
            return projects[project_id]
        except KeyError:
            raise ValueError("No such project {}".format(project_id))

    def get_vocab(
        self, vocab_spec: str, default_language: str | None
    ) -> tuple[AnnifVocabulary, None] | tuple[AnnifVocabulary, str]:
        """Return an (AnnifVocabulary, language) pair corresponding to the
        vocab_spec. If no language information is specified, use the given
        default language."""

        match = re.match(r"([\w-]+)(\((.*)\))?$", vocab_spec)
        if match is None:
            raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
        vocab_id = match.group(1)
        posargs, kwargs = parse_args(match.group(3))
        language = posargs[0] if posargs else default_language

        self._init_vars()
        if vocab_id not in self._vocabs[self._rid]:
            self._vocabs[self._rid][vocab_id] = AnnifVocabulary(vocab_id, self._datadir)
        return self._vocabs[self._rid][vocab_id], language


def initialize_projects(app: Flask) -> None:
    projects_config_path = app.config["PROJECTS_CONFIG_PATH"]
    datadir = app.config["DATADIR"]
    init_projects = app.config["INITIALIZE_PROJECTS"]
    app.annif_registry = AnnifRegistry(projects_config_path, datadir, init_projects)


def get_projects(min_access: Access = Access.private) -> dict[str, AnnifProject]:
    """Return the available projects as a dict of project_id ->
    AnnifProject. The min_access parameter may be used to set the minimum
    access level required for the returned projects."""
    if not hasattr(current_app, "annif_registry"):
        initialize_projects(current_app)

    return current_app.annif_registry.get_projects(min_access)


def get_project(project_id: str, min_access: Access = Access.private) -> AnnifProject:
    """return the definition of a single Project by project_id"""

    projects = get_projects(min_access)
    try:
        return projects[project_id]
    except KeyError:
        raise ValueError(f"No such project '{project_id}'")


def get_vocabs(min_access: Access = Access.private) -> dict[str, AnnifVocabulary]:
    """Return the available vocabularies as a dict of vocab_id ->
    AnnifVocabulary. The min_access parameter may be used to set the minimum
    access level required for the returned vocabularies."""

    vocabs = {}
    for proj in get_projects(min_access).values():
        try:
            vocabs[proj.vocab.vocab_id] = proj.vocab
        except ConfigurationException:
            pass

    return vocabs


def get_vocab(vocab_id: str, min_access: Access = Access.private) -> AnnifVocabulary:
    """return a single AnnifVocabulary by vocabulary id"""

    vocabs = get_vocabs(min_access)
    try:
        return vocabs[vocab_id]
    except KeyError:
        raise ValueError(f"No such vocabulary '{vocab_id}'")