NatLibFi/Annif

View on GitHub
annif/analyzer/spacy.py

Summary

Maintainability
A
0 mins
Test Coverage
"""spaCy analyzer for Annif which uses spaCy for lemmatization"""

from __future__ import annotations

import importlib

import annif.util
from annif.exception import OperationFailedException

from . import analyzer

_KEY_LOWERCASE = "lowercase"


class SpacyAnalyzer(analyzer.Analyzer):
    name = "spacy"

    @staticmethod
    def is_available() -> bool:
        # return True iff spaCy is installed
        return importlib.util.find_spec("spacy") is not None

    def __init__(self, param: str, **kwargs) -> None:
        import spacy

        self.param = param
        try:
            self.nlp = spacy.load(param, exclude=["ner", "parser"])
        except IOError as err:
            raise OperationFailedException(
                f"Loading spaCy model '{param}' failed - "
                + f"please download the model.\n{err}"
            )
        if _KEY_LOWERCASE in kwargs:
            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
        else:
            self.lowercase = False
        super().__init__(**kwargs)

    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
        lemmas = [
            lemma
            for lemma in (token.lemma_ for token in self.nlp(text.strip()))
            if (not filter or self.is_valid_token(lemma))
        ]
        if self.lowercase:
            return [lemma.lower() for lemma in lemmas]
        else:
            return lemmas