fedspendingtransparency/usaspending-api

View on GitHub
usaspending_api/awards/v2/filters/search.py

Summary

Maintainability
F
3 days
Test Coverage
F
39%
import itertools
import logging

from django.db.models import Q

from usaspending_api.accounts.views.federal_accounts_v2 import filter_on
from usaspending_api.awards.models import FinancialAccountsByAwards
from usaspending_api.search.filters.postgres.psc import PSCCodes
from usaspending_api.awards.v2.filters.filter_helpers import combine_date_range_queryset, total_obligation_queryset
from usaspending_api.awards.v2.filters.location_filter_geocode import geocode_filter_locations
from usaspending_api.common.exceptions import InvalidParameterException, NotImplementedException
from usaspending_api.recipient.models import RecipientProfile
from usaspending_api.references.models import PSC
from usaspending_api.search.helpers.matview_filter_helpers import build_award_ids_filter
from usaspending_api.search.filters.postgres.tas import TasCodes, TreasuryAccounts
from usaspending_api.search.models import TransactionSearch
from usaspending_api.search.v2 import elasticsearch_helper
from usaspending_api.settings import API_MAX_DATE, API_MIN_DATE, API_SEARCH_MIN_DATE


logger = logging.getLogger(__name__)


def transaction_search_filter(filters):
    return matview_search_filter(filters, TransactionSearch, for_downloads=True)


def matview_search_filter(filters, model, for_downloads=False):
    queryset = model.objects.all()

    recipient_scope_q = Q(recipient_location_country_code="USA") | Q(recipient_location_country_name="UNITED STATES")
    pop_scope_q = Q(pop_country_code="USA") | Q(pop_country_name="UNITED STATES")

    faba_flag = False
    faba_queryset = FinancialAccountsByAwards.objects.filter(award__isnull=False)

    for key, value in filters.items():
        if value is None:
            raise InvalidParameterException("Invalid filter: " + key + " has null as its value.")

        key_list = [
            "keywords",
            "elasticsearch_keyword",
            "time_period",
            "award_type_codes",
            "prime_and_sub_award_types",
            "agencies",
            "legal_entities",
            "recipient_id",
            "recipient_search_text",
            "recipient_scope",
            "recipient_locations",
            "recipient_type_names",
            "place_of_performance_scope",
            "place_of_performance_locations",
            "award_amounts",
            "award_ids",
            "program_numbers",
            "naics_codes",
            PSCCodes.underscore_name,
            "contract_pricing_type_codes",
            "set_aside_type_codes",
            "extent_competed_type_codes",
            TasCodes.underscore_name,
            TreasuryAccounts.underscore_name,
            # next 3 keys used by federal account page
            "federal_account_ids",
            "object_class",
            "program_activity",
        ]

        if key not in key_list:
            raise InvalidParameterException("Invalid filter: " + key + " does not exist.")

        if key == "keywords":

            def keyword_parse(keyword):
                # keyword_ts_vector & award_ts_vector are Postgres TS_vectors.
                # keyword_ts_vector = recipient_name + naics_code + naics_description
                #     + psc_description + awards_description
                # award_ts_vector = piid + fain + uri
                filter_obj = Q(keyword_ts_vector=keyword) | Q(award_ts_vector=keyword)
                if keyword.isnumeric():
                    filter_obj |= Q(naics_code__contains=keyword)
                if len(keyword) == 4 and PSC.objects.all().filter(code__iexact=keyword).exists():
                    filter_obj |= Q(product_or_service_code__iexact=keyword)

                return filter_obj

            filter_obj = Q()
            for keyword in value:
                filter_obj |= keyword_parse(keyword)
            potential_duns = list(filter((lambda x: len(x) > 7 and len(x) < 10), value))
            if len(potential_duns) > 0:
                filter_obj |= Q(recipient_unique_id__in=potential_duns) | Q(
                    parent_recipient_unique_id__in=potential_duns
                )

            queryset = queryset.filter(filter_obj)

        elif key == "elasticsearch_keyword":
            keyword = " ".join(value) if isinstance(value, list) else value
            transaction_ids = elasticsearch_helper.get_download_ids(keyword=keyword, field="transaction_id")
            # flatten IDs
            transaction_ids = list(itertools.chain.from_iterable(transaction_ids))
            logger.info("Found {} transactions based on keyword: {}".format(len(transaction_ids), keyword))
            transaction_ids = [str(transaction_id) for transaction_id in transaction_ids]
            queryset = queryset.extra(
                where=[
                    '"transaction_search"."transaction_id" = ANY(\'{{{}}}\'::int[])'.format(
                        "," "".join(transaction_ids)
                    )
                ]
            )

        elif key == "time_period":
            min_date = API_SEARCH_MIN_DATE
            if for_downloads:
                min_date = API_MIN_DATE
            queryset &= combine_date_range_queryset(value, model, min_date, API_MAX_DATE)

        elif key == "award_type_codes":
            queryset = queryset.filter(type__in=value)

        elif key == "prime_and_sub_award_types":
            award_types = value.get("prime_awards")
            if award_types:
                queryset = queryset.filter(type__in=award_types)

        elif key == "agencies":
            # TODO: Make function to match agencies in award filter throwing dupe error
            funding_toptier = Q()
            funding_subtier = Q()
            awarding_toptier = Q()
            awarding_subtier = Q()
            for v in value:
                type = v["type"]
                tier = v["tier"]
                name = v["name"]
                if type == "funding":
                    if tier == "toptier":
                        funding_toptier |= Q(funding_toptier_agency_name=name)
                    elif tier == "subtier":
                        if "toptier_name" in v:
                            funding_subtier |= Q(funding_subtier_agency_name=name) & Q(
                                funding_toptier_agency_name=v["toptier_name"]
                            )
                        else:
                            funding_subtier |= Q(funding_subtier_agency_name=name)

                elif type == "awarding":
                    if tier == "toptier":
                        awarding_toptier |= Q(awarding_toptier_agency_name=name)
                    elif tier == "subtier":
                        if "toptier_name" in v:
                            awarding_subtier |= Q(awarding_subtier_agency_name=name) & Q(
                                awarding_toptier_agency_name=v["toptier_name"]
                            )
                        else:
                            awarding_subtier |= Q(awarding_subtier_agency_name=name)

            awarding_queryfilter = Q()
            funding_queryfilter = Q()

            # Since these are Q filters, no DB hits for boolean checks
            if funding_toptier:
                funding_queryfilter |= funding_toptier
            if funding_subtier:
                funding_queryfilter |= funding_subtier
            if awarding_toptier:
                awarding_queryfilter |= awarding_toptier
            if awarding_subtier:
                awarding_queryfilter |= awarding_subtier

            queryset = queryset.filter(funding_queryfilter & awarding_queryfilter)

        elif key == "legal_entities":
            # This filter key has effectively become obsolete by recipient_search_text
            msg = 'API request included "{}" key. No filtering will occur with provided value "{}"'
            logger.info(msg.format(key, value))
            # in_query = [v for v in value]
            # if len(in_query) != 0:
            #     queryset &= model.objects.filter(recipient_id__in=in_query)

        elif key == "recipient_search_text":
            all_filters_obj = Q()
            for recip in value:
                upper_recipient_string = str(recip).upper()
                # recipient_name_ts_vector is a postgres TS_Vector
                filter_obj = Q(recipient_name_ts_vector=upper_recipient_string)
                if len(upper_recipient_string) == 9 and upper_recipient_string[:5].isnumeric():
                    filter_obj |= Q(recipient_unique_id=upper_recipient_string)
                all_filters_obj |= filter_obj
            queryset = queryset.filter(all_filters_obj)

        elif key == "recipient_id":
            filter_obj = Q()
            recipient_hash = value[:-2]

            if value.endswith("P"):  # For parent types, gather all of the children's transactions
                parent_duns_rows = RecipientProfile.objects.filter(
                    recipient_hash=recipient_hash, recipient_level="P"
                ).values("recipient_unique_id")
                if len(parent_duns_rows) == 1:
                    parent_duns = parent_duns_rows[0]["recipient_unique_id"]
                    filter_obj = Q(parent_recipient_unique_id=parent_duns)
                elif len(parent_duns_rows) > 2:
                    # shouldn't occur
                    raise InvalidParameterException("Non-unique parent record found in RecipientProfile")
            elif value.endswith("C"):
                filter_obj = Q(recipient_hash=recipient_hash, parent_recipient_unique_id__isnull=False)
            else:
                # "R" recipient level
                filter_obj = Q(recipient_hash=recipient_hash, parent_recipient_unique_id__isnull=True)
            queryset = queryset.filter(filter_obj)

        elif key == "recipient_scope":
            if value == "domestic":
                queryset = queryset.filter(recipient_scope_q)
            elif value == "foreign":
                queryset = queryset.exclude(recipient_scope_q)
            else:
                raise InvalidParameterException("Invalid filter: recipient_scope type is invalid.")

        elif key == "recipient_locations":
            queryset = queryset.filter(geocode_filter_locations("recipient_location", value))

        elif key == "recipient_type_names":
            if len(value) != 0:
                queryset = queryset.filter(business_categories__overlap=value)

        elif key == "place_of_performance_scope":
            if value == "domestic":
                queryset = queryset.filter(pop_scope_q)
            elif value == "foreign":
                queryset = queryset.exclude(pop_scope_q)
            else:
                raise InvalidParameterException("Invalid filter: place_of_performance_scope is invalid.")

        elif key == "place_of_performance_locations":
            queryset = queryset.filter(geocode_filter_locations("pop", value))

        elif key == "award_amounts":
            queryset &= total_obligation_queryset(value, model, filters)

        elif key == "award_ids":
            queryset = build_award_ids_filter(queryset, value, ("piid", "fain", "uri"))

        elif key == "program_numbers":
            in_query = [v for v in value]
            if len(in_query) != 0:
                queryset = queryset.filter(cfda_number__in=in_query)

        elif key == "naics_codes":
            if isinstance(value, list):
                require = value
            elif isinstance(value, dict):
                require = value.get("require") or []
                if value.get("exclude"):
                    raise NotImplementedException(
                        "NOT IMPLEMENTED: postgres endpoint does not currently support excluded naics!"
                    )

            else:
                raise InvalidParameterException("naics_codes must be an array or object")

            if [value for value in require if len(str(value)) not in [2, 4, 6]]:
                raise InvalidParameterException(
                    "naics code filtering only supported for codes with lengths of 2, 4, and 6"
                )

            regex = f"^({'|'.join([str(elem) for elem in require])}).*"
            queryset = queryset.filter(naics_code__regex=regex)

        elif key == PSCCodes.underscore_name:
            q = PSCCodes.build_tas_codes_filter(value)
            queryset = queryset.filter(q) if q else queryset

        elif key == "contract_pricing_type_codes":
            in_query = [v for v in value]
            if len(in_query) != 0:
                queryset = queryset.filter(type_of_contract_pricing__in=in_query)

        elif key == "set_aside_type_codes":
            or_queryset = Q()
            for v in value:
                or_queryset |= Q(type_set_aside__exact=v)
            queryset = queryset.filter(or_queryset)

        elif key == "extent_competed_type_codes":
            or_queryset = Q()
            for v in value:
                or_queryset |= Q(extent_competed__exact=v)
            queryset = queryset.filter(or_queryset)

        # Because these two filters OR with each other, we need to know about the presence of both filters to know what to do
        # This filter was picked arbitrarily to be the one that checks for the other
        elif key == TasCodes.underscore_name:
            q = TasCodes.build_tas_codes_filter(queryset, value)
            if TreasuryAccounts.underscore_name in filters.keys():
                q |= TreasuryAccounts.build_tas_codes_filter(queryset, filters[TreasuryAccounts.underscore_name])
            queryset = queryset.filter(q)

        elif key == TreasuryAccounts.underscore_name and TasCodes.underscore_name not in filters.keys():
            queryset = queryset.filter(TreasuryAccounts.build_tas_codes_filter(queryset, value))

        # Federal Account Filter
        elif key == "federal_account_ids":
            faba_flag = True
            or_queryset = Q()
            for v in value:
                or_queryset |= Q(treasury_account__federal_account_id=v)
            faba_queryset = faba_queryset.filter(or_queryset)

        # Federal Account Filter
        elif key == "object_class":
            result = Q()
            for oc in value:
                subresult = Q()
                subresult &= filter_on("award__awardsearch__financial_set__object_class", "object_class", oc)
                result |= subresult
            queryset = queryset.filter(result)

        # Federal Account Filter
        elif key == "program_activity":
            or_queryset = Q()
            for v in value:
                or_queryset |= Q(award__awardsearch__financial_set__program_activity__id=v)
            queryset = queryset.filter(or_queryset)

    if faba_flag:
        award_ids = faba_queryset.values("award_id")
        queryset = queryset.filter(award_id__in=award_ids)

    return queryset