api/views.py from 18F/calc

api/views.py
Summary

Maintainability

1 day
Test Coverage

51%
Issues
import bleach
import csv
from decimal import Decimal
from textwrap import dedent

from django.http import HttpResponse
from django.db.models import Avg, Max, Min, Count, StdDev
from django.utils.safestring import SafeString

from markdown import markdown
from rest_framework import serializers
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.schemas import AutoSchema
from rest_framework.compat import coreapi, coreschema
from django.utils.html import strip_tags
from rest_framework import generics

from api.pagination import ContractPagination
from api.serializers import ContractSerializer, ScheduleMetadataSerializer
from api.utils import get_histogram
from contracts.models import Contract, EDUCATION_CHOICES, ScheduleMetadata
from calc.utils import humanlist, backtickify


DOCS_DESCRIPTION = dedent("""
CALC's back-end exposes a public API for its labor rates data.
This API is used by CALC's front-end Data Explorer application,
and can also be accessed by any third-party application over
the public internet.

For more developer documentation on CALC, please visit
[/docs/](/docs/).
""")

SIMPLE_QUERYARG_TYPE_MAP = {
    int: coreschema.Integer,
    str: coreschema.String,
}

ALL_CONTRACT_FIELDS = ContractSerializer.Meta.fields

SORTABLE_CONTRACT_FIELDS = list(set(
    ContractSerializer.Meta.fields
).intersection(set([f.name for f in Contract._meta.fields if f.db_index])))


def queryarg(name, _type, description):
    '''
    A simple helper method to generate a coreapi field out of a
    less verbose syntax.
    '''

    html_desc = SafeString(markdown(dedent(description)))

    return coreapi.Field(
        name,
        location="query",
        schema=SIMPLE_QUERYARG_TYPE_MAP[_type](
            description=html_desc,
        )
    )


Q_QUERYARG = queryarg("q", str, "Keywords to search by.")

GET_CONTRACTS_QUERYARGS = [
    Q_QUERYARG,
    queryarg(
        "experience_range",
        str,
        """
        Filter by a range of years of experience, e.g. `5-10`.
        This is a convenient alternative to separately
        specifying `min_experience` and `max_experience`.
        """
    ),
    queryarg(
        "min_experience",
        int,
        "Filter by minimum years of experience."
    ),
    queryarg(
        "max_experience",
        int,
        "Filter by maximum years of experience."
    ),
    queryarg(
        "min_education",
        str,
        "Filter by a minimum level of education:\n\n" + '\n'.join([
            f"* `{code}` = {desc}"
            for code, desc in EDUCATION_CHOICES
        ])
    ),
    queryarg(
        "schedule",
        str,
        """
        Filter by GSA schedule. See [/api/schedules/](/api/schedules/)
        for a list of valid values.
        """,
    ),
    queryarg(
        "sin",
        str,
        """
        Filter by SIN number. See [/api/schedules/](/api/schedules/)
        for a list of example values.

        Note that due to the current state of data, not all
        results may be returned.  For more details, see
        [#1033](https://github.com/18F/calc/issues/1033).
        """,
    ),
    queryarg(
        "site",
        str,
        """
        Filter by worksite. Can be `customer`, `contractor`,
        or `both`.
        """
    ),
    queryarg(
        "business_size",
        str,
        """
        Filter by business size: `s` for small business, or
        `o` for other than small business.
        """
    ),
    queryarg(
        "price",
        int,
        "Filter by exact price."
    ),
    queryarg(
        "price__gte",
        int,
        "Price must be greater than or equal to this integer."
    ),
    queryarg(
        "price__lte",
        int,
        "Price must be less than or equal to this integer."
    ),
    queryarg(
        "sort",
        str,
        f"""
        Comma-separated list of columns to sort on.
        Defaults to the field used for pricing.

        Sortable columns include
        {humanlist(backtickify(SORTABLE_CONTRACT_FIELDS))}.
        """
    ),
    queryarg(
        "query_type",
        str,
        """
        Defines how the user's keyword search should work.
        Can be `match_all` (default), `match_phrase`, or `match_exact`.
        """
    ),
    queryarg(
        "exclude",
        str,
        "Comma-separated list of ids to exclude from the search results."
    )
]


def get_contracts_queryset(request_params, wage_field):
    """
    Filters and returns contracts based on query params

    Args:
        request_params (dict): the request query parameters, corresponding
            to GET_CONTRACTS_QUERYARGS above.
        wage_field (str): the name of the field currently being used for
            wage calculations and sorting

    Returns:
        QuerySet: a filtered and sorted QuerySet to retrieve Contract objects
    """

    query = request_params.get('q', None)
    # Ideally we'd go ahead and return a plain queryset here if there is
    # no query to avoid doing extra work, but before we can do that
    # we'll have to ensure filtering fields can't do anything in the absence
    # of a query or else Very Strange Things happen.

    # Since our query can be multi-phrase, leave the original queryset alone.
    # Instead, start with an empty queryset, then find matching subsets
    # in the original and chain them together.
    if query:
        query_type = request_params.get('query_type', 'match_all')
        query_by = request_params.get('query_by', None)
        contracts = Contract.objects.all().multi_phrase_search(
            query, query_by, query_type)
    else:  # no query, so start with full query set
        contracts = Contract.objects.all()

    # Exclude records w/o rates for the selected contract period.
    # Additional price filtering is already in the CurrentContractManager
    contracts = contracts.exclude(**{wage_field + '__isnull': True})

    exclude = request_params.getlist('exclude')
    if exclude:
        # getlist only works for key=val&key=val2, not for key=val1,val2
        # this is safe because any non-integer value in the request params
        # wouldn't match id__in anyway.

        # TO-DO: take a list of phrases, pass them through
        # `clean_search` and then exclude those phrases
        exclude = exclude[0].split(',')
        contracts = contracts.exclude(id__in=exclude)

    # *** EXPERIENCE ***
    min_experience = request_params.get('min_experience', None)
    max_experience = request_params.get('max_experience', None)
    experience_range = request_params.get('experience_range', None)
    if experience_range:
        years = experience_range.split(',')
        min_experience = years[0]
        if len(years) > 1:
            max_experience = years[1]
    # Ensure the input matches expected numeric format to avoid injections
    if min_experience and min_experience.isdigit():
        contracts = contracts.filter(min_years_experience__gte=min_experience)

    if max_experience and max_experience.isdigit():
        contracts = contracts.filter(min_years_experience__lte=max_experience)

    # *** EDUCATION ***
    ed_levels = [x[0] for x in EDUCATION_CHOICES]
    min_education = request_params.get('min_education', None)
    if min_education:
        min_level = ed_levels.index(min_education)
        if min_level:  # The submitted value matched a choice and wasn't weird.
            contracts = contracts.filter(education_level__in=ed_levels[min_level:])

    education = request_params.get('education', None)
    if education:
        # Find submitted levels that are within our group of accepted values
        degrees = [value for value in education.split(',') if value in ed_levels]
        if degrees:
            contracts = contracts.filter(education_level__in=degrees)

    schedule = request_params.get('schedule', None)
    if schedule:
        schedule = bleach.clean(schedule)
        contracts = contracts.filter(schedule__iexact=schedule)

    site = request_params.get('site', None)
    if site:
        site = bleach.clean(site)
        contracts = contracts.filter(contractor_site__icontains=site)

    business_size = request_params.get('business_size', None)
    if business_size and business_size in ('s', 'o'):
        if business_size == 's':
            contracts = contracts.filter(business_size__istartswith='s')
        else:
            contracts = contracts.filter(business_size__istartswith='o')

    # WE NEED TO DOUBLE CHECK SIN AND PRICE.
    # THEY DO NOT APPEAR TO BE ON THE SEARCH PAGE.
    sin = request_params.get('sin', None)
    if sin:
        contracts = contracts.filter(sin__icontains=sin)

    price = request_params.get('price', None)
    price__gte = request_params.get('price__gte')
    price__lte = request_params.get('price__lte')
    if price:
        contracts = contracts.filter(**{wage_field + '__exact': price})
    else:
        if price__gte:
            contracts = contracts.filter(**{wage_field + '__gte': price__gte})
        if price__lte:
            contracts = contracts.filter(**{wage_field + '__lte': price__lte})

    # get any sorting params and sort by them.
    sort = request_params.get('sort', wage_field).split(',')
    for field in sort:
        if field.startswith('-'):
            field = field[1:]
        if field not in ALL_CONTRACT_FIELDS:
            stripped = strip_tags(field)
            clean_field = bleach.clean(stripped, tags=[], strip=True)
            raise serializers.ValidationError(f'"{clean_field}" is not a valid field to sort on')
        if field not in SORTABLE_CONTRACT_FIELDS:
            stripped = strip_tags(field)
            clean_field = bleach.clean(stripped, tags=[], strip=True)
            raise serializers.ValidationError(f'Unable to sort on the field "{clean_field}"')

    return contracts.order_by(*sort)


def quantize(num, precision=2):
    if num is None:
        return None
    return Decimal(num).quantize(Decimal(10) ** -precision)


class GetRates(APIView):
    """
    Get detailed information about all labor rates that match a search query.

    The JSON response contains the following keys:

    * `next` is a URL that points to the next page of results, or
    `null` if no additional pages are available.
    * `previous` is a URL that points to the previous page of
    results, or `null` if no previous pages are available.
    * `results` is an array containing the results for the
    current page. Each item in the array contains the following keys:
        * `id` is the internal ID of the rate in the CALC database.
            This can be passed to the `exclude` query parameter to
            exclude individual rates from search results.
        * `idv_piid` is the contract number of the contract that
            contains the labor rate.
        * `vendor_name` is the name of the vendor that the
            labor rate's contract is under.
        * `education_level` is the minimum level of education
            for the labor rate.
        * `min_years_experience` is the minimum years of experience
            for the labor rate.
        * `hourly_rate_year1`, `current_price`, `next_year_price`,
            and `second_year_price` contain pricing information for
            the labor rate.
        * `schedule` is the schedule the labor rate is under. See
            [/api/schedules/](/api/schedules/) for a list of valid values.
        * `sin` describes the special item numbers (SINs) the labor
            rate is under. See
            [#1033](https://github.com/18F/calc/issues/1033) for
            details on some limitations.
        * `contractor_site` is the worksite of the labor rate.
        * `business_size` is the business size of the vendor
            offering the labor rate.

    Additionally, the response contains aggregate details about
    the distribution of the search results, across all pages:

    * `count` is the total number rates.
    * `average` is the average price of the rates.
    * `minimum` is the minimum price of the rates.
    * `maximum` is the maximum price of the rates.
    * `first_standard_deviation` is the first standard deviation
      of the rates.
    * `wage_histogram` is an array that contains an object for
      each bin in the histogram (the number of bins can be
      specified via the `histogram` query parameter):
        * `min` is the minimum price of the bin.
        * `max` is the maximum price of the bin.
        * `count` is the number of prices in the bin.
    """

    # The AutoSchema will introspect this to ultimately generate
    # documentation about our pagination query args.
    pagination_class = ContractPagination

    schema = AutoSchema(
        manual_fields=[
            queryarg(
                "contract-year",
                int,
                """
                Return price for the given contract year (1 or 2).
                Defaults to the current year pricing.
                """
            ),
            queryarg(
                "histogram",
                int,
                """
                Number of bins to divide a wage histogram into.
                If not provided, no histogram data will be returned.
                """
            ),
        ] + GET_CONTRACTS_QUERYARGS
    )

    def get(self, request):
        bins = request.query_params.get('histogram', None)

        """
        wage_field determines prices for a given year:
        This year, next year, or the year after.
        Relies on indexing a list, which may be less reliable than a tuple.

        This is used both here in get() and downstream in get_query_set(),
        so we have to pass it through.
        """
        possible_wage_fields = ['current_price', 'next_year_price', 'second_year_price']
        year = request.query_params.get('contract-year', 0)
        wage_field = possible_wage_fields[int(year)]
        contracts_all = self.get_queryset(request.query_params, wage_field)

        stats = contracts_all.aggregate(
            Min(wage_field), Max(wage_field),
            Avg(wage_field), StdDev(wage_field))

        page_stats = {
            'minimum': stats[wage_field + '__min'],
            'maximum': stats[wage_field + '__max'],
            'average': quantize(stats[wage_field + '__avg']),
            'first_standard_deviation': quantize(
                stats[wage_field + '__stddev']
            )
        }

        if bins and bins.isnumeric():
            values = contracts_all.values_list(wage_field, flat=True)
            page_stats['wage_histogram'] = get_histogram(values, int(bins))

        pagination = self.pagination_class(page_stats)
        results = pagination.paginate_queryset(contracts_all, request)
        serializer = ContractSerializer(results, many=True)
        return pagination.get_paginated_response(serializer.data)

    def get_queryset(self, request, wage_field):
        return get_contracts_queryset(request, wage_field)


class ScheduleMetadataList(generics.ListAPIView):
    """
    Returns an array of objects representing metadata about
    Schedules offered by CALC. Each object contains the following keys:

    * `schedule` is the identifier for the schedule as it appears in
      other CALC API endpoints, such as [/api/rates/](/api/rates/).
    * `full_name` is the full name of the schedule as it should appear
      to end users.
    * `sin` is the SIN number of the schedule, if one exists.
    """

    queryset = ScheduleMetadata.objects.all()
    serializer_class = ScheduleMetadataSerializer


class GetRatesCSV(APIView):
    """
    Returns a CSV of matched records and selected search and filter options.

    Note that the first two rows actually contain metadata about the requested
    search and filter options. The subsequent rows contain the
    matched records.
    """

    schema = AutoSchema(
        manual_fields=GET_CONTRACTS_QUERYARGS
    )

    def get(self, request, format=None):
        wage_field = 'current_price'
        contracts_all = get_contracts_queryset(request.GET, wage_field)

        q = request.query_params.get('q', 'None')

        # If the query starts with special chars that could be interpreted
        # as parts of a formula by Excel, then prefix the query with
        # an apostrophe so that Excel instead treats it as plain text.
        # See https://issues.apache.org/jira/browse/CSV-199
        # for more information.
        if q.startswith(('@', '-', '+', '=', '|', '%')):
            q = "'" + q

        min_education = request.query_params.get(
            'min_education', 'None Specified')
        min_experience = request.query_params.get(
            'min_experience', 'None Specified')
        site = request.query_params.get('site', 'None Specified')
        business_size = request.query_params.get(
            'business_size', 'None Specified')
        business_size_map = {
            'o': 'other than small',
            's': 'small business'
        }
        business_size_set = business_size_map.get(business_size)
        if business_size_set:
            business_size = business_size_set

        response = HttpResponse(content_type="text/csv")
        response['Content-Disposition'] = ('attachment; '
                                           'filename="pricing_results.csv"')
        writer = csv.writer(response)
        writer.writerow(("Search Query", "Minimum Education Level",
                         "Minimum Years Experience", "Worksite",
                         "Business Size", "", "", "", "", "", "", "", "", ""))
        writer.writerow((q, min_education, min_experience, site,
                         business_size, "", "", "", "", "", "", "", "", ""))
        writer.writerow(("Contract #", "Business Size", "Schedule", "Site",
                         "Begin Date", "End Date", "SIN", "Vendor Name",
                         "Labor Category", "education Level",
                         "Minimum Years Experience",
                         "Current Year Labor Price", "Next Year Labor Price",
                         "Second Year Labor Price"))

        for c in contracts_all:
            writer.writerow((c.idv_piid, c.get_readable_business_size(),
                             c.schedule, c.contractor_site, c.contract_start,
                             c.contract_end, c.sin, c.vendor_name,
                             c.labor_category, c.get_education_level_display(),
                             c.min_years_experience, c.current_price,
                             c.next_year_price, c.second_year_price))

        return response


class GetAutocomplete(APIView):
    """
    Return autocomplete suggestions for a given query.

    The JSON response is an array containing objects
    with the following keys:

    * `labor_category` is the name of a labor category that
      matches your query.

    * `count` is the number of records with the labor category.
    """

    schema = AutoSchema(
        manual_fields=[
            Q_QUERYARG,
            queryarg(
                "query_type",
                str,
                """
                Defines how the search query should work.
                Can be `match_all` (default) or `match_phrase`.
                """
            )
        ]
    )

    MAX_RESULTS = 20

    def get(self, request, format=None):
        q = request.query_params.get('q', False)
        query_type = request.query_params.get('query_type', 'match_all')
        query_by = request.query_params.get('query_by', None)

        if q:
            data = Contract.objects.all().multi_phrase_search(
                q, query_by, query_type)

            data = data.values('_normalized_labor_category').annotate(
                count=Count('_normalized_labor_category')).order_by('-count')

            # limit data to MAX_RESULTS
            data = data[:self.MAX_RESULTS]

            data = [
                {'labor_category': d['_normalized_labor_category'],
                 'count': d['count']}
                for d in data
            ]
            return Response(data)
        else:
            return Response([])