18F/analytics-reporter-api

View on GitHub
src/db.js

Summary

Maintainability
A
0 mins
Test Coverage
const knex = require("knex");

const config = require("./config");

const db = knex({ client: "pg", connection: config.postgres });

const parseLimitParam = (limitParam) => {
  const limit = parseInt(limitParam, 10);

  if (limit > 10000 || limit <= 0) {
    return 10000;
  }
  return limit;
};

const parsePageParam = (pageParam) => {
  const page = parseInt(pageParam, 10);
  return Math.max(1, page);
};

/**
 * @param {string} before the maximum date for the query in ISO format
 * @param {string} after the minimum date for the query in ISO format
 * @returns {string} the constructed where clause for the query
 */
const buildTimeQuery = (before, after) => {
  if (before && after) {
    return ['"date" <= ?::date AND "date" >= ?::date', [before, after]];
  }
  if (before) {
    return ['"date" <= ?::date', [before]];
  }
  if (after) {
    return ['"date" >= ?::date', [after]];
  }
  return [true];
};

/**
 * @param {string} domain the domain to be queried
 * @param {string} reportName the report to be queried
 * @param {number} limitParam the maximum number of results to return
 * @param {number} pageParam the page of data to return based on the limit
 * @param {string} before the maximum date for the query in ISO format
 * @param {string} after the minimum date for the query in ISO format
 * @param {string} dbTable the table name to query
 * @returns {Promise<string[]>} resolves with the result of the database query
 */
const queryDomain = (
  domain,
  reportName,
  limitParam,
  pageParam,
  before,
  after,
  dbTable,
) => {
  const timeQuery = buildTimeQuery(before, after);

  const mainQuery = db(dbTable).where({ report_name: reportName });

  if (reportName == "download") {
    mainQuery.whereRaw("data->> 'page' like ?", [`%${domain}%`]);
  } else {
    mainQuery.whereRaw(
      `data @> '${JSON.stringify({ domain }).replaceAll("'", "''").replaceAll("?", "\\?")}'::jsonb`,
    );
  }

  return (
    mainQuery
      .whereRaw(...timeQuery)
      // Using `orderByRaw` in order to specifcy NULLS LAST, see:
      // https://github.com/knex/knex/issues/282
      .orderByRaw("date desc NULLS LAST")
      // Previously, this was ordered by data-->total_events and data-->visits. Those queries
      // were very slow, and from what I can tell, it's not possible to add the proper multi-field
      // index on (date, data-->total_events, data-->visits) to speed up the queries, because `data`
      // is a JSON field. See this (rather wordy, sorry) thread for more details:
      // https://github.com/18F/analytics-reporter-api/issues/161#issuecomment-874860764
      //
      // Ordering by `id` here does _not_ guarantee ordering based on total_events or visits. However,
      // the order in which data is inserted into the table (by code in the analytics-reporter repo, which
      // pulls from Google Analytics) happens to be in order by visits or total_events, so ordering by
      // IDs may in practice keep the same ordering as before - but it would be best not to rely on this.
      // A longer term fix would be to move the total_events and visits fields to their own columns.
      .orderBy("id", "asc")
      .limit(limitParam)
      .offset((pageParam - 1) * limitParam)
  );
};

/**
 * @param {object} queryConfig the config values for the database query
 * @param {string} queryConfig.reportName the report to be queried
 * @param {string} queryConfig.reportAgency the agency to be queried
 * @param {number} queryConfig.limit the max results to return
 * @param {number} queryConfig.page the page of data to return based on the limit
 * @param {string} queryConfig.domain the domain to be queried
 * @param {string} queryConfig.after the minimum date for the query in ISO format
 * @param {string} queryConfig.before the maximum date for the query in ISO format
 * @param {string} queryConfig.version the API version
 * @returns {Promise<string[]>} resolves with the result of the database query
 */
const query = ({
  reportName,
  reportAgency = null,
  limit = 1000,
  page = 1,
  domain = null,
  after = null,
  before = null,
  version,
}) => {
  // we have different tables for new ga4
  // TODO: once UA has sunset we can remove this
  const dbTable = version === "1.1" ? "analytics_data" : "analytics_data_ga4";
  const limitParam = parseLimitParam(limit);
  const pageParam = parsePageParam(page);
  if (domain) {
    return queryDomain(
      domain,
      reportName,
      limitParam,
      pageParam,
      before,
      after,
      dbTable,
    );
  }
  const recordQuery = Object.assign({
    report_name: reportName,
    report_agency: reportAgency,
  });
  const timeQuery = buildTimeQuery(before, after);

  return (
    db(dbTable)
      .where(recordQuery)
      .whereRaw(...timeQuery)
      // Using `orderByRaw` in order to specifcy NULLS LAST, see:
      // https://github.com/knex/knex/issues/282
      .orderByRaw("date desc NULLS LAST")
      // Previously, this was ordered by data-->total_events and data-->visits. Those queries
      // were very slow, and from what I can tell, it's not possible to add the proper multi-field
      // index on (date, data-->total_events, data-->visits) to speed up the queries, because `data`
      // is a JSON field. See this (rather wordy, sorry) thread for more details:
      // https://github.com/18F/analytics-reporter-api/issues/161#issuecomment-874860764
      //
      // Ordering by `id` here does _not_ guarantee ordering based on total_events or visits. However,
      // the order in which data is inserted into the table (by code in the analytics-reporter repo, which
      // pulls from Google Analytics) happens to be in order by visits or total_events, so ordering by
      // IDs may in practice keep the same ordering as before - but it would be best not to rely on this.
      // A longer term fix would be to move the total_events and visits fields to their own columns.
      .orderBy("id", "asc")
      .limit(limitParam)
      .offset((pageParam - 1) * limitParam)
  );
};

module.exports = { query, queryDomain, buildTimeQuery, dbClient: db };