fedspendingtransparency/usaspending-api

View on GitHub
docker-compose.yml

Summary

Maintainability
Test Coverage
# See usages of this compose file and its commands in the README.md file
version: '3.7'

volumes:
  local_pg_data:
    driver: local
  local_es_data:
    driver: local

services:

  usaspending-db:
    command: postgres -c 'max_connections=500'
    profiles:
      - usaspending  # must pass --profile usaspending to docker-compose for this to come up, or run a service with one of these other profiles
      - manage
      - test
      - ci
    image: postgres:13.8-alpine
    container_name: usaspending-db
    volumes:
      - type: volume
        source: local_pg_data
        target: /var/lib/postgresql/data
    ports:
      - ${USASPENDING_DB_PORT:-5432}:5432
    environment:
      POSTGRES_USER: ${USASPENDING_DB_USER:-usaspending}
      POSTGRES_PASSWORD: ${USASPENDING_DB_PASSWORD:-usaspender}
      POSTGRES_DB: ${USASPENDING_DB_NAME:-data_store_api}

  usaspending-manage:
    profiles:
      - manage  # must pass --profile manage to docker-compose for this to come up, or use docker-compose run
    image: usaspending-backend  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    build: .
    container_name: usaspending-manage
    volumes:
     - .:/dockermount
    depends_on:
      - usaspending-db
    # For an interactive shell, override this command with: docker-compose run --rm usaspending-manage python3 -u manage.py shell
    command: python3 -u manage.py help
    environment:
      DJANGO_DEBUG: ${DJANGO_DEBUG}
      DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      ES_HOSTNAME: ${ES_HOSTNAME}
      DATA_BROKER_DATABASE_URL: postgresql://${BROKER_DB_USER}:${BROKER_DB_PASSWORD}@${BROKER_DB_HOST}:${BROKER_DB_PORT}/data_broker

  usaspending-test:
    profiles:
      - test  # must pass --profile test to docker-compose for this to come up, or use docker-compose run
    image: usaspending-testing  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    build:
      context: .
      dockerfile: Dockerfile.testing
    container_name: usaspending-test
    volumes:
     - .:/dockermount
     # Required to interact with host's docker daemon from within this running container,
     # to spin up the data-act-broker-init-test-db container used for broker integration tests (see: conftest.broker_db_setup)
     - /var/run/docker.sock:/var/run/docker.sock
    depends_on:
      - usaspending-db
      - usaspending-es
    command: make tests
    environment:
      DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      ES_HOST: ${ES_HOST}
      ES_HOSTNAME: ${ES_HOSTNAME}
      DATA_BROKER_DATABASE_URL: postgresql://${BROKER_DB_USER}:${BROKER_DB_PASSWORD}@${BROKER_DB_HOST}:${BROKER_DB_PORT}/data_broker
      # Location in host machine where broker src code root can be found
      DATA_BROKER_SRC_PATH: "$PWD/../data-act-broker-backend"
      MINIO_HOST: ${MINIO_HOST}
      DOWNLOAD_DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api

  usaspending-ci:
    profiles:
      - ci  # must pass --profile ci to docker-compose for this to come up, or use docker-compose run
    image: usaspending-backend  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    build: .
    container_name: usaspending-ci
    volumes:
     - .:/dockermount
     # Required to interact with host's docker daemon from within this running container,
     # to spin up the data-act-broker-init-test-db container used for broker integration tests (see: conftest.broker_db_setup)
     - /var/run/docker.sock:/var/run/docker.sock
    depends_on:
      - usaspending-db
      - usaspending-es
    command:
      - sh
      - -c
      - |
        printf "==============\nChecking code format:\n"
        black --check --diff .
        printf -- "-------\nChecking code syntax:\n"
        flake8 && echo "Successfully passed"
        printf -- "-------\nChecking API documentation files:\n"
        python3 manage.py check_for_endpoint_documentation
        printf -- "-------\nRunning unit tests:\n"
        pytest --durations 50 --ignore-glob='**/tests/integration/*' --cov=usaspending_api --cov-report= --reuse-db -rsx
        printf -- "-------\nRunning integration tests:\n"
        pytest --durations 50 --override-ini=python_files='**/tests/integration/*' --cov=usaspending_api --cov-append --cov-report term --cov-report xml:coverage.xml --reuse-db -rsx
    environment:
      DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      ES_HOSTNAME: ${ES_HOSTNAME}
      DATA_BROKER_DATABASE_URL: postgresql://${BROKER_DB_USER}:${BROKER_DB_PASSWORD}@${BROKER_DB_HOST}:${BROKER_DB_PORT}/data_broker
      # Location in host machine where broker src code root can be found
      DATA_BROKER_SRC_PATH: "$PWD/../data-act-broker-backend"

  usaspending-api:
    profiles:
      - usaspending  # must pass --profile usaspending to docker-compose for this to come up
    image: usaspending-backend  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    build: .
    container_name: usaspending-api
    volumes:
      - .:/dockermount
    ports:
      - 8000:8000
    depends_on:
      - usaspending-db
      - usaspending-es
    restart: on-failure:3 # 3 max attempt, and then it will stop restarting
    # Must wait on postgres db to be up (~9s)
    command: /bin/sh -c "sleep 9s; python3 -u manage.py runserver --verbosity 2 0.0.0.0:8000"
    environment:
      DJANGO_DEBUG: ${DJANGO_DEBUG}
      RUN_LOCAL_DOWNLOAD_IN_PROCESS: "False"
      DB_SOURCE: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      DB_R1: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      DOWNLOAD_DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      ES_HOSTNAME: ${ES_HOSTNAME}

  usaspending-bulk-download:
    profiles:
      - usaspending  # must pass --profile usaspending to docker-compose for this to come up
    image: usaspending-backend  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    build: .
    container_name: usaspending-bulk-download
    restart: on-failure:5 # 5 max attempt, and then it will stop restarting. NOTE: bulk download errors will cause one failure+restart iterations
    volumes:
    - .:/dockermount
    command: python3 manage.py download_sqs_worker
    environment:
      DJANGO_DEBUG: ${DJANGO_DEBUG}
      DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
      DOWNLOAD_DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api

  usaspending-es:
    profiles:
      - usaspending  # must pass --profile usaspending to docker-compose for this to come up
      - test
      - ci
    image: docker.elastic.co/elasticsearch/elasticsearch:7.1.1
    container_name: usaspending-es
    environment:
      - node.name=usaspending-es
      - discovery.seed_hosts=usaspending-es
      - cluster.initial_master_nodes=usaspending-es
      - cluster.name=usaspending
      - network.host=0.0.0.0
      - bootstrap.memory_lock=true
      - "ES_JAVA_OPTS=-Xms1536m -Xmx1536m"  # Ensure Docker is allocated plenty of memory, otherwise this will fail
    # Inject plugin install, then resume with orignial entrypoint command
    command: >
      /bin/sh -c "
        if [ ! -d /usr/share/elasticsearch/plugins/mapper-murmur3 ]; then
          # Certificate problem workaround when on VPN - wget without checking cert, then install from local filesystem
          wget --no-check-certificate https://artifacts.elastic.co/downloads/elasticsearch-plugins/mapper-murmur3/mapper-murmur3-7.1.1.zip
          ./bin/elasticsearch-plugin install file:///usr/share/elasticsearch/mapper-murmur3-7.1.1.zip
        fi
        /usr/local/bin/docker-entrypoint.sh"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    volumes:
    - type: volume
      source: local_es_data
      target: /usr/share/elasticsearch/data
    ports:
      - 9200:9200

  usaspending-kibana-es:
    profiles:
      - usaspending  # must pass --profile usaspending to docker-compose for this to come up
    image: docker.elastic.co/kibana/kibana-oss:7.1.1
    container_name: usaspending-kibana-es
    # ELASTICSEARCH_HOSTS should match the port for "usaspending-es"; value will need to be updated if using Windows
    environment:
      - ELASTICSEARCH_HOSTS="http://docker.for.mac.localhost:9200"
    ports:
      - 5601:5601

  minio:
    profiles:  # must pass one of these with --profile to docker-compose
      - s3
      - spark
      - test
    image: minio/minio:RELEASE.2022-04-12T06-55-35Z
    container_name: minio
    volumes:
      - .:/dockermount
      - type: bind
        source: ${MINIO_DATA_DIR:-../data/s3}
        target: /data
    ports:
      - ${MINIO_PORT:-10001}:10001
      - ${MINIO_CONSOLE_PORT:-10002}:10002
    environment:
      MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-usaspending}
      MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-usaspender}
    entrypoint: >
      /bin/sh -c "
        # Create the bucket within MinIO and file path for the data dict
        mkdir -p data/dti-da-public-files-nonprod/user_reference_docs
        cp dockermount/usaspending_api/data/Data_Dictionary_Crosswalk.xlsx data/dti-da-public-files-nonprod/user_reference_docs/Data_Dictionary_Crosswalk.xlsx
        minio server --address ":10001" --console-address ":10002" /data
      "
    healthcheck:
      test: ["CMD", "curl", "-f", "http://${MINIO_HOST:-localhost}:${MINIO_PORT:-10001}/minio/health/live"]
      interval: 30s
      timeout: 20s
      retries: 3

  spark-master:
    profiles:
      - spark  # must pass --profile spark to docker-compose for this to come up
      - test
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker-compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
    container_name: spark-master
    environment:
      SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
      SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
      SPARK_MASTER_WEBUI_PORT: ${SPARK_MASTER_WEBUI_PORT:-4040}
    command: >
      /bin/sh -c "
        $${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.master.Master \
        --port $${SPARK_MASTER_PORT} \
        --webui-port $${SPARK_MASTER_WEBUI_PORT}"
    ports:
      - ${SPARK_MASTER_PORT:-7077}:7077
      - ${SPARK_MASTER_WEBUI_PORT:-4040}:4040
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false

  spark-worker:
    profiles:
      - spark  # must pass --profile spark to docker-compose for this to come up
      - test
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker-compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
    container_name: spark-worker
    depends_on:
      - spark-master
    environment:
      SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
      SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
      SPARK_WORKER_WEBUI_PORT: ${SPARK_WORKER_WEBUI_PORT:-4041}
    command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $${SPARK_WORKER_WEBUI_PORT} spark://$${SPARK_MASTER_HOST}:$${SPARK_MASTER_PORT}"
    ports:
      - ${SPARK_WORKER_WEBUI_PORT:-4041}:4041
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false

  spark-history-server:
    profiles:
      - spark  # must pass --profile spark to docker-compose for this to come up
      - test
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker-compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
    container_name: spark-history-server
    environment:
      SPARK_HISTORY_SERVER_PORT: ${SPARK_HISTORY_SERVER_PORT:-18080}
    command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.history.HistoryServer"
    ports:
      - ${SPARK_HISTORY_SERVER_PORT:-18080}:18080
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false

  # Example of running spark-submit container:
  #  NOTE: double check package dependency versions here with those used in unit tests (conftest_spark.py), as these docs could have gotten stale
  #  (1) Review config values in usaspending_api/config/envs/local.py and override any as needed in a .env file or -e environment variable
  #  (2) Deploy minio in docker container (see README.md)
  #  (3) Deploy the postgres DB docker container, if your script connects to a DB.
  #      - If so, also export a JDBC_URL environment variable to pass in to the spark-submit container so it can find its connection
  #  (4) If reading or writing to S3, make sure the bucket given by the value of config setting CONFIG.AWS_S3_BUCKET exists
  #      - e.g. create via UI at http://localhost:10001
  #      - or use MinIO client CLI: mc mb local/data
  #  (4) Run the spark-submit container, citing the dependent packages:
  #    (NOTEs:
  #      - postgresql is needed as a JDBC driver, if connecting to a Postgres DB
  #      - delta-core is needed to read/write in Delta Lake format
  #      - hadoop-aws is needed for the S3AFileSystem.java, used to write data to S3,
  #        - and should use the same hadoop version in your local setup
  #        - NOTE that specifying hadoop-awas should pull in on its own the required version of the aws-java-sdk
  #      - spark-hive is needed to use a hive metastore_db of schemas and tables
  #        - the Docker image at this time only installs spark and hadoop standalone, which does not seem to include all the needed Hive jars
  #
  #    make docker-compose-run profiles="--profile spark" args="--rm -e MINIO_HOST=minio -e JDBC_URL -e COMPONENT_NAME='My Spark Prototype Script' spark-submit \
  #      --packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
  #      /project/usaspending_api/etl/tests/path_to_your_spark_prototype_script.py"
  spark-submit:
    profiles:
      - spark  # must pass --profile spark to docker-compose for this to come up
      - test
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker-compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
    container_name: spark-submit
    depends_on:
      - spark-master
      - spark-worker
      - spark-history-server
      - minio
    environment:
      SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
      SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
      # i.e. target where host warehouse dir is bound in below volume config.
      #   This env var needs to be picked up as the config for the spark.sql.warehouse.dir spark conf setting when SparkSessions are created inside of a spark-submitted job
      SPARK_SQL_WAREHOUSE_DIR: /spark-warehouse
      # i.e. a metastore_db sub dir of the target where host warehouse dir is bound in below volume config.
      #   This env var needs to be picked up as the path part of the config for the spark.hadoop.javax.jdo.option.ConnectionURL spark conf setting when SparkSessions are created inside of a spark-submitted job
      HIVE_METASTORE_DERBY_DB_DIR: /spark-warehouse/metastore_db
      PYTHONPATH: "/project"
    # NOTE: entrypoint CANNOT interpolate env vars when processed. They are passed through literally.
    # So in using 1 $ rather than 2 $$, the var is evaluated based on the current SHELL ENV when docker-compose is run,
    # and interpolated before accessed as the entrypoint.
    # While this service has values for these interpolated vars in the environment: element, those are not used here,
    # but merely passed into the container. KEEP the two references to these vars and their defaults consistent!
    # To see what it will be, you can run docker-compose config (i.e. make docker-compose-config in this project's Makefile)
    entrypoint: ./bin/spark-submit --master spark://${SPARK_MASTER_HOST:-spark-master}:${SPARK_MASTER_PORT:-7077}
    command: --help
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false
      # NOTE: The hive metastore_db Derby database folder is expected to be configured to show up as a subfolder of the spark-warehouse dir
      - type: bind
        source: ${SPARK_SQL_WAREHOUSE_DIR:-./spark-warehouse}
        target: /spark-warehouse
      # Mount the JAR dependencies local repo on host into container to take advantage of caching/reuse
      # i.e., to download the dependencies only once and reuse on subsequent docker-compose run calls
      - type: bind
        source: ${HOME}/.ivy2
        target: /root/.ivy2
        read_only: false