Makefile from fedspendingtransparency/usaspending-api

Makefile
Summary

Maintainability

Test Coverage

Issues
#### DEFAULTS ##########################################################################################################
#### Boilerplate Makefile setup
MAKEFLAGS += --warn-undefined-variables
SHELL := bash
.SHELLFLAGS := -ec -o pipefail
.DEFAULT_GOAL := help
.DELETE_ON_ERROR:
.SUFFIXES:


#### INCLUDES ##########################################################################################################
#### Includes of other Makefiles, or files that declare environment variables (e.g. .env)
# include .env _should_ allow vars to be defined here
# However be careful of vars referencing other vars. or vals with $ in it (e.g. passwords), which would need escaping

env_file_exists := $(wildcard .env)
ifneq ($(strip ${env_file_exists}),)
include .env
endif

#### VARS ##############################################################################################################
#### Variables used in this Makefile.
#### Uppercased are environment vars, or make-specific vars. All others should be lower-snake-case

# default ENV_CODE to lcl if not set
ENV_CODE ?= lcl
# default version if not set in .env or an env var
PYTHON_VERSION ?= 3.10.12
venv_name := usaspending-api
docker_compose_file := docker-compose.yml
dockerfile_for_spark := Dockerfile.spark
# Root directories under which python (namespace) packages start, for all python code in this project
src_root_paths = "."

#### RULES #############################################################################################################
#### Rules defining file targets that need to be made, or PHONY targets, which don't actually produce a file
#### Reminder: The name of non-PHONY targets needs to be the name of a file on disk, or it will always be invoked
#### NOTE: Most rules here deal with project-setup
####       Rules orchestrating project workloads are in the included Makefile

.PHONY: help
help: ## print this help
    @grep -E '^[a-zA-Z_-]+:.*?## .*$$' Makefile | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

.PHONY: printvars
printvars: ## Print the Environment variables present in calls to make, plus variables defined in the executed Makefile
    @$(info ==== Makefile Variables ====)
    @$(info )
    @$(foreach V,$(sort $(.VARIABLES)), \
        $(if $(filter-out environment% default automatic, \
        $(origin $V)),$(info $V=$($V) ($(value $V)))))
    @printf "\n==== Environment Variables ====\n\n"
    @printenv

.python-version: ## Attempt to setup python using pyenv
    @if ! command -v pyenv &> /dev/null; then \
        echo "WARNING: pyenv could not be found. Install pyenv to get a virtual env running with the compatible python version: ${PYTHON_VERSION}. Will fallback to using system python3."; \
    else \
          set -x; \
          echo "pyenv setting python version to ${PYTHON_VERSION}"; \
          pyenv install -s ${PYTHON_VERSION}; \
          pyenv local ${PYTHON_VERSION}; \
          python3 -V; \
          if [ "$$(python3 -V)" != "Python ${PYTHON_VERSION}" ]; then \
              echo "ERROR: pyenv was not able to set local python version to ${PYTHON_VERSION}"; \
              exit 1; \
          fi; \
    fi;

.venv: ## Ensure a virtual environment is established at .venv
    @( \
        set -x; \
        test -d .venv || python3 -m venv .venv/${venv_name}; \
    )

.PHONY: requirements-dev
upgrade :=  #unset it
requirements-dev: .venv ## Install pip packages in dev virtual env. Add upgrade=true to upgrade required packages to newest version (can be lengthy)
    # Because this depends on .venv, the virtual env should exist to activate
    # This will "activate" the virtual env only for the duration of the scripts in the parens-scope
    # Then when this make rule recipe is complete, the virtual env will be dropped
    # But it IS there and populated
    # Must _manually_ reactivate the virtual env to interact with it on the command line
    @( \
        source .venv/${venv_name}/bin/activate; \
        echo "virtual env at .venv/${venv_name} activated (temporarily)"; \
        pip install $$(cat requirements/requirements-dev.txt | grep 'pip=='); \
        src_roots=(${src_root_paths}); \
        for src_root in "$${src_roots[@]}"; do \
            pip install ${if ${upgrade},--upgrade,} --editable "$${src_root}[dev]"; \
        done; \
    )

.ivy2: ## Ensure user has a ~/.ivy2 dir, which will be bound to in a docker container volume to save on dependency downloads
    @mkdir -p ~/.ivy2

.PHONY: activate
activate: ## Spit out the command to run to activate the virtual env, since you can't do it within a make shell process. Use this like: source $(make activate)
    @echo ".venv/${venv_name}/bin/activate"

.PHONY: local-dev-setup
local-dev-setup: .python-version requirements-dev check-dependencies .ivy2 ## Setup python, virtual environment, and pip dependencies, then check version info

.PHONY: check-dependencies
check-dependencies: ## Prints out the versions of dependencies in use
    @printf "\n==== [PYTHON VERSIONS] ====\n\n"
    @echo "python -> $$(python -V) ... python3 -> $$(python3 -V)"
    @printf "\n==== [PIP PACKAGE VERSIONS] ====\n\n"
    @source .venv/${venv_name}/bin/activate && pip list
    @printf "\n==== [SPARK VERSION] ====\n\n"
    @source .venv/${venv_name}/bin/activate && pyspark --version
    @printf "\n==== [HADOOP VERSION] ====\n\n"
    @source .venv/${venv_name}/bin/activate && python3 -c "from pyspark.sql import SparkSession; \
spark = spark = SparkSession.builder.getOrCreate(); \
print('Hadoop ' + spark.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion());"

.PHONY: env-code
env-code:  ## Print the value of ENV_CODE environment variable
    @echo ${ENV_CODE}

.PHONY: test-dbs
createdb :=  #unset it
test-dbs:  ## Trigger the setup of multiple test DBs that can be reused with pytest --numprocesses. Add createdb=true to force (re-)creation of Test DBs rather than reuse.
    pytest ${if ${createdb},--create-db,} --reuse-db --numprocesses=auto --no-cov --disable-warnings -rP -vvv --capture=no --log-cli-level=WARNING --show-capture=log 2> /dev/null 'usaspending_api/tests/integration/test_setup_of_test_dbs.py::test_trigger_test_db_setup'

.PHONY: test-spark-deps
test-spark-deps:  ## Trigger a singular test in one pytest session that does nothing but cause Maven dependencies to be downloaded and cached through Ivy; reduces contention when parallel spark builds need the depdencies
    pytest --no-cov --disable-warnings -r=fEs --verbosity=3 'usaspending_api/tests/integration/test_setup_of_spark_dependencies.py::test_preload_spark_jars'

.PHONY: tests
tests: local-dev-setup test-dbs test-spark-deps ## Run automated unit/integration tests. Configured for useful logging. add args="..." to append additional pytest args
    pytest --failed-first --reuse-db --numprocesses=auto --dist=worksteal -rP -vv --capture=no --show-capture=log 2> /dev/null ${args}

.PHONY: tests-failed
tests-failed: local-dev-setup test-dbs test-spark-deps ## Re-run only automated unit/integration tests that failed on the previous run. Configured for verbose logging to get more detail on failures. logging. add args="..." to append additional pytest args
    pytest --last-failed --reuse-db --numprocesses=auto --dist=worksteal -rP -vvv ${args}

.PHONY: confirm-clean-all
no-prompt := 'false'
dry-run := 'false'
confirm-clean-all:  ## Guard to prompt for confirmation before aggressive clean
ifeq ($(strip ${no-prompt}),'false')
ifeq ($(strip ${dry-run}),'false')
    @echo -n "This will remove any untracked/uncommitted source files or files in the working directory. Consider backing up any files in your custom setup. To see what files would be removed, re-run with dry-run=true. Continue? [y/N] " && read ans && [ $${ans:-N} = y ]
endif
endif

.PHONY: clean-all
dry-run := 'false'
clean-all: confirm-clean-all  ## Remove all tmp artifacts and artifacts created as part of local dev env setup. To avoid prompt (e.g. in script) call like: make clean-all no-prompt=true. To only see what WOULD be deleted, include dry-run=true
ifeq ($(strip ${dry-run}),'false')
    rm -f .python-version
    rm -rf .venv
    @git clean -xfd --exclude='\.env' --exclude='\.envrc' --exclude='\.idea/' --exclude='spark-warehouse/' --exclude='\.vscode/'
    deactivate || true
    #if command -v deactivate &> /dev/null; then deactivate; fi;
else  # this is a dry-run, spit out what would be removed
    @printf "Would remove .python-version\nWould remove .venv\n"
    @git clean --dry-run -xfd --exclude='\.env' --exclude='\.envrc' --exclude='\.idea/' --exclude='spark-warehouse/'
endif


.PHONY: docker-compose
docker-compose: ## Run an arbitrary docker-compose command by passing in the Docker Compose profiles in the "profiles" variable, and args in the "args" variable
    # NOTE: The .env file is used to provide environment variable values that replace variables in the compose file
    #       Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
    #       where it is with "--project_directory". Since this is called from the root Makefile, using ./ points to the dir
    #       of that Makefile
    docker-compose ${profiles} --project-directory . --file ${docker_compose_file} ${args}

.PHONY: docker-compose-config
docker-compose-config:  ## Show config and vars expanded, which will be used in docker-compose
    # NOTE: The .env file is used to provide environment variable values that replace varialbes in the compose file
    #       Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
    #       where it is with "--project_directory". Since this is called from teh root Makefile, using ./ points to the dir
    #       of that Makefile
    docker-compose --project-directory . --file ${docker_compose_file} config ${args}

.PHONY: docker-compose-up-usaspending
docker-compose-up-usaspending: ## Deploy containerized version of this app on the local machine using docker-compose
    # To 'up' a single docker-compose service, pass it in the args var, e.g.: make deploy-docker args=my-service
    # NOTE: [See NOTE in docker-compose rule about .env file]
    docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} up ${args}

.PHONY: docker-compose-up-s3
docker-compose-up-s3: ## Deploy minio container on the local machine using docker-compose, which acts as a look-alike AWS S3 service
    # NOTE: [See NOTE in docker-compose rule about .env file]
    echo "docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}"
    docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}

.PHONY: docker-compose-up-spark
docker-compose-up-spark: ## Deploy containerized version of spark cluster infrastructure on the local machine using docker-compose
    # NOTE: [See NOTE in docker-compose rule about .env file]
    docker-compose --profile spark --project-directory . --file ${docker_compose_file} up ${args}

.PHONY: docker-compose-run
docker-compose-run: ## Use docker-compose run <args> to run one or more Docker Compose services with options
    # NOTE: [See NOTE in docker-compose rule about .env file]
    docker-compose ${profiles} --project-directory . --file ${docker_compose_file} run ${args}

.PHONY: docker-compose-down
docker-compose-down: ## Run docker-compose down to bring down services listed in the compose file
    # NOTE: [See NOTE in docker-compose rule about .env file]
    docker-compose --project-directory . --file ${docker_compose_file} down ${args}

.PHONY: docker-build-spark
docker-build-spark: ## Run docker build to build a base container image for spark, hadoop, and python installed
    # NOTE: [See NOTE in above docker-compose rule about .env file]
    echo "docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})"
    docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})

.PHONY: docker-compose-build
docker-compose-build: ## Ensure ALL services in the docker-compose.yaml file have an image built for them according to their build: key
    # NOTE: This *may* creates a compose-specific image name IF an image: YAML key does not specify the image name to be used as
    #       a tag when compose has to build the image.
    #       If no image key is specified, then be aware that:
    #       While building and tagging the spark-base image can be done, docker-compose will _NOT USE_ that image at runtime,
    #       but look for an image with its custom tag. It may use cached layers of that image when doing its build,
    #       but it will create a _differently named_ image: the image name is always going to be <project>_<service>,
    #       where project defaults to the directory name you're in. Therefore you MUST always run this command (or the manual version of it)
    #       anytime you want services run with Docker Compose to accommodate recent changes in the image (e.g. python package dependency changes)
    # NOTE: [See NOTE in above docker-compose rule about .env file]
    echo "docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
    docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}

.PHONY: docker-compose-build-spark
docker-compose-build-spark: ## See: docker-compose-build rule. This builds just the subset of spark services.
    # NOTE: [See NOTE in above docker-compose rule about .env file]=
    echo "docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
    docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}

.PHONY: docker-compose-spark-submit
docker-compose-spark-submit: ## Run spark-submit from within local docker containerized infrastructure (which must be running first). Set params with django_command="..."
    docker-compose --profile=spark --project-directory . --file ${docker_compose_file} run \
        -e MINIO_HOST=minio \
        -e COMPONENT_NAME='${django_command}${python_script}' \
        -e DATABASE_URL=${DATABASE_URL} \
    spark-submit \
    --driver-memory "2g" \
    --packages \
        org.postgresql:postgresql:42.2.23, \
        io.delta:delta-spark_2.12:3.1.0, \
        org.apache.hadoop:hadoop-aws:3.3.4 \
    ${if ${python_script}, \
        ${python_script}, \
        /project/manage.py ${django_command} \
    }

.PHONY: localhost-spark-submit
localhost-spark-submit: ## Run spark-submit from with localhost as the driver and worker (single node). Set params with django_command="..."
    SPARK_LOCAL_IP=127.0.0.1 \
    spark-submit \
    --driver-memory "2g" \
    --packages \
        org.postgresql:postgresql:42.2.23, \
        io.delta:delta-spark_2.12:3.1.0, \
        org.apache.hadoop:hadoop-aws:3.3.4 \
    ${if ${python_script}, \
        ${python_script}, \
        manage.py ${django_command} \
    }

.PHONY: pyspark-shell
pyspark-shell: ## Launch a local pyspark REPL shell with all of the packages and spark config pre-set
    SPARK_LOCAL_IP=127.0.0.1 pyspark \
    --packages \
        org.postgresql:postgresql:42.2.23, \
        io.delta:delta-spark_2.12:3.1.0, \
        org.apache.hadoop:hadoop-aws:3.3.4 \
    --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \
    --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \
    --conf spark.hadoop.fs.s3a.endpoint=localhost:${MINIO_PORT} \
    --conf spark.hadoop.fs.s3a.access.key=usaspending \
    --conf spark.hadoop.fs.s3a.secret.key=usaspender \
    --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
    --conf spark.hadoop.fs.s3a.path.style.access=true \
    --conf spark.sql.catalogImplementation=hive \
    --conf spark.sql.warehouse.dir='$(PWD)/spark-warehouse' \
    --conf spark.hadoop.javax.jdo.option.ConnectionURL='jdbc:derby:;databaseName=$(PWD)/spark-warehouse/metastore_db;create=true'