src/tests/architect_tests/test_feature_generators.py

Summary

Maintainability
C
1 day
Test Coverage
import copy
from datetime import date

import pandas as pd
import pytest
import sqlalchemy
from sqlalchemy import text as t

from triage.component.architect.feature_generators import FeatureGenerator
from triage.component.collate import Aggregate, Categorical, SpacetimeAggregation

from unittest.mock import patch


INPUT_DATA = [
    # entity_id, knowledge_date, zip_code, cat_one, quantity_one
    (1, date(2014, 1, 1), "60120", "good", 10000),
    (1, date(2014, 10, 11), "60120", "good", None),
    (3, date(2012, 6, 8), "60653", "bad", 342),
    (3, date(2014, 12, 21), "60653", "inbetween", 600),
    (4, date(2014, 4, 4), "60653", "bad", 1236),
]

INPUT_STATES = [
    # entity_id, as_of_date
    (1, date(2013, 9, 30)),
    (1, date(2014, 9, 30)),
    (1, date(2015, 1, 1)),
    (3, date(2013, 9, 30)),
    (3, date(2014, 9, 30)),
    (3, date(2015, 1, 1)),
    (4, date(2014, 9, 30)),
    (4, date(2015, 1, 1)),
]


@pytest.fixture(name='test_engine', scope='function')
def fixture_test_engine(db_engine):
    """Local extension to the shared db_engine fixture to set up test
    database tables.

    """
    db_engine.execute(
        """\
        create table data (
            entity_id int,
            knowledge_date date,
            zip_code text,
            cat_one varchar,
            quantity_one float
        )
        """
    )
    for row in INPUT_DATA:
        db_engine.execute("insert into data values (%s, %s, %s, %s, %s)", row)

    db_engine.execute(
        """\
        create table states (
            entity_id int,
            as_of_date date
        )
        """
    )
    for row in INPUT_STATES:
        db_engine.execute("insert into states values (%s, %s)", row)

    return db_engine


def test_feature_generation(test_engine):
    aggregate_config = [
        {
            "prefix": "aprefix",
            "aggregates": [
                {
                    "quantity": "quantity_one",
                    "metrics": ["sum", "count"],
                    "imputation": {
                        "sum": {"type": "constant", "value": 137},
                        "count": {"type": "zero"},
                    },
                }
            ],
            "categoricals_imputation": {"all": {"type": "null_category"}},
            "categoricals": [
                {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]}
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }
    ]

    expected_output = {
        "aprefix_aggregation_imputed": [
            {
                "entity_id": 1,
                "as_of_date": date(2013, 9, 30),
                "aprefix_entity_id_all_quantity_one_sum": 137,
                "aprefix_entity_id_all_quantity_one_count": 0,
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 0,
                "aprefix_entity_id_all_cat_one__NULL_sum": 1,
                "aprefix_entity_id_all_quantity_one_imp": 1,
            },
            {
                "entity_id": 1,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_quantity_one_sum": 10000,
                "aprefix_entity_id_all_quantity_one_count": 1,
                "aprefix_entity_id_all_cat_one_good_sum": 1,
                "aprefix_entity_id_all_cat_one_bad_sum": 0,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
                "aprefix_entity_id_all_quantity_one_imp": 0,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2013, 9, 30),
                "aprefix_entity_id_all_quantity_one_sum": 342,
                "aprefix_entity_id_all_quantity_one_count": 1,
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
                "aprefix_entity_id_all_quantity_one_imp": 0,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_quantity_one_sum": 342,
                "aprefix_entity_id_all_quantity_one_count": 1,
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
                "aprefix_entity_id_all_quantity_one_imp": 0,
            },
            {
                "entity_id": 4,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_quantity_one_sum": 1236,
                "aprefix_entity_id_all_quantity_one_count": 1,
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
                "aprefix_entity_id_all_quantity_one_imp": 0,
            },
        ]
    }

    features_schema_name = "features"

    output_tables = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).create_all_tables(
        feature_dates=["2013-09-30", "2014-09-30"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )

    for output_table in output_tables:
        records = pd.read_sql(
            "select * from {}.{} order by entity_id, as_of_date".format(
                features_schema_name,
                output_table,
            ),
            test_engine,
        ).to_dict("records")

        for record, expected_record in zip(records, expected_output[output_table]):
            assert record == expected_record


def test_index_column_lookup(test_engine):
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]

    features_schema_name = "features"
    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    )
    lookup = feature_generator.index_column_lookup(aggregations)
    assert lookup == {
        "prefix1_aggregation_imputed": ["as_of_date", "entity_id"],
        "prefix2_aggregation_imputed": ["as_of_date", "entity_id"],
    }


def test_feature_generation_feature_start_time(test_engine):
    aggregate_config = [
        {
            "prefix": "aprefix",
            "aggregates_imputation": {"all": {"type": "constant", "value": 7}},
            "aggregates": [{"quantity": "quantity_one", "metrics": ["sum"]}],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }
    ]

    expected_output = {
        "aprefix_aggregation_imputed": [
            {
                "entity_id": 1,
                "as_of_date": date(2015, 1, 1),
                "aprefix_entity_id_all_quantity_one_sum": 10000,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2015, 1, 1),
                "aprefix_entity_id_all_quantity_one_sum": 600,
            },
            {
                "entity_id": 4,
                "as_of_date": date(2015, 1, 1),
                "aprefix_entity_id_all_quantity_one_sum": 1236,
            },
        ]
    }

    features_schema_name = "features"
    output_tables = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
        feature_start_time="2013-01-01",
    ).create_all_tables(
        feature_dates=["2015-01-01"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )

    for output_table in output_tables:
        records = pd.read_sql(
            "select * from {}.{} order by as_of_date, entity_id".format(
                features_schema_name,
                output_table,
            ),
            test_engine,
        ).to_dict("records")

        assert records == expected_output[output_table]


def test_dynamic_categoricals(test_engine):
    aggregate_config = [
        {
            "prefix": "aprefix",
            "categoricals": [
                {
                    "column": "cat_one",
                    "choice_query": "select distinct(cat_one) from data",
                    "metrics": ["sum"],
                    "imputation": {"all": {"type": "null_category"}},
                }
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }
    ]
    expected_output = {
        "aprefix_aggregation_imputed": [
            {
                "entity_id": 1,
                "as_of_date": date(2013, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 0,
                "aprefix_entity_id_all_cat_one__NULL_sum": 1,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2013, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
            {
                "entity_id": 1,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 1,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 0,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
            {
                "entity_id": 4,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
        ]
    }

    features_schema_name = "features"

    output_tables = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).create_all_tables(
        feature_dates=["2013-09-30", "2014-09-30"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )

    for output_table in output_tables:
        records = pd.read_sql(
            "select * from {}.{} order by as_of_date, entity_id".format(
                features_schema_name, output_table
            ),
            test_engine,
        ).to_dict("records")

        assert records == expected_output[output_table]


def test_array_categoricals(db_engine):
    aggregate_config = [
        {
            "prefix": "aprefix",
            "array_categoricals": [
                {
                    "column": "cat_one",
                    "choices": ["good", "bad", "inbetween"],
                    "metrics": ["sum"],
                    "imputation": {"all": {"type": "null_category"}},
                }
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }
    ]
    expected_output = {
        "aprefix_aggregation_imputed": [
            {
                "entity_id": 1,
                "as_of_date": date(2013, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 0,
                "aprefix_entity_id_all_cat_one__NULL_sum": 1,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2013, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
            {
                "entity_id": 1,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 1,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 0,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
            {
                "entity_id": 3,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
            {
                "entity_id": 4,
                "as_of_date": date(2014, 9, 30),
                "aprefix_entity_id_all_cat_one_good_sum": 0,
                "aprefix_entity_id_all_cat_one_inbetween_sum": 0,
                "aprefix_entity_id_all_cat_one_bad_sum": 1,
                "aprefix_entity_id_all_cat_one__NULL_sum": 0,
            },
        ]
    }

    input_data = [
        # entity_id, knowledge_date, cat_one, quantity_one
        (1, date(2014, 1, 1), ["good", "good"], 10000),
        (1, date(2014, 10, 11), ["good"], None),
        (3, date(2012, 6, 8), ["bad"], 342),
        (3, date(2014, 12, 21), ["inbetween"], 600),
        (4, date(2014, 4, 4), ["bad"], 1236),
    ]

    db_engine.execute(
        """\
        create table data (
            entity_id int,
            knowledge_date date,
            cat_one varchar[],
            quantity_one float
        )
        """
    )
    for row in input_data:
        db_engine.execute("insert into data values (%s, %s, %s, %s)", row)

    db_engine.execute(
        """\
        create table states (
            entity_id int,
            as_of_date date
        )
        """
    )
    for row in INPUT_STATES:
        db_engine.execute("insert into states values (%s, %s)", row)

    features_schema_name = "features"

    output_tables = FeatureGenerator(
        db_engine=db_engine,
        features_schema_name=features_schema_name,
    ).create_all_tables(
        feature_dates=["2013-09-30", "2014-09-30"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )

    for output_table in output_tables:
        records = pd.read_sql(
            "select * from {}.{} order by as_of_date, entity_id".format(
                features_schema_name, output_table
            ),
            db_engine,
        ).to_dict("records")

        assert records == expected_output[output_table]


def test_generate_table_tasks(test_engine):
    test_engine.execute('create schema features')
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]
    features_schema_name = "features"

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="aggregation")
    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)

    # build the aggregation tables to check the imputation tasks
    FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).process_table_tasks(table_tasks)

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="imputation")

    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)


def test_aggregations(test_engine):
    aggregate_config = [
        {
            "prefix": "prefix1",
            "categoricals": [
                {
                    "column": "cat_one",
                    "choice_query": "select distinct(cat_one) from data",
                    "metrics": ["sum"],
                    "imputation": {"all": {"type": "null_category"}},
                }
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        },
        {
            "prefix": "prefix2",
            "aggregates_imputation": {"all": {"type": "mean"}},
            "aggregates": [{"quantity": "quantity_one", "metrics": ["count"]}],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        },
    ]
    features_schema_name = "features"

    aggregations = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).aggregations(
        feature_dates=["2013-09-30", "2014-09-30"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )
    for aggregation in aggregations:
        assert isinstance(aggregation, SpacetimeAggregation)


def test_replace(test_engine):
    # test the replace=False functionality, wherein we see if the cohort is fully represented
    # in the imputed table and reuse the features if so
    aggregate_config = [
        {
            "prefix": "aprefix",
            "aggregates_imputation": {"all": {"type": "mean"}},
            "aggregates": [{"quantity": "quantity_one", "metrics": ["sum", "count"]}],
            "categoricals": [
                {
                    "column": "cat_one",
                    "choices": ["good", "bad"],
                    "metrics": ["sum"],
                    "imputation": {"all": {"type": "null_category"}},
                }
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }
    ]

    features_schema_name = "features"
    feature_tables = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
        replace=False,
    ).create_all_tables(
        feature_dates=["2013-09-30", "2014-09-30", "2015-01-01"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )

    assert len(feature_tables) == 1
    assert list(feature_tables)[0] == "aprefix_aggregation_imputed"

    # now try and run feature generation with replace=False. We should
    # be able to see that the entire cohort is there and reuse the features
    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
        replace=False,
    )
    aggregations = feature_generator.aggregations(
        feature_dates=["2013-09-30", "2014-09-30", "2015-01-01"],
        feature_aggregation_config=aggregate_config,
        state_table="states",
    )
    table_tasks = feature_generator.generate_all_table_tasks(
        aggregations,
        task_type="aggregation",
    )

    assert len(table_tasks["aprefix_entity_id"]) == 0
    assert len(table_tasks["aprefix_aggregation"]) == 0

    imp_tasks = feature_generator.generate_all_table_tasks(
        aggregations,
        task_type="imputation",
    )

    assert len(imp_tasks["aprefix_aggregation_imputed"]) == 0

    # add a new member of the cohort. now we should need to rebuild everything
    test_engine.execute("insert into states values (%s, %s)", 999, "2015-01-01")
    table_tasks = feature_generator.generate_all_table_tasks(
        aggregations,
        task_type="aggregation",
    )
    assert len(table_tasks["aprefix_entity_id"]) == 3
    assert len(table_tasks["aprefix_aggregation"]) == 3
    feature_generator.process_table_tasks(table_tasks)
    imp_tasks = feature_generator.generate_all_table_tasks(
        aggregations,
        task_type="imputation",
    )

    assert len(imp_tasks["aprefix_aggregation_imputed"]) == 3

def test_aggregations_materialize_off(test_engine):
    aggregate_config = {
        "prefix": "aprefix",
        "categoricals": [
            {
                "column": "cat_one",
                "choices": ["good", "bad"],
                "metrics": ["sum"],
                "imputation": {"all": {"type": "null_category"}},
            }
        ],
        "intervals": ["all"],
        "knowledge_date_column": "knowledge_date",
        "from_obj": "data",
    }

    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name="features",
        materialize_subquery_fromobjs=False
    )

    with patch("triage.component.architect.feature_generators.FromObj") as fromobj_mock:
        feature_generator.aggregations([aggregate_config], "2016-01-01", "states")
        assert not fromobj_mock.called


def test_aggregations_materialize_on(test_engine):
    aggregate_config = {
        "prefix": "aprefix",
        "categoricals": [
            {
                "column": "cat_one",
                "choices": ["good", "bad"],
                "metrics": ["sum"],
                "imputation": {"all": {"type": "null_category"}},
            }
        ],
        "intervals": ["all"],
        "knowledge_date_column": "knowledge_date",
        "from_obj": "data",
    }

    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name="features",
    )

    with patch("triage.component.architect.feature_generators.FromObj") as fromobj_mock:
        feature_generator.aggregations([aggregate_config], "2016-01-01", "states")
        fromobj_mock.assert_called_once_with(
            from_obj="data",
            knowledge_date_column="knowledge_date",
            name="features.aprefix"
        )


def test_transaction_error(test_engine):
    """Database connections are cleaned up regardless of in-transaction
    query errors.

    """
    aggregate_config = [
        {
            "prefix": "aprefix",
            "aggregates": [
                {
                    "quantity": "quantity_one",
                    "metrics": ["sum"],
                    "imputation": {
                        "sum": {"type": "constant", "value": 137},
                        "count": {"type": "zero"},
                    },
                }
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }
    ]

    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name="features",
    )

    with pytest.raises(sqlalchemy.exc.ProgrammingError):
        feature_generator.create_all_tables(
            feature_dates=["2013-09-30", "2014-09-30"],
            feature_aggregation_config=aggregate_config,
            state_table="statez",  # WRONG!
        )

    ((query_count,),) = test_engine.execute(
        t("""\
            select count(1) from pg_stat_activity
            where datname = :datname and
                  query not ilike '%%pg_stat_activity%%'
        """),
        datname=test_engine.url.database,
    )

    assert query_count == 0


class TestValidations:

    @pytest.fixture
    def base_config(self):
        return {
            "prefix": "aprefix",
            "categoricals": [
                {
                    "column": "cat_one",
                    "choices": ["good", "bad"],
                    "metrics": ["sum"],
                    "imputation": {"all": {"type": "null_category"}},
                }
            ],
            "intervals": ["all"],
            "knowledge_date_column": "knowledge_date",
            "from_obj": "data",
        }

    @pytest.fixture
    def feature_generator(self, test_engine):
        return FeatureGenerator(test_engine, "features")

    def test_correct_keys(self, base_config, feature_generator):
        feature_generator.validate([base_config])

        with pytest.raises(ValueError):
            has_groups = copy.deepcopy(base_config)
            has_groups["groups"] = ["entity_id", "zip_code"]
            feature_generator.validate([has_groups])

        with pytest.raises(ValueError):
            no_intervals = copy.deepcopy(base_config)
            del no_intervals["intervals"]
            feature_generator.validate([no_intervals])

        with pytest.raises(ValueError):
            no_kdate = copy.deepcopy(base_config)
            del no_kdate["knowledge_date_column"]
            feature_generator.validate([no_kdate])

        with pytest.raises(ValueError):
            no_from_obj = copy.deepcopy(base_config)
            del no_from_obj["from_obj"]
            feature_generator.validate([no_from_obj])

        with pytest.raises(ValueError):
            no_aggs = copy.deepcopy(base_config)
            del no_aggs["categoricals"]
            feature_generator.validate([no_aggs])

        with pytest.raises(ValueError):
            no_imps = copy.deepcopy(base_config)
            del no_imps["categoricals"][0]["imputation"]
            feature_generator.validate([no_imps])

    def test_bad_from_obj(self, base_config, feature_generator):
        bad_from_obj = copy.deepcopy(base_config)
        bad_from_obj["from_obj"] = "where thing is other_thing"
        with pytest.raises(ValueError):
            feature_generator.validate([bad_from_obj])

    def test_bad_interval(self, base_config, feature_generator):
        base_config["intervals"] = ["1y", "1fortnight"]
        with pytest.raises(ValueError):
            feature_generator.validate([base_config])

    def test_bad_choice_query(self, base_config, feature_generator):
        del base_config["categoricals"][0]["choices"]
        base_config["categoricals"][0][
            "choice_query"
        ] = "select distinct cat_two from data"
        with pytest.raises(ValueError):
            feature_generator.validate([base_config])

    def test_wrong_imp_fcn(self, base_config, feature_generator):
        del base_config["categoricals"][0]["imputation"]["all"]
        base_config["categoricals"][0]["imputation"]["max"] = {
            "type": "null_category"
        }
        with pytest.raises(ValueError):
            feature_generator.validate([base_config])

    def test_bad_imp_rule(self, base_config, feature_generator):
        base_config["categoricals"][0]["imputation"]["all"] = {
            "type": "bad_rule_doesnt_exist"
        }
        with pytest.raises(ValueError):
            feature_generator.validate([base_config])

    def test_no_imp_rule_type(self, base_config, feature_generator):
        base_config["categoricals"][0]["imputation"]["all"] = {"value": "good"}
        with pytest.raises(ValueError):
            feature_generator.validate([base_config])

    def test_missing_imp_arg(self, base_config, feature_generator):
        # constant value imputation requires a 'value' parameter
        base_config["categoricals"][0]["imputation"]["all"] = {"type": "constant"}
        with pytest.raises(ValueError):
            feature_generator.validate([base_config])