daemonslayer/tests-airflow

View on GitHub
src/etl/examples/hive-example/dags/process_hive_dwh.py

Summary

Maintainability
A
0 mins
Test Coverage
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import airflow
from datetime import datetime, timedelta
from airflow.operators.hive_operator import HiveOperator
from airflow.operators.latest_only_operator import LatestOnlyOperator
from airflow.models import Variable


args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(7),
    'provide_context': True,
    'depends_on_past': False
}

tmpl_search_path = Variable.get("hive_sql_path")

# This dag needs to run in sequence to generate the correct results.
# It is dependent on the partitions in staging and then updates dimensions
# in chronological order. The HQL files of each step describe what that
# step does. Hive is "append-only", so the main strategy is to prepare a 
# completely new dimension that completely overwrites a current one.

# The steps are explicitly coded, you could build them dynamically using
# a simple for loop, because only the step id and the dimension name changes.

# Notice that customer and product are processed independently and in parallel
# in a single dag.

dag = airflow.DAG(
    'process_hive_dwh',
    schedule_interval="@daily",
    dagrun_timeout=timedelta(minutes=60),
    template_searchpath=tmpl_search_path,
    default_args=args,
    max_active_runs=10)

customer_step_1 = HiveOperator(
    hql='customer/step_1.hql',
    hive_cli_conn_id='hive_staging',
    schema='default',
    hiveconf_jinja_translate=True,
    task_id='customer_step_1',
    dag=dag)

customer_step_2 = HiveOperator(
    hql='customer/step_2.hql',
    hive_cli_conn_id='hive_staging',
    schema='default',
    hiveconf_jinja_translate=True,
    task_id='customer_step_2',
    dag=dag)


product_step_1 = HiveOperator(
    hql='product/step_1.hql',
    hive_cli_conn_id='hive_staging',
    schema='default',
    hiveconf_jinja_translate=True,
    task_id='product_step_1',
    dag=dag)

product_step_2 = HiveOperator(
    hql='product/step_2.hql',
    hive_cli_conn_id='hive_staging',
    schema='default',
    hiveconf_jinja_translate=True,
    task_id='product_step_2',
    dag=dag)

latest_only = LatestOnlyOperator(
    task_id='latest_only',
    dag=dag)

fact_order_step_1 = HiveOperator(
    hql='order/step_1.hql',
    hive_cli_conn_id='hive_staging',
    schema='default',
    hiveconf_jinja_translate=True,
    task_id='fact_order_step_1',
    dag=dag)

fact_order_step_2 = HiveOperator(
    hql='order/step_2.hql',
    hive_cli_conn_id='hive_staging',
    schema='default',
    hiveconf_jinja_translate=True,
    task_id='fact_order_step_2',
    dag=dag)

latest_only >> customer_step_1
latest_only >> product_step_1

customer_step_1 >> customer_step_2
product_step_1 >> product_step_2

customer_step_2 >> fact_order_step_1
product_step_2 >> fact_order_step_1

fact_order_step_1 >> fact_order_step_2


if __name__ == "__main__":
    dag.cli()