13-1.과제(summary table + slack hook)

과제 - channel, mau summary를 추가하기

from airflow import DAG
from airflow.macros import *

import os
from glob import glob
import logging
import subprocess

from plugins import redshift_summary
from plugins import slack

DAG_ID = "Build_Summary_v2"
dag = DAG(
    DAG_ID,
    schedule_interval="25 13 * * *",
    max_active_runs=1,
    concurrency=1,
    catchup=False,
    start_date=datetime(2021, 9, 17),
    default_args= {
        'on_failure_callback': slack.on_failure_callback,
        'retries': 1,
        'retry_delay': timedelta(minutes=1),
    }
)

# this should be listed in dependency order (all in analytics)
tables_load = [
    # 'nps_summary',
    'mau_summary',
    'channel_summary'
]

dag_root_path = os.path.dirname(os.path.abspath(__file__))
redshift_summary.build_summary_table(dag_root_path, dag, tables_load, "redshift_dev_db")

channel_summary.py

{
          'table': 'channel_summary',
          'schema': 'hajuny129',
          'main_sql': 
            """
        SELECT
	      DISTINCT A.userid,
        FIRST_VALUE(A.channel) over(partition by A.userid order by B.ts rows between unbounded preceding and unbounded following) AS First_Channel,
        LAST_VALUE(A.channel) over(partition by A.userid order by B.ts rows between unbounded preceding and unbounded following) AS Last_Channel
        FROM raw_data.user_session_channel A
        LEFT JOIN raw_data.session_timestamp B ON A.sessionid = B.sessionid;
            """,
          'input_check':
          [
            # {
            #   'sql': 'SELECT COUNT(1) FROM hajuny129.channel_summary',
            #   'count': 949
            # },
          ],
          'output_check':
          [
            {
              'sql': 'SELECT COUNT(1) FROM {schema}.temp_{table}',
              'count': 949
            }
          ],
}

mau_summary.py

{
          'table': 'mau_summary',
          'schema': 'hajuny129',
          'main_sql': 
            """
            SELECT 
            TO_CHAR(A.ts, 'YYYY-MM') AS month,
            COUNT(DISTINCT B.userid) AS mau
            FROM raw_data.session_timestamp A
            JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
            GROUP BY 1;
            """,
          'input_check':
          [
            # {
            #   'sql': 'SELECT COUNT(1) FROM hajuny129.mau_summary',
            #   'count': 7
            # },
          ],
          'output_check':
          [
            {
              'sql': 'SELECT COUNT(1) FROM {schema}.temp_{table}',
              'count': 7
            }
          ],
}

airflow dags test Build_Summary_v2 2023-05-14

Slack API hook

발급 후 테스트, Variable 등록

에러 만들기

name_gender_v3

schema를 keeyong이라 만들어서 에러 발생하게끔 설정
plugins import, dag 파라미터에 slack 플러그인 호출

from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.models import Variable
from airflow.providers.postgres.hooks.postgres import PostgresHook

from datetime import datetime
from datetime import timedelta
import requests
import logging
import psycopg2

from plugins import slack

def get_Redshift_connection(autocommit=True):
    hook = PostgresHook(postgres_conn_id='redshift_dev_db')
    conn = hook.get_conn()
    conn.autocommit = autocommit
    return conn.cursor()

def extract(**context):
    link = context["params"]["url"]
    task_instance = context['task_instance']
    execution_date = context['execution_date']

    logging.info(execution_date)
    f = requests.get(link)
    return (f.text)

def transform(**context):
    logging.info("Transform started")    
    text = context["task_instance"].xcom_pull(key="return_value", task_ids="extract")
    lines = text.strip().split("\\n")[1:] # 첫 번째 라인을 제외하고 처리
    records = []
    for l in lines:
      (name, gender) = l.split(",") # l = "Keeyong,M" -> [ 'keeyong', 'M' ]
      records.append([name, gender])
    logging.info("Transform ended")
    return records

def load(**context):
    logging.info("load started")    
    schema = context["params"]["schema"]
    table = context["params"]["table"]

    lines = context["task_instance"].xcom_pull(key="return_value", task_ids="transform")
    """
    records = [
      [ "Keeyong", "M" ],
      [ "Claire", "F" ],
      ...
    ]
    """
    # BEGIN과 END를 사용해서 SQL 결과를 트랜잭션으로 만들어주는 것이 좋음
    cur = get_Redshift_connection()
    try:
        cur.execute("BEGIN;")
        cur.execute(f"DELETE FROM {schema}.name_gender;") 
        # DELETE FROM을 먼저 수행 -> FULL REFRESH을 하는 형태
        for r in records:
            name = r[0]
            gender = r[1]
            print(name, "-", gender)
            sql = f"INSERT INTO {schema}.name_gender VALUES ('{name}', '{gender}')"
            cur.execute(sql)
        cur.execute("COMMIT;")   # cur.execute("END;") 
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        cur.execute("ROLLBACK;")   
        raise
    logging.info("load done")

dag = DAG(
    dag_id = 'name_gender_v3',
    start_date = datetime(2023,4,6), # 날짜가 미래인 경우 실행이 안됨
    schedule = '0 2 * * *',  # 적당히 조절
    catchup = False,
    max_active_runs = 1,
    default_args = {
        'retries': 1,
        'retry_delay': timedelta(minutes=3),
        'on_failure_callback': slack.on_failure_callback,
    }
)

extract = PythonOperator(
    task_id = 'extract',
    python_callable = extract,
    params = {
        'url':  Variable.get("csv_url")
    },
    dag = dag)

transform = PythonOperator(
    task_id = 'transform',
    python_callable = transform,
    params = { 
    },  
    dag = dag)

load = PythonOperator(
    task_id = 'load',
    python_callable = load,
    params = {
        'schema': 'keeyong',
        'table': 'name_gender'
    },
    dag = dag)

extract >> transform >> load

스크린샷 2023-06-19 오후 11.00.39.png

에러가 발생했지만 에러 관련 Hook 연동이 되지 않습니다!
질문 1
- 위와 같이 Slack Hook이 되지 않는 점
질문 2
- name_gender_v4 dag를 실행할 때, airflow dags test name_gender_v4 2023-05-15로 실행을 했을 때 성공했었는데, 그 이후로는 실패한 코드를 넣어놓아도 execution date를 최신 날짜로 넣어도 성공했다는 결과가 나옵니다. 이유를 모르겠습니다.