Skip to content

Commit d7a52c1

Browse files
authored
Add sample dags (#1)
Adds few sample DAGs showcasing various Airflow features roll_d20:: * generates random number. Can read ENV var to override * DAG structure ![image](https://github.com/user-attachments/assets/8d43cde6-4c89-453e-a626-b24798f2fb5f) sample_ch_ddl:: * checks if table exists, if not — creates * DAG structure ![image](https://github.com/user-attachments/assets/ba5e8251-8e87-4d36-afcc-16825e324856) sample_ch_insert:: * has DAG params for number of randoms to insert * updates dataset * DAG structure ![image](https://github.com/user-attachments/assets/71818527-988f-44d5-9319-1c6d06e49af5) sample_ch_stats:: * triggered by dataset update from insert dag * DAG structure ![image](https://github.com/user-attachments/assets/fa0e9496-fa86-4dfe-95e8-325eb7bd3479) * display stats task ![image](https://github.com/user-attachments/assets/fc38b389-87d1-4875-bde8-d9f8152cbca1)
1 parent 30d6524 commit d7a52c1

11 files changed

+264
-1
lines changed

README.md

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,35 @@
11
# sample-airflow-dags
2-
Example dags for DC Managed Airflow, showcasing interaction with other DC services
2+
Example dags for DobuleCloud Managed Airflow, showcasing interaction with other DC services
3+
4+
Refer to documentation for Getting Started guide: https://double.cloud/docs/en/managed-airflow/get-started
5+
6+
## ClickHouse connection
7+
8+
You would need a ClickHouse connection named `ch_default` to make DAGs with tag `clickhouse` connect to your ClickHouse instance.
9+
If using DoubleCloud ClickHouse, create a generic connection with extra setting `{"secure": true}`.
10+
11+
## DAGs
12+
13+
[roll_d20](./dags/roll_d20.py) is not connected to anything and can be used to check new setup. Once enabled, it runs on a cron schedule every 5 minutes. You can set env variable `RND_SEED_OVERRIDE` in DC Airflow cluster settings to specify custom random seed for reproducibility.
14+
15+
![roll_d20_graph](./img/roll_d20_graph.png)
16+
17+
---
18+
19+
[sample_ch_ddl](./dags/sample_ch_ddl.py) checks if `sample_table` exists in clickhouse connection with connection_id `ch_default`.
20+
21+
![sample_ch_ddl_graph](./img/sample_ch_ddl_graph.png)
22+
23+
---
24+
25+
[sample_ch_insert](./dags/sample_ch_insert.py) inserts specified number of rows in `sample_table` and updates dataset `clickhouse://sample_table`.
26+
27+
![sample_ch_insert_graph](./img/sample_ch_insert_graph.png)
28+
29+
---
30+
31+
[sample_ch_stats](./dags/sample_ch_stats.py) computes stats on `sample_table` and outputs them in task logs. Triggered on updates in dataset `clickhouse://sample_table`
32+
33+
![sample_ch_stats_graph](./img/sample_ch_stats_graph.png)
34+
35+
![sample_ch_stats_log](./img/sample_ch_stats_log.png)

dags/roll_d20.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import random
3+
import pathlib
4+
from datetime import datetime as dt
5+
6+
from airflow.decorators import dag, task
7+
8+
9+
@dag(
10+
dag_id=pathlib.Path(__file__).stem,
11+
description="Example DAG with two chained tasks that outputs a random number between 1 and 20",
12+
schedule='*/5 * * * *',
13+
start_date=dt(2024, 9, 1, 0, 0, 0),
14+
catchup=False,
15+
tags=["sample", "random"],
16+
)
17+
def roll_d20():
18+
@task
19+
def dice_roll():
20+
seed = os.environ.get('RND_SEED_OVERRIDE')
21+
if seed:
22+
seed = int(seed)
23+
print("Random seed override:", seed)
24+
random.seed(seed)
25+
26+
return random.randint(1, 20)
27+
28+
@task
29+
def roll_result(roll_value):
30+
print("Hello from DoubleCloud")
31+
print("You rolled", roll_value)
32+
33+
roll_result(roll_value=dice_roll())
34+
35+
36+
my_dag = roll_d20()
37+
38+
39+
if __name__ == '__main__':
40+
my_dag.test()

dags/sample_ch_ddl.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import datetime
2+
import pathlib
3+
4+
from airflow.utils.trigger_rule import TriggerRule
5+
from airflow.decorators import dag
6+
7+
from airflow_clickhouse_plugin.operators.clickhouse import ClickHouseOperator
8+
from airflow.operators.empty import EmptyOperator
9+
from airflow.providers.common.sql.operators import sql
10+
from airflow_clickhouse_plugin.operators.clickhouse_dbapi import (
11+
ClickHouseBaseDbApiOperator,
12+
)
13+
14+
15+
class ClickHouseBranchSQLOperator(
16+
sql.BranchSQLOperator,
17+
ClickHouseBaseDbApiOperator,
18+
):
19+
"""
20+
temporary workaround for Airflow < 2.9.4
21+
see https://github.com/bryzgaloff/airflow-clickhouse-plugin/issues/87
22+
"""
23+
24+
pass
25+
26+
27+
@dag(
28+
dag_id=pathlib.Path(__file__).stem,
29+
schedule=None,
30+
start_date=datetime.datetime(2024, 9, 1, 0, 0, 0),
31+
catchup=False,
32+
dag_display_name="Create sample_table",
33+
tags=["sample", "clickhouse", "ddl"],
34+
max_active_runs=1,
35+
)
36+
def sample_ddl_stats():
37+
check_tbl_exists = ClickHouseBranchSQLOperator(
38+
task_id='check_if_sample_table_exists',
39+
sql='EXISTS sample_table',
40+
conn_id='ch_default',
41+
follow_task_ids_if_true='do_nothing',
42+
follow_task_ids_if_false='create_sample_table',
43+
)
44+
45+
do_nothing = EmptyOperator(task_id="do_nothing")
46+
create_tbl = ClickHouseOperator(
47+
task_id='create_sample_table',
48+
sql="""
49+
CREATE TABLE IF NOT EXISTS sample_table
50+
(
51+
id UInt32,
52+
value Float64,
53+
category Enum8('A' = 1, 'B' = 2, 'C' = 3)
54+
) ENGINE = MergeTree() ORDER BY id;
55+
""",
56+
clickhouse_conn_id='ch_default',
57+
)
58+
59+
check_tbl_exists >> [create_tbl, do_nothing]
60+
61+
62+
my_dag = sample_ddl_stats()
63+
64+
65+
if __name__ == '__main__':
66+
my_dag.test()

dags/sample_ch_insert.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import datetime
2+
import pathlib
3+
4+
from airflow.datasets import Dataset
5+
from airflow.decorators import dag, task
6+
from airflow.operators.empty import EmptyOperator
7+
from airflow.utils.trigger_rule import TriggerRule
8+
9+
from airflow_clickhouse_plugin.operators.clickhouse import ClickHouseOperator
10+
11+
12+
sql_dir = pathlib.Path(__file__).absolute().parent / "sql"
13+
sample_table_dataset = Dataset('clickhouse://sample_table')
14+
15+
16+
@dag(
17+
dag_id=pathlib.Path(__file__).stem,
18+
schedule=None,
19+
start_date=datetime.datetime(2024, 9, 1, 0, 0, 0),
20+
catchup=False,
21+
template_searchpath=[sql_dir],
22+
dag_display_name="Insert data to sample_table",
23+
tags=["sample", "clickhouse", "random"],
24+
max_active_runs=1,
25+
params={
26+
'num_rows': 1_000_000,
27+
},
28+
)
29+
def sample_ch_insert():
30+
start = EmptyOperator(task_id="start")
31+
insert_data = ClickHouseOperator(
32+
task_id='insert_into_sample_table',
33+
sql='insert_into_sample_table.sql',
34+
clickhouse_conn_id='ch_default',
35+
params={
36+
"num_rows": "{{ params.num_rows }}",
37+
},
38+
outlets=[sample_table_dataset],
39+
)
40+
41+
start >> insert_data
42+
43+
44+
my_dag = sample_ch_insert()
45+
46+
47+
if __name__ == '__main__':
48+
my_dag.test()

dags/sample_ch_stats.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import datetime
2+
import pathlib
3+
4+
from airflow.datasets import Dataset
5+
from airflow.decorators import dag, task
6+
from airflow.operators.empty import EmptyOperator
7+
from airflow.utils.trigger_rule import TriggerRule
8+
9+
from airflow_clickhouse_plugin.operators.clickhouse import ClickHouseOperator
10+
11+
12+
sql_dir = pathlib.Path(__file__).absolute().parent / "sql"
13+
sample_table_dataset = Dataset('clickhouse://sample_table')
14+
15+
16+
@dag(
17+
dag_id=pathlib.Path(__file__).stem,
18+
schedule=sample_table_dataset,
19+
start_date=datetime.datetime(2024, 9, 1, 0, 0, 0),
20+
catchup=False,
21+
template_searchpath=[sql_dir],
22+
dag_display_name="Compute stats on sample_table",
23+
tags=["sample", "clickhouse", "stats"],
24+
max_active_runs=1,
25+
)
26+
def sample_ch_insert():
27+
start = EmptyOperator(task_id="start")
28+
compute_stats = ClickHouseOperator(
29+
task_id='compute_stats_sample_table',
30+
sql="""
31+
SELECT
32+
category,
33+
count() AS total_count,
34+
avg(value) AS mean_value,
35+
median(value) AS median_value,
36+
stddevPop(value) AS std_dev,
37+
min(value) AS min_value,
38+
max(value) AS max_value
39+
FROM sample_table
40+
GROUP BY category
41+
ORDER BY category;
42+
""",
43+
with_column_types=True,
44+
clickhouse_conn_id='ch_default',
45+
)
46+
47+
@task
48+
def display_stats(upstream_xcom):
49+
stats, names = upstream_xcom
50+
print("Stats on sample_table:")
51+
print(",".join(t for t, _ in names))
52+
for row in stats:
53+
print(row)
54+
55+
end = EmptyOperator(task_id="end")
56+
57+
start >> compute_stats
58+
display_stats(compute_stats.output) >> end
59+
60+
61+
my_dag = sample_ch_insert()
62+
63+
64+
if __name__ == '__main__':
65+
my_dag.test()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
INSERT INTO sample_table (id, value, category)
2+
SELECT
3+
number AS id,
4+
randNormal(50, 10) AS value,
5+
multiIf(
6+
rand() % 3 = 0, 'A',
7+
rand() % 3 = 1, 'B',
8+
'C'
9+
) AS category
10+
FROM numbers({{ params.num_rows }})
11+
SETTINGS max_insert_block_size = 100000;

img/roll_d20_graph.png

6.89 KB
Loading

img/sample_ch_ddl_graph.png

22.4 KB
Loading

img/sample_ch_insert_graph.png

16.8 KB
Loading

img/sample_ch_stats_graph.png

23.2 KB
Loading

0 commit comments

Comments
 (0)