1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
-- Reddit Ads Row Count Validation Test
-- Compares distinct counts between RAW layer and WORKSPACE_MARKETING layer
-- Accounts for deduplication logic in source models
WITH raw_counts AS (
-- Single primary key tables
SELECT 'ad' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.AD
UNION ALL
SELECT 'business_account' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.BUSINESS_ACCOUNT
UNION ALL
SELECT 'campaign' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.CAMPAIGN
UNION ALL
SELECT 'custom_audience_history' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.CUSTOM_AUDIENCE_HISTORY
UNION ALL
SELECT 'geolocation' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.GEOLOCATION
UNION ALL
SELECT 'interest' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.INTEREST
UNION ALL
SELECT 'time_zone' AS table_name, COUNT(DISTINCT id) AS raw_count
FROM RAW.REDDIT_ADS.TIME_ZONE
-- Composite primary key tables
UNION ALL
SELECT 'ad_group' AS table_name, COUNT(DISTINCT CONCAT(account_id, '|', id)) AS raw_count
FROM RAW.REDDIT_ADS.AD_GROUP
UNION ALL
SELECT 'targeting_community' AS table_name, COUNT(DISTINCT CONCAT(ad_group_id, '|', community_id)) AS raw_count
FROM RAW.REDDIT_ADS.TARGETING_COMMUNITY
UNION ALL
SELECT 'targeting_custom_audience' AS table_name, COUNT(DISTINCT CONCAT(ad_group_id, '|', custom_audience_id)) AS raw_count
FROM RAW.REDDIT_ADS.TARGETING_CUSTOM_AUDIENCE
UNION ALL
SELECT 'targeting_device' AS table_name, COUNT(DISTINCT _fivetran_id) AS raw_count
FROM RAW.REDDIT_ADS.TARGETING_DEVICE
UNION ALL
SELECT 'targeting_geolocation' AS table_name, COUNT(DISTINCT CONCAT(ad_group_id, '|', geolocation_id)) AS raw_count
FROM RAW.REDDIT_ADS.TARGETING_GEOLOCATION
UNION ALL
SELECT 'targeting_interest' AS table_name, COUNT(DISTINCT CONCAT(ad_group_id, '|', interest_id)) AS raw_count
FROM RAW.REDDIT_ADS.TARGETING_INTEREST
),
workspace_counts AS (
-- Workspace layer counts using primary keys from schema.yml
SELECT 'ad' AS table_name, COUNT(DISTINCT ad_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_AD
UNION ALL
SELECT 'business_account' AS table_name, COUNT(DISTINCT business_account_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_BUSINESS_ACCOUNT
UNION ALL
SELECT 'campaign' AS table_name, COUNT(DISTINCT campaign_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_CAMPAIGN
UNION ALL
SELECT 'custom_audience_history' AS table_name, COUNT(DISTINCT custom_audience_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_CUSTOM_AUDIENCE_HISTORY
UNION ALL
SELECT 'geolocation' AS table_name, COUNT(DISTINCT geolocation_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_GEOLOCATION
UNION ALL
SELECT 'interest' AS table_name, COUNT(DISTINCT interest_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_INTEREST
UNION ALL
SELECT 'time_zone' AS table_name, COUNT(DISTINCT time_zone_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_TIME_ZONE
-- Composite primary key tables
UNION ALL
SELECT 'ad_group' AS table_name, COUNT(DISTINCT CONCAT(ad_group_account_id, '|', ad_group_id)) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_AD_GROUP
UNION ALL
SELECT 'targeting_community' AS table_name, COUNT(DISTINCT CONCAT(targeting_community_ad_group_id, '|', targeting_community_id)) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_TARGETING_COMMUNITY
UNION ALL
SELECT 'targeting_custom_audience' AS table_name, COUNT(DISTINCT CONCAT(targeting_custom_audience_ad_group_id, '|', targeting_custom_audience_id)) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_TARGETING_CUSTOM_AUDIENCE
UNION ALL
SELECT 'targeting_device' AS table_name, COUNT(DISTINCT targeting_device_fivetran_id) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_TARGETING_DEVICE
UNION ALL
SELECT 'targeting_geolocation' AS table_name, COUNT(DISTINCT CONCAT(targeting_geolocation_ad_group_id, '|', targeting_geolocation_id)) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_TARGETING_GEOLOCATION
UNION ALL
SELECT 'targeting_interest' AS table_name, COUNT(DISTINCT CONCAT(targeting_interest_ad_group_id, '|', targeting_interest_id)) AS workspace_count
FROM REDDIT_DBT_MODEL_GENERATION_PROD.WORKSPACE_MARKETING.WK_REDDIT_ADS_TARGETING_INTEREST
)
-- Final comparison with validation results
SELECT
r.table_name,
r.raw_count,
w.workspace_count,
r.raw_count - w.workspace_count AS count_difference,
CASE
WHEN r.raw_count = w.workspace_count THEN '✅ PASS'
WHEN r.raw_count > w.workspace_count THEN '⚠️ RAW > WORKSPACE (Expected due to deduplication)'
ELSE '❌ FAIL - WORKSPACE > RAW (Unexpected)'
END AS validation_status,
ROUND((w.workspace_count::FLOAT / r.raw_count::FLOAT) * 100, 2) AS data_retention_percentage
FROM raw_counts r
JOIN workspace_counts w ON r.table_name = w.table_name
ORDER BY r.table_name;
|