SlideShare a Scribd company logo
November 13, 2014 | Las Vegas, NV 
Timon Karnezos, Director Infrastructure, Neustar 
VidhyaSrinivasan, Sr. Manager Software Development, Amazon Redshift
Petabyte scale 
Massively parallelRelational data warehouseFully managed; zero admin
10 GigE 
(HPC) 
Ingestion 
Backup 
Restore 
JDBC/ODBC
Ad TechUse Cases
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
692.8s 
34.9s 
< 0.76%
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
00 
01 
10 
11 
00 
01 
10 
11 
P 
Space filling Curve for Two Dimensions
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
Frequency
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
Attribution
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
Overlap
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
Ad-hoc
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
0.7B 
/ day 
2B 
/ week 
8B 
/ month 
21B 
/ quarter
--Number of ads seen per user 
WITH frequency_intermediateAS ( 
SELECT user_id , 
SUM(1)AS impression_count, 
SUM(cost)AS cost , 
SUM(revenue)AS revenue 
FROM impressions 
WHERE record_dateBETWEEN <...> 
GROUP BY 1 
) 
--Number of people who saw N ads 
SELECT impression_count, SUM(1), SUM(cost), SUM(revenue) 
FROM frequency_intermediate 
GROUP BY 1;
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
CREATE TABLE ( 
record_datedateENCODENOT NULL , 
campaign_idbigintENCODENOT NULL , 
site_idbigintENCODENOT NULL , 
user_idbigintENCODENOT NULL DISTKEY, 
impression_countint ENCODENOT NULL , 
costbigintENCODENOT NULL , 
revenuebigintENCODENOT NULL 
)SORTKEY(,,,);
WITH user_frequencyAS ( 
SELECT user_id, campaign_id, site_id, 
SUM(impression_count)AS frequency, 
SUM(cost)AScost , 
SUM(revenue)AS revenue 
FROM frequency_intermediate 
WHERE record_dateBETWEEN <...> 
GROUP BY 1,2,3 
) 
SELECT campaign_id, site_id, frequency, 
SUM(1), SUM(cost), SUM(revenue) 
FROM user_frequency 
GROUP BY 1,2,3;
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
--Basic sessionization query, assemble user activity 
--that ended in a conversion into a timeline. 
SELECT <...> 
FROM impressions i 
JOIN conversions cON 
i.user_id =c.user_id AND 
i.record_date <c.record_date 
ORDER BY i.record_date;
Position: 1 
Position: 2 
Position: 3
Hour offset: 3 
Position: 1 
Position: 2 
Hour offset: 12 
Hour offset: 16 
Position: 3
--Sessionize user activity per conversion, partition by campaign (45-day lookback window) 
SELECT c.record_dateAS conversion_date , 
c.event_idAS conversion_id , 
i.campaign_idAS campaign_id , 
i.site_idAS site_id , 
i.user_idAS user_id , 
c.revenueAS conversion_revenue, 
DATEDIFF('hour', i.record_date, c.record_date) AS hour_offset, 
SUM(1)OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id 
ORDER BY i.record_dateDESC ROWS UNBOUNDED PRECEDING) AS position 
FROM impressions i 
JOIN conversions cON 
i.user_id= c.user_idAND 
i.campaign_id= c.campaign_idAND 
i.record_date< c.record_dateAND 
i.record_date> (c.record_date-interval '45 days') AND 
c.record_dateBETWEEN <...>;
--Compute statistics on sessions (funnel placement, last-touch, site-count, etc...) 
SELECT campaign_id , 
site_id , 
conversion_date, 
AVG(position)ASaverage_position, 
SUM(conversion_revenue * (position = 1)::int)ASlta_attributed , 
AVG(COUNT(DISTINCT site_id) 
OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id 
ORDER BY i.record_dateASC 
ROWS UNBOUNDED PRECEDING)) AS average_unique_preceding_site_count 
FROMsessions 
GROUPBY 1,2,3;
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
Site A 
Site B 
SiteC 
Site A 
20% 
60% 
Site B 
90% 
Site C 
CPM 
$0.06 
$1.05 
$9.50
Site A 
Site B 
SiteC 
Site A 
20% 
60% 
Site B 
90% 
Site C 
CPM 
$0.06 
$1.05 
$9.50
Site A 
Site B 
SiteC 
Site A 
20% 
60% 
Site B 
90% 
Site C 
CPM 
$0.06 
$1.05 
$9.50
CREATE TABLE ( 
user_idbigintENCODENOT NULL DISTKEY, 
site_id bigintENCODENOT NULL 
)SORTKEY();
WITH co_occurencesAS ( 
SELECT 
oi.site_idAS site1 , 
oi2.site_id AS site2 
FROM overlap_intermediate oi 
JOIN overlap_intermediate oi2 ON 
oi.site_id> oi2.site_id AND 
oi.ak_user_id= oi2.ak_user_id 
) 
SELECT site1, site2, SUM(1) 
FROM co_occurences 
GROUP BY 1,2;
CREATE TABLE ( 
record_datedateENCODENOT NULL , 
campaign_idbigintENCODENOT NULL , 
site_idbigintENCODENOT NULL , 
user_idbigintENCODENOT NULL DISTKEY 
)SORTKEY(,);
WITH 
site_overlap_intermediateAS ( 
SELECT user_id, site_id, campaign_id 
FROM overlap_intermediateWHERE record_dateBETWEEN <...> GROUP BY 1,2,3 
), 
site_co_occurencesAS ( 
SELECT oi.campaign_idAS c_id, oi.site_idAS site1,oi2.site_id AS site2 
FROM site_overlap_intermediate oi 
JOIN site_overlap_intermediate oi2 ON 
oi.site_id> oi2.site_idAND 
oi.ak_user_id= oi2.ak_user_id AND 
oi.campaign_id = oi2.campaign_id 
) 
SELECT c_id, site1, site2, SUM(1)FROM site_co_occurencesGROUP BY 1,2,3;
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
8 
fact tables 
26 
dimension tables 
7 
mapping tables
42 
views 
121 
joins 
1100 
sloc
$ pg_dump–Fc some_file --table=foo --table=bar 
$ pg_restore--schema-only --clean –Fc some_file > schema.sql 
$ pg_restore--data-only --table=foo –Fc some_file > foo.tsv 
$ aws s3 cp schema.sql s3://metadata-bucket/YYYYMMDD/schema.sql 
$ aws s3 cp foo.tsv s3://metadata-bucket/YYYYMMDD/foo.tsv 
> i schema.sql 
> COPY foo FROM ‘s3://metadata-bucket/YYYYMMDD/foo.tsv’ <...> 
# or combine ‘COPY <..> FROM <...> SSH’and pg_restore/psql
UNLOAD 
(' 
SELECT i.* 
FROM impressions i 
JOIN client_to_campaign_mapping m ON 
m.campaign_id= i.campaign_id 
WHERE i.record_date>= '{{yyyy}}-{{mm}}-{{dd}}' -interval '1 day'AND 
i.record_date< '{{yyyy}}-{{mm}}-{{dd}}' AND 
m.client_id= <...> 
‘) 
TO's3://{{bucket}}/us_eastern/{{yyyy}}/{{mm}}/{{dd}}/dsdk_events/{{vers}}/impressions/' 
WITH CREDENTIALS 'aws_access_key_id={{key}};aws_secret_access_key={{secret}}' 
DELIMITER ','NULL 'N'ADDQUOTES ESCAPE GZIP MANIFEST;
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
Workload 
Node Count 
Node Type 
Restore 
Maint. 
Exec. 
Frequency 
& Attribution 
& Overlap 
&Ad Hoc 
16 
dw2.8xlarge 
2h 
1h 
6h 
= $691.20
Workload 
Node Count 
Node Type 
Restore 
Maint. 
Exec. 
Frequency 
8 
dw2.8xlarge 
1.5h 
0.5h 
2.5h 
Attribution 
8 
dw2.8xlarge 
1.5h 
0.5h 
2h 
Overlap 
8 
dw2.8xlarge 
1h 
0.5h 
2.5h 
Ad-hoc 
8 
dw2.8xlarge 
0h 
0.5h 
1.5h 
= $556.80 
(-19%)
(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
http://bit.ly/awsevals

More Related Content

(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014

  • 1. November 13, 2014 | Las Vegas, NV Timon Karnezos, Director Infrastructure, Neustar VidhyaSrinivasan, Sr. Manager Software Development, Amazon Redshift
  • 2. Petabyte scale Massively parallelRelational data warehouseFully managed; zero admin
  • 3. 10 GigE (HPC) Ingestion Backup Restore JDBC/ODBC
  • 9. 00 01 10 11 00 01 10 11 P Space filling Curve for Two Dimensions
  • 25. 0.7B / day 2B / week 8B / month 21B / quarter
  • 26. --Number of ads seen per user WITH frequency_intermediateAS ( SELECT user_id , SUM(1)AS impression_count, SUM(cost)AS cost , SUM(revenue)AS revenue FROM impressions WHERE record_dateBETWEEN <...> GROUP BY 1 ) --Number of people who saw N ads SELECT impression_count, SUM(1), SUM(cost), SUM(revenue) FROM frequency_intermediate GROUP BY 1;
  • 29. CREATE TABLE ( record_datedateENCODENOT NULL , campaign_idbigintENCODENOT NULL , site_idbigintENCODENOT NULL , user_idbigintENCODENOT NULL DISTKEY, impression_countint ENCODENOT NULL , costbigintENCODENOT NULL , revenuebigintENCODENOT NULL )SORTKEY(,,,);
  • 30. WITH user_frequencyAS ( SELECT user_id, campaign_id, site_id, SUM(impression_count)AS frequency, SUM(cost)AScost , SUM(revenue)AS revenue FROM frequency_intermediate WHERE record_dateBETWEEN <...> GROUP BY 1,2,3 ) SELECT campaign_id, site_id, frequency, SUM(1), SUM(cost), SUM(revenue) FROM user_frequency GROUP BY 1,2,3;
  • 36. --Basic sessionization query, assemble user activity --that ended in a conversion into a timeline. SELECT <...> FROM impressions i JOIN conversions cON i.user_id =c.user_id AND i.record_date <c.record_date ORDER BY i.record_date;
  • 37. Position: 1 Position: 2 Position: 3
  • 38. Hour offset: 3 Position: 1 Position: 2 Hour offset: 12 Hour offset: 16 Position: 3
  • 39. --Sessionize user activity per conversion, partition by campaign (45-day lookback window) SELECT c.record_dateAS conversion_date , c.event_idAS conversion_id , i.campaign_idAS campaign_id , i.site_idAS site_id , i.user_idAS user_id , c.revenueAS conversion_revenue, DATEDIFF('hour', i.record_date, c.record_date) AS hour_offset, SUM(1)OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id ORDER BY i.record_dateDESC ROWS UNBOUNDED PRECEDING) AS position FROM impressions i JOIN conversions cON i.user_id= c.user_idAND i.campaign_id= c.campaign_idAND i.record_date< c.record_dateAND i.record_date> (c.record_date-interval '45 days') AND c.record_dateBETWEEN <...>;
  • 40. --Compute statistics on sessions (funnel placement, last-touch, site-count, etc...) SELECT campaign_id , site_id , conversion_date, AVG(position)ASaverage_position, SUM(conversion_revenue * (position = 1)::int)ASlta_attributed , AVG(COUNT(DISTINCT site_id) OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id ORDER BY i.record_dateASC ROWS UNBOUNDED PRECEDING)) AS average_unique_preceding_site_count FROMsessions GROUPBY 1,2,3;
  • 44. Site A Site B SiteC Site A 20% 60% Site B 90% Site C CPM $0.06 $1.05 $9.50
  • 45. Site A Site B SiteC Site A 20% 60% Site B 90% Site C CPM $0.06 $1.05 $9.50
  • 46. Site A Site B SiteC Site A 20% 60% Site B 90% Site C CPM $0.06 $1.05 $9.50
  • 47. CREATE TABLE ( user_idbigintENCODENOT NULL DISTKEY, site_id bigintENCODENOT NULL )SORTKEY();
  • 48. WITH co_occurencesAS ( SELECT oi.site_idAS site1 , oi2.site_id AS site2 FROM overlap_intermediate oi JOIN overlap_intermediate oi2 ON oi.site_id> oi2.site_id AND oi.ak_user_id= oi2.ak_user_id ) SELECT site1, site2, SUM(1) FROM co_occurences GROUP BY 1,2;
  • 49. CREATE TABLE ( record_datedateENCODENOT NULL , campaign_idbigintENCODENOT NULL , site_idbigintENCODENOT NULL , user_idbigintENCODENOT NULL DISTKEY )SORTKEY(,);
  • 50. WITH site_overlap_intermediateAS ( SELECT user_id, site_id, campaign_id FROM overlap_intermediateWHERE record_dateBETWEEN <...> GROUP BY 1,2,3 ), site_co_occurencesAS ( SELECT oi.campaign_idAS c_id, oi.site_idAS site1,oi2.site_id AS site2 FROM site_overlap_intermediate oi JOIN site_overlap_intermediate oi2 ON oi.site_id> oi2.site_idAND oi.ak_user_id= oi2.ak_user_id AND oi.campaign_id = oi2.campaign_id ) SELECT c_id, site1, site2, SUM(1)FROM site_co_occurencesGROUP BY 1,2,3;
  • 56. 8 fact tables 26 dimension tables 7 mapping tables
  • 57. 42 views 121 joins 1100 sloc
  • 58. $ pg_dump–Fc some_file --table=foo --table=bar $ pg_restore--schema-only --clean –Fc some_file > schema.sql $ pg_restore--data-only --table=foo –Fc some_file > foo.tsv $ aws s3 cp schema.sql s3://metadata-bucket/YYYYMMDD/schema.sql $ aws s3 cp foo.tsv s3://metadata-bucket/YYYYMMDD/foo.tsv > i schema.sql > COPY foo FROM ‘s3://metadata-bucket/YYYYMMDD/foo.tsv’ <...> # or combine ‘COPY <..> FROM <...> SSH’and pg_restore/psql
  • 59. UNLOAD (' SELECT i.* FROM impressions i JOIN client_to_campaign_mapping m ON m.campaign_id= i.campaign_id WHERE i.record_date>= '{{yyyy}}-{{mm}}-{{dd}}' -interval '1 day'AND i.record_date< '{{yyyy}}-{{mm}}-{{dd}}' AND m.client_id= <...> ‘) TO's3://{{bucket}}/us_eastern/{{yyyy}}/{{mm}}/{{dd}}/dsdk_events/{{vers}}/impressions/' WITH CREDENTIALS 'aws_access_key_id={{key}};aws_secret_access_key={{secret}}' DELIMITER ','NULL 'N'ADDQUOTES ESCAPE GZIP MANIFEST;
  • 64. Workload Node Count Node Type Restore Maint. Exec. Frequency & Attribution & Overlap &Ad Hoc 16 dw2.8xlarge 2h 1h 6h = $691.20
  • 65. Workload Node Count Node Type Restore Maint. Exec. Frequency 8 dw2.8xlarge 1.5h 0.5h 2.5h Attribution 8 dw2.8xlarge 1.5h 0.5h 2h Overlap 8 dw2.8xlarge 1h 0.5h 2.5h Ad-hoc 8 dw2.8xlarge 0h 0.5h 1.5h = $556.80 (-19%)