(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014
- 1. November 13, 2014 | Las Vegas, NV
Timon Karnezos, Director Infrastructure, Neustar
VidhyaSrinivasan, Sr. Manager Software Development, Amazon Redshift
- 9. 00
01
10
11
00
01
10
11
P
Space filling Curve for Two Dimensions
- 26. --Number of ads seen per user
WITH frequency_intermediateAS (
SELECT user_id ,
SUM(1)AS impression_count,
SUM(cost)AS cost ,
SUM(revenue)AS revenue
FROM impressions
WHERE record_dateBETWEEN <...>
GROUP BY 1
)
--Number of people who saw N ads
SELECT impression_count, SUM(1), SUM(cost), SUM(revenue)
FROM frequency_intermediate
GROUP BY 1;
- 29. CREATE TABLE (
record_datedateENCODENOT NULL ,
campaign_idbigintENCODENOT NULL ,
site_idbigintENCODENOT NULL ,
user_idbigintENCODENOT NULL DISTKEY,
impression_countint ENCODENOT NULL ,
costbigintENCODENOT NULL ,
revenuebigintENCODENOT NULL
)SORTKEY(,,,);
- 30. WITH user_frequencyAS (
SELECT user_id, campaign_id, site_id,
SUM(impression_count)AS frequency,
SUM(cost)AScost ,
SUM(revenue)AS revenue
FROM frequency_intermediate
WHERE record_dateBETWEEN <...>
GROUP BY 1,2,3
)
SELECT campaign_id, site_id, frequency,
SUM(1), SUM(cost), SUM(revenue)
FROM user_frequency
GROUP BY 1,2,3;
- 36. --Basic sessionization query, assemble user activity
--that ended in a conversion into a timeline.
SELECT <...>
FROM impressions i
JOIN conversions cON
i.user_id =c.user_id AND
i.record_date <c.record_date
ORDER BY i.record_date;
- 38. Hour offset: 3
Position: 1
Position: 2
Hour offset: 12
Hour offset: 16
Position: 3
- 39. --Sessionize user activity per conversion, partition by campaign (45-day lookback window)
SELECT c.record_dateAS conversion_date ,
c.event_idAS conversion_id ,
i.campaign_idAS campaign_id ,
i.site_idAS site_id ,
i.user_idAS user_id ,
c.revenueAS conversion_revenue,
DATEDIFF('hour', i.record_date, c.record_date) AS hour_offset,
SUM(1)OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id
ORDER BY i.record_dateDESC ROWS UNBOUNDED PRECEDING) AS position
FROM impressions i
JOIN conversions cON
i.user_id= c.user_idAND
i.campaign_id= c.campaign_idAND
i.record_date< c.record_dateAND
i.record_date> (c.record_date-interval '45 days') AND
c.record_dateBETWEEN <...>;
- 40. --Compute statistics on sessions (funnel placement, last-touch, site-count, etc...)
SELECT campaign_id ,
site_id ,
conversion_date,
AVG(position)ASaverage_position,
SUM(conversion_revenue * (position = 1)::int)ASlta_attributed ,
AVG(COUNT(DISTINCT site_id)
OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id
ORDER BY i.record_dateASC
ROWS UNBOUNDED PRECEDING)) AS average_unique_preceding_site_count
FROMsessions
GROUPBY 1,2,3;
- 44. Site A
Site B
SiteC
Site A
20%
60%
Site B
90%
Site C
CPM
$0.06
$1.05
$9.50
- 45. Site A
Site B
SiteC
Site A
20%
60%
Site B
90%
Site C
CPM
$0.06
$1.05
$9.50
- 46. Site A
Site B
SiteC
Site A
20%
60%
Site B
90%
Site C
CPM
$0.06
$1.05
$9.50
- 47. CREATE TABLE (
user_idbigintENCODENOT NULL DISTKEY,
site_id bigintENCODENOT NULL
)SORTKEY();
- 48. WITH co_occurencesAS (
SELECT
oi.site_idAS site1 ,
oi2.site_id AS site2
FROM overlap_intermediate oi
JOIN overlap_intermediate oi2 ON
oi.site_id> oi2.site_id AND
oi.ak_user_id= oi2.ak_user_id
)
SELECT site1, site2, SUM(1)
FROM co_occurences
GROUP BY 1,2;
- 49. CREATE TABLE (
record_datedateENCODENOT NULL ,
campaign_idbigintENCODENOT NULL ,
site_idbigintENCODENOT NULL ,
user_idbigintENCODENOT NULL DISTKEY
)SORTKEY(,);
- 50. WITH
site_overlap_intermediateAS (
SELECT user_id, site_id, campaign_id
FROM overlap_intermediateWHERE record_dateBETWEEN <...> GROUP BY 1,2,3
),
site_co_occurencesAS (
SELECT oi.campaign_idAS c_id, oi.site_idAS site1,oi2.site_id AS site2
FROM site_overlap_intermediate oi
JOIN site_overlap_intermediate oi2 ON
oi.site_id> oi2.site_idAND
oi.ak_user_id= oi2.ak_user_id AND
oi.campaign_id = oi2.campaign_id
)
SELECT c_id, site1, site2, SUM(1)FROM site_co_occurencesGROUP BY 1,2,3;
- 58. $ pg_dump–Fc some_file --table=foo --table=bar
$ pg_restore--schema-only --clean –Fc some_file > schema.sql
$ pg_restore--data-only --table=foo –Fc some_file > foo.tsv
$ aws s3 cp schema.sql s3://metadata-bucket/YYYYMMDD/schema.sql
$ aws s3 cp foo.tsv s3://metadata-bucket/YYYYMMDD/foo.tsv
> i schema.sql
> COPY foo FROM ‘s3://metadata-bucket/YYYYMMDD/foo.tsv’ <...>
# or combine ‘COPY <..> FROM <...> SSH’and pg_restore/psql
- 59. UNLOAD
('
SELECT i.*
FROM impressions i
JOIN client_to_campaign_mapping m ON
m.campaign_id= i.campaign_id
WHERE i.record_date>= '{{yyyy}}-{{mm}}-{{dd}}' -interval '1 day'AND
i.record_date< '{{yyyy}}-{{mm}}-{{dd}}' AND
m.client_id= <...>
‘)
TO's3://{{bucket}}/us_eastern/{{yyyy}}/{{mm}}/{{dd}}/dsdk_events/{{vers}}/impressions/'
WITH CREDENTIALS 'aws_access_key_id={{key}};aws_secret_access_key={{secret}}'
DELIMITER ','NULL 'N'ADDQUOTES ESCAPE GZIP MANIFEST;
- 64. Workload
Node Count
Node Type
Restore
Maint.
Exec.
Frequency
& Attribution
& Overlap
&Ad Hoc
16
dw2.8xlarge
2h
1h
6h
= $691.20
- 65. Workload
Node Count
Node Type
Restore
Maint.
Exec.
Frequency
8
dw2.8xlarge
1.5h
0.5h
2.5h
Attribution
8
dw2.8xlarge
1.5h
0.5h
2h
Overlap
8
dw2.8xlarge
1h
0.5h
2.5h
Ad-hoc
8
dw2.8xlarge
0h
0.5h
1.5h
= $556.80
(-19%)