-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_merge_join.sql
122 lines (113 loc) · 3.56 KB
/
01_merge_join.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
--EXPLAIN EXPLAIN EXPLAIN!!!
--learning to read the redshift explain plans is extremely important!
--you want to make sure that your predicate filters are applied to XN Seq Scan operators
--and bad network operations are eliminated as much as possible.
--DS_DIST_NONE and DS_DIST_ALL_NONE are good... most of the others are bad
--Nested Loop is evil!
--be wary of intermediate result sets requiring bad network operations
explain select
beer
,favorite_instrument
,count(distinct d.user_id)
from
beer.events e
inner join
beer.drinkers d
on
e.user_id=d.user_id
and
d.company='venus.com'
group by
beer
,favorite_instrument;
--create tables to implement compression
--sort and distkey optimization is targeted toward merge join
--encodings will be explained in future labs
drop table if exists beer.drinkers2 cascade;
create table if not exists beer.drinkers2
(
id integer encode delta32k
,user_id char(32) encode lzo
,first_name varchar(32) encode bytedict
,last_name varchar(32) encode bytedict
,company varchar(64) encode bytedict
,school varchar(128) encode bytedict
,superhero_power varchar(64) encode bytedict
,good_tipper boolean encode runlength
,favorite_instrument varchar(64) encode lzo
,zip_code char(5) encode lzo
)
distkey(user_id)
sortkey(user_id);
drop table if exists beer.events2 cascade;
create table if not exists beer.events2
(
beer_event_id char(32) encode lzo
,drinking_session_id char(32) encode lzo
,user_id char(32) encode lzo
,beer varchar(64) encode bytedict
,schmooziest_buzzword varchar(32) encode bytedict
,best_thing_said varchar(128) encode lzo
,worst_thing_said varchar(128) encode lzo
,drunken_babble varchar(1024) encode lzo
,likes smallint encode mostly8
,beer_opened_time timestamp encode lzo
)
distkey(user_id)
sortkey(user_id);
--copy data from source tables to target tables
insert into beer.drinkers2 select * from beer.drinkers;
insert into beer.events2 select * from beer.events;
--analyze all the tables to make explain and execution plans accurate
analyze beer.drinkers2;
analyze beer.events2;
--EXECUTE ALL STATEMENTS UP TO THIS POINT
-- http://docs.aws.amazon.com/redshift/latest/dg/c_data_redistribution.html
--without optimization
--Hash operator materializes a hash table for drinkers
--DS_BCAST_INNER on hash join indicates an expensive network operation because
--the intermediate results of the hash table need to be broadcasted between all of the nodes
--in order to insure that the join keys are present everywhere they are needed.
--this is a N*(N-1) operation... where N is the number of slices
explain select
beer
,favorite_instrument
,count(distinct d.user_id)
from
beer.events e
inner join
beer.drinkers d
on
e.user_id=d.user_id
and
d.company='venus.com'
group by
beer
,favorite_instrument;
--with optimization
--merge join is possible because both tables are sorted on join key
--and distributed on the same key. all data is
--colocated and sorted, so it joins like a zipper!
--this query runs an average of twice as fast as the original tables
--note that you need to keep your tables vacuumed in order to keep
--the performance benefit of a merge join
explain select
beer
,favorite_instrument
,count(distinct d.user_id)
from
beer.events2 e
inner join
beer.drinkers2 d
on
e.user_id=d.user_id
and
d.company='venus.com'
group by
beer
,favorite_instrument;
--BONUS GOTCHA!!!
--when distributing multiple tables on the same key...
---the column data types must match exactly... not just the values.
--the value 'hello' will be distributed differently
--for varchar(32) than it will for varchar(33)