-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_optimizing_for_predicate_pruning.sql
123 lines (113 loc) · 3.18 KB
/
03_optimizing_for_predicate_pruning.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
--OPTIMIZE FOR PREDICATES
--we are going to optimize the tables for use in where clause
--filters based upon the company, user_id, and beer_opened_time
--in this table note we set the top sort order key to raw encoding.
--this will store the company data uncompressed.
-- light compression is okay to apply here
drop table if exists beer.drinkers3 cascade;
create table if not exists beer.drinkers3
(
id integer
,user_id char(32) encode lzo
,first_name varchar(32) encode bytedict
,last_name varchar(32) encode bytedict
,company varchar(64) encode raw --we put raw encoding on the top sort order field
,school varchar(128) encode bytedict
,superhero_power varchar(64) encode bytedict
,good_tipper boolean encode runlength
,favorite_instrument varchar(64) encode lzo
,zip_code char(5) encode lzo
)
distkey(user_id)
sortkey(company,user_id);
--this table is sorted by the timestamp allowing for
--range restricted scans.
--this also makes it faster to roll off data in the future
drop table if exists beer.events3 cascade;
create table if not exists beer.events3
(
beer_event_id char(32) encode lzo
,drinking_session_id char(32) encode lzo
,user_id char(32) encode lzo
,beer varchar(64) encode bytedict
,schmooziest_buzzword varchar(32) encode bytedict
,best_thing_said varchar(128) encode lzo
,worst_thing_said varchar(128) encode lzo
,drunken_babble varchar(1024) encode lzo
,likes smallint encode mostly8
,beer_opened_time timestamp encode lzo
)
distkey(user_id)
sortkey(beer_opened_time);
insert into beer.drinkers3 select * from beer.drinkers2;
insert into beer.events3 select * from beer.events2;
analyze beer.events3;
analyze beer.drinkers3;
--the explain plans for the following queries have similar seq scan operators.
--note that the second query has lower cost for each seq scan
--the reality of the sorting and compression isn't always well quantified
--by the explain plan cost... but the timings make it clear
explain select
beer
,count(distinct user_id)
from
beer.events
where
beer_opened_time between '2016-11-01' and '2016-11-04'
group by
beer;
--this set of tables delivers the results with subsecond timing
--after the first run. this is due to the exexcution plan being
--compiled already after the first run.
explain select
beer
,count(distinct user_id)
from
beer.events3
where
beer_opened_time between '2016-11-01' and '2016-11-04'
group by
beer;
--this query accesses approximately the same number of records ~3.5M
--but it takes much longer
--that is because redshift is unable to identify which blocks it needs
--to pull from disk because the column is not sorted at all.
--the table ends up getting scanned in this case.
explain select
beer
,count(distinct user_id)
from
beer.events3
where
schmooziest_buzzword in
('3rd generation',
'4th generation',
'5th generation',
'6th generation',
'Adaptive',
'Advanced',
'Ameliorated',
'Assimilated',
'Automated',
'Balanced')
group by
beer;
select
beer
,count(distinct user_id)
from
beer.events
where
schmooziest_buzzword in
('3rd generation',
'4th generation',
'5th generation',
'6th generation',
'Adaptive',
'Advanced',
'Ameliorated',
'Assimilated',
'Automated',
'Balanced')
group by
beer;