-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchap13.qmd
220 lines (192 loc) · 5.99 KB
/
chap13.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
---
title: "必知必会第13课:创建高级联结"
author: "黄天元"
format: html
editor: visual
---
## 数据库的连接
```{r}
library(pacman)
p_load(tidyverse,DBI,RSQLite)
mydb = dbConnect(RSQLite::SQLite(), "data/TYSQL.sqlite")
```
## 基于SQL和tidyverse的数据操作
```{r}
# 13.1
dbGetQuery(mydb,"SELECT RTRIM(vend_name) || ' (' || RTRIM(vend_country) || ')'
AS vend_title
FROM Vendors
ORDER BY vend_name;")
tbl(mydb,"Vendors") %>%
arrange(vend_name) %>%
transmute(vend_title = str_c(
str_trim(vend_name,"right")," (",
str_trim(vend_country),")"
))
dbGetQuery(mydb,"SELECT cust_name, cust_contact
FROM Customers AS C, Orders AS O, OrderItems AS OI
WHERE C.cust_id = O.cust_id
AND OI.order_num = O.order_num
AND prod_id = 'RGAN01';")
tbl(mydb,"Customers") -> C
tbl(mydb,"Orders") -> O
tbl(mydb,"OrderItems") -> OI
C %>% inner_join(O,by = "cust_id") %>%
inner_join(OI,by = "order_num") %>%
filter(prod_id == 'RGAN01') %>%
select(cust_name,cust_contact)
# 13.2
dbGetQuery(mydb,"SELECT cust_id, cust_name, cust_contact
FROM Customers
WHERE cust_name = (SELECT cust_name
FROM Customers
WHERE cust_contact = 'Jim Jones');")
tbl(mydb,"Customers") %>%
filter(cust_contact == 'Jim Jones') %>%
pull(cust_name) -> query1
tbl(mydb,"Customers") %>%
select(cust_id, cust_name, cust_contact) %>%
filter(cust_name == query1)
dbGetQuery(mydb,"SELECT c1.cust_id, c1.cust_name, c1.cust_contact
FROM Customers AS c1, Customers AS c2
WHERE c1.cust_name = c2.cust_name
AND c2.cust_contact = 'Jim Jones';")
tbl(mydb,"Customers") %>%
select(cust_id,cust_name,cust_contact) -> c1
tbl(mydb,"Customers") %>%
filter(cust_contact == 'Jim Jones') %>%
select(cust_name) -> c2
c1 %>% inner_join(c2)
# 13.2.2
dbGetQuery(mydb,"SELECT C.*, O.order_num, O.order_date,
OI.prod_id, OI.quantity, OI.item_price
FROM Customers AS C, Orders AS O,
OrderItems AS OI
WHERE C.cust_id = O.cust_id
AND OI.order_num = O.order_num
AND prod_id = 'RGAN01';")
tbl(mydb,"Customers") -> C
tbl(mydb,"Orders") %>%
select(order_num,order_date,cust_id)-> O
tbl(mydb,"OrderItems") %>%
filter(prod_id == 'RGAN01') %>%
select(prod_id,quantity,item_price,order_num)-> OI
C %>% inner_join(O) %>%
inner_join(OI) %>%
collect() %>%
print(width = Inf)
# 13.2.3
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
INNER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
tbl(mydb,"Customers") %>%
inner_join(
tbl(mydb,"Orders"),by = "cust_id"
) %>%
select(cust_id,order_num)
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
tbl(mydb,"Customers") %>%
left_join(
tbl(mydb,"Orders"),by = "cust_id"
) %>%
select(cust_id,order_num)
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
RIGHT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
tbl(mydb,"Customers") %>%
right_join(
tbl(mydb,"Orders"),by = "cust_id"
) %>%
select(cust_id,order_num)
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
FULL OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
tbl(mydb,"Customers") %>%
full_join(
tbl(mydb,"Orders"),by = "cust_id"
) %>%
select(cust_id,order_num)
# 13.3
dbGetQuery(mydb,"SELECT Customers.cust_id,
COUNT(Orders.order_num) AS num_ord
FROM Customers
INNER JOIN Orders ON Customers.cust_id = Orders.cust_id
GROUP BY Customers.cust_id;")
tbl(mydb,"Customers") %>%
select(cust_id) %>%
inner_join(tbl(mydb,"Orders"),by = "cust_id") %>%
count(cust_id,name = "num_ord")
dbGetQuery(mydb,"SELECT Customers.cust_id,
COUNT(Orders.order_num) AS num_ord
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id
GROUP BY Customers.cust_id;")
tbl(mydb,"Customers") %>%
select(cust_id) %>%
left_join(tbl(mydb,"Orders"),by = "cust_id") %>%
summarise(num_ord = sum(!is.na(order_num)),.by = cust_id)
```
## 注意
SQLite在3.39.0版本之前不支持右连接(RIGHT JOIN)和全外连接(FULL OUTER JOIN),但从3.39.0版本开始支持这两种连接类型。
## 练习
- 请使用SQL和R两种方式,解决课后的挑战题。
```{r}
#| echo: false
#| eval: false
## SQL
# 参考:https://forta.com/books/0135182794/challenges/
# 1
dbGetQuery(mydb,"SELECT cust_name, order_num
FROM Customers
JOIN Orders ON Customers.cust_id = Orders.cust_id
ORDER BY cust_name;")
tbl(mydb,"Customers") %>%
inner_join(
tbl(mydb,"Orders")
) %>%
select(cust_name, order_num) %>%
arrange(cust_name)
# 2
dbGetQuery(mydb,"SELECT cust_name, order_num
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id
ORDER BY cust_name;")
tbl(mydb,"Customers") %>%
left_join(
tbl(mydb,"Orders")
) %>%
select(cust_name, order_num) %>%
arrange(cust_name)
# 3
dbGetQuery(mydb,"SELECT prod_name, order_num
FROM Products LEFT OUTER JOIN OrderItems
ON Products.prod_id = OrderItems.prod_id
ORDER BY prod_name;")
tbl(mydb,"Products") %>%
left_join(tbl(mydb,"OrderItems")) %>%
select(prod_name,order_num) %>%
arrange(prod_name) %>%
collect()
# 4
dbGetQuery(mydb,"SELECT prod_name, COUNT(order_num) AS orders
FROM Products LEFT OUTER JOIN OrderItems
ON Products.prod_id = OrderItems.prod_id
GROUP BY prod_name
ORDER BY prod_name;")
tbl(mydb,"Products") %>%
left_join(tbl(mydb,"OrderItems") %>%
count(prod_id,order_num,name = "orders")) %>%
replace_na(list(orders = 0)) %>%
summarise(orders = sum(orders),.by = prod_name) %>%
arrange(prod_name)
# 5
dbGetQuery(mydb,"SELECT Vendors.vend_id, COUNT(prod_id) AS prod_num
FROM Vendors
LEFT OUTER JOIN Products ON Vendors.vend_id = Products.vend_id
GROUP BY Vendors.vend_id;")
tbl(mydb,"Vendors") %>%
left_join(tbl(mydb,"Products"),by = "vend_id") %>%
summarise(prod_num = sum(!is.na(prod_id)),.by = vend_id)
```