-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_get_embassylocations.R
82 lines (50 loc) · 1.91 KB
/
02_get_embassylocations.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#### The purpose of this script is to use screen scraping to build a dataframe with info about U.S. embassy locations.
# Inputs: webpage url listing countries with embassies and consulates
# Outputs:
# Prepare working environment ----
library(tidyverse)
library(rvest)
# Read embassy page info ----
embassy_page <- read_html("https://www.usembassy.gov/")
# Get list of embassy locations from webpage ----
# get list of all countries on main webpage that have hyperlinks to their own pages
names <- embassy_page %>%
html_nodes("a.pcs-post-title") %>%
html_attr('title')
# get urls for each country's webpage listing their embassies and consulates
links <- embassy_page %>%
html_nodes("a.pcs-post-title") %>%
html_attr('href')
# create df with each country's webpage
embassy_links <- as.data.frame(links) %>%
mutate(id = row_number())
# create df with names of all countries
embassy_names <- as.data.frame(names) %>%
mutate(id = row_number())
# bind country names to their webpages
embassy_locations <- cbind(embassy_names, embassy_links) %>%
select(-id)
# save intermediate df
save(embassy_locations, file = "embassies.Rda")
# Get names and addresses for individual embassy locations from html code on each country page----
results <- c()
for(i in links) {
page <- read_html(paste0(i))
text <- page %>%
html_nodes("div.cityname1") %>%
html_text()
results <- c(results, text)
}
# clean up address strings
test <- gsub("\n", ", ", results)
test <- sub("Phone[^Phone]+$", "", test)
test <- sub("Telephone[^Telephone]+$", "", test)
test <- sub("Tel[^Tel]+$", "", test)
test <- sub("tel[^tel]+$", "", test)
test <- sub("Fax[^Fax]+$", "", test)
test <- gsub("Address: ", " ", test)
test <- sub(",[^,]+$", "", test)
# save df with all embassy addresses for each country
embassy_addresses <- as.data.frame(test) %>%
rename(address = test)
save(embassy_addresses, file="embassy_addresses.Rda")