-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget CA dept ed data.R
110 lines (91 loc) · 3.52 KB
/
get CA dept ed data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
## taken from https://github.com/daranzolin/rCAEDDATA/blob/master/data-raw/setup.R
library(rvest)
library(stringr)
library(tidyverse)
url <- "http://www.cde.ca.gov/ds/sd/sd/index.asp"
files <- url %>%
read_html() %>%
html_nodes("h3+ ul a") %>%
html_attr("href")
file_paths <- paste0("http://www.cde.ca.gov/ds/sd/sd/", files)
get_text_urls <- function(page) {
page %>%
read_html() %>%
html_nodes("td:nth-child(2) a") %>%
html_attr("href")
}
all_files <- map(file_paths, function(x) {
message("Sleeping...")
Sys.sleep(2)
get_text_urls(x)
})
file_names <- flatten_chr(all_files)
usable_file_names <- file_names[!str_detect(file_names, ".exe|.dbf")]
download_ca_file <- function(url) {
destfile_name <- url %>%
str_replace("http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?", "") %>%
str_replace("ftp://ftp.cde.ca.gov/demo/", "") %>%
str_replace("http://www.cde.ca.gov/ds/sd/sd/documents/", "") %>%
str_replace("http://dq.cde.ca.gov/dataquest/", "") %>%
str_replace("\\?cLevel=School&", "") %>%
str_replace_all("=", "_")
destfile_name <- case_when(!grepl(".txt$", destfile_name) & !grepl(".xls$", destfile_name) ~ paste0(destfile_name, ".txt"),
TRUE ~ destfile_name)
safely(download.file(url, destfile = paste0("CA_eddata/", destfile_name)))
}
for (url in usable_file_names) {
download_ca_file(url)
Sys.sleep(2)
}
year_levels <- c(
"9091", "9192", "9293", "9394", "9495", "9596", "9697", "9798", "9899", "9900",
"0001", "0102", "0203", "0304", "0405", "0506", "0607", "0708", "0809", "0910",
"1011", "1112", "1213", "1314", "1415", "1516", "1617", "1718"
)
# Downloadable data files for graduates and graduates meeting
# University of California (UC)/California State University (CSU)
# entrance requirements by race/ethnic designation and gender by school.
graduates <- dir() %>%
keep(str_detect(., "filesgrads.txt")) %>%
map(read_tsv) %>%
map_df(mutate, YEAR = factor(YEAR, year_levels))
saveRDS(graduates, "graduates.rds")
# Downloadable data files pertaining to students eligible for
# Free or Reduced Price Meals (FRPM).
frpm <- dir() %>%
keep(str_detect(., "frpm")) %>%
map(read_xls, sheet = 2, col_names = FALSE) %>%
map_df(function(x) {
header <- unlist(x[1,])
d <- x[-c(1:2),]
names(d) <- header
d
})
saveRDS(frpm, "frpm.rds")
# California Longitudinal Pupil Achievement Data System (CALPADS) cohort
# outcome data reported by race/ethnicity, program participation, and gender.
cohorts <- dir() %>%
keep(str_detect(., "filescohort.txt")) %>%
map(read_tsv) %>%
map_df(mutate, Year = factor(Year, year_levels))
saveRDS(cohorts, "cohorts.rds")
#Downloadable data files pertaining to the California Longitudinal Pupil Achievement Data System (CALPADS)
# UPC Source File for grades K-12.
calpads <- dir() %>%
keep(str_detect(., "cupc[0-9]{4}.xls")) %>%
map(read_xls)
# Downloadable data files for school-level enrollment by
# racial/ethnic designation, gender, and grade.
read_and_mutate_with_year <- function(file) {
year <- str_extract(file, "[0-9]{4}-[0-9]{2}|[0-9]{4}")
read_tsv(file) %>%
mutate(YEAR = year)
}
enrollments <- dir() %>%
keep(str_detect(., "filesenr.asp.txt")) %>%
map(read_and_mutate_with_year) %>%
map_df(mutate, CDS_CODE = as.character(CDS_CODE)) %>%
mutate(YEAR = factor(YEAR, levels = c("9495", "9596", "9697", "9798", "9899", "9900",
"0001", "0102", "0203", "0304", "0405", "0506", "0607", "2007-08", "2008-09", "2009-10",
"2010-11", "2011-12", "2012-13", "2013-14", "2014-15", "2015-16", "2016-17")))
saveRDS(enrollments, "enrollments.rds")