-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathnoaa_scraper.py
162 lines (142 loc) · 5.84 KB
/
noaa_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#link to libraries neede by this notebook
import os
import numpy as np
from html.parser import HTMLParser
from urllib import parse
from urllib.request import urlopen
from urllib.request import urlretrieve
#following are needed for the progress bar
from ipywidgets import FloatProgress
from IPython.display import display
#use glob to download filenames from a directoery
from glob import glob
# We are going to create a class called LinkParser that inherits some
# methods from HTMLParser which is why it is passed into the definition
class LinkParser(HTMLParser):
# This is a function that HTMLParser normally has
# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
# We are looking for the begining of a link. Links normally look
# like <a href="www.someurl.com"></a>
if tag == 'a':
for (key, value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
self.links = self.links + [newUrl]
# This is a new function that we are creating to get links
# that our spider() function will call
def getLinks(self, url):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)
if 'text/html' in response.getheader('Content-Type'):
htmlBytes = response.read()
# Note that feed() handles Strings well, but not bytes
# (A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return '',self.links #htmlString, self.links
if 'text/plain' in response.getheader('Content-Type'):
return url,[]
else:
return "",[]
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def noaa_spider(url, word, maxPages):
if not os.path.isdir('data'):
os.mkdir('data')
pagesToVisit = [url]
textfiles = [];
numberVisited = 0
foundWord = False
urlsVisited = set()
foundFiles = set()
progressBar = FloatProgress(min=0, max=maxPages)
display(progressBar)
progressBar.value = 0
# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
# Start from the beginning of our collection of pages to visit:
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
#try:
#print(numberVisited, "Visiting:", url)
parser = LinkParser()
if url not in urlsVisited:
urlsVisited.add(url)
if '.txt' in url:
if word in url:
textfiles = textfiles + [url]
foundFiles.add(url)
print("FOUND ", url)
name='./data/'+url.split('/')[-1]
if not os.path.isfile(name):
print('downloading...',name)
urlretrieve(url,name)
else:
print('file exists...',name)
else:
numberVisited = numberVisited +1
progressBar.value = numberVisited
data, links = parser.getLinks(url)
# Add the pages that we visited to the end of our collection
# of pages to visit:
pagesToVisit = pagesToVisit + links
return foundFiles
def read_data_column(filename, col=8):
f = open(filename, 'r')
filename
air_temperature = []
for row in f:
data = row.split()
temp = float(data[col])
if(temp < -9000): # Check for valid data
#print('IsNan')
if(air_temperature == []): # First point in serise
temp = 0
else:
temp=air_temperature[-1] #Repeat previous data point
else:
temp = temp*9.0/5.0+32
if(temp != []):
air_temperature.append(temp)
f.close()
return air_temperature
def get_airtemperature_from_files():
#Read all Tif images in current directory
files = glob('./data/*.txt');
files.sort();
progressBar = FloatProgress(min=0, max=len(files))
display(progressBar)
progressBar.value = 0
air_temperature = []
for filename in files:
progressBar.value = progressBar.value + 1
print('reading...',filename)
air_temperature = air_temperature + read_data_column(filename)
return air_temperature
def get_noaa_temperatures(url, name, maxdepth=100):
#Now call the main noaa_spider function and search for the word hpc
files = noaa_spider(url, name, 100)
return get_airtemperature_from_files()
print(f"running as {__name__}")