-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_to_json.py
115 lines (101 loc) · 3.93 KB
/
html_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import json
from bs4 import BeautifulSoup
# Note: the email line for faculty cards is slightly different, so may not work for faculty cards
# Example input (html):
# !! Note: ASSUMES that each card is separated by an empty line AND file ends with 2 empty lines !!
# <div class="ui card">
# <div class="image">
# <img src="assets/headshots/ohjun.jpg" alt="Oh Jun Kweon head shot" />
# </div>
# <div class="content">
# <div class="header">
# Oh Jun Kweon
# </div>
# <div class="meta">
# he/him
# </div>
# <div class="description">
# <a class="email" href="mailto:[email protected]" target="_blank" data-tooltip="[email protected]"><i class="icon envelope"></i></a>
# <span data-tooltip="macOS"><i class="icon apple"></i></span>
# <span data-tooltip="VS Code"><i class="icon-vscode"></i></span>
# <span data-tooltip="Hometown: Dubai, UAE"><i class="icon map pin"></i></span>
# </div>
# </div>
# <div class="emoji content">
# <span data-tooltip="My blood is 83% coffee"><em data-emoji="coffee"></em></span>
# <span data-tooltip="Greatest invention by humans"><em data-emoji="sushi"></em></span>
# <span data-tooltip="Worst invention by humans"><em data-emoji="alarm_clock"></em></span>
# <span data-tooltip="Always looking for book recs"><em data-emoji="book"></em></span>
# </div>
# </div>
# Example output (json):
# {
# "ohjun": {
# "uniqname": "ohjun",
# "name": "Oh Jun Kweon",
# "pronoun": "he/him",
# "oss": ["macos"],
# "ides": ["vscode"],
# "hometown": "Dubai, UAE",
# "emojis": [
# ["coffee", "My blood is 83% coffee"],
# ["sushi", "Greatest invention by humans"],
# ["alarm_clock", "Worst invention by humans"],
# ["book", "Always looking for book recs"]
# ]
# }
# }
# Note: this function is based on code generated by ChatGPT
def extract_data(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
name = soup.find('div', class_='header').text.strip()
pronoun = soup.find('div', class_='meta').text.strip()
uniqname = soup.find('a', class_='email')['href'][7:].split('@')[0]
desc = [span['data-tooltip'] for span in soup.find('div', class_='description').find_all('span')]
# TODO: extract oss, ides, hometown from desc
oss = []
ides = []
hometown = ''
os_options = ['macos', 'windows', 'linux']
ide_options = ['vs code', 'emacs', 'vim', 'xcode', 'visual studio']
for item in desc:
if item.split(' ')[0] == "Hometown:":
hometown = ' '.join(item.split(' ')[1:])
elif item.lower() in os_options:
oss.append(item.lower())
elif item.lower() in ide_options:
ides.append(item.lower())
emojis_name = [em['data-emoji'] for em in soup.find('div', class_='emoji content').find_all('em')]
emojis_desc = [span['data-tooltip'] for span in soup.find('div', class_='emoji content').find_all('span')]
emojis = list(zip(emojis_name, emojis_desc))
data = {
"uniqname": uniqname,
"name": name,
"pronoun": pronoun,
"ides": ides,
"oss": oss,
"hometown": hometown,
"emojis": emojis
}
return data
if __name__ == "__main__":
if len(sys.argv) != 3:
print('Usage: python html_to_json.py [INPUT_FILENAME] [OUTPUT_FILENAME]')
exit()
INPUT_FILENAME = sys.argv[1]
OUTPUT_FILENAME = sys.argv[2]
people = {}
# populate people with Person objects
with open(INPUT_FILENAME, 'r') as in_file:
person_html = ''
for line in in_file:
if line == '\n':
data = extract_data(person_html)
people[data['uniqname']] = data
person_html = ''
else:
person_html += line
# format people into json
with open(OUTPUT_FILENAME, 'w') as out_file:
json.dump(people, out_file, indent=4)