-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_parser.py
40 lines (29 loc) · 957 Bytes
/
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from bs4 import BeautifulSoup
import json
print('References (r, R), or all (a,A)?\n')
choice = str(input()).lower()
input_list = ['r', 'a']
full_list = ['references']
parse_list = []
if choice not in input_list:
print('\nTry again, the input was not valid.\n\n')
exit()
if choice == 'a':
parse_list = full_list
else:
parse_list = [full_list[input_list.index(choice)]]
root_folder = './'
export_folder = 'zotero_export/'
for type in parse_list:
html_file_name = f'LinkedMusic_{type}.html'
path = f'{type}/content.json'
with open(export_folder + html_file_name) as f:
html_soup = BeautifulSoup(f, 'html.parser')
content_array = []
for html_tag in html_soup.findAll('div', {'class': 'csl-entry'}):
content_array.append(str(html_tag))
# sort alphabetically by author
print(content_array)
content_array.sort()
with open(path, 'w') as f:
json.dump(content_array, f, indent=4)