-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect.py
116 lines (95 loc) · 3.82 KB
/
collect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import internetarchive as ia
import json
import pathlib as pl
import requests
from bs4 import BeautifulSoup
import nltk
import os
# Set up NLTK
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
# create directories and set path variables
pl.Path('material/hocr').mkdir(parents=True, exist_ok=True)
cleaned_dir = 'material/cleaned'
# ner_dir = 'material/ner'
# get list of available volumes for publisher (updated query)
query = 'creator:"Chicago, Photoplay Magazine Publishing Company"'
volumes_list = []
for item in ia.search_items(query):
volumes_list.append(item['identifier'])
# TESTING: small sample, delete old files
# volumes_list = ['pho1314chic', 'photo42chic', 'photoplay51chic', 'photoplayvolume222chic']
# Retrieve metadata for volumes
metadata_list = []
print('Retrieving metadata...')
for vol in volumes_list:
item = ia.get_item(vol)
metadata = {
'identifier': item.metadata.get('identifier', 'N/A'),
'title': item.metadata.get('title', 'N/A'),
'journal-title': item.metadata.get('journal-title', 'N/A'),
'volume': item.metadata.get('volume', 'N/A'),
'year': item.metadata.get('year', 'N/A'),
'date-string': item.metadata.get('date-string', 'N/A'),
'publisher': item.metadata.get('creator', 'N/A')
}
metadata_list.append(metadata)
# Save metadata to a JSON file
with open('material/metadata.json', 'w') as json_file:
json.dump(metadata_list, json_file, indent=4)
print('Metadata saved to metadata.json.')
# Download HOCR files
for vol in volumes_list:
hocr_file_path = f'material/hocr/{vol}_hocr.html'
if pl.Path(hocr_file_path).is_file():
print(f'HOCR file for {vol} already exists. Skipping download.')
else:
print(f'Downloading HOCR file for {vol}...')
url = f'https://archive.org/download/{vol}/{vol}_hocr.html'
response = requests.get(url)
if response.status_code == 200:
with open(hocr_file_path, 'w', encoding='utf-8') as file:
file.write(response.text)
print(f'Downloaded HOCR file for {vol}.')
else:
print(f'Failed to download HOCR file for {vol}')
print('HOCR download completed.')
# TODO txt statt hocr nutzen
# Extract text from HOCR file
def extract_text_from_hocr(hocr_file_path):
with open(hocr_file_path, 'r', encoding='utf-8') as hocr_file:
soup = BeautifulSoup(hocr_file, 'html.parser')
text = [word.get_text().strip() for word in soup.select('.ocrx_word')]
return text
# Clean text
def clean_text(text):
r_newlines = text.replace('- ', '')
r_replaced = r_newlines.replace("\u25a0", " ") # Replace "\u25a0" with a space
r_replaced = r_replaced.replace("\u2014", " ") # Replace "\u2014" with a space
return r_replaced
# Combine metadata, filtered text, and NER results
combined_data = []
for metadata in metadata_list:
identifier = metadata['identifier']
hocr_file_path = f'material/hocr/{identifier}_hocr.html'
# Check if the HOCR file exists for the current identifier
if not pl.Path(hocr_file_path).is_file():
print(f'HOCR file for {identifier} does not exist. Skipping extraction.')
continue
print(f'Cleaning {identifier}...')
# Extract text from HOCR file
hocr_text = extract_text_from_hocr(hocr_file_path)
# Clean text
cleaned_text = clean_text(' '.join(hocr_text))
metadata['text'] = cleaned_text
combined_data.append(metadata)
# Save combined data to combined.json
with open('material/collected_data.json', 'w') as json_file:
json.dump(combined_data, json_file, indent=4)
# Delete metadata.json
if os.path.exists('material/metadata.json'):
os.remove('material/metadata.json')
print('metadata.json deleted.')
print('Cleaning completed. Combined data saved to combined.json.')