-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsplitter.py
105 lines (87 loc) · 3.87 KB
/
splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import unicodedata
import shutil
import sys
import os
import re
# print term width horizontal line
def hz_line(character='-'):
terminal_width = shutil.get_terminal_size().columns
line = character * terminal_width
print(line)
""" def sanitize_content(text):
# Normalize unicode characters to their closest ASCII representation
text = unicodedata.normalize('NFKC', text)
# Remove hyphenation at the end of lines and join words
text = re.sub(r'-\s*\n', '', text)
# Condense all whitespace except for paragraph breaks into a single space
text = re.sub(r'[^\S\n]+', ' ', text)
# Replace multiple newlines with a double newline (to preserve paragraph breaks)
text = re.sub(r'\n\s*\n', '\n\n', text)
return text """
""" def sanitize_content(text):
# Remove hyphenation at the end of lines and join words
text = re.sub(r'-\s*\n\s*', '', text)
# Replace multiple consecutive spaces with a single space
text = re.sub(r' {2,}', ' ', text)
# Ensure that paragraph breaks are maintained
text = re.sub(r'\n{2,}', '\n\n', text)
return text """
""" def sanitize_content(text):
# Normalize unicode characters to their closest ASCII representation
text = unicodedata.normalize('NFKC', text)
# Remove hyphenation at the end of lines and join words
text = re.sub(r'-\s*\n', '', text)
# Replace all whitespace characters (including non-breaking spaces) with a single space
text = re.sub(r'\s+', ' ', text)
return text """
# text preprocessing & sanitization
def sanitize_content(text):
# Remove hyphenation at the end of lines and join words
text = re.sub(r'-\s*\n', '', text)
# Replace multiple spaces (but not newlines) with a single space
# text = re.sub(r'[^\S\n]+', ' ', text)
# text = re.sub(r' {2,}', ' ', text)
# Replace multiple whitespace characters (including non-breaking spaces) with a single space
text = re.sub(r'[^\S\r\n]+', ' ', text)
# Merge lines into paragraphs
text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
text = re.sub(r'[^\S\r\n]+', ' ', text)
return text
def split_text_for_translation(file_path, char_limit):
hz_line()
print(f"Current character split limit (from next empty line): {char_limit} characters.")
hz_line()
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Sanitize the text for hyphenation
content = sanitize_content(content)
sections = []
current_section = ""
for line in content.split('\n'):
if len(current_section) + len(line) < char_limit or not line.strip():
current_section += line + '\n'
else:
sections.append(current_section)
current_section = line + '\n'
if current_section:
sections.append(current_section)
return sections
def main(file_path, char_limit):
base_name = os.path.splitext(os.path.basename(file_path))[0]
output_dir = f"{base_name}-splits"
os.makedirs(output_dir, exist_ok=True)
sections = split_text_for_translation(file_path, char_limit)
num_digits = len(str(len(sections))) # Calculate the number of digits for formatting
for i, section in enumerate(sections, start=1):
formatted_index = str(i).zfill(num_digits) # Pad the index with leading zeros
output_filename = os.path.join(output_dir, f"{base_name}_split_{formatted_index}.txt")
with open(output_filename, 'w', encoding='utf-8') as output_file:
output_file.write(section)
print(f"Section {formatted_index} written to {output_filename}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python scriptname.py <inputfile>")
sys.exit(1)
inputfile = sys.argv[1]
char_limit = 5000 # Set your character limit here
main(inputfile, char_limit)