forked from jsvine/pdfplumber
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_dedupe_chars.py
140 lines (121 loc) · 5.35 KB
/
test_dedupe_chars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
import logging
import os
import unittest
import pdfplumber
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/issue-71-duplicate-chars.pdf")
self.pdf = pdfplumber.open(path)
@classmethod
def teardown_class(self):
self.pdf.close()
def test_extract_table(self):
page = self.pdf.pages[0]
table_without_drop_duplicates = page.extract_table()
table_with_drop_duplicates = page.dedupe_chars().extract_table()
last_line_without_drop = table_without_drop_duplicates[1][1].split("\n")[-1]
last_line_with_drop = table_with_drop_duplicates[1][1].split("\n")[-1]
assert (
last_line_without_drop
== "微微软软 培培训训课课程程:: 名名模模意意义义一一些些有有意意义义一一些些"
)
assert last_line_with_drop == "微软 培训课程: 名模意义一些有意义一些"
def test_extract_words(self):
page = self.pdf.pages[0]
x0 = 440.143
x1_without_drop = 534.992
x1_with_drop = 534.719
top_windows = 791.849
top_linux = 794.357
bottom = 802.961
last_words_without_drop = page.extract_words()[-1]
last_words_with_drop = page.dedupe_chars().extract_words()[-1]
assert round(last_words_without_drop["x0"], 3) == x0
assert round(last_words_without_drop["x1"], 3) == x1_without_drop
assert round(last_words_without_drop["top"], 3) in (top_windows, top_linux)
assert round(last_words_without_drop["bottom"], 3) == bottom
assert last_words_without_drop["upright"] == 1
assert (
last_words_without_drop["text"]
== "名名模模意意义义一一些些有有意意义义一一些些"
)
assert round(last_words_with_drop["x0"], 3) == x0
assert round(last_words_with_drop["x1"], 3) == x1_with_drop
assert round(last_words_with_drop["top"], 3) in (top_windows, top_linux)
assert round(last_words_with_drop["bottom"], 3) == bottom
assert last_words_with_drop["upright"] == 1
assert last_words_with_drop["text"] == "名模意义一些有意义一些"
def test_extract_text(self):
page = self.pdf.pages[0]
last_line_without_drop = page.extract_text().split("\n")[-1]
last_line_with_drop = page.dedupe_chars().extract_text().split("\n")[-1]
assert (
last_line_without_drop
== "微微软软 培培训训课课程程:: 名名模模意意义义一一些些有有意意义义一一些些"
)
assert last_line_with_drop == "微软 培训课程: 名模意义一些有意义一些"
def test_extract_text2(self):
path = os.path.join(HERE, "pdfs/issue-71-duplicate-chars-2.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
assert (
page.dedupe_chars().extract_text(y_tolerance=6).splitlines()[4]
== "UE 8. Circulation - Métabolismes"
)
def test_extra_attrs(self):
path = os.path.join(HERE, "pdfs/issue-1114-dedupe-chars.pdf")
pdf = pdfplumber.open(path)
page = pdf.pages[0]
def dup_chars(s: str) -> str:
return "".join((char if char == " " else char + char) for char in s)
ground_truth = (
("Simple", False, False),
("Duplicated", True, True),
("Font", "fontname", True),
("Size", "size", True),
("Italic", "fontname", True),
("Weight", "fontname", True),
("Horizontal shift", False, "HHoorrizizoonntatal ls shhifitft"),
("Vertical shift", False, True),
)
gt = []
for text, should_dedup, dup_text in ground_truth:
if isinstance(dup_text, bool):
if dup_text:
dup_text = dup_chars(text)
else:
dup_text = text
gt.append((text, should_dedup, dup_text))
keys_list = ["no_dedupe", (), ("size",), ("fontname",), ("size", "fontname")]
for keys in keys_list:
if keys != "no_dedupe":
filtered_page = page.dedupe_chars(tolerance=2, extra_attrs=keys)
else:
filtered_page = page
for i, line in enumerate(
filtered_page.extract_text(y_tolerance=5).splitlines()
):
text, should_dedup, dup_text = gt[i]
if keys == "no_dedupe":
should_dedup = False
if isinstance(should_dedup, str):
if should_dedup in keys:
fail_msg = (
f"{should_dedup} is not required to match "
"so it should be duplicated"
)
assert line == dup_text, fail_msg
else:
fail_msg = (
"Should not be duplicated "
f"when requiring matching {should_dedup}"
)
assert line == text, fail_msg
elif should_dedup:
assert line == text
else:
assert line == dup_text