Skip to content

Commit

Permalink
3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
hyunwoongko committed Feb 15, 2022
1 parent 8badf46 commit bad3adb
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 26 deletions.
16 changes: 1 addition & 15 deletions kss/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,21 +144,7 @@ def exceptions():
year_s.append(f"{i}{j}s")
year_s.append(f"{i}{j}S")

time = []
for i in Const.numbers:
for j in Const.numbers:
for k in Const.single_quotes:
time.append(f"{i}{k}{j}")
time.append(f"{i}{k}{j}")
time.append(f"{i}{k}{j}")

inch = []
for i in Const.numbers + ["."]:
for j in Const.numbers:
for k in Const.double_quotes:
inch.append(f"{i}{j}{k}")

return faces + apostrophe + year_s + time + inch
return faces + apostrophe + year_s

@staticmethod
def ec_cases():
Expand Down
47 changes: 37 additions & 10 deletions kss/kss.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,9 @@ def split_sentences(

for _input_for_pp in mp_temp:
out = "".join(_input_for_pp).replace(" ", "")
for special in Const.quotes_or_brackets:
out = out.replace(special, "")
if use_quotes_brackets_processing:
for special in Const.quotes_or_brackets:
out = out.replace(special, "")

mp_postprocessing.append(out)

Expand Down Expand Up @@ -169,8 +170,10 @@ def split_sentences(
for result in _results:
mp_temp += result
out = "".join(mp_temp).replace(" ", "")
for special in Const.quotes_or_brackets:
out = out.replace(special, "")
if use_quotes_brackets_processing:
for special in Const.quotes_or_brackets:
out = out.replace(special, "")
mp_postprocessing = [m.replace("\u200b", "") for m in mp_postprocessing]

if out in mp_postprocessing:
mp_output_final.append(mp_temp)
Expand Down Expand Up @@ -243,8 +246,20 @@ def _split_sentences(
backend: str,
recover_step: int = 0,
):
if text in _cache.dic:
return _cache.get(text)
if use_quotes_brackets_processing:
text = text.replace("\u200b", "")

cache_key = (
text,
use_heuristic,
use_quotes_brackets_processing,
max_recover_step,
max_recover_length,
backend,
)

if cache_key in _cache.dic:
return _cache.get(cache_key)
else:
original_text = deepcopy(text)

Expand All @@ -259,8 +274,9 @@ def _split_sentences(
text = prep.add_emojis_to_dict(text)
text = prep.backup(text)

for s in Const.quotes_or_brackets:
text = text.replace(s, f"\u200b{s}\u200b")
if use_quotes_brackets_processing:
for s in Const.quotes_or_brackets:
text = text.replace(s, f"\u200b{s}\u200b")

if use_morpheme:
eojeols = _morph.pos(text=text, backend=backend)
Expand Down Expand Up @@ -583,10 +599,21 @@ def _split_sentences(
outputs = []
for s in results:
s = prep.restore(s)
s = s.replace("\u200b", "")
if use_quotes_brackets_processing:
s = s.replace("\u200b", "")
outputs.append(s)

_cache.put(original_text, outputs)
_cache.put(
(
original_text,
use_heuristic,
use_quotes_brackets_processing,
max_recover_step,
max_recover_length,
backend,
),
outputs,
)

return outputs

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def read_file(filename, cb):

setup(
name="kss",
version="3.3.1.1",
version="3.4",
author="Hyunwoong Ko",
author_email="[email protected]",
url="https://github.com/hyunwoongko/kss",
Expand Down
23 changes: 23 additions & 0 deletions tests/test_kss.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,26 @@ def test_chunks(self):
"전하 아니되옵니다. 부디 용서하옵소서. 소인이 큰 죄를 저질렀사옵니다.\n", max_length=6, overlap=True
)
print(out)

def test_yjy2026(self):
output = kss.split_sentences("1'1″")
print(output)

def test_hannabros(self):
output = kss.split_sentences('분리 할 수 ​​있다.')
print(output)

def test_lifelongeek(self):
text = "우리가 타이라는 단어는 원래 동사로 묶다 엮다라는 표현이고요 그다음에 명사로 하게 되면 묶음이라는 표현이죠 그런데"
output_1 = kss.split_sentences(text)
for i in range(500):
output = kss.split_sentences("안녕")
text = "우리가 타이라는 단어는 원래 동사로 묶다 엮다라는 표현이고요 그다음에 명사로 하게 되면 묶음이라는 표현이죠 그런데"
output_2 = kss.split_sentences(text)
print(output_1, output_2)
# 음.. 뭘까 잘 되는데

def test_newdboy(self):
tst = '그것이 잘 적혀 있는지 확인해야 한다고 했기 때문이다. EBS 미래교육연구소 최홍규 박사도 그렇게 말했다'
output = kss.split_sentences(text=tst, backend='mecab', use_heuristic=True)
print(output)

0 comments on commit bad3adb

Please sign in to comment.