Skip to content

Commit

Permalink
Merge pull request #181 from bab2min/dev/191
Browse files Browse the repository at this point in the history
Prepare 0.19.1
  • Loading branch information
bab2min authored Oct 18, 2024
2 parents eeb814f + 0456c9b commit 68a1d1b
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 16 deletions.
3 changes: 2 additions & 1 deletion document/document_header.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
<a class="homelink" rel="home" title="kiwipiepy Home" href="/kiwipiepy" style="display:block; font-size:2em; font-weight:bold; color:#555; padding-bottom:.5em; border-bottom:1px solid silver;"> <img src="/kiwipiepy/logo.png" alt="" style="height:1.5em;"> kiwipiepy </a>
<!--a id='lang-en' href="../en/index.html">English</a--> <a id='lang-kr' href="../kr/index.html">한국어</a>
<div id="version-link">
<span>v0.19.0</span>
<span>v0.19.1</span>
<ul>
<li><a href='/kiwipiepy/v0.19.1/kr'>v0.19.1</a></li>
<li><a href='/kiwipiepy/v0.19.0/kr'>v0.19.0</a></li>
<li><a href='/kiwipiepy/v0.18.1/kr'>v0.18.1</a></li>
<li><a href='/kiwipiepy/v0.18.0/kr'>v0.18.0</a></li>
Expand Down
2 changes: 1 addition & 1 deletion kiwipiepy/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.19.0'
__version__ = '0.19.1'
21 changes: 20 additions & 1 deletion kiwipiepy/_wrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1950,15 +1950,34 @@ def make_hsdataset(
self,
inputs:List[str],
batch_size:int = 128,
causal_context_size:int = 0,
window_size:int = 8,
num_workers:int = 1,
dropout:float = 0,
dropout_on_history:float = 0,
token_filter:Callable[[str, str], bool] = None,
window_filter:Callable[[str, str], bool] = None,
split_ratio:float = 0,
separate_default_morpheme:bool = False,
morpheme_def_path:str = None,
morpheme_def_min_cnt:int = 0,
seed:int = 0,
):
return super().make_hsdataset(inputs, batch_size, window_size, num_workers, dropout, token_filter, split_ratio, separate_default_morpheme, seed)
return super().make_hsdataset(
inputs,
batch_size,
causal_context_size,
window_size,
num_workers,
dropout,
dropout_on_history,
token_filter,
window_filter,
split_ratio,
separate_default_morpheme,
morpheme_def_path,
morpheme_def_min_cnt,
seed)


def extract_substrings(
Expand Down
7 changes: 7 additions & 0 deletions kiwipiepy/documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,13 @@ Python 모듈 관련 오류는 https://github.com/bab2min/kiwipiepy/issues, 형

역사
----
* 0.19.0 (2024-10-19)
* Kiwi 0.19.1의 기능들(https://github.com/bab2min/Kiwi/releases/tag/v0.19.1 )이 반영되었습니다.
* 앞쪽의 특수 문자가 형태소와 잘못 결합하여 분석되는 버그 수정
* 오타 교정을 켠 상태에서 PretokenizedSpan 혹은 blocklist를 사용할때 오류가 발생하는 버그 수정
* XSM 태그가 종종 접두사 위치에 등장하는 버그 수정
* `서툰`, `내민` 등의 형태가 명사로 잘못 분석되는 버그 수정

* 0.19.0 (2024-10-03)
* Kiwi 0.19.0의 기능들(https://github.com/bab2min/Kiwi/releases/tag/v0.19.0 )이 반영되었습니다.
* 장음화 오타 정정 기능 추가(ex: 지인짜 -> 진짜). Kiwi 초기화 시 typos='lengthening' 옵션으로 사용 가능합니다.
Expand Down
75 changes: 68 additions & 7 deletions src/KiwiPy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,11 @@ struct HSDatasetObject : py::CObject<HSDatasetObject>
return hsd.getWindowSize();
}

const kiwi::Vector<uint8_t>& getWindowTokenValidness() const
{
return hsd.getWindowTokenValidness();
}

size_t numSents() const
{
return hsd.numSents();
Expand Down Expand Up @@ -287,6 +292,7 @@ py::TypeWrapper<HSDatasetObject> _HSDatasetSetter{ gModule, [](PyTypeObject& obj
{ (char*)"batch_size", PY_GETTER(&HSDatasetObject::getBatchSize), nullptr, "", nullptr },
{ (char*)"window_size", PY_GETTER(&HSDatasetObject::getWindowSize), nullptr, "", nullptr },
{ (char*)"num_sents", PY_GETTER(&HSDatasetObject::numSents), nullptr, "", nullptr },
{ (char*)"window_token_validness", PY_GETTER(&HSDatasetObject::getWindowTokenValidness), nullptr, "", nullptr },
{ nullptr },
};
static PySequenceMethods seq = {
Expand Down Expand Up @@ -328,8 +334,9 @@ struct HSDatasetIterObject : py::CObject<HSDatasetIterObject>
py::UniqueObj iternext()
{
const size_t batchSize = obj->hsd.getBatchSize();
const size_t causalContextSize = obj->hsd.getCausalContextSize();
const size_t windowSize = obj->hsd.getWindowSize();
npy_intp sizes[2] = { (npy_intp)batchSize * 4, (npy_intp)windowSize };
npy_intp sizes[2] = { (npy_intp)batchSize * 4, (npy_intp)(causalContextSize + windowSize) };
py::UniqueObj inData{ PyArray_EMPTY(2, sizes, NPY_INT64, 0) };
py::UniqueObj outData{ PyArray_EMPTY(1, sizes, NPY_INT64, 0) };
py::UniqueObj lmLProbsData{ PyArray_EMPTY(1, sizes, NPY_FLOAT32, 0) };
Expand Down Expand Up @@ -910,8 +917,22 @@ struct KiwiObject : py::CObject<KiwiObject>
size_t loadUserDictionary(const char* path);
py::UniqueObj getMorpheme(size_t id);
py::UniqueObj join(PyObject* morphs, bool lmSearch = true, bool returnPositions = false);
py::UniqueObj makeHSDataset(PyObject* inputPathes, size_t batchSize, size_t windowSize, size_t numWorkers,
float dropout = 0, PyObject* tokenFilter = nullptr, float splitRatio = 0, bool separateDefaultMorpheme = false, size_t seed = 42) const;


py::UniqueObj makeHSDataset(PyObject* inputPathes,
size_t batchSize,
size_t causalContextSize,
size_t windowSize,
size_t numWorkers,
float dropout = 0,
PyObject* tokenFilter = nullptr,
PyObject* windowFilter = nullptr,
float splitRatio = 0,
bool separateDefaultMorpheme = false,
PyObject* morphemeDefPath = nullptr,
size_t morphemeDefMinCnt = 0,
size_t seed = 42) const;

py::UniqueObj listAllScripts() const;

float getCutOffThreshold() const
Expand Down Expand Up @@ -2511,10 +2532,21 @@ py::UniqueObj KiwiObject::join(PyObject* morphs, bool lmSearch, bool returnPosit
}
}

py::UniqueObj KiwiObject::makeHSDataset(PyObject* inputPathes, size_t batchSize, size_t windowSize, size_t numWorkers,
float dropout, PyObject* tokenFilter, float splitRatio, bool separateDefaultMorpheme, size_t seed) const
py::UniqueObj KiwiObject::makeHSDataset(PyObject* inputPathes,
size_t batchSize,
size_t causalContextSize,
size_t windowSize,
size_t numWorkers,
float dropout,
PyObject* tokenFilter,
PyObject* windowFilter,
float splitRatio,
bool separateDefaultMorpheme,
PyObject* morphemeDefPath,
size_t morphemeDefMinCnt,
size_t seed) const
{
KiwiBuilder::TokenFilter tf;
KiwiBuilder::TokenFilter tf, wf;
if (tokenFilter && tokenFilter != Py_None)
{
tf = [&](const u16string& form, POSTag tag)
Expand All @@ -2526,9 +2558,38 @@ py::UniqueObj KiwiObject::makeHSDataset(PyObject* inputPathes, size_t batchSize,
return !!truth;
};
}
if (windowFilter && windowFilter != Py_None)
{
wf = [&](const u16string& form, POSTag tag)
{
py::UniqueObj ret{ PyObject_CallObject(windowFilter, py::buildPyTuple(form, tagToString(tag)).get()) };
if (!ret) throw py::ExcPropagation{};
auto truth = PyObject_IsTrue(ret.get());
if (truth < 0) throw py::ExcPropagation{};
return !!truth;
};
}

string morphemeDefPathStr;
if (morphemeDefPath && morphemeDefPath != Py_None)
{
morphemeDefPathStr = py::toCpp<string>(morphemeDefPath);
}

HSDataset anotherDataset;
auto dataset = builder.makeHSDataset(py::toCpp<vector<string>>(inputPathes), batchSize, windowSize, numWorkers, dropout, tf, splitRatio, separateDefaultMorpheme, &anotherDataset);
auto dataset = builder.makeHSDataset(py::toCpp<vector<string>>(inputPathes),
batchSize,
causalContextSize,
windowSize,
numWorkers,
dropout,
tf,
wf,
splitRatio,
separateDefaultMorpheme,
morphemeDefPathStr,
morphemeDefMinCnt,
&anotherDataset);
dataset.seed(seed);
if (splitRatio == 0)
{
Expand Down
10 changes: 5 additions & 5 deletions src/PyUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -1157,19 +1157,19 @@ namespace py
static constexpr force_list_t force_list{};

#ifdef USE_NUMPY
template<typename _Ty>
struct ValueBuilder<std::vector<_Ty>,
template<typename _Ty, typename _Alloc>
struct ValueBuilder<std::vector<_Ty, _Alloc>,
typename std::enable_if<numpy_able<_Ty>::value>::type>
{
UniqueObj operator()(const std::vector<_Ty>& v)
UniqueObj operator()(const std::vector<_Ty, _Alloc>& v)
{
npy_intp size = v.size();
UniqueObj obj{ PyArray_EMPTY(1, &size, detail::NpyType<_Ty>::type, 0) };
std::memcpy(PyArray_DATA((PyArrayObject*)obj.get()), v.data(), sizeof(_Ty) * size);
return obj;
}

bool _toCpp(PyObject* obj, std::vector<_Ty>& out)
bool _toCpp(PyObject* obj, std::vector<_Ty, _Alloc>& out)
{
if (detail::NpyType<_Ty>::npy_type >= 0 && PyArray_Check(obj) && PyArray_TYPE((PyArrayObject*)obj) == detail::NpyType<_Ty>::npy_type)
{
Expand All @@ -1181,7 +1181,7 @@ namespace py
{
UniqueObj iter{ PyObject_GetIter(obj) }, item;
if (!iter) return false;
std::vector<_Ty> v;
std::vector<_Ty, _Alloc> v;
while ((item = UniqueObj{ PyIter_Next(iter.get()) }))
{
_Ty i;
Expand Down

0 comments on commit 68a1d1b

Please sign in to comment.