forked from bfelbo/DeepMoji
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_sentence_tokenizer.py
111 lines (92 loc) · 3.25 KB
/
test_sentence_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from __future__ import print_function
import test_helper
import json
from deepmoji.sentence_tokenizer import SentenceTokenizer
sentences = [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
dicts = [
{'label': 0},
{'label': 1},
{'label': 2},
{'label': 3},
{'label': 4},
{'label': 5},
{'label': 6},
{'label': 7},
{'label': 8},
{'label': 9},
]
train_ind = [0, 5, 3, 6, 8]
val_ind = [9, 2, 1]
test_ind = [4, 7]
with open('../model/vocabulary.json', 'r') as f:
vocab = json.load(f)
def test_dataset_split_parameter():
""" Dataset is split in the desired ratios
"""
split_parameter = [0.7, 0.1, 0.2]
st = SentenceTokenizer(vocab, 30)
result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
split_parameter, extend_with=0)
train = result[0]
val = result[1]
test = result[2]
train_dicts = result_dicts[0]
val_dicts = result_dicts[1]
test_dicts = result_dicts[2]
assert len(train) == len(sentences) * split_parameter[0]
assert len(val) == len(sentences) * split_parameter[1]
assert len(test) == len(sentences) * split_parameter[2]
assert len(train_dicts) == len(dicts) * split_parameter[0]
assert len(val_dicts) == len(dicts) * split_parameter[1]
assert len(test_dicts) == len(dicts) * split_parameter[2]
def test_dataset_split_explicit():
""" Dataset is split according to given indices
"""
split_parameter = [train_ind, val_ind, test_ind]
st = SentenceTokenizer(vocab, 30)
tokenized, _, _ = st.tokenize_sentences(sentences)
result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
train = result[0]
val = result[1]
test = result[2]
train_dicts = result_dicts[0]
val_dicts = result_dicts[1]
test_dicts = result_dicts[2]
for i, sentence in enumerate(sentences):
if i in train_ind:
assert tokenized[i] in train
assert dicts[i] in train_dicts
elif i in val_ind:
assert tokenized[i] in val
assert dicts[i] in val_dicts
elif i in test_ind:
assert tokenized[i] in test
assert dicts[i] in test_dicts
assert len(train) == len(train_ind)
assert len(val) == len(val_ind)
assert len(test) == len(test_ind)
assert len(train_dicts) == len(train_ind)
assert len(val_dicts) == len(val_ind)
assert len(test_dicts) == len(test_ind)
def test_id_to_sentence():
"""Tokenizing and converting back preserves the input.
"""
vb = {'CUSTOM_MASK': 0,
'aasdf': 1000,
'basdf': 2000}
sentence = u'aasdf basdf basdf basdf'
st = SentenceTokenizer(vb, 30)
token, _, _ = st.tokenize_sentences([sentence])
assert st.to_sentence(token[0]) == sentence
def test_id_to_sentence_with_unknown():
"""Tokenizing and converting back preserves the input, except for unknowns.
"""
vb = {'CUSTOM_MASK': 0,
'CUSTOM_UNKNOWN': 1,
'aasdf': 1000,
'basdf': 2000}
sentence = u'aasdf basdf ccc'
expected = u'aasdf basdf CUSTOM_UNKNOWN'
st = SentenceTokenizer(vb, 30)
token, _, _ = st.tokenize_sentences([sentence])
assert st.to_sentence(token[0]) == expected