Skip to content

Commit

Permalink
update doc & fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
tiankongdeguiji committed Jan 23, 2025
1 parent b5fbedc commit 2c031d8
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 31 deletions.
10 changes: 5 additions & 5 deletions docs/source/feature/feature.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ feature_configs: {
```

- **separator**: FG多值分隔符,默认为`\x1d`
- **value_dim**: 指定Embedding特征的输入维度
- **value_dim**: 默认值为1, 指定Embedding特征的输入维度

## ComboFeature: 组合特征

Expand Down Expand Up @@ -245,7 +245,7 @@ feature_configs: {
- **vocab_list**: 指定词表,适合取值比较少可以枚举的特征。
- **vocab_dict**: 指定字典形式词表,适合多个词需要编码到同一个编号情况,**编号需要从2开始**,编码0预留给默认值,编码1预留给超出词表的词
- **zch**: 零冲突hash,可设置Id的准入和驱逐策略,详见[文档](../zch.md)
- **value_dim**: 默认值是0,可以设置1,value_dim=0时支持多值ID输出
- **value_dim**: 默认值是1,可以设置0,value_dim=0时支持多值ID输出

如果Map的值为连续值,可设置:

Expand Down Expand Up @@ -288,7 +288,7 @@ feature_configs: {
- **vocab_list**: 指定词表,适合取值比较少可以枚举的特征。
- **vocab_dict**: 指定字典形式词表,适合多个词需要编码到同一个编号情况,**编号需要从2开始**,编码0预留给默认值,编码1预留给超出词表的词
- **zch**: 零冲突hash,可设置Id的准入和驱逐策略,详见[文档](../zch.md)
- **value_dim**: 默认值是0,可以设置1,value_dim=0时支持多值ID输出
- **value_dim**: 默认值是1,可以设置0,value_dim=0时支持多值ID输出

如果Map的值为连续值,可设置:

Expand Down Expand Up @@ -478,7 +478,7 @@ feature_configs: {
- **sequence_length**: 序列特征最大长度
- **sequence_delim**: 序列特征分隔符
- **expression**: 特征FG所依赖的字段来源,由两部分组成`input_side`:`input_name`
- **value_dim**: 目前只支持value_dim=1,不支持多值ID序列
- **value_dim**: 默认值是1,可以设置0,value_dim=0时支持多值ID输出
- 其余配置同IdFeature

## SequenceRawFeature:数值型序列特征
Expand Down Expand Up @@ -550,4 +550,4 @@ feature_configs: {
- **feature_name**: 子特征特征名,完整的子特征名应拼接上`${sequence_name}__`前缀,以上述配置中`item_id`子特征为例,子特征名列名应为`click_seq__item_id`
- **expression**: 特征FG所依赖子特征字段来源名,由两部分组成`input_side`:`input_name`。在输入样本数据中列名应拼接上`${sequence_name}__`前缀,以上述配置中`item_id`子特征为例,`expression``item:iid`,输入样本数据中列名应为`click_seq__iid`。在线上模型服务中,如果子特征的`input_side``item`,子序列无需从请求中传递;如果子特征的`input_side``user`,子序列需要从请求中传递。
- 其中当类型为IdFeature时
- **value_dim**: 目前只支持value_dim=1,不支持多值ID序列
- **value_dim**: 默认值是1,可以设置0,value_dim=0时支持多值ID输出
8 changes: 4 additions & 4 deletions tzrec/datasets/data_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,19 +692,19 @@ def _to_sparse_features_user1_itemb(
keys_user.append(key)
if key in dg_sequence_mulval_sparse_keys:
mulval_keys_user.append(key)
# pyre-ignore [16]
# pyre-ignore [61]
mulval_seq_lengths_user.append(seq_length)
# pyre-ignore [16]
# pyre-ignore [61]
mulval_key_lengths_user.append(key_length)
else:
values_item.append(value)
lengths_item.append(length)
keys_item.append(key)
if key in dg_sequence_mulval_sparse_keys:
mulval_keys_item.append(key)
# pyre-ignore [16]
# pyre-ignore [61]
mulval_seq_lengths_item.append(seq_length)
# pyre-ignore [16]
# pyre-ignore [61]
mulval_key_lengths_item.append(key_length)

if len(dg_has_weight_keys) > 0:
Expand Down
7 changes: 3 additions & 4 deletions tzrec/features/lookup_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ def value_dim(self) -> int:
"""Fg value dimension of the feature."""
if self.config.HasField("value_dim"):
return self.config.value_dim
elif self._is_sparse:
return 0
else:
return 1

Expand All @@ -82,7 +80,8 @@ def is_sparse(self) -> bool:
"""Feature is sparse or dense."""
if self._is_sparse is None:
self._is_sparse = (
self.config.HasField("hash_bucket_size")
self.config.HasField("zch")
or self.config.HasField("hash_bucket_size")
or self.config.HasField("num_buckets")
or len(self.vocab_list) > 0
or len(self.vocab_dict) > 0
Expand Down Expand Up @@ -242,7 +241,7 @@ def fg_json(self) -> List[Dict[str, Any]]:
if fg_cfg["needDiscrete"]:
fg_cfg["combiner"] = ""
if fg_cfg["combiner"] == "":
fg_cfg["value_dim"] = self.config.value_dim
fg_cfg["value_dim"] = self.value_dim

fg_cfgs = [fg_cfg]
if raw_fg_cfg is not None:
Expand Down
7 changes: 3 additions & 4 deletions tzrec/features/match_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ def value_dim(self) -> int:
"""Fg value dimension of the feature."""
if self.config.HasField("value_dim"):
return self.config.value_dim
elif self._is_sparse:
return 0
else:
return 1

Expand All @@ -84,7 +82,8 @@ def is_sparse(self) -> bool:
"""Feature is sparse or dense."""
if self._is_sparse is None:
self._is_sparse = (
self.config.HasField("hash_bucket_size")
self.config.HasField("zch")
or self.config.HasField("hash_bucket_size")
or self.config.HasField("num_buckets")
or len(self.config.vocab_list) > 0
or len(self.config.vocab_dict) > 0
Expand Down Expand Up @@ -213,6 +212,6 @@ def fg_json(self) -> List[Dict[str, Any]]:
fg_cfg["boundaries"] = list(self.config.boundaries)

if fg_cfg["needDiscrete"]:
fg_cfg["value_dim"] = self.config.value_dim
fg_cfg["value_dim"] = self.value_dim
# del fg_cfg["combiner"]
return [fg_cfg]
12 changes: 6 additions & 6 deletions tzrec/features/sequence_feature_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_fg_encoded_sequence_id_feature(
parsed_feat = seq_feat.parse(input_data)
self.assertEqual(parsed_feat.name, "click_50_seq__id_feat")
np.testing.assert_allclose(parsed_feat.values, np.array(expected_values))
np.testing.assert_allclose(parsed_feat.lengths, np.array(expected_lengths))
np.testing.assert_allclose(parsed_feat.key_lengths, np.array(expected_lengths))
np.testing.assert_allclose(
parsed_feat.seq_lengths, np.array(expected_seq_lengths)
)
Expand All @@ -124,7 +124,7 @@ def test_fg_encoded_simple_sequence_id_feature(self):
parsed_feat = seq_feat.parse(input_data)
self.assertEqual(parsed_feat.name, "click_50_seq_id")
np.testing.assert_allclose(parsed_feat.values, np.array([1, 2, 3, 4, 5, 6]))
np.testing.assert_allclose(parsed_feat.lengths, np.array([1, 1, 1, 2, 1]))
np.testing.assert_allclose(parsed_feat.key_lengths, np.array([1, 1, 1, 2, 1]))
np.testing.assert_allclose(parsed_feat.seq_lengths, np.array([2, 0, 1, 2]))

@parameterized.expand(
Expand Down Expand Up @@ -168,7 +168,7 @@ def test_sequence_id_feature_with_hash_bucket_size(
parsed_feat = seq_feat.parse(input_data)
self.assertEqual(parsed_feat.name, "click_50_seq__id_feat")
np.testing.assert_allclose(parsed_feat.values, np.array(expected_values))
np.testing.assert_allclose(parsed_feat.lengths, np.array(expected_lengths))
np.testing.assert_allclose(parsed_feat.key_lengths, np.array(expected_lengths))
self.assertTrue(
np.allclose(parsed_feat.seq_lengths, np.array(expected_seq_lengths))
)
Expand Down Expand Up @@ -213,7 +213,7 @@ def test_simple_sequence_id_feature_with_hash_bucket_size(
parsed_feat = seq_feat.parse(input_data)
self.assertEqual(parsed_feat.name, "click_50_seq_id_feat")
np.testing.assert_allclose(parsed_feat.values, np.array(expected_values))
np.testing.assert_allclose(parsed_feat.lengths, np.array(expected_lengths))
np.testing.assert_allclose(parsed_feat.key_lengths, np.array(expected_lengths))
self.assertTrue(
np.allclose(parsed_feat.seq_lengths, np.array(expected_seq_lengths))
)
Expand Down Expand Up @@ -290,7 +290,7 @@ def test_sequence_id_feature_with_num_buckets(
parsed_feat = seq_feat.parse(input_data)
self.assertEqual(parsed_feat.name, "click_50_seq__id_feat")
np.testing.assert_allclose(parsed_feat.values, np.array(expected_values))
np.testing.assert_allclose(parsed_feat.lengths, np.array(expected_lengths))
np.testing.assert_allclose(parsed_feat.key_lengths, np.array(expected_lengths))
self.assertTrue(
np.allclose(parsed_feat.seq_lengths, np.array(expected_seq_lengths))
)
Expand Down Expand Up @@ -320,7 +320,7 @@ def test_sequence_id_feature_with_vocab_list(self):
)
self.assertTrue(
np.allclose(
parsed_feat.lengths, np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
parsed_feat.key_lengths, np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
)
)
np.testing.assert_allclose(parsed_feat.seq_lengths, np.array([6, 1, 4]))
Expand Down
11 changes: 10 additions & 1 deletion tzrec/tests/configs/multi_tower_din_fg_mock.config
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,15 @@ feature_configs {
expression: "user:id_6"
hash_bucket_size: 100
embedding_dim: 16
value_dim: 0
}
}
feature_configs {
id_feature {
feature_name: "id_7"
expression: "user:id_7"
hash_bucket_size: 100
embedding_dim: 16
weighted: true
}
}
feature_configs {
Expand Down Expand Up @@ -338,6 +346,7 @@ model_config {
feature_names: "id_4"
feature_names: "id_5"
feature_names: "id_6"
feature_names: "id_7"
feature_names: "raw_1"
feature_names: "raw_2"
feature_names: "raw_3"
Expand Down
22 changes: 15 additions & 7 deletions tzrec/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,13 +588,21 @@ def build_mock_input_with_fg(
random.random() < 0.5 and feature.inputs[0] not in single_id_fields
)
side, name = feature.side_inputs[0]
inputs[side][name] = IdMockInput(
name,
is_multi=is_multi,
num_ids=feature.num_embeddings,
vocab_list=feature.config.vocab_list,
multival_sep=chr(29),
)
if feature.is_weighted:
inputs[side][name] = MapMockInput(
name,
is_sparse=feature.is_sparse,
num_ids=feature.num_embeddings,
vocab_list=feature.config.vocab_list,
)
else:
inputs[side][name] = IdMockInput(
name,
is_multi=is_multi,
num_ids=feature.num_embeddings,
vocab_list=feature.config.vocab_list,
multival_sep=chr(29),
)
elif type(feature) is RawFeature:
side, name = feature.side_inputs[0]
inputs[side][name] = RawMockInput(
Expand Down

0 comments on commit 2c031d8

Please sign in to comment.