本文通过如下示例为您说明如何配置特征生成配置文件fg.json和模型配置文件config。
示例数据
以下为示例数据,字段包括了常见的数据类型,其中:
ID特征:user_id、item_id
类别特征:gender、category
数值特征:age
lookup特征:user__kv_category_click_1d
多值特征:tags
文本特征:description
行为序列特征:click_10_seq
样本的label:is_click
Lookup特征是一种通过外部预计算的映射表(如KV存储、缓存或数据库表)来获取特征值的机制。它允许模型在训练或预测时快速查询已预先计算好的特征值,而无需实时计算或复杂的数据处理逻辑。这种方式在推荐系统、点击率预估等场景中常见,例如用户的历史行为统计特征(如用户对某类商品的点击次数)、物品的流行度等。
字段名称 | 示例数据1 | 示例数据2 | 示例数据3 |
request_id | 101 | 102 | 103 |
user_id | 1 | 2 | 3 |
item_id | 4 | 5 | 10 |
event_unix_time | 1672502400 | 1672502400 | 1672502400 |
is_click | 0 | 1 | 1 |
age | 25 | 30 | 22 |
gender | 男 | 女 | 女 |
user__kv_category_click_1d | 电子产品:10家电:1饰品:2 | 电子产品:1家电:5饰品:1 | 电子产品:1家电:2饰品:11 |
category | 电子产品 | 家电 | 饰品 |
tags | 科技电脑便携 | 家居电器冷藏 | 时尚眼镜防晒 |
description | 便携高性能笔记本 | 大容量冷藏冰箱 | 时尚防晒太阳镜 |
click_10_seq | item__item_id:4#item__category:电子产品#user__ts:21041;item__item_id:5#item__category:家电#user__ts:168139;item__item_id:10#item__category:饰品#user__ts:168284 | ||
ds | 20230101 | 20230101 | 20230101 |
特征生成配置文件 fg.json
以下是配置好的fg.json示例:
{
"features": [
{
"feature_name": "user_id",
"feature_type": "id_feature",
"value_type": "String",
"expression": "user:user_id",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "item_id",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:item_id",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "age",
"feature_type": "raw_feature",
"value_type": "Double",
"expression": "user:age",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false
},
{
"feature_name": "gender",
"feature_type": "id_feature",
"value_type": "String",
"expression": "user:gender",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "user__kv_category_click_1d",
"feature_type": "lookup_feature",
"value_type": "Double",
"map": "user:user__kv_category_click_1d",
"key": "item:category",
"needDiscrete": false,
"needWeighting": false,
"needKey": false,
"default_value": "0",
"combiner": "mean",
"need_prefix": false
},
{
"feature_name": "category",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:category",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false
},
{
"feature_name": "tags",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:tags",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": true
},
{
"feature_name": "description",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:description",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": true
},
{
"sequence_name": "click_10_seq",
"sequence_column": "click_10_seq",
"sequence_length": 10,
"sequence_delim": ";",
"attribute_delim": "#",
"sequence_table": "item",
"sequence_pk": "user:click_10_seq",
"features": [
{
"feature_name": "item_id",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:item_id",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false,
"group": "click_10_seq_feature"
},
{
"feature_name": "category",
"feature_type": "id_feature",
"value_type": "String",
"expression": "item:category",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"is_multi": false,
"group": "click_10_seq_feature"
},
{
"feature_name": "ts",
"feature_type": "raw_feature",
"value_type": "Double",
"expression": "user:ts",
"default_value": "-1024",
"combiner": "mean",
"need_prefix": false,
"group": "click_10_seq_feature"
}
]
}
],
"reserves": [
"request_id",
"user_id",
"item_id",
"is_click"
]
}
模型配置文件 config
以下是配置好的模型配置文件config示例。包括了组合特征和表达式特征,详情请参见easy_rec。
train_config {
optimizer_config {
use_moving_average: false
adam_optimizer {
learning_rate {
exponential_decay_learning_rate {
initial_learning_rate: 0.001
decay_steps: 1
decay_factor: 0.5
min_learning_rate: 1e-06
}
}
}
}
num_steps: 1
sync_replicas: true
save_summary_steps: 100
log_step_count_steps: 100
}
eval_config {
metrics_set {
auc {
}
}
}
data_config {
batch_size: 1024
label_fields: "is_click"
shuffle: false
num_epochs: 10000
input_type: OdpsRTPInput
separator: ""
selected_cols: "is_click,features"
input_fields {
input_name: "is_click"
input_type: INT32
default_val: "0"
}
input_fields {
input_name: "user_id"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "item_id"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "age"
input_type: DOUBLE
default_val: "-1024"
}
input_fields {
input_name: "gender"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "user__kv_category_click_1d"
input_type: DOUBLE
default_val: "0"
}
input_fields {
input_name: "category"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "tags"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "description"
input_type: STRING
default_val: "-1024"
}
input_fields {
input_name: "click_10_seq__item_id"
input_type: STRING
}
input_fields {
input_name: "click_10_seq__category"
input_type: STRING
}
input_fields {
input_name: "click_10_seq__ts"
input_type: STRING
}
pai_worker_queue: true
}
feature_configs {
input_names: "user_id"
feature_type: IdFeature
embedding_dim: 8
hash_bucket_size: 48000
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "item_id"
feature_type: IdFeature
embedding_dim: 8
hash_bucket_size: 27000
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "age"
feature_type: RawFeature
embedding_dim: 4
separator: ""
boundaries: 1e-08
boundaries: 10
boundaries: 20
boundaries: 30
boundaries: 40
boundaries: 50
boundaries: 60
}
feature_configs {
input_names: "gender"
feature_type: IdFeature
embedding_dim: 4
hash_bucket_size: 10
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "user__kv_category_click_1d"
feature_type: RawFeature
embedding_dim: 4
separator: ""
boundaries: 1e-08
boundaries: 1.0
boundaries: 2.0
boundaries: 3.0
boundaries: 4.0
boundaries: 5.0
boundaries: 6.0
}
feature_configs {
input_names: "category"
feature_type: IdFeature
embedding_dim: 4
hash_bucket_size: 100
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "tags"
feature_type: TagFeature
embedding_dim: 4
hash_bucket_size: 1000
separator: ""
combiner: "mean"
}
feature_configs {
input_names: "description"
feature_type: SequenceFeature
embedding_dim: 4
hash_bucket_size: 10
separator: ""
sequence_combiner {
text_cnn {
filter_sizes: 2
filter_sizes: 3
filter_sizes: 4
num_filters: 16
num_filters: 8
num_filters: 8
}
}
}
feature_configs {
input_names: "click_10_seq__item_id"
feature_type: SequenceFeature
embedding_dim: 8
hash_bucket_size: 27000
separator: ";"
combiner: "mean"
sub_feature_type: IdFeature
}
feature_configs {
input_names: "click_10_seq__category"
feature_type: SequenceFeature
embedding_dim: 4
hash_bucket_size: 10000
separator: ";"
combiner: "mean"
sub_feature_type: IdFeature
}
feature_configs {
input_names: "click_10_seq__ts"
feature_type: SequenceFeature
embedding_dim: 4
separator: ";"
sub_feature_type: RawFeature
}
feature_configs {
input_names: "click_10_seq__ts"
feature_type: SequenceFeature
embedding_dim: 4
separator: ";"
sub_feature_type: RawFeature
}
feature_configs {
input_names: ["age", "gender"]
feature_name: "combo_age_gender"
feature_type: ComboFeature
embedding_dim: 16
hash_bucket_size: 1000
}
feature_configs {
input_names: "age"
feature_name: "age_satisfy1"
feature_type: ExprFeature
expression: "age>=18"
}
model_config {
model_class: "MultiTower"
feature_groups {
group_name: "all"
feature_names: "user_id"
feature_names: "item_id"
feature_names: "age"
feature_names: "gender"
feature_names: "user__kv_category_click_1d"
feature_names: "category"
feature_names: "tags"
feature_names: "description"
feature_names: "combo_age_gender"
feature_names: "age_satisfy1"
wide_deep: DEEP
sequence_features {
group_name: "click_10_seq"
seq_att_map {
key: "item_id"
key: "category"
hist_seq: "click_10_seq__item_id"
hist_seq: "click_10_seq__category"
hist_seq: "click_10_seq__ts"
}
tf_summary: false
allow_key_search: false
allow_key_transform: true
}
}
embedding_regularization: 5e-06
multi_tower {
towers {
input: "all"
dnn {
hidden_units: 256
hidden_units: 128
}
}
final_dnn {
hidden_units: 64
hidden_units: 32
}
l2_regularization: 1e-06
}
}
export_config {
multi_placeholder: true
}