特征生成和EasyRec配置案例

本文通过如下示例为您说明如何配置特征生成配置文件fg.json和模型配置文件config。

示例数据

以下为示例数据,字段包括了常见的数据类型,其中:

  • ID特征:user_id、item_id

  • 类别特征:gender、category

  • 数值特征:age

  • lookup特征:user__kv_category_click_1d

  • 多值特征:tags

  • 文本特征:description

  • 行为序列特征:click_10_seq

  • 样本的label:is_click

说明

Lookup特征是一种通过外部预计算的映射表(如KV存储、缓存或数据库表)来获取特征值的机制。它允许模型在训练或预测时快速查询已预先计算好的特征值,而无需实时计算或复杂的数据处理逻辑。这种方式在推荐系统、点击率预估等场景中常见,例如用户的历史行为统计特征(如用户对某类商品的点击次数)、物品的流行度等。

字段名称

示例数据1

示例数据2

示例数据3

request_id

101

102

103

user_id

1

2

3

item_id

4

5

10

event_unix_time

1672502400

1672502400

1672502400

is_click

0

1

1

age

25

30

22

gender

user__kv_category_click_1d

电子产品:10家电:1饰品:2

电子产品:1家电:5饰品:1

电子产品:1家电:2饰品:11

category

电子产品

家电

饰品

tags

科技电脑便携

家居电器冷藏

时尚眼镜防晒

description

便携高性能笔记本

大容量冷藏冰箱

时尚防晒太阳镜

click_10_seq

item__item_id:4#item__category:电子产品#user__ts:21041;item__item_id:5#item__category:家电#user__ts:168139;item__item_id:10#item__category:饰品#user__ts:168284

ds

20230101

20230101

20230101

特征生成配置文件 fg.json

以下是配置好的fg.json示例:

{
    "features": [
        {
            "feature_name": "user_id",
            "feature_type": "id_feature",
            "value_type": "String",
            "expression": "user:user_id",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false,
            "is_multi": false
        },
        {
            "feature_name": "item_id",
            "feature_type": "id_feature",
            "value_type": "String",
            "expression": "item:item_id",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false,
            "is_multi": false
        },
        {
            "feature_name": "age",
            "feature_type": "raw_feature",
            "value_type": "Double",
            "expression": "user:age",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false
        },
        {
            "feature_name": "gender",
            "feature_type": "id_feature",
            "value_type": "String",
            "expression": "user:gender",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false,
            "is_multi": false
        },
        {
            "feature_name": "user__kv_category_click_1d",
            "feature_type": "lookup_feature",
            "value_type": "Double",
            "map": "user:user__kv_category_click_1d",
            "key": "item:category",
            "needDiscrete": false,
            "needWeighting": false,
            "needKey": false,
            "default_value": "0",
            "combiner": "mean",
            "need_prefix": false
        },
        {
            "feature_name": "category",
            "feature_type": "id_feature",
            "value_type": "String",
            "expression": "item:category",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false,
            "is_multi": false
        },
        {
            "feature_name": "tags",
            "feature_type": "id_feature",
            "value_type": "String",
            "expression": "item:tags",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false,
            "is_multi": true
        },
        {
            "feature_name": "description",
            "feature_type": "id_feature",
            "value_type": "String",
            "expression": "item:description",
            "default_value": "-1024",
            "combiner": "mean",
            "need_prefix": false,
            "is_multi": true
        },
        {
            "sequence_name": "click_10_seq",
            "sequence_column": "click_10_seq",
            "sequence_length": 10,
            "sequence_delim": ";",
            "attribute_delim": "#",
            "sequence_table": "item",
            "sequence_pk": "user:click_10_seq",
            "features": [
                {
                    "feature_name": "item_id",
                    "feature_type": "id_feature",
                    "value_type": "String",
                    "expression": "item:item_id",
                    "default_value": "-1024",
                    "combiner": "mean",
                    "need_prefix": false,
                    "is_multi": false,
                    "group": "click_10_seq_feature"
                },
                {
                    "feature_name": "category",
                    "feature_type": "id_feature",
                    "value_type": "String",
                    "expression": "item:category",
                    "default_value": "-1024",
                    "combiner": "mean",
                    "need_prefix": false,
                    "is_multi": false,
                    "group": "click_10_seq_feature"
                },
                {
                    "feature_name": "ts",
                    "feature_type": "raw_feature",
                    "value_type": "Double",
                    "expression": "user:ts",
                    "default_value": "-1024",
                    "combiner": "mean",
                    "need_prefix": false,
                    "group": "click_10_seq_feature"
                }
            ]
        }
    ],
    "reserves": [
        "request_id",
        "user_id",
        "item_id",
        "is_click"
    ]
}

模型配置文件 config

以下是配置好的模型配置文件config示例。包括了组合特征和表达式特征,详情请参见easy_rec

train_config {
  optimizer_config {
    use_moving_average: false
    adam_optimizer {
      learning_rate {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.001
          decay_steps: 1
          decay_factor: 0.5
          min_learning_rate: 1e-06
        }
      }
    }
  }
  num_steps: 1
  sync_replicas: true
  save_summary_steps: 100
  log_step_count_steps: 100
}
eval_config {
  metrics_set {
    auc {
    }
  }
}
data_config {
  batch_size: 1024
  label_fields: "is_click"
  shuffle: false
  num_epochs: 10000
  input_type: OdpsRTPInput
  separator: ""
  selected_cols: "is_click,features"
  input_fields {
    input_name: "is_click"
    input_type: INT32
    default_val: "0"
  }
  input_fields {
    input_name: "user_id"
    input_type: STRING
    default_val: "-1024"
  }
  input_fields {
    input_name: "item_id"
    input_type: STRING
    default_val: "-1024"
  }
  input_fields {
    input_name: "age"
    input_type: DOUBLE
    default_val: "-1024"
  }
  input_fields {
    input_name: "gender"
    input_type: STRING
    default_val: "-1024"
  }
  input_fields {
    input_name: "user__kv_category_click_1d"
    input_type: DOUBLE
    default_val: "0"
  }
  input_fields {
    input_name: "category"
    input_type: STRING
    default_val: "-1024"
  }
  input_fields {
    input_name: "tags"
    input_type: STRING
    default_val: "-1024"
  }
  input_fields {
    input_name: "description"
    input_type: STRING
    default_val: "-1024"
  }
  input_fields {
    input_name: "click_10_seq__item_id"
    input_type: STRING
  }
  input_fields {
    input_name: "click_10_seq__category"
    input_type: STRING
  }
  input_fields {
    input_name: "click_10_seq__ts"
    input_type: STRING
  }
  pai_worker_queue: true
}
feature_configs {
  input_names: "user_id"
  feature_type: IdFeature
  embedding_dim: 8
  hash_bucket_size: 48000
  separator: ""
  combiner: "mean"
}
feature_configs {
  input_names: "item_id"
  feature_type: IdFeature
  embedding_dim: 8
  hash_bucket_size: 27000
  separator: ""
  combiner: "mean"
}
feature_configs {
  input_names: "age"
  feature_type: RawFeature
  embedding_dim: 4
  separator: ""
  boundaries: 1e-08
  boundaries: 10
  boundaries: 20
  boundaries: 30
  boundaries: 40
  boundaries: 50
  boundaries: 60
}
feature_configs {
  input_names: "gender"
  feature_type: IdFeature
  embedding_dim: 4
  hash_bucket_size: 10
  separator: ""
  combiner: "mean"
}
feature_configs {
  input_names: "user__kv_category_click_1d"
  feature_type: RawFeature
  embedding_dim: 4
  separator: ""
  boundaries: 1e-08
  boundaries: 1.0
  boundaries: 2.0
  boundaries: 3.0
  boundaries: 4.0
  boundaries: 5.0
  boundaries: 6.0
}
feature_configs {
  input_names: "category"
  feature_type: IdFeature
  embedding_dim: 4
  hash_bucket_size: 100
  separator: ""
  combiner: "mean"
}
feature_configs {
  input_names: "tags"
  feature_type: TagFeature
  embedding_dim: 4
  hash_bucket_size: 1000
  separator: ""
  combiner: "mean"
}
feature_configs {
  input_names: "description"
  feature_type: SequenceFeature
  embedding_dim: 4
  hash_bucket_size: 10
  separator: ""
  sequence_combiner {
    text_cnn {
      filter_sizes: 2
      filter_sizes: 3
      filter_sizes: 4
      num_filters: 16
      num_filters: 8
      num_filters: 8
    }
  }
}
feature_configs {
  input_names: "click_10_seq__item_id"
  feature_type: SequenceFeature
  embedding_dim: 8
  hash_bucket_size: 27000
  separator: ";"
  combiner: "mean"
  sub_feature_type: IdFeature
}
feature_configs {
  input_names: "click_10_seq__category"
  feature_type: SequenceFeature
  embedding_dim: 4
  hash_bucket_size: 10000
  separator: ";"
  combiner: "mean"
  sub_feature_type: IdFeature
}
feature_configs {
  input_names: "click_10_seq__ts"
  feature_type: SequenceFeature
  embedding_dim: 4
  separator: ";"
  sub_feature_type: RawFeature
}
feature_configs {
  input_names: "click_10_seq__ts"
  feature_type: SequenceFeature
  embedding_dim: 4
  separator: ";"
  sub_feature_type: RawFeature
}
feature_configs {
  input_names: ["age", "gender"]
  feature_name: "combo_age_gender"
  feature_type: ComboFeature
  embedding_dim: 16
  hash_bucket_size: 1000
}
feature_configs {
  input_names: "age"
  feature_name: "age_satisfy1"
  feature_type: ExprFeature
  expression: "age>=18"
}
model_config {
  model_class: "MultiTower"
  feature_groups {
    group_name: "all"
    feature_names: "user_id"
    feature_names: "item_id"
    feature_names: "age"
    feature_names: "gender"
    feature_names: "user__kv_category_click_1d"
    feature_names: "category"
    feature_names: "tags"
    feature_names: "description"
    feature_names: "combo_age_gender"
    feature_names: "age_satisfy1"
    wide_deep: DEEP
    sequence_features {
      group_name: "click_10_seq"
      seq_att_map {
        key: "item_id"
        key: "category"
        hist_seq: "click_10_seq__item_id"
        hist_seq: "click_10_seq__category"
        hist_seq: "click_10_seq__ts"
      }
      tf_summary: false
      allow_key_search: false
      allow_key_transform: true
    }
  }
  embedding_regularization: 5e-06
  multi_tower {
    towers {
      input: "all"
      dnn {
        hidden_units: 256
        hidden_units: 128
      }
    }
    final_dnn {
      hidden_units: 64
      hidden_units: 32
    }
    l2_regularization: 1e-06
  }
}
export_config {
  multi_placeholder: true
}