本文将介绍向量检索版两种二进制数据召回结果的处理方式(protobuf和flatbuffers)。
protobuf格式
Maven依赖
<properties>
<grpc.version>1.6.1</grpc.version>
<protobuf.version>3.21.5</protobuf.version>
</properties>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>${protobuf.version}</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java-util</artifactId>
<version>${protobuf.version}</version>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-netty</artifactId>
<version>${grpc.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-protobuf</artifactId>
<version>${grpc.version}</version>
<scope>provided</scope>
</dependency>
生成protobuf文件
将Maven依赖引入项目中后,还需生成protobuf相应的文件,才能正常将向量检索版实例召回的结果解析成protobuf格式,生成protobuf文件步骤如下:
安装protobuf环境,推荐proto版本 【3.21.5】,可以通过命令
protoc --version
查看版本。在项目中定义描述文件,文件后缀为【.proto】,可根据需求自行更改,案例如下:
【Ha3ResultProto.proto】
syntax = "proto2"; package com.searchengine.example.demo.protobuf; option cc_enable_arenas = true; message PBAttrKVPair { optional string key = 1; repeated int64 int64Value = 2; repeated double doubleValue = 3; repeated bytes bytesValue = 4; } enum ValueType { ATTRIBUTE_TYPE = 0; VARIABLE_VALUE_TYPE = 1; } message PBInt64Attribute { optional string key = 1; optional ValueType type = 2; repeated int64 int64Value = 3; repeated uint32 offset = 4; } message PBDoubleAttribute { optional string key = 1; optional ValueType type = 2; repeated double doubleValue = 3; repeated uint32 offset = 4; } message PBBytesAttribute { optional string key = 1; optional ValueType type = 2; repeated bytes bytesValue = 3; repeated uint32 offset = 4; } message SortExprssionMeta { optional bool sortFlag = 1; optional string sortExprName = 2; } message PBSortValues { optional uint32 dimensionCount = 1; repeated double sortValues = 2; repeated SortExprssionMeta sortExprMetas = 3; } message PBKVPair { optional string key = 1; optional bytes value = 2; } message PBResult { optional uint64 totalTime = 1; optional PBHits hits = 2; repeated PBAggregateResults aggResults = 3; repeated PBErrorResult errorResults = 4; optional bytes tracer = 5; optional bool fromCache = 6; optional PBMatchDocs matchDocs = 7; repeated PBMetaMap metaMap = 8; } message PBMatchDocs { optional uint32 numMatchDocs = 1; optional uint32 totalMatchDocs = 2; repeated string clusterNames = 3; repeated uint32 clusterIds = 4; repeated uint32 hashids = 5; repeated uint32 docids = 6; repeated int32 fullIndexVersions = 7; repeated int32 indexVersions = 8; repeated uint64 pkHighers = 9; repeated uint64 pkLowers = 10; repeated uint32 searcherIps = 11; repeated bytes tracers = 12; repeated PBInt64Attribute int64AttrValues = 13; repeated PBDoubleAttribute doubleAttrValues = 14; repeated PBBytesAttribute bytesAttrValues = 15; optional PBSortValues sortValues = 16; } message PBHits { optional uint32 numhits = 1; optional uint32 totalHits = 2; repeated PBHit hit = 3; repeated PBMetaHitMap metaHitMap = 4; optional double coveredPercent = 5; repeated SortExprssionMeta sortExprMetas = 6; } message PBHit { optional string clusterName = 1; optional uint32 hashid = 2; optional uint32 docid = 3; optional int32 fullIndexVersion = 4; optional int32 indexVersion = 5; optional uint64 pkHigher = 6; optional uint64 pkLower = 7; repeated PBAttrKVPair attributes = 8; repeated PBAttrKVPair variableValues = 9; repeated PBKVPair summary = 10; repeated PBKVPair property = 11; repeated string sortValues = 12; optional bytes tracer = 13; optional uint32 searcherIp = 14; optional string rawPk = 15; optional bytes summaryBytes = 16; } message PBMetaHitMap { optional string metaHitKey = 1; repeated PBKVPair metaHitValue = 2; } message PBAggregateResults { optional string aggregateKey = 1; repeated PBAggregateValue aggregateValue = 2; } message PBAggregateValue { optional string groupValue = 1; repeated PBKVPair funNameResultPair = 2; } message PBErrorResult { optional string partitionId = 1; optional string hostName = 2; optional uint32 errorCode = 3; optional string errorDescription = 4; } message PBMetaMap { optional string metaKey = 1; repeated PBKVPair metaValue = 2; }
在对应protobuf描述文件目录下执行命令
protoc --java_out=./ Ha3ResultProto.proto
。生成Java文件路径可以在描述文件中package指定,例如:
package com.aliyun.demo.protobuf
。执行命令后,会自动在指定的package中生成对应的Java文件,以上述为例,会在
com.aliyun.demo.protobuf
包下生成一个Ha3ResultProto.java
的文件,在通过SDK解析向量检索版实例的召回结果时,可直接引用:import com.aliyun.ha3engine.Client; import com.aliyun.ha3engine.models.*; import com.aliyun.tea.TeaException; import com.aliyun.demo.protobuf.Ha3ResultProto; import org.junit.Before; import org.junit.Test; import java.nio.ByteBuffer; import java.util.*; public class DataFormatService { /** * 向量检索版client,暂时支持查询操作 */ private Client client; @Before public void clientInit() throws Exception { /* 初始化向量检索版client */ Config config = new Config(); // API域名,可在实例详情页>API入口 查看 config.setEndpoint(""); // 实例名称,可在实例详情页左上角查看,例:ha-cn-i7*****605 config.setInstanceId(""); // 用户名,可在实例详情页>网络信息 查看 config.setAccessUserName(""); // 密码,可在实例详情页>网络信息 修改 config.setAccessPassWord(""); //公网调用填写httpProxy config.setHttpProxy(""); client = new Client(config); } @Test public void protobufFormat() throws Exception { try { /* 示例: 使用 ha查询串进行搜索. */ SearchRequestModel haQueryRequestModel = new SearchRequestModel(); SearchQuery haRawQuery = new SearchQuery(); haRawQuery.setQuery("query=id:8148508889615505646&&config=start:0,hit:100,format:protobuf&&cluster=general"); haQueryRequestModel.setQuery(haRawQuery); SearchBytesResponseModel haSearchBytesResponseModel = client.SearchBytes(haQueryRequestModel); System.out.println("ha查询串搜索结果:\n" + Arrays.toString(haSearchBytesResponseModel.getBody())); //转换为protobuf格式 Ha3ResultProto.PBResult pbResult = Ha3ResultProto.PBResult.parseFrom(haSearchBytesResponseModel.getBody()); System.out.println("protobuf格式输出结果:\n" + pbResult); } catch (TeaException e) { System.out.println(e.getCode()); System.out.println(e.getMessage()); Map<String, Object> abc = e.getData(); System.out.println(com.aliyun.teautil.Common.toJSONString(abc)); } } }
注意事项
protobuf格式只适用于ha查询方式下使用,使用时需定义format:protobuf,如不需要protobuf格式,可使用普通json,定义format:json即可。
client提供了Search和SearchBytes两种查询方式,Search方式返回body为String格式数据,SearchBytes返回body为byte[]格式数据,因此SearchBytes查询方法只能在aliyun-sdk-ha3engine1.3.2版本中使用。
protobuf转换必须使用
aliyun-sdk-ha3engine1.3.2
版本。
flatbuffers格式
Maven依赖
<properties>
<flatbuffers.java.version>2.0.7</flatbuffers.java.version>
</properties>
<dependency>
<groupId>com.google.flatbuffers</groupId>
<artifactId>flatbuffers-java</artifactId>
<version>${flatbuffers.java.version}</version>
</dependency>
生成flatbuffers文件
将Maven依赖引入项目中后,还需生成flatbuffers相应的文件,才能正常将向量检索版实例召回的结果解析成flatbuffers格式,生成flatbuffers文件步骤如下:
安装flatbuffers环境,推荐flatbuffers版本 【2.0.7】,可以通过命令
flatc --version
查看版本。在项目中定义描述文件,文件后缀为【.fbs】,可以根据需求自行更改,案例如下:
【SqlResult.fbs】
include "TwoDimTable.fbs"; namespace com.searchengine.example.demo.protobuf; table SqlErrorResult { partitionId: string (id:0); hostName: string (id:1); errorCode: uint (id:2); errorDescription: string (id:3); } table SqlResult { processTime: double (id:0); rowCount: uint32 (id:1); errorResult: SqlErrorResult (id:2); sqlTable: TwoDimTable (id:3); searchInfo: string (id:4); } root_type SqlResult;
【TwoDimTable.fbs】
namespace com.searchengine.example.demo.protobuf; // multi value table MultiInt8 { value: [byte]; } table MultiInt16 { value: [short]; } table MultiInt32 { value: [int]; } table MultiInt64 { value: [long]; } table MultiUInt8 { value: [ubyte]; } table MultiUInt16 { value: [ushort]; } table MultiUInt32 { value: [uint]; } table MultiUInt64 { value: [ulong]; } table MultiFloat { value: [float]; } table MultiDouble { value: [double]; } table MultiString { value: [string]; } // column base storage table Int8Column { value: [byte]; } table Int16Column { value: [short]; } table Int32Column { value: [int]; } table Int64Column { value: [long]; } table UInt8Column { value: [ubyte]; } table UInt16Column { value: [ushort]; } table UInt32Column { value: [uint]; } table UInt64Column { value: [ulong]; } table FloatColumn { value: [float]; } table DoubleColumn { value: [double]; } table StringColumn { value: [string]; } table MultiInt8Column { value: [MultiInt8]; } table MultiUInt8Column { value: [MultiUInt8]; } table MultiInt16Column { value: [MultiInt16]; } table MultiUInt16Column { value: [MultiUInt16]; } table MultiInt32Column { value: [MultiInt32]; } table MultiUInt32Column { value: [MultiUInt32]; } table MultiInt64Column { value: [MultiInt64]; } table MultiUInt64Column { value: [MultiUInt64]; } table MultiFloatColumn { value: [MultiFloat]; } table MultiDoubleColumn { value: [MultiDouble]; } table MultiStringColumn { value: [MultiString]; } // column type union ColumnType { Int8Column, Int16Column, Int32Column, Int64Column, UInt8Column, UInt16Column, UInt32Column, UInt64Column, FloatColumn, DoubleColumn, StringColumn, MultiInt8Column, MultiInt16Column, MultiInt32Column, MultiInt64Column, MultiUInt8Column, MultiUInt16Column, MultiUInt32Column, MultiUInt64Column, MultiFloatColumn, MultiDoubleColumn, MultiStringColumn, } table Column { name: string; value: ColumnType; } table TwoDimTable { rowCount: uint (id:0); columns: [Column] (id:1); }
在对应flatbuffers描述文件目录下执行命令
flatc --java_out=./ SqlResult.fbs
。生成Java文件路径可以在描述文件中【namespace】指定,例如:
namespace com.aliyun.demo.flatbuffers
。执行命令后,会自动在指定的package中生成对应的Java文件,以上述为例,会在
com.aliyun.demo.flatbuffers
包下生成一个若干.java
的文件,在通过SDK解析向量检索版实例的召回结果时,可直接引用:import com.aliyun.ha3engine.Client; import com.aliyun.ha3engine.models.*; import com.aliyun.tea.TeaException; import com.aliyun.demo.flatbuffers.Int64Column; import com.aliyun.demo.flatbuffers.SqlResult; import org.junit.Before; import org.junit.Test; import java.nio.ByteBuffer; import java.util.*; public class DataFormatService { /** * 向量检索版client,暂时支持查询操作 */ private Client client; @Before public void clientInit() throws Exception { /* 初始化向量检索版client */ Config config = new Config(); // API域名,可在实例详情页>API入口 查看 config.setEndpoint(""); // 实例名称,可在实例详情页左上角查看,例:ha-cn-i7*****605 config.setInstanceId(""); // 用户名,可在实例详情页>网络信息 查看 config.setAccessUserName(""); // 密码,可在实例详情页>网络信息 修改 config.setAccessPassWord(""); //公网调用填写httpProxy config.setHttpProxy(""); client = new Client(config); } @Test public void flatBuffersFormat() throws Exception { try { /* 示例 : 使用 sql 查询串进行搜索 */ SearchRequestModel sqlQueryRRequestModel = new SearchRequestModel(); SearchQuery SqlRawQuery = new SearchQuery(); SqlRawQuery.setSql("query=select * from indexTableName&&kvpair=trace:INFO;format:flatbuffers"); sqlQueryRRequestModel.setQuery(SqlRawQuery); SearchBytesResponseModel sqlSearchBytesResponseModel = client.SearchBytes(sqlQueryRRequestModel); System.out.println("sql 查询串搜索结果:\n" + Arrays.toString(sqlSearchBytesResponseModel.getBody())); //转换为flatBuffers格式 SqlResult sqlResult = SqlResult.getRootAsSqlResult(ByteBuffer.wrap(sqlSearchBytesResponseModel.getBody())); /* 指定返回字段,例如返回数据共三个字段id、content和url,需要根据字段类型返回Column类型 此处获取第一个字段id对应的Column,id的类型为int64,所以使用Int64Column接收 若返回字段的类型为String,则使用StringColumn,以此类推 */ Int64Column int64Column = (Int64Column) sqlResult.sqlTable().columns(0).value(new Int64Column()); // 获取字段的名称,例如返回数据共三个字段id、content和url依次返回,则返回name为id String name = sqlResult.sqlTable().columns(0).name(); System.out.println("字段名称=" + name); // 获取字段对应的数据条数 int total = int64Column.valueLength(); System.out.println(name + "字段数据条数=" + total); // 遍历数据 if (total != 0) { for (int i = 0; i < total; i++) { // 获取字段的value,例如字段id对应的数据有n条,此处可获取到id对应的第n条数据 long value = int64Column.value(i); System.out.println(name + "字段第" + (i+1) + "条数据=" + value); } } } catch (TeaException e) { System.out.println(e.getCode()); System.out.println(e.getMessage()); Map<String, Object> abc = e.getData(); System.out.println(com.aliyun.teautil.Common.toJSONString(abc)); } } }
注意事项
flatbuffers格式只适用于sql查询方式下使用,使用时需定义format:flatbuffers,如不需要flatbuffers格式,可使用普通json,定义format:json即可。
flatbuffers转换必须使用
aliyun-sdk-ha3engine1.3.2
版本。