本文将介绍向量检索版两种二进制数据召回结果的处理方式(protobuf和flatbuffers)。
protobuf格式
Maven依赖
<properties>
<grpc.version>1.6.1</grpc.version>
<protobuf.version>3.21.5</protobuf.version>
</properties>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>${protobuf.version}</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java-util</artifactId>
<version>${protobuf.version}</version>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-netty</artifactId>
<version>${grpc.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-protobuf</artifactId>
<version>${grpc.version}</version>
<scope>provided</scope>
</dependency>
生成protobuf文件
将Maven依赖引入项目中后,还需生成protobuf相应的文件,才能正常将向量检索版实例召回的结果解析成protobuf格式,生成protobuf文件步骤如下:
安装protobuf环境,推荐proto版本 【3.21.5】,可以通过命令
protoc --version
查看版本在项目中定义描述文件,文件后缀为【.proto】,可根据需求自行更改,案例如下:
【Ha3ResultProto.proto】
syntax = "proto2";
package com.searchengine.example.demo.protobuf;
option cc_enable_arenas = true;
message PBAttrKVPair {
optional string key = 1;
repeated int64 int64Value = 2;
repeated double doubleValue = 3;
repeated bytes bytesValue = 4;
}
enum ValueType {
ATTRIBUTE_TYPE = 0;
VARIABLE_VALUE_TYPE = 1;
}
message PBInt64Attribute {
optional string key = 1;
optional ValueType type = 2;
repeated int64 int64Value = 3;
repeated uint32 offset = 4;
}
message PBDoubleAttribute {
optional string key = 1;
optional ValueType type = 2;
repeated double doubleValue = 3;
repeated uint32 offset = 4;
}
message PBBytesAttribute {
optional string key = 1;
optional ValueType type = 2;
repeated bytes bytesValue = 3;
repeated uint32 offset = 4;
}
message SortExprssionMeta {
optional bool sortFlag = 1;
optional string sortExprName = 2;
}
message PBSortValues {
optional uint32 dimensionCount = 1;
repeated double sortValues = 2;
repeated SortExprssionMeta sortExprMetas = 3;
}
message PBKVPair {
optional string key = 1;
optional bytes value = 2;
}
message PBResult
{
optional uint64 totalTime = 1;
optional PBHits hits = 2;
repeated PBAggregateResults aggResults = 3;
repeated PBErrorResult errorResults = 4;
optional bytes tracer = 5;
optional bool fromCache = 6;
optional PBMatchDocs matchDocs = 7;
repeated PBMetaMap metaMap = 8;
}
message PBMatchDocs
{
optional uint32 numMatchDocs = 1;
optional uint32 totalMatchDocs = 2;
repeated string clusterNames = 3;
repeated uint32 clusterIds = 4;
repeated uint32 hashids = 5;
repeated uint32 docids = 6;
repeated int32 fullIndexVersions = 7;
repeated int32 indexVersions = 8;
repeated uint64 pkHighers = 9;
repeated uint64 pkLowers = 10;
repeated uint32 searcherIps = 11;
repeated bytes tracers = 12;
repeated PBInt64Attribute int64AttrValues = 13;
repeated PBDoubleAttribute doubleAttrValues = 14;
repeated PBBytesAttribute bytesAttrValues = 15;
optional PBSortValues sortValues = 16;
}
message PBHits
{
optional uint32 numhits = 1;
optional uint32 totalHits = 2;
repeated PBHit hit = 3;
repeated PBMetaHitMap metaHitMap = 4;
optional double coveredPercent = 5;
repeated SortExprssionMeta sortExprMetas = 6;
}
message PBHit {
optional string clusterName = 1;
optional uint32 hashid = 2;
optional uint32 docid = 3;
optional int32 fullIndexVersion = 4;
optional int32 indexVersion = 5;
optional uint64 pkHigher = 6;
optional uint64 pkLower = 7;
repeated PBAttrKVPair attributes = 8;
repeated PBAttrKVPair variableValues = 9;
repeated PBKVPair summary = 10;
repeated PBKVPair property = 11;
repeated string sortValues = 12;
optional bytes tracer = 13;
optional uint32 searcherIp = 14;
optional string rawPk = 15;
optional bytes summaryBytes = 16;
}
message PBMetaHitMap
{
optional string metaHitKey = 1;
repeated PBKVPair metaHitValue = 2;
}
message PBAggregateResults
{
optional string aggregateKey = 1;
repeated PBAggregateValue aggregateValue = 2;
}
message PBAggregateValue
{
optional string groupValue = 1;
repeated PBKVPair funNameResultPair = 2;
}
message PBErrorResult
{
optional string partitionId = 1;
optional string hostName = 2;
optional uint32 errorCode = 3;
optional string errorDescription = 4;
}
message PBMetaMap
{
optional string metaKey = 1;
repeated PBKVPair metaValue = 2;
}
在对应protobuf描述文件目录下执行命令
protoc --java_out=./ Ha3ResultProto.proto
生成Java文件路径可以在描述文件中package指定,例如:
package com.aliyun.demo.protobuf
执行命令后,会自动在指定的package中生成对应的Java文件,以上述为例,会在
com.aliyun.demo.protobuf
包下生成一个Ha3ResultProto.java
的文件,在通过SDK解析向量检索版实例的召回结果时,可直接引用:
import com.aliyun.ha3engine.Client;
import com.aliyun.ha3engine.models.*;
import com.aliyun.tea.TeaException;
import com.aliyun.demo.protobuf.Ha3ResultProto;
import org.junit.Before;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.util.*;
public class DataFormatService {
/**
* 向量检索版client,暂时支持查询操作
*/
private Client client;
@Before
public void clientInit() throws Exception {
/*
初始化向量检索版client
*/
Config config = new Config();
// API域名,可在实例详情页>API入口 查看
config.setEndpoint("");
// 实例名称,可在实例详情页左上角查看,例:ha-cn-i7*****605
config.setInstanceId("");
// 用户名,可在实例详情页>网络信息 查看
config.setAccessUserName("");
// 密码,可在实例详情页>网络信息 修改
config.setAccessPassWord("");
//公网调用填写httpProxy
config.setHttpProxy("");
client = new Client(config);
}
@Test
public void protobufFormat() throws Exception {
try {
/*
示例: 使用 ha查询串进行搜索.
*/
SearchRequestModel haQueryRequestModel = new SearchRequestModel();
SearchQuery haRawQuery = new SearchQuery();
haRawQuery.setQuery("query=id:8148508889615505646&&config=start:0,hit:100,format:protobuf&&cluster=general");
haQueryRequestModel.setQuery(haRawQuery);
SearchBytesResponseModel haSearchBytesResponseModel = client.SearchBytes(haQueryRequestModel);
System.out.println("ha查询串搜索结果:\n" + Arrays.toString(haSearchBytesResponseModel.getBody()));
//转换为protobuf格式
Ha3ResultProto.PBResult pbResult = Ha3ResultProto.PBResult.parseFrom(haSearchBytesResponseModel.getBody());
System.out.println("protobuf格式输出结果:\n" + pbResult);
} catch (TeaException e) {
System.out.println(e.getCode());
System.out.println(e.getMessage());
Map<String, Object> abc = e.getData();
System.out.println(com.aliyun.teautil.Common.toJSONString(abc));
}
}
}
注意事项
protobuf格式只适用于ha查询方式下使用,使用时需定义format:protobuf,如不需要protobuf格式,可使用普通json,定义format:json即可
client提供了Search和SearchBytes两种查询方式,Search方式返回body为String格式数据,SearchBytes返回body为byte[]格式数据,因此SearchBytes查询方法只能在aliyun-sdk-ha3engine1.3.2版本中使用
protobuf转换必须使用
aliyun-sdk-ha3engin1.3.2
版本
flatbuffers格式
Maven依赖
<properties>
<flatbuffers.java.version>2.0.7</flatbuffers.java.version>
</properties>
<dependency>
<groupId>com.google.flatbuffers</groupId>
<artifactId>flatbuffers-java</artifactId>
<version>${flatbuffers.java.version}</version>
</dependency>
生成flatbuffers文件
将Maven依赖引入项目中后,还需生成flatbuffers相应的文件,才能正常将向量检索版实例召回的结果解析成flatbuffers格式,生成flatbuffers文件步骤如下:
安装flatbuffers环境,推荐flatbuffers版本 【2.0.7】,可以通过命令
flatc --version
查看版本在项目中定义描述文件,文件后缀为【.fbs】,可以根据需求自行更改,案例如下:
【SqlResult.fbs】
include "TwoDimTable.fbs";
namespace com.searchengine.example.demo.protobuf;
table SqlErrorResult {
partitionId: string (id:0);
hostName: string (id:1);
errorCode: uint (id:2);
errorDescription: string (id:3);
}
table SqlResult {
processTime: double (id:0);
rowCount: uint32 (id:1);
errorResult: SqlErrorResult (id:2);
sqlTable: TwoDimTable (id:3);
searchInfo: string (id:4);
}
root_type SqlResult;
【TwoDimTable.fbs】
namespace com.searchengine.example.demo.protobuf;
// multi value
table MultiInt8 { value: [byte]; }
table MultiInt16 { value: [short]; }
table MultiInt32 { value: [int]; }
table MultiInt64 { value: [long]; }
table MultiUInt8 { value: [ubyte]; }
table MultiUInt16 { value: [ushort]; }
table MultiUInt32 { value: [uint]; }
table MultiUInt64 { value: [ulong]; }
table MultiFloat { value: [float]; }
table MultiDouble { value: [double]; }
table MultiString { value: [string]; }
// column base storage
table Int8Column { value: [byte]; }
table Int16Column { value: [short]; }
table Int32Column { value: [int]; }
table Int64Column { value: [long]; }
table UInt8Column { value: [ubyte]; }
table UInt16Column { value: [ushort]; }
table UInt32Column { value: [uint]; }
table UInt64Column { value: [ulong]; }
table FloatColumn { value: [float]; }
table DoubleColumn { value: [double]; }
table StringColumn { value: [string]; }
table MultiInt8Column { value: [MultiInt8]; }
table MultiUInt8Column { value: [MultiUInt8]; }
table MultiInt16Column { value: [MultiInt16]; }
table MultiUInt16Column { value: [MultiUInt16]; }
table MultiInt32Column { value: [MultiInt32]; }
table MultiUInt32Column { value: [MultiUInt32]; }
table MultiInt64Column { value: [MultiInt64]; }
table MultiUInt64Column { value: [MultiUInt64]; }
table MultiFloatColumn { value: [MultiFloat]; }
table MultiDoubleColumn { value: [MultiDouble]; }
table MultiStringColumn { value: [MultiString]; }
// column type
union ColumnType {
Int8Column,
Int16Column,
Int32Column,
Int64Column,
UInt8Column,
UInt16Column,
UInt32Column,
UInt64Column,
FloatColumn,
DoubleColumn,
StringColumn,
MultiInt8Column,
MultiInt16Column,
MultiInt32Column,
MultiInt64Column,
MultiUInt8Column,
MultiUInt16Column,
MultiUInt32Column,
MultiUInt64Column,
MultiFloatColumn,
MultiDoubleColumn,
MultiStringColumn,
}
table Column {
name: string;
value: ColumnType;
}
table TwoDimTable {
rowCount: uint (id:0);
columns: [Column] (id:1);
}
在对应flatbuffers描述文件目录下执行命令
flatc --java_out=./ SqlResult.fbs
生成Java文件路径可以在描述文件中【namespace】指定,例如:
namespace com.aliyun.demo.flatbuffers
执行命令后,会自动在指定的package中生成对应的Java文件,以上述为例,会在
com.aliyun.demo.flatbuffers
包下生成一个若干.java
的文件,在通过SDK解析向量检索版实例的召回结果时,可直接引用:
import com.aliyun.ha3engine.Client;
import com.aliyun.ha3engine.models.*;
import com.aliyun.tea.TeaException;
import com.aliyun.demo.flatbuffers.Int64Column;
import com.aliyun.demo.flatbuffers.SqlResult;
import org.junit.Before;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.util.*;
public class DataFormatService {
/**
* 向量检索版client,暂时支持查询操作
*/
private Client client;
@Before
public void clientInit() throws Exception {
/*
初始化向量检索版client
*/
Config config = new Config();
// API域名,可在实例详情页>API入口 查看
config.setEndpoint("");
// 实例名称,可在实例详情页左上角查看,例:ha-cn-i7*****605
config.setInstanceId("");
// 用户名,可在实例详情页>网络信息 查看
config.setAccessUserName("");
// 密码,可在实例详情页>网络信息 修改
config.setAccessPassWord("");
//公网调用填写httpProxy
config.setHttpProxy("");
client = new Client(config);
}
@Test
public void flatBuffersFormat() throws Exception {
try {
/*
示例 : 使用 sql 查询串进行搜索
*/
SearchRequestModel sqlQueryRRequestModel = new SearchRequestModel();
SearchQuery SqlRawQuery = new SearchQuery();
SqlRawQuery.setSql("query=select * from indexTableName&&kvpair=trace:INFO;format:flatbuffers");
sqlQueryRRequestModel.setQuery(SqlRawQuery);
SearchBytesResponseModel sqlSearchBytesResponseModel = client.SearchBytes(sqlQueryRRequestModel);
System.out.println("sql 查询串搜索结果:\n" + Arrays.toString(sqlSearchBytesResponseModel.getBody()));
//转换为flatBuffers格式
SqlResult sqlResult = SqlResult.getRootAsSqlResult(ByteBuffer.wrap(sqlSearchBytesResponseModel.getBody()));
/*
指定返回字段,例如返回数据共三个字段id、content和url,需要根据字段类型返回Column类型
此处获取第一个字段id对应的Column,id的类型为int64,所以使用Int64Column接收
若返回字段的类型为String,则使用StringColumn,以此类推
*/
Int64Column int64Column = (Int64Column) sqlResult.sqlTable().columns(0).value(new Int64Column());
// 获取字段的名称,例如返回数据共三个字段id、content和url依次返回,则返回name为id
String name = sqlResult.sqlTable().columns(0).name();
System.out.println("字段名称=" + name);
// 获取字段对应的数据条数
int total = int64Column.valueLength();
System.out.println(name + "字段数据条数=" + total);
// 遍历数据
if (total != 0) {
for (int i = 0; i < total; i++) {
// 获取字段的value,例如字段id对应的数据有n条,此处可获取到id对应的第n条数据
long value = int64Column.value(i);
System.out.println(name + "字段第" + (i+1) + "条数据=" + value);
}
}
} catch (TeaException e) {
System.out.println(e.getCode());
System.out.println(e.getMessage());
Map<String, Object> abc = e.getData();
System.out.println(com.aliyun.teautil.Common.toJSONString(abc));
}
}
}
注意事项
flatbuffers格式只适用于sql查询方式下使用,使用时需定义format:flatbuffers,如不需要flatbuffers格式,可使用普通json,定义format:json即可
flatbuffers转换必须使用
aliyun-sdk-ha3engin1.3.2
版本