向量检索版召回结果解析

本文将介绍向量检索版两种二进制数据召回结果的处理方式(protobuf和flatbuffers)。

protobuf格式

Maven依赖

<properties>
    <grpc.version>1.6.1</grpc.version>
    <protobuf.version>3.21.5</protobuf.version>
</properties>

<dependency>
    <groupId>com.google.protobuf</groupId>
    <artifactId>protobuf-java</artifactId>
    <version>${protobuf.version}</version>
</dependency>
<dependency>
    <groupId>com.google.protobuf</groupId>
    <artifactId>protobuf-java-util</artifactId>
    <version>${protobuf.version}</version>
</dependency>
<dependency>
    <groupId>io.grpc</groupId>
    <artifactId>grpc-netty</artifactId>
    <version>${grpc.version}</version>
    <scope>provided</scope>
</dependency>
<dependency>
    <groupId>io.grpc</groupId>
    <artifactId>grpc-protobuf</artifactId>
    <version>${grpc.version}</version>
    <scope>provided</scope>
</dependency>

生成protobuf文件

将Maven依赖引入项目中后,还需生成protobuf相应的文件,才能正常将向量检索版实例召回的结果解析成protobuf格式,生成protobuf文件步骤如下:

  1. 安装protobuf环境,推荐proto版本 【3.21.5】,可以通过命令 protoc --version查看版本

  2. 在项目中定义描述文件,文件后缀为【.proto】,可根据需求自行更改,案例如下:

【Ha3ResultProto.proto】

syntax = "proto2";
package com.searchengine.example.demo.protobuf;
option cc_enable_arenas = true;

message PBAttrKVPair {
  optional string key = 1;
  repeated int64 int64Value = 2;
  repeated double doubleValue = 3;
  repeated bytes bytesValue = 4;
}

enum ValueType {
  ATTRIBUTE_TYPE = 0;
  VARIABLE_VALUE_TYPE = 1;
}

message PBInt64Attribute {
  optional string key = 1;
  optional ValueType type = 2;
  repeated int64 int64Value = 3;
  repeated uint32 offset = 4;
}

message PBDoubleAttribute {
  optional string key = 1;
  optional ValueType type = 2;
  repeated double doubleValue = 3;
  repeated uint32 offset = 4;
}

message PBBytesAttribute {
  optional string key = 1;
  optional ValueType type = 2;
  repeated bytes bytesValue = 3;
  repeated uint32 offset = 4;
}

message SortExprssionMeta {
  optional bool sortFlag = 1;
  optional string sortExprName = 2;
}

message PBSortValues {
  optional uint32 dimensionCount = 1;
  repeated double sortValues = 2;
  repeated SortExprssionMeta sortExprMetas = 3;
}

message PBKVPair {
  optional string key = 1;
  optional bytes value = 2;
}

message PBResult
{
  optional uint64 totalTime = 1;
  optional PBHits hits = 2;
  repeated PBAggregateResults aggResults = 3;
  repeated PBErrorResult errorResults = 4;
  optional bytes tracer = 5;
  optional bool fromCache = 6;
  optional PBMatchDocs matchDocs = 7;
  repeated PBMetaMap metaMap = 8;
}

message PBMatchDocs
{
  optional uint32 numMatchDocs = 1;
  optional uint32 totalMatchDocs = 2;
  repeated string clusterNames = 3;
  repeated uint32 clusterIds = 4;
  repeated uint32 hashids = 5;
  repeated uint32 docids = 6;
  repeated int32 fullIndexVersions = 7;
  repeated int32 indexVersions = 8;
  repeated uint64 pkHighers = 9;
  repeated uint64 pkLowers = 10;
  repeated uint32 searcherIps = 11;
  repeated bytes tracers = 12;
  repeated PBInt64Attribute int64AttrValues = 13;
  repeated PBDoubleAttribute doubleAttrValues = 14;
  repeated PBBytesAttribute bytesAttrValues = 15;
  optional PBSortValues sortValues = 16;
}

message PBHits
{
  optional uint32 numhits = 1;
  optional uint32 totalHits = 2;
  repeated PBHit hit = 3;
  repeated PBMetaHitMap metaHitMap = 4;
  optional double coveredPercent = 5;
  repeated SortExprssionMeta sortExprMetas = 6;
}

message PBHit {
  optional string clusterName = 1;
  optional uint32 hashid = 2;
  optional uint32 docid = 3;
  optional int32 fullIndexVersion = 4;
  optional int32 indexVersion = 5;
  optional uint64 pkHigher = 6;
  optional uint64 pkLower = 7;
  repeated PBAttrKVPair attributes = 8;
  repeated PBAttrKVPair variableValues = 9;
  repeated PBKVPair summary = 10;
  repeated PBKVPair property = 11;
  repeated string sortValues = 12;
  optional bytes tracer = 13;
  optional uint32 searcherIp = 14;
  optional string rawPk = 15;
  optional bytes summaryBytes = 16;
}

message PBMetaHitMap
{
  optional string metaHitKey = 1;
  repeated PBKVPair metaHitValue = 2;
}

message PBAggregateResults
{
  optional string aggregateKey = 1;
  repeated PBAggregateValue aggregateValue = 2;
}

message PBAggregateValue
{
  optional string groupValue = 1;
  repeated PBKVPair funNameResultPair = 2;
}

message PBErrorResult
{
  optional string partitionId = 1;
  optional string hostName = 2;
  optional uint32 errorCode = 3;
  optional string errorDescription = 4;
}

message PBMetaMap
{
  optional string metaKey = 1;
  repeated PBKVPair metaValue = 2;
}

  1. 在对应protobuf描述文件目录下执行命令 protoc --java_out=./ Ha3ResultProto.proto

  2. 生成Java文件路径可以在描述文件中package指定,例如:package com.aliyun.demo.protobuf

  3. 执行命令后,会自动在指定的package中生成对应的Java文件,以上述为例,会在com.aliyun.demo.protobuf包下生成一个Ha3ResultProto.java的文件,在通过SDK解析向量检索版实例的召回结果时,可直接引用:

import com.aliyun.ha3engine.Client;
import com.aliyun.ha3engine.models.*;
import com.aliyun.tea.TeaException;
import com.aliyun.demo.protobuf.Ha3ResultProto;
import org.junit.Before;
import org.junit.Test;

import java.nio.ByteBuffer;
import java.util.*;

public class DataFormatService {
	/**
	* 向量检索版client,暂时支持查询操作
	*/
	private Client client;

	@Before
	public void clientInit() throws Exception {

		/*
			初始化向量检索版client
		*/
		Config config = new Config();

		// API域名,可在实例详情页>API入口 查看
		config.setEndpoint("");
		// 实例名称,可在实例详情页左上角查看,例:ha-cn-i7*****605
		config.setInstanceId("");
		// 用户名,可在实例详情页>网络信息 查看
		config.setAccessUserName("");
		// 密码,可在实例详情页>网络信息 修改
		config.setAccessPassWord("");
		//公网调用填写httpProxy
		config.setHttpProxy("");
		client = new Client(config);
	}


	@Test
	public void protobufFormat() throws Exception {
		try {
			/*
				示例:  使用 ha查询串进行搜索.
			*/
			SearchRequestModel haQueryRequestModel = new SearchRequestModel();
			SearchQuery haRawQuery = new SearchQuery();
			haRawQuery.setQuery("query=id:8148508889615505646&&config=start:0,hit:100,format:protobuf&&cluster=general");
			haQueryRequestModel.setQuery(haRawQuery);
			SearchBytesResponseModel haSearchBytesResponseModel = client.SearchBytes(haQueryRequestModel);
			System.out.println("ha查询串搜索结果:\n" + Arrays.toString(haSearchBytesResponseModel.getBody()));
			//转换为protobuf格式
			Ha3ResultProto.PBResult pbResult = Ha3ResultProto.PBResult.parseFrom(haSearchBytesResponseModel.getBody());
			System.out.println("protobuf格式输出结果:\n" + pbResult);

		} catch (TeaException e) {
			System.out.println(e.getCode());
			System.out.println(e.getMessage());
			Map<String, Object> abc = e.getData();
			System.out.println(com.aliyun.teautil.Common.toJSONString(abc));
		}
	}
}

注意事项

  • protobuf格式只适用于ha查询方式下使用,使用时需定义format:protobuf,如不需要protobuf格式,可使用普通json,定义format:json即可

  • client提供了Search和SearchBytes两种查询方式,Search方式返回body为String格式数据,SearchBytes返回body为byte[]格式数据,因此SearchBytes查询方法只能在aliyun-sdk-ha3engine1.3.2版本中使用

  • protobuf转换必须使用aliyun-sdk-ha3engin1.3.2版本

flatbuffers格式

Maven依赖

<properties>
	<flatbuffers.java.version>2.0.7</flatbuffers.java.version>
</properties>

<dependency>
    <groupId>com.google.flatbuffers</groupId>
    <artifactId>flatbuffers-java</artifactId>
    <version>${flatbuffers.java.version}</version>
</dependency>

生成flatbuffers文件

将Maven依赖引入项目中后,还需生成flatbuffers相应的文件,才能正常将向量检索版实例召回的结果解析成flatbuffers格式,生成flatbuffers文件步骤如下:

  1. 安装flatbuffers环境,推荐flatbuffers版本 【2.0.7】,可以通过命令flatc --version查看版本

  2. 在项目中定义描述文件,文件后缀为【.fbs】,可以根据需求自行更改,案例如下:

【SqlResult.fbs】

include "TwoDimTable.fbs";

namespace com.searchengine.example.demo.protobuf;

table SqlErrorResult {
      partitionId: string (id:0);
      hostName: string (id:1);
      errorCode: uint (id:2);
      errorDescription: string (id:3);
}

table SqlResult {
      processTime: double (id:0);
      rowCount: uint32 (id:1);
      errorResult: SqlErrorResult (id:2);
      sqlTable: TwoDimTable (id:3);
      searchInfo: string (id:4);
}

root_type SqlResult;

【TwoDimTable.fbs】

namespace com.searchengine.example.demo.protobuf;

// multi value
table MultiInt8   { value: [byte];   }
table MultiInt16  { value: [short];  }
table MultiInt32  { value: [int];    }
table MultiInt64  { value: [long];   }
table MultiUInt8  { value: [ubyte];  }
table MultiUInt16 { value: [ushort]; }
table MultiUInt32 { value: [uint];   }
table MultiUInt64 { value: [ulong];  }
table MultiFloat  { value: [float];  }
table MultiDouble { value: [double]; }
table MultiString { value: [string]; }

// column base storage
table Int8Column   { value: [byte];   }
table Int16Column  { value: [short];  }
table Int32Column  { value: [int];    }
table Int64Column  { value: [long];   }
table UInt8Column  { value: [ubyte];  }
table UInt16Column { value: [ushort]; }
table UInt32Column { value: [uint];   }
table UInt64Column { value: [ulong];  }
table FloatColumn  { value: [float];  }
table DoubleColumn { value: [double]; }
table StringColumn { value: [string]; }

table MultiInt8Column   { value: [MultiInt8];   }
table MultiUInt8Column  { value: [MultiUInt8];  }
table MultiInt16Column  { value: [MultiInt16];  }
table MultiUInt16Column { value: [MultiUInt16]; }
table MultiInt32Column  { value: [MultiInt32];  }
table MultiUInt32Column { value: [MultiUInt32]; }
table MultiInt64Column  { value: [MultiInt64];  }
table MultiUInt64Column { value: [MultiUInt64]; }
table MultiFloatColumn  { value: [MultiFloat];  }
table MultiDoubleColumn { value: [MultiDouble]; }
table MultiStringColumn { value: [MultiString]; }

// column type
union ColumnType {
      Int8Column,
      Int16Column,
      Int32Column,
      Int64Column,
      UInt8Column,
      UInt16Column,
      UInt32Column,
      UInt64Column,
      FloatColumn,
      DoubleColumn,
      StringColumn,
      MultiInt8Column,
      MultiInt16Column,
      MultiInt32Column,
      MultiInt64Column,
      MultiUInt8Column,
      MultiUInt16Column,
      MultiUInt32Column,
      MultiUInt64Column,
      MultiFloatColumn,
      MultiDoubleColumn,
      MultiStringColumn,
}

table Column {
      name: string;
      value: ColumnType;
}

table TwoDimTable {
      rowCount: uint (id:0);
      columns: [Column] (id:1);
}

  1. 在对应flatbuffers描述文件目录下执行命令 flatc --java_out=./ SqlResult.fbs

  2. 生成Java文件路径可以在描述文件中【namespace】指定,例如:namespace com.aliyun.demo.flatbuffers

  3. 执行命令后,会自动在指定的package中生成对应的Java文件,以上述为例,会在com.aliyun.demo.flatbuffers包下生成一个若干.java的文件,在通过SDK解析向量检索版实例的召回结果时,可直接引用:

import com.aliyun.ha3engine.Client;
import com.aliyun.ha3engine.models.*;
import com.aliyun.tea.TeaException;
import com.aliyun.demo.flatbuffers.Int64Column;
import com.aliyun.demo.flatbuffers.SqlResult;
import org.junit.Before;
import org.junit.Test;

import java.nio.ByteBuffer;
import java.util.*;

public class DataFormatService {
	/**
	* 向量检索版client,暂时支持查询操作
	*/
	private Client client;


	@Before
	public void clientInit() throws Exception {

		/*
			初始化向量检索版client
		*/
		Config config = new Config();

		// API域名,可在实例详情页>API入口 查看
		config.setEndpoint("");
		// 实例名称,可在实例详情页左上角查看,例:ha-cn-i7*****605
		config.setInstanceId("");
		// 用户名,可在实例详情页>网络信息 查看
		config.setAccessUserName("");
		// 密码,可在实例详情页>网络信息 修改
		config.setAccessPassWord("");
		//公网调用填写httpProxy
		config.setHttpProxy("");
		client = new Client(config);
	}

	@Test
	public void flatBuffersFormat() throws Exception {
		try {
			/*
				示例 :  使用 sql 查询串进行搜索
			*/
			SearchRequestModel sqlQueryRRequestModel = new SearchRequestModel();
			SearchQuery SqlRawQuery = new SearchQuery();
			SqlRawQuery.setSql("query=select * from indexTableName&&kvpair=trace:INFO;format:flatbuffers");
			sqlQueryRRequestModel.setQuery(SqlRawQuery);
			SearchBytesResponseModel sqlSearchBytesResponseModel = client.SearchBytes(sqlQueryRRequestModel);
			System.out.println("sql 查询串搜索结果:\n" + Arrays.toString(sqlSearchBytesResponseModel.getBody()));
			//转换为flatBuffers格式
			SqlResult sqlResult = SqlResult.getRootAsSqlResult(ByteBuffer.wrap(sqlSearchBytesResponseModel.getBody()));
			/*
				指定返回字段,例如返回数据共三个字段id、content和url,需要根据字段类型返回Column类型
				此处获取第一个字段id对应的Column,id的类型为int64,所以使用Int64Column接收
				若返回字段的类型为String,则使用StringColumn,以此类推
			*/
			Int64Column int64Column = (Int64Column) sqlResult.sqlTable().columns(0).value(new Int64Column());
			// 获取字段的名称,例如返回数据共三个字段id、content和url依次返回,则返回name为id
			String name = sqlResult.sqlTable().columns(0).name();
			System.out.println("字段名称=" + name);

			// 获取字段对应的数据条数
			int total = int64Column.valueLength();
			System.out.println(name + "字段数据条数=" + total);

			// 遍历数据
			if (total != 0) {
				for (int i = 0; i < total; i++) {
					// 获取字段的value,例如字段id对应的数据有n条,此处可获取到id对应的第n条数据
					long value = int64Column.value(i);
					System.out.println(name + "字段第" + (i+1) + "条数据=" + value);
				}
			}
		} catch (TeaException e) {
			System.out.println(e.getCode());
			System.out.println(e.getMessage());
			Map<String, Object> abc = e.getData();
			System.out.println(com.aliyun.teautil.Common.toJSONString(abc));
		}
	}
}

注意事项

  • flatbuffers格式只适用于sql查询方式下使用,使用时需定义format:flatbuffers,如不需要flatbuffers格式,可使用普通json,定义format:json即可

  • flatbuffers转换必须使用aliyun-sdk-ha3engin1.3.2版本