在多租户海量索引场景下,自定义路由键是实现用户级数据隔离与精准查询的核心技术。通过将用户标识(例如ID)绑定为路由键,可保证每次查询仅针对目标用户数据,在保障了数据安全性的同时进一步提升查询性能。本文介绍如何使用自定义路由键功能。
前提条件
已安装Python环境,且Python版本为3.6及以上版本。
已安装opensearch-py,且opensearch-py版本为2.6.0及以上版本。
已开通Lindorm向量引擎。
已开通Lindorm搜索引擎。
已将客户端的IP地址加入到Lindorm白名单。
准备工作
在创建和使用向量索引前,您需要通过opensearch-py连接搜索引擎,连接方式如下:
from opensearchpy import OpenSearch, Object
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 如果使用 logging,为防止 opensearch info 日志过多,需要进行以下修改
logging.getLogger('opensearch').setLevel(logging.WARN)
def get_client() -> OpenSearch:
search_client = OpenSearch(
hosts=[{"host": "ld-t4n5668xk31ui****.lindorm.aliyuncs.com", "port": 30070}],
http_auth=("<username>", "<password>"),
http_compress=False,
use_ssl=False,
pool_maxsize=128,
timeout=30 # 单位:秒
)
return search_client
其中host、username和password分别为搜索引擎的连接地址、默认用户名和默认密码,如何获取,请参见查看连接信息。
创建索引
仅支持纯向量数据查询
如果索引内数据量在万级以下,建议您使用flat索引。如果数据量为几万或几十万,建议使用hnsw索引。如果数据量达到了百万级别,建议使用ivfpq索引。您也可以根据业务需求,选择使用稀疏向量索引。
在自定义路由键的场景下,主键_id是全局唯一,且必须是全局唯一。
创建索引时需指定
"knn_routing": true
,表示开启自定义路由键功能。对于ivfpq索引,还需设置"meta": {"offline.construction": "true"}
。
以下为创建4种索引的示例。
flat routing索引
def create_ugc_flat():
index_body = {
"settings": {
"index": {
"number_of_shards": 2,
"knn": True,
"knn_routing": True
}
},
"mappings": {
"_source": {
"excludes": ["vector1"]
},
"properties": {
"vector1": {
"type": "knn_vector",
"dimension": 3,
"data_type": "float",
"method": {
"engine": "lvector",
"name": "flat",
"space_type": "l2",
"parameters": {}
}
},
"field1": {
"type": "long"
}
}
}
}
response = client.indices.create(index='vector_routing_flat_test', body=index_body)
hnsw routing索引
def create_ugc_hnsw():
index_body = {
"settings" : {
"index": {
"number_of_shards": 2,
"knn": True,
"knn_routing": True
}
},
"mappings": {
"_source": {
"excludes": ["vector1"]
},
"properties": {
"vector1": {
"type": "knn_vector",
"dimension": 3,
"method": {
"engine": "lvector",
"name": "hnsw",
"space_type": "l2",
"parameters": {
"m": 24,
"ef_construction": 500
}
}
},
"field1": {
"type": "long"
}
}
}
}
response = client.indices.create(index='vector_routing_hnsw_test', body=index_body)
sparse_hnsw routing稀疏向量索引
def create_ugc_sparse_hnsw():
index_body = {
"settings" : {
"index": {
"number_of_shards": 2,
"knn": True,
"knn_routing": True
}
},
"mappings": {
"_source": {
"excludes": ["vector1"]
},
"properties": {
"vector1": {
"type": "knn_vector",
"data_type": "sparse_vector",
"method": {
"engine": "lvector",
"name": "sparse_hnsw",
"space_type": "innerproduct",
"parameters": {
"m": 24,
"ef_construction": 200
}
}
},
"field1": {
"type": "long"
}
}
}
}
response = client.indices.create(index='vector_routing_sparse_test', body=index_body)
ivfpq routing索引
在自定义路由键场景下,由于单个路由键的数据量通常较小(例如几十万条甚至更少),其ivfpq参数设置需区别于千万或亿级数据的通用策略。例如用于定义簇数量的参数nlist,在设置时可以遵循每个簇承载1,000~30,000条数据的原则,如果每个路由键的数据量为几千条,可以将nlist设置为2
。
def create_ugc_ivfpq():
index_body = {
"settings": {
"index": {
"number_of_shards": 4,
"knn": True,
"knn_routing": True
}
},
"mappings": {
"_source": {
"excludes": ["vector1"]
},
"properties": {
"vector1": {
"type": "knn_vector",
"dimension": 3,
"data_type": "float",
"meta": {"offline.construction": "true"},
"method": {
"engine": "lvector",
"name": "ivfpq",
"space_type": "cosinesimil",
"parameters": {
"m": 3, # 同维度设置相同即可
"nlist": 2,
"centroids_use_hnsw": False,
"centroids_hnsw_m": 48,
"centroids_hnsw_ef_construct": 500,
"centroids_hnsw_ef_search": 200
}
}
},
"field1": {
"type": "long"
}
}
}
}
response = client.indices.create(index='vector_routing_ivfpq_test', body=index_body)
支持纯向量数据查询和融合查询
如果您需要执行融合查询,创建索引时需指定全文检索字段,即创建索引时添加以下参数:
def create_ugc_hybrid_search():
index_body = {
"settings" : {
"index": {
"number_of_shards": 2,
"knn": True,
"knn_routing": True
}
},
"mappings": {
"_source": {
"excludes": ["vector1"]
},
"properties": {
"vector1": {
"type": "knn_vector",
"dimension": 3,
"data_type": "float",
"method": {
"engine": "lvector",
"name": "hnsw",
"space_type": "l2",
"parameters": {
"m": 24,
"ef_construction": 500
}
}
},
"text_field": {
"type": "text",
"analyzer": "ik_max_word"
},
"field1": {
"type": "long"
}
}
}
}
response = client.indices.create(index='vector1_routing_hnsw_hybridSearch', body=index_body)
数据写入
单条写入
以下示例向flat索引vector_routing_flat_test
中写入数据,并指定路由值为租户user123
。
def write_ugc_index():
doc = {
"field1": 1,
"vector1": [1.2, 1.3, 1.4]
}
response = client.index(index='vector_routing_flat_test', body=doc, id=1, routing='user123')
批量写入
以下示例向hnsw索引vector_routing_hnsw_test
中批量写入数据,并分别指定路由值为1
和2
。
def bulk_write_ugc_index():
operations = """
{ "index" : { "_index" : "vector_routing_hnsw_test", "_id" : "2", "routing": "1"} }
{ "field1" : 2, "vector1": [2.2, 2.3, 2.4]}
{ "index" : { "_index" : "vector_routing_hnsw_test", "_id" : "3", "routing": "2" } }
{ "field1" : 3, "vector1": [3.2, 3.3, 3.4]}
"""
response = client.bulk(body=operations)
索引构建
构建ivfpq索引
仅ivfpq索引需手动构建,需要在构建语句中设置 "meta": {"offline.construction": "true"}
, 表示离线索引。
发起构建前务必确保索引已写入足够的数据量,必须大于256条且超过nlist的30倍。
def build_ugc_index():
# 真实业务,请务必确保索引已写入足量的数据再发起构建
body_build = {
"indexName": "vector_routing_ivfpq_test",
"fieldName": "vector1",
"removeOldIndex": "true",
"ivf_train_only": "false"
}
response = client.transport.perform_request(
method="POST",
url='/_plugins/_vector/index/build',
body=body_build
)
参数说明
参数 | 是否必填 | 说明 |
ivf_train_only | 是 |
无论设置为 |
清理训练数据,保留索引码本
如果将ivf_train_only设置为true
,则必须执行该步骤。该操作利用现有数据训练码本,不对现有数据生成索引。
其中,reserve_codebook=true
为必填项,表示保存索引码本。清理训练数据后需重新写入数据才可以执行纯向量数据查询(knn检索)。
如果ivf_train_only设置为false
,现存数据会根据训练的码本生成索引数据,且会保留现有的数据,您可跳过该步骤。
def truncate_reserve_codebook():
method = 'POST'
url = '/_truncate/vector_routing_ivfpq_test'
params = {
'reserve_codebook': 'true'
}
body = None
headers = {
'Content-Type': 'application/json'
}
response = client.transport.perform_request(
method=method,
url=url,
params=params,
body=body,
headers=headers
)
数据查询
纯向量数据查询
纯向量数据的查询可以通过knn
结构实现。
flat routing索引
def query_ugc_flat():
query = {
"size": 20,
"query": {
"knn": {
"vector1": {
"vector": [2.3, 3.3, 4.4],
"k": 20
}
}
}
}
response = client.search(index='vector_routing_flat_test', body=query, routing='user123')
hnsw routing索引
def query_ugc_hnsw():
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": [2.2, 2.3, 2.4],
"k": 10
}
}
},
"ext": {"lvector": {"ef_search": "100"}}
}
response = client.search(index='vector_routing_hnsw_test', body=query, routing='1')
sparse_hsnw routing稀疏向量索引
def query_ugc_sparse_hnsw():
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": {"indices": [10, 45, 16], "values": [0.5, 0.5, 0.2]},
"k": 10
}
}
},
"ext": {"lvector": {"ef_search": "100"}}
}
response = client.search(index='vector_routing_sparse_test', body=query, routing='1')
ivfpq routing索引
def query_ugc_ivfpq():
query = {
"size": 10,
"query": {
"knn": {
"vector1": {
"vector": [2.2, 2.3, 2.4],
"k": 10
}
}
},
"ext": {"lvector": {"nprobe": "2", "reorder_factor": "2","client_refactor":"true"}}
}
response = client.search(index='vector_routing_ivfpq_test', body=query, routing='1')
融合查询
使用融合查询前请确保您的索引已支持纯向量数据查询和融合查询。
全文向量混合检索
def query_ugc_hybrid():
query = {
"size": 10,
"_source": False,
"query": {
"knn": {
"vector1": {
"vector": [2.8, 2.3, 2.4],
"filter": {
"bool": {
"must": [{
"bool": {
"must": [{
"match": {
"text_field": { # 请替换为您需要检索的全文字段
"query": "test1 test2"
}
}
},
{
"term": {
"_routing": "user123" #请替换为连接地址中指定的routing值
}
}]
}
}]
}
},
"k": 10
}
}
},
"ext": {
"lvector": {
"hybrid_search_type": "filter_rrf",
"rrf_rank_constant": "60",
"rrf_knn_weight_factor": "0.5"
}
}
}
response = client.search(index='vector_text_hybridSearch', body=query, routing='user123')
向量+全文+属性过滤
def query_ugc_hybrid_filter():
query = {
"size": 10,
"_source": False,
"query": {
"knn": {
"vector1": {
"vector": [2.8, 2.3, 2.4],
"filter": {
"bool": {
"must": [{
"bool": {
"must": [{
"match": {
"text_field": { # 替换为您需要检索的全文字段
"query": "test1 test2"
}
}
},
{
"term": {
"_routing": "user123" #替换为连接语句中指定的routing值
}
}]
}
},
{
"bool": {
"filter": [{
"range": {
"field1": {
"gt": 2
}
}
}]
}
}]
}
},
"k": 10
}
}
},
"ext": {
"lvector": {
"hybrid_search_type": "filter_rrf",
"rrf_rank_constant": "60",
"rrf_knn_weight_factor": "0.5",
"filter_type": "efficient_filter"
}
}
}
response = client.search(index='vector_text_hybridSearch', body=query, routing='user123')