Azure Cosmos DB NoSQL 向量存储¶
本笔记本将快速演示如何在 LlamaIndex 中使用 AzureCosmosDBNoSqlVectorSearch 执行向量搜索。
如果你在 Colab 上打开此笔记本,可能需要先安装 LlamaIndex 🦙。
In [ ]:
Copied!
%pip install llama-index-embeddings-openai
%pip install llama-index-llms-azure-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-llms-azure-openai
In [ ]:
Copied!
!pip install llama-index
!pip install llama-index
In [ ]:
Copied!
import os
import json
import openai
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
import os
import json
import openai
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
配置 Azure OpenAI¶
第一步是设置大语言模型(llm)和嵌入模型。这些模型将用于为加载到数据库中的文档创建嵌入向量,以及执行大语言模型的补全任务。
In [ ]:
Copied!
llm = AzureOpenAI(
model="AZURE_OPENAI_MODEL",
deployment_name="AZURE_OPENAI_DEPLOYMENT_NAME",
azure_endpoint="AZURE_OPENAI_BASE",
api_key="AZURE_OPENAI_KEY",
api_version="AZURE_OPENAI_VERSION",
)
embed_model = AzureOpenAIEmbedding(
model="AZURE_OPENAI_EMBEDDING_MODEL",
deployment_name="AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME",
azure_endpoint="AZURE_OPENAI_BASE",
api_key="AZURE_OPENAI_KEY",
api_version="AZURE_OPENAI_VERSION",
)
llm = AzureOpenAI(
model="AZURE_OPENAI_MODEL",
deployment_name="AZURE_OPENAI_DEPLOYMENT_NAME",
azure_endpoint="AZURE_OPENAI_BASE",
api_key="AZURE_OPENAI_KEY",
api_version="AZURE_OPENAI_VERSION",
)
embed_model = AzureOpenAIEmbedding(
model="AZURE_OPENAI_EMBEDDING_MODEL",
deployment_name="AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME",
azure_endpoint="AZURE_OPENAI_BASE",
api_key="AZURE_OPENAI_KEY",
api_version="AZURE_OPENAI_VERSION",
)
In [ ]:
Copied!
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
加载文档¶
在本示例中,我们将使用由 SimpleDirectoryReader 处理的 paul_graham 文章。
In [ ]:
Copied!
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(
input_files=[r"\docs\examples\data\paul_graham\paul_graham_essay.txt"]
).load_data()
print("Document ID:", documents[0].doc_id)
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(
input_files=[r"\docs\examples\data\paul_graham\paul_graham_essay.txt"]
).load_data()
print("Document ID:", documents[0].doc_id)
创建索引¶
此处我们将建立与 Cosmos DB NoSQL 的连接并创建向量存储索引。
In [ ]:
Copied!
from azure.cosmos import CosmosClient, PartitionKey
from llama_index.vector_stores.azurecosmosnosql import (
AzureCosmosDBNoSqlVectorSearch,
)
from llama_index.core import StorageContext
# create cosmos client
URI = "AZURE_COSMOSDB_URI"
KEY = "AZURE_COSMOSDB_KEY"
client = CosmosClient(URI, credential=KEY)
# specify vector store properties
indexing_policy = {
"indexingMode": "consistent",
"includedPaths": [{"path": "/*"}],
"excludedPaths": [{"path": '/"_etag"/?'}],
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
}
vector_embedding_policy = {
"vectorEmbeddings": [
{
"path": "/embedding",
"dataType": "float32",
"distanceFunction": "cosine",
"dimensions": 3072,
}
]
}
partition_key = PartitionKey(path="/id")
cosmos_container_properties_test = {"partition_key": partition_key}
cosmos_database_properties_test = {}
# create vector store
store = AzureCosmosDBNoSqlVectorSearch(
cosmos_client=client,
vector_embedding_policy=vector_embedding_policy,
indexing_policy=indexing_policy,
cosmos_container_properties=cosmos_container_properties_test,
cosmos_database_properties=cosmos_database_properties_test,
create_container=True,
)
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
from azure.cosmos import CosmosClient, PartitionKey
from llama_index.vector_stores.azurecosmosnosql import (
AzureCosmosDBNoSqlVectorSearch,
)
from llama_index.core import StorageContext
# create cosmos client
URI = "AZURE_COSMOSDB_URI"
KEY = "AZURE_COSMOSDB_KEY"
client = CosmosClient(URI, credential=KEY)
# specify vector store properties
indexing_policy = {
"indexingMode": "consistent",
"includedPaths": [{"path": "/*"}],
"excludedPaths": [{"path": '/"_etag"/?'}],
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
}
vector_embedding_policy = {
"vectorEmbeddings": [
{
"path": "/embedding",
"dataType": "float32",
"distanceFunction": "cosine",
"dimensions": 3072,
}
]
}
partition_key = PartitionKey(path="/id")
cosmos_container_properties_test = {"partition_key": partition_key}
cosmos_database_properties_test = {}
# create vector store
store = AzureCosmosDBNoSqlVectorSearch(
cosmos_client=client,
vector_embedding_policy=vector_embedding_policy,
indexing_policy=indexing_policy,
cosmos_container_properties=cosmos_container_properties_test,
cosmos_database_properties=cosmos_database_properties_test,
create_container=True,
)
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
查询索引¶
现在我们可以使用索引来提问了。
In [ ]:
Copied!
query_engine = index.as_query_engine()
response = query_engine.query("What did the author love working on?")
query_engine = index.as_query_engine()
response = query_engine.query("What did the author love working on?")
In [ ]:
Copied!
import textwrap
print(textwrap.fill(str(response), 100))
import textwrap
print(textwrap.fill(str(response), 100))