Activeloop 深度记忆¶
如何在文档问答系统中实现命中率提升15%以上的RAG效果?
检索增强生成器(RAGs)近期获得了广泛关注。随着高级RAG技术和智能体的出现,它们拓展了RAGs所能实现的潜力边界。然而,若干挑战可能限制RAGs在生产环境中的集成应用。在实施生产级RAG方案时,需要重点考量的核心要素包括准确性(召回率)、成本以及延迟。对于基础用例,OpenAI的Ada模型配合简单的相似性搜索即可获得令人满意的结果。但若需实现更高的搜索准确率或召回率,则可能需要采用高级检索技术。这些方法可能涉及可变数据块大小、多次查询重写等操作,进而可能导致延迟和成本上升。Activeloop推出的Deep Memory功能(面向Deep Lake用户)通过引入微型神经网络层来解决这些问题,该网络层经过训练可将用户查询与语料库中的相关数据精准匹配。虽然该功能会在搜索时引入极小的延迟,但能使检索准确率提升高达27%,同时保持成本效益和使用简便性,无需任何额外的高级RAG技术。
%pip install llama-index-vector-stores-deeplake
%pip install llama-index-llms-openai
import nest_asyncio
import os
import getpass
nest_asyncio.apply()
!pip install deeplake beautifulsoup4 html2text tiktoken openai llama-index python-dotenv
在本教程中,我们将解析 DeepLake 文档,并构建一个能够回答文档相关问题的 RAG 系统。
本教程可分为以下几个部分:
1. 数据集创建与导入¶
让我使用 BeautifulSoup 解析所有链接并将其转换为 LlamaIndex 文档:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_all_links(url):
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to retrieve the page: {url}")
return []
soup = BeautifulSoup(response.content, "html.parser")
# Finding all 'a' tags which typically contain href attribute for links
links = [
urljoin(url, a["href"])
for a in soup.find_all("a", href=True)
if a["href"]
]
return links
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from llama_index.core import Document
def load_documents(url):
all_links = get_all_links(url)
loader = AsyncHtmlLoader(all_links)
docs = loader.load()
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs = [Document.from_langchain_format(doc) for doc in docs_transformed]
return docs
docs = load_documents("https://docs.deeplake.ai/en/latest/")
Fetching pages: 100%|##########| 120/120 [00:13<00:00, 8.70it/s]
len(docs)
120
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core import (
VectorStoreIndex,
SimpleDirectoryReader,
StorageContext,
)
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API token: ")
# # activeloop token is needed if you are not signed in using CLI: `activeloop login -u <USERNAME> -p <PASSWORD>`
os.environ["ACTIVELOOP_TOKEN"] = getpass.getpass(
"Enter your ActiveLoop API token: "
) # Get your API token from https://app.activeloop.ai, click on your profile picture in the top right corner, and select "API Tokens"
token = os.getenv("ACTIVELOOP_TOKEN")
vector_store = DeepLakeVectorStore(
dataset_path="hub://activeloop-test/deeplake_docs_deepmemory2",
overwrite=False, # set to True to overwrite the existing dataset
runtime={"tensor_db": True},
token=token,
)
Deep Lake Dataset in hub://activeloop-test/deeplake_docs_deepmemory2 already exists, loading from the storage
def create_modules(vector_store, docs=[], populate_vector_store=True):
if populate_vector_store:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(docs)
else:
nodes = []
# by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
node.id_ = f"node_{idx}"
llm = OpenAI(model="gpt-4")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return storage_context, nodes, llm
(
storage_context,
nodes,
llm,
) = create_modules(
docs=docs,
vector_store=vector_store,
# populate_vector_store=False, # uncomment this line to skip populating the vector store
)
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
deep_memory_retriever = vector_index.as_retriever(
similarity_top_k=4, deep_memory=True
)
2. 深度记忆训练¶
上文展示了 deep_memory 的整体工作原理。如你所见,要训练它需要相关性数据、查询语句以及语料数据(即我们想要查询的数据)。语料数据已在上一节中填充完毕;这里我们将生成问题和相关性数据。
questions- 是一个字符串文本,其中每个字符串代表一个查询。relevance- 包含每个问题对应真实答案的链接。可能有多个文档包含给定问题的答案。因此,relevance 的类型是 List[List[tuple[str, float]]],其中外层列表代表查询,内层列表代表相关文档。元组包含一个 str 和 float 对,其中字符串表示源文档的 id(对应数据集中的 id 张量),而浮点数表示当前文档与问题的相关程度。
from llama_index.core.evaluation import (
generate_question_context_pairs,
EmbeddingQAFinetuneDataset,
)
import random
def create_train_test_datasets(
number_of_samples=600, llm=None, nodes=None, save=False
):
random_indices = random.sample(range(len(nodes)), number_of_samples)
ratio = int(len(random_indices) * 0.8)
train_indices = random_indices[:ratio]
test_indices = random_indices[ratio:]
train_nodes = [nodes[i] for i in train_indices]
test_nodes = [nodes[i] for i in test_indices]
train_qa_dataset = generate_question_context_pairs(
train_nodes, llm=llm, num_questions_per_chunk=1
)
test_qa_dataset = generate_question_context_pairs(
test_nodes, llm=llm, num_questions_per_chunk=1
)
# [optional] save
if save:
train_qa_dataset.save_json(
f"deeplake_docs_{number_of_samples}_train.json"
)
test_qa_dataset.save_json(
f"deeplake_docs_{number_of_samples}_test.json"
)
return train_qa_dataset, test_qa_dataset
train_qa_dataset, test_qa_dataset = create_train_test_datasets(
number_of_samples=600, llm=llm, nodes=nodes, save=True
)
4%|▍ | 19/480 [02:25<1:04:00, 8.33s/it]
train_qa_dataset = EmbeddingQAFinetuneDataset.from_json(
"deeplake_docs_600_train.json"
)
test_qa_dataset = EmbeddingQAFinetuneDataset.from_json(
"deeplake_docs_600_test.json"
)
def create_query_relevance(qa_dataset):
"""Function for converting llama-index dataset to correct format for deep memory training"""
queries = [text for _, text in qa_dataset.queries.items()]
relevant_docs = qa_dataset.relevant_docs
relevance = []
for doc in relevant_docs:
relevance.append([(relevant_docs[doc][0], 1)])
return queries, relevance
train_queries, train_relevance = create_query_relevance(train_qa_dataset)
test_queries, test_relevance = create_query_relevance(test_qa_dataset)
train_queries[:3]
['In the context of creating a bounding box tensor in a dataset, explain the significance of the "coords" argument and its keys "type" and "mode". What does the "type" key specify about the bounding box coordinates?', 'Explain the process of creating an intrinsics tensor and appending intrinsics matrices in the context of computer vision. What are the dimensions of the intrinsics parameters and what do they represent? Also, describe the concept of a Segmentation Mask Htype and its role in image processing.', 'In the context of querying for images in the MNIST Train Dataset using `ds.query`, what does the command "select * where labels == 0" signify and what is the expected output?']
train_relevance[:3]
[[('node_788', 1)], [('node_861', 1)], [('node_82', 1)]]
test_queries[:3]
['What are the steps to update the information of keypoints and connections in a tensor, and what types of data can be appended to keypoints?', 'What is the command to create a mesh tensor in DeepLake and what are the supported compressions? Also, explain how to append a ply file containing mesh data to this tensor.', 'What is a Sequence htype in the context of tensors and how does it function as a wrapper for other htypes? Provide examples.']
test_relevance[:3]
[[('node_933', 1)], [('node_671', 1)], [('node_471', 1)]]
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
job_id = vector_store.vectorstore.deep_memory.train(
queries=train_queries,
relevance=train_relevance,
embedding_function=embeddings.embed_documents,
)
Starting DeepMemory training job Your Deep Lake dataset has been successfully created!
Preparing training data for deepmemory:
Creating 483 embeddings in 1 batches of size 483:: 100%|██████████| 1/1 [00:03<00:00, 3.67s/it]
DeepMemory training job started. Job ID: 65421a5003888c9ca36c72e8
vector_store.vectorstore.deep_memory.status(job_id)
This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/adilkhan/deeplake_docs_deepmemory2 -------------------------------------------------------------- | 65421a5003888c9ca36c72e8 | -------------------------------------------------------------- | status | completed | -------------------------------------------------------------- | progress | eta: 12.2 seconds | | | recall@10: 67.01% (+18.56%) | -------------------------------------------------------------- | results | recall@10: 67.01% (+18.56%) | --------------------------------------------------------------
3. DeepMemory 评估¶
太棒了!训练带来了显著的改进!现在,让我们在测试集上评估它的性能。
recalls = vector_store.vectorstore.deep_memory.evaluate(
queries=test_queries,
relevance=test_relevance,
embedding_function=embeddings.embed_documents,
)
info Wed Nov 1 09:32:44 2023 GMT Added distance metric `deepmemory_distance`. Embedding queries took 0.95 seconds ---- Evaluating without Deep Memory ---- Recall@1: 12.5% Recall@3: 23.3% Recall@5: 30.8% Recall@10: 50.8% Recall@50: 94.2% Recall@100: 95.8% ---- Evaluating with Deep Memory ---- Recall@1: 11.7% Recall@3: 27.5% Recall@5: 40.8% Recall@10: 65.0% Recall@50: 96.7% Recall@100: 98.3%
令人印象深刻!我们在测试集上观察到召回率提升了15%。接下来,让我们使用RetrieverEvaluator来检查MRR(平均倒数排名)和命中率。
import pandas as pd
def display_results(eval_results):
"""Display results from evaluate."""
hit_rates = []
mrrs = []
names = []
for name, eval_result in eval_results.items():
metric_dicts = []
for er in eval_result:
metric_dict = er.metric_vals_dict
metric_dicts.append(metric_dict)
full_df = pd.DataFrame(metric_dicts)
hit_rate = full_df["hit_rate"].mean()
mrr = full_df["mrr"].mean()
hit_rates.append(hit_rate)
mrrs.append(mrr)
names.append(name)
metric_df = pd.DataFrame(
[
{"retrievers": names[i], "hit_rate": hit_rates[i], "mrr": mrrs[i]}
for i in range(2)
],
)
return metric_df
评估深度记忆检索的性能:
from llama_index.core.evaluation import RetrieverEvaluator
deep_memory_retriever = vector_index.as_retriever(
similarity_top_k=10, vector_store_kwargs={"deep_memory": True}
)
dm_retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate"], retriever=deep_memory_retriever
)
dm_eval_results = await dm_retriever_evaluator.aevaluate_dataset(
test_qa_dataset, retriever=dm_retriever_evaluator
)
from llama_index.core.evaluation import RetrieverEvaluator
naive_retriever = vector_index.as_retriever(similarity_top_k=10)
naive_retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate"], retriever=naive_retriever
)
naive_eval_results = await naive_retriever_evaluator.aevaluate_dataset(
test_qa_dataset, retriever=naive_retriever
)
eval_results = {
f"{mode} with Deep Memory top-10 eval": eval_result
for mode, eval_result in zip(
["with", "without"], [dm_eval_results, naive_eval_results]
)
}
display_results(eval_results)
| retrievers | hit_rate | mrr | |
|---|---|---|---|
| 0 | with with Deep Memory top-10 eval | 0.650000 | 0.244775 |
| 1 | without with Deep Memory top-10 eval | 0.508333 | 0.215129 |
不仅命中率有所提升,MRR 也有所提高
4. 深度记忆推理¶
query_engine = vector_index.as_query_engine(
vector_store_kwargs={"deep_memory": True}, llm=llm
)
response = query_engine.query(
"How can you connect your own storage to the deeplake?"
)
print(response)
info Wed Nov 1 11:37:33 2023 GMT Can't find any metric in the dataset. You can connect your own storage to deeplake by using the `connect()` function in the deeplake API.
query_engine = vector_index.as_query_engine(
vector_store_kwargs={"deep_memory": False}, llm=llm
)
response = query_engine.query(
"How can you connect your own storage to the deeplake?"
)
print(response)
The context does not provide information on how to connect your own storage to Deep Lake.
根据我们的观察,若缺乏"深度记忆"机制,模型容易因检索到错误的上下文而产生不准确的结果。