递归检索器 + 节点引用 + Braintrust¶
本指南展示如何通过递归检索遍历节点关系,并基于"引用"获取节点。
节点引用是一个强大的概念。初次执行检索时,您可能希望检索的是引用而非原始文本。您可以让多个引用指向同一个节点。
本指南将探讨节点引用的几种不同用法:
- 分块引用:不同大小的文本块指向更大的文本块
- 元数据引用:摘要和生成的问题指向更大的文本块
我们使用 Braintrust 评估递归检索+节点引用方法的效果。Braintrust 是企业级AI产品开发平台,涵盖评估、提示词调试和数据管理等功能,可帮助企业轻松集成AI技术。
以下是示例评估看板:
%pip install llama-index-llms-openai
%pip install llama-index-readers-file
%load_ext autoreload
%autoreload 2
# NOTE: Replace YOUR_OPENAI_API_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes.
# Signup for Braintrust at https://braintrustdata.com/ and get your API key at https://www.braintrustdata.com/app/braintrustdata.com/settings/api-keys
# NOTE: Replace YOUR_OPENAI_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes.
%env OPENAI_API_KEY=
%env BRAINTRUST_API_KEY=
%env TOKENIZERS_PARALLELISM=true # This is needed to avoid a warning message from Chroma
%pip install -U llama_hub llama_index braintrust autoevals pypdf pillow transformers torch torchvision
加载数据与初始化配置¶
本节将下载Llama 2论文并创建初始节点集(分块大小为1024)。
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
import json
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
node_parser = SentenceSplitter(chunk_size=1024)
base_nodes = node_parser.get_nodes_from_documents(docs)
# set node ids to be a constant
for idx, node in enumerate(base_nodes):
node.id_ = f"node-{idx}"
from llama_index.core.embeddings import resolve_embed_model
embed_model = resolve_embed_model("local:BAAI/bge-small-en")
llm = OpenAI(model="gpt-3.5-turbo")
基准检索器¶
定义一个基准检索器,其功能仅通过嵌入相似度获取前k个原始文本节点。
base_index = VectorStoreIndex(base_nodes, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
retrievals = base_retriever.retrieve(
"Can you tell me about the key concepts for safety finetuning"
)
for n in retrievals:
display_source_node(n, source_length=1500)
query_engine_base = RetrieverQueryEngine.from_args(base_retriever, llm=llm)
response = query_engine_base.query(
"Can you tell me about the key concepts for safety finetuning"
)
print(str(response))
分块引用:子分块指向父分块的引用关系¶
本使用示例展示了如何构建一个由较小分块指向较大父分块的引用图。
在查询时,我们会检索较小的分块,但同时会跟随引用获取更大的父分块。这种方式能为合成阶段提供更丰富的上下文信息。
sub_chunk_sizes = [128, 256, 512]
sub_node_parsers = [SentenceSplitter(chunk_size=c) for c in sub_chunk_sizes]
all_nodes = []
for base_node in base_nodes:
for n in sub_node_parsers:
sub_nodes = n.get_nodes_from_documents([base_node])
sub_inodes = [
IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
]
all_nodes.extend(sub_inodes)
# also add original node to node
original_node = IndexNode.from_text_node(base_node, base_node.node_id)
all_nodes.append(original_node)
all_nodes_dict = {n.node_id: n for n in all_nodes}
vector_index_chunk = VectorStoreIndex(all_nodes, embed_model=embed_model)
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=2)
retriever_chunk = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_chunk},
node_dict=all_nodes_dict,
verbose=True,
)
nodes = retriever_chunk.retrieve(
"Can you tell me about the key concepts for safety finetuning"
)
for node in nodes:
display_source_node(node, source_length=2000)
query_engine_chunk = RetrieverQueryEngine.from_args(retriever_chunk, llm=llm)
response = query_engine_chunk.query(
"Can you tell me about the key concepts for safety finetuning"
)
print(str(response))
元数据引用:摘要与生成式问题(指向更大文本块)¶
在本使用示例中,我们将展示如何定义引用源节点的附加上下文。
这些附加上下文既包含摘要,也包含生成的问题。
在查询时,我们虽然检索较小的文本块,但会遵循指向更大文本块的引用关系。这种方法能为信息合成提供更丰富的上下文。
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core.extractors import (
SummaryExtractor,
QuestionsAnsweredExtractor,
)
extractors = [
SummaryExtractor(summaries=["self"], show_progress=True),
QuestionsAnsweredExtractor(questions=5, show_progress=True),
]
# run metadata extractor across base nodes, get back dictionaries
metadata_dicts = []
for extractor in extractors:
metadata_dicts.extend(extractor.extract(base_nodes))
# cache metadata dicts
def save_metadata_dicts(path):
with open(path, "w") as fp:
for m in metadata_dicts:
fp.write(json.dumps(m) + "\n")
def load_metadata_dicts(path):
with open(path, "r") as fp:
metadata_dicts = [json.loads(l) for l in fp.readlines()]
return metadata_dicts
save_metadata_dicts("data/llama2_metadata_dicts.jsonl")
metadata_dicts = load_metadata_dicts("data/llama2_metadata_dicts.jsonl")
# all nodes consists of source nodes, along with metadata
import copy
all_nodes = copy.deepcopy(base_nodes)
for idx, d in enumerate(metadata_dicts):
inode_q = IndexNode(
text=d["questions_this_excerpt_can_answer"],
index_id=base_nodes[idx].node_id,
)
inode_s = IndexNode(
text=d["section_summary"], index_id=base_nodes[idx].node_id
)
all_nodes.extend([inode_q, inode_s])
all_nodes_dict = {n.node_id: n for n in all_nodes}
## Load index into vector index
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo")
vector_index_metadata = VectorStoreIndex(all_nodes)
vector_retriever_metadata = vector_index_metadata.as_retriever(
similarity_top_k=2
)
retriever_metadata = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_metadata},
node_dict=all_nodes_dict,
verbose=True,
)
nodes = retriever_metadata.retrieve(
"Can you tell me about the key concepts for safety finetuning"
)
for node in nodes:
display_source_node(node, source_length=2000)
query_engine_metadata = RetrieverQueryEngine.from_args(
retriever_metadata, llm=llm
)
response = query_engine_metadata.query(
"Can you tell me about the key concepts for safety finetuning"
)
print(str(response))
评估¶
我们使用Braintrust评估递归检索+节点引用方法的效果。Braintrust是企业级AI产品开发套件,从评估测试到提示词调试,再到数据管理,它能帮助企业消除AI集成过程中的不确定性和繁琐操作。
我们同时评估了文本块引用和元数据引用两种方式,通过嵌入相似度查找来检索参考节点。我们将这两种方法与直接获取原始节点的基线检索器进行对比,采用命中率(hit-rate)和平均倒数排名(MRR)作为评估指标。
可查看以下示例评估面板:
数据集生成¶
我们首先从文本块集合中生成问题数据集。
from llama_index.core.evaluation import (
generate_question_context_pairs,
EmbeddingQAFinetuneDataset,
)
import nest_asyncio
nest_asyncio.apply()
eval_dataset = generate_question_context_pairs(base_nodes)
eval_dataset.save_json("data/llama2_eval_dataset.json")
# optional
eval_dataset = EmbeddingQAFinetuneDataset.from_json(
"data/llama2_eval_dataset.json"
)
结果对比¶
我们对每个检索器进行了评估,以测量命中率(hit rate)和平均倒数排名(MRR)。
研究发现,带有节点引用(无论是文本块引用还是元数据引用)的检索器,其表现通常优于直接检索原始文本块的方式。
import pandas as pd
# set vector retriever similarity top k to higher
top_k = 10
def display_results(names, results_arr):
"""Display results from evaluate."""
hit_rates = []
mrrs = []
for name, eval_results in zip(names, results_arr):
metric_dicts = []
for eval_result in eval_results:
metric_dict = eval_result.metric_vals_dict
metric_dicts.append(metric_dict)
results_df = pd.DataFrame(metric_dicts)
hit_rate = results_df["hit_rate"].mean()
mrr = results_df["mrr"].mean()
hit_rates.append(hit_rate)
mrrs.append(mrr)
final_df = pd.DataFrame(
{"retrievers": names, "hit_rate": hit_rates, "mrr": mrrs}
)
display(final_df)
让我们定义一些评分函数并设置数据集变量 data。
queries = eval_dataset.queries
relevant_docs = eval_dataset.relevant_docs
data = [
({"input": queries[query], "expected": relevant_docs[query]})
for query in queries.keys()
]
def hitRateScorer(input, expected, output=None):
is_hit = any([id in expected for id in output])
return 1 if is_hit else 0
def mrrScorer(input, expected, output=None):
for i, id in enumerate(output):
if id in expected:
return 1 / (i + 1)
return 0
import braintrust
# Evaluate the chunk retriever
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=10)
retriever_chunk = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_chunk},
node_dict=all_nodes_dict,
verbose=False,
)
def runChunkRetriever(input, hooks):
retrieved_nodes = retriever_chunk.retrieve(input)
retrieved_ids = [node.node.node_id for node in retrieved_nodes]
return retrieved_ids
chunkEval = await braintrust.Eval(
name="llamaindex-recurisve-retrievers",
data=data,
task=runChunkRetriever,
scores=[hitRateScorer, mrrScorer],
)
# Evaluate the metadata retriever
vector_retriever_metadata = vector_index_metadata.as_retriever(
similarity_top_k=10
)
retriever_metadata = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_metadata},
node_dict=all_nodes_dict,
verbose=False,
)
def runMetaDataRetriever(input, hooks):
retrieved_nodes = retriever_metadata.retrieve(input)
retrieved_ids = [node.node.node_id for node in retrieved_nodes]
return retrieved_ids
metadataEval = await braintrust.Eval(
name="llamaindex-recurisve-retrievers",
data=data,
task=runMetaDataRetriever,
scores=[hitRateScorer, mrrScorer],
)
# Evaluate the base retriever
base_retriever = base_index.as_retriever(similarity_top_k=10)
def runBaseRetriever(input, hooks):
retrieved_nodes = base_retriever.retrieve(input)
retrieved_ids = [node.node.node_id for node in retrieved_nodes]
return retrieved_ids
baseEval = await braintrust.Eval(
name="llamaindex-recurisve-retrievers",
data=data,
task=runBaseRetriever,
scores=[hitRateScorer, mrrScorer],
)