时效性过滤¶
展示基于时效性加权的节点后处理器功能
In [ ]:
Copied!
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
In [ ]:
Copied!
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import (
FixedRecencyPostprocessor,
EmbeddingRecencyPostprocessor,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import (
FixedRecencyPostprocessor,
EmbeddingRecencyPostprocessor,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.response.notebook_utils import display_response
将文档解析为节点并存入文档库¶
在本示例中,存在 PG 文章的 3 个不同版本。这些版本大体相同,除了一个具体段落——该段落详细说明了他们为 Viaweb 筹集的资金金额。
V1 版本:5 万美元,V2 版本:3 万美元,V3 版本:1 万美元
V1 版本日期:2020-01-01,V2 版本日期:2020-02-03,V3 版本日期:2022-04-12
这样设计的目的是促使索引获取最新信息(即 V3 版本)
In [ ]:
Copied!
# load documents
from llama_index.core import StorageContext
def get_file_metadata(file_name: str):
"""Get file metadata."""
if "v1" in file_name:
return {"date": "2020-01-01"}
elif "v2" in file_name:
return {"date": "2020-02-03"}
elif "v3" in file_name:
return {"date": "2022-04-12"}
else:
raise ValueError("invalid file")
documents = SimpleDirectoryReader(
input_files=[
"test_versioned_data/paul_graham_essay_v1.txt",
"test_versioned_data/paul_graham_essay_v2.txt",
"test_versioned_data/paul_graham_essay_v3.txt",
],
file_metadata=get_file_metadata,
).load_data()
# define settings
from llama_index.core import Settings
Settings.text_splitter = SentenceSplitter(chunk_size=512)
# use node parser to parse into nodes
nodes = Settings.text_splitter.get_nodes_from_documents(documents)
# add to docstore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)
# load documents
from llama_index.core import StorageContext
def get_file_metadata(file_name: str):
"""Get file metadata."""
if "v1" in file_name:
return {"date": "2020-01-01"}
elif "v2" in file_name:
return {"date": "2020-02-03"}
elif "v3" in file_name:
return {"date": "2022-04-12"}
else:
raise ValueError("invalid file")
documents = SimpleDirectoryReader(
input_files=[
"test_versioned_data/paul_graham_essay_v1.txt",
"test_versioned_data/paul_graham_essay_v2.txt",
"test_versioned_data/paul_graham_essay_v3.txt",
],
file_metadata=get_file_metadata,
).load_data()
# define settings
from llama_index.core import Settings
Settings.text_splitter = SentenceSplitter(chunk_size=512)
# use node parser to parse into nodes
nodes = Settings.text_splitter.get_nodes_from_documents(documents)
# add to docstore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)
In [ ]:
Copied!
print(documents[2].get_text())
print(documents[2].get_text())
构建索引¶
In [ ]:
Copied!
# build index
index = VectorStoreIndex(nodes, storage_context=storage_context)
# build index
index = VectorStoreIndex(nodes, storage_context=storage_context)
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 84471 tokens
定义时效性后处理器¶
In [ ]:
Copied!
node_postprocessor = FixedRecencyPostprocessor()
node_postprocessor = FixedRecencyPostprocessor()
In [ ]:
Copied!
node_postprocessor_emb = EmbeddingRecencyPostprocessor()
node_postprocessor_emb = EmbeddingRecencyPostprocessor()
查询索引¶
In [ ]:
Copied!
# naive query
query_engine = index.as_query_engine(
similarity_top_k=3,
)
response = query_engine.query(
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?",
)
# naive query
query_engine = index.as_query_engine(
similarity_top_k=3,
)
response = query_engine.query(
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?",
)
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1813 tokens INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens
In [ ]:
Copied!
# query using fixed recency node postprocessor
query_engine = index.as_query_engine(
similarity_top_k=3, node_postprocessors=[node_postprocessor]
)
response = query_engine.query(
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?",
)
# query using fixed recency node postprocessor
query_engine = index.as_query_engine(
similarity_top_k=3, node_postprocessors=[node_postprocessor]
)
response = query_engine.query(
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?",
)
In [ ]:
Copied!
# query using embedding-based node postprocessor
query_engine = index.as_query_engine(
similarity_top_k=3, node_postprocessors=[node_postprocessor_emb]
)
response = query_engine.query(
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?",
)
# query using embedding-based node postprocessor
query_engine = index.as_query_engine(
similarity_top_k=3, node_postprocessors=[node_postprocessor_emb]
)
response = query_engine.query(
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?",
)
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens
查询索引(底层用法)¶
在本示例中,我们首先通过查询调用获取完整的节点集合,随后将其传递至节点后处理器,最终通过摘要索引合成响应。
In [ ]:
Copied!
from llama_index.core import SummaryIndex
from llama_index.core import SummaryIndex
In [ ]:
Copied!
query_str = (
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?"
)
query_str = (
"How much did the author raise in seed funding from Idelle's husband"
" (Julian) for Viaweb?"
)
In [ ]:
Copied!
query_engine = index.as_query_engine(
similarity_top_k=3, response_mode="no_text"
)
init_response = query_engine.query(
query_str,
)
resp_nodes = [n.node for n in init_response.source_nodes]
query_engine = index.as_query_engine(
similarity_top_k=3, response_mode="no_text"
)
init_response = query_engine.query(
query_str,
)
resp_nodes = [n.node for n in init_response.source_nodes]
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens
In [ ]:
Copied!
summary_index = SummaryIndex(resp_nodes)
query_engine = summary_index.as_query_engine(
node_postprocessors=[node_postprocessor]
)
response = query_engine.query(query_str)
summary_index = SummaryIndex(resp_nodes)
query_engine = summary_index.as_query_engine(
node_postprocessors=[node_postprocessor]
)
response = query_engine.query(query_str)
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens