配置¶
我们加载了一些数据并定义了一个非常简单的 RAG 查询引擎用于评估(采用 top-k 检索机制)。
%pip install llama-index-readers-file pymupdf
%pip install llama-index-llms-openai
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
mkdir: data: File exists --2023-09-19 00:05:14-- https://arxiv.org/pdf/2307.09288.pdf Resolving arxiv.org (arxiv.org)... 128.84.21.199 Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 13661300 (13M) [application/pdf] Saving to: ‘data/llama2.pdf’ data/llama2.pdf 100%[===================>] 13.03M 1.56MB/s in 9.3s 2023-09-19 00:05:25 (1.40 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-4")
node_parser = SentenceSplitter(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine(llm=llm)
数据集生成¶
我们首先进行合成评估数据集的生成工作。具体流程是:基于现有上下文人工合成一组问题,然后通过强大的大语言模型(例如GPT-4)结合原始上下文生成每个问题的"真实答案"作为基准。
定义函数¶
我们定义用于数据集生成的函数:
from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re
llm = OpenAI(model="gpt-4")
我们定义 generate_answers_for_questions 函数,用于根据给定上下文从问题生成答案。
QA_PROMPT = PromptTemplate(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge, "
"answer the query.\n"
"Query: {query_str}\n"
"Answer: "
)
def generate_answers_for_questions(
questions: List[str], context: str, llm: OpenAI
) -> str:
"""Generate answers for questions given context."""
answers = []
for question in questions:
fmt_qa_prompt = QA_PROMPT.format(
context_str=context, query_str=question
)
response_obj = llm.complete(fmt_qa_prompt)
answers.append(str(response_obj))
return answers
我们定义 generate_qa_pairs 用于在整个节点列表上生成问答对。
QUESTION_GEN_USER_TMPL = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge, "
"generate the relevant questions. "
)
QUESTION_GEN_SYS_TMPL = """\
You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided.\
"""
question_gen_template = ChatPromptTemplate(
message_templates=[
ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL),
ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
]
)
def generate_qa_pairs(
nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10
) -> List[Tuple[str, str]]:
"""Generate questions."""
qa_pairs = []
for idx, node in enumerate(nodes):
print(f"Node {idx}/{len(nodes)}")
context_str = node.get_content(metadata_mode="all")
fmt_messages = question_gen_template.format_messages(
num_questions_per_chunk=10,
context_str=context_str,
)
chat_response = llm.chat(fmt_messages)
raw_output = chat_response.message.content
result_list = str(raw_output).strip().split("\n")
cleaned_questions = [
re.sub(r"^\d+[\).\s]", "", question).strip()
for question in result_list
]
answers = generate_answers_for_questions(
cleaned_questions, context_str, llm
)
cur_qa_pairs = list(zip(cleaned_questions, answers))
qa_pairs.extend(cur_qa_pairs)
return qa_pairs
qa_pairs
[('What is the main focus of the work described in the document?',
'The main focus of the work described in the document is the development and release of Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. The fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. The document also provides a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat.'),
('What is the range of parameters for the large language models (LLMs) developed in this work?',
'The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion.'),
('What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?',
'The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat.'),
('How do the models developed in this work compare to open-source chat models based on the benchmarks tested?',
'The models developed in this work, specifically the fine-tuned LLMs called Llama 2-Chat, outperform open-source chat models on most benchmarks tested.'),
('What are the two key areas of human evaluation mentioned in the document for the developed models?',
'The two key areas of human evaluation mentioned in the document for the developed models are helpfulness and safety.'),
('What is the purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat?',
'The purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat is to enable the community to build on their work and contribute to the responsible development of Large Language Models (LLMs).'),
('What is the intended benefit for the community from this work?',
'The intended benefit for the community from this work is to enable them to build on the work and contribute to the responsible development of large language models (LLMs). The team provides a detailed description of their approach to fine-tuning and safety improvements of Llama 2-Chat for this purpose.'),
('Who are the corresponding authors of this work and how can they be contacted?',
'The corresponding authors of this work are Thomas Scialom and Hugo Touvron. They can be contacted via email at tscialom@meta.com and htouvron@meta.com respectively.'),
('What is the source of the document and how many pages does it contain?',
'The source of the document is "1" and it contains 77 pages.'),
('Where can the contributions of all the authors be found in the document?',
'The contributions of all the authors can be found in Section A.1 of the document.')]
从数据集中获取配对¶
注意:此操作可能耗时较长。为提高速度,可尝试输入节点的子集进行操作。
qa_pairs = generate_qa_pairs(
# nodes[:1],
nodes,
llm,
num_questions_per_chunk=10,
)
[可选] 定义保存/加载功能¶
# save
import pickle
pickle.dump(qa_pairs, open("eval_dataset.pkl", "wb"))
# save
import pickle
qa_pairs = pickle.load(open("eval_dataset.pkl", "rb"))
生成结果评估¶
本节将介绍几种评估生成结果的方法。总体而言,我们采用"评估用大语言模型"来衡量生成结果的质量。这项工作会在带标注和无标注两种实验设置下进行。
我们将详细说明以下评估算法:
- 正确性:将生成答案与标准答案进行比对
- 忠实度:评估响应内容是否忠实于上下文(无需标注)
构建正确性评估器¶
正确性评估器会根据给定的查询,将生成的答案与参考的标准答案进行对比。我们会输出一个1到5分的评分,其中1分表示最差,5分表示最优。
我们通过一个带有聊天界面的系统提示和用户提示来实现这一功能。
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Dict
CORRECTNESS_SYS_TMPL = """
You are an expert evaluation system for a question answering chatbot.
You are given the following information:
- a user query,
- a reference answer, and
- a generated answer.
Your job is to judge the relevance and correctness of the generated answer.
Output a single score that represents a holistic evaluation.
You must return your response in a line with only the score.
Do not return answers in any other format.
On a separate line provide your reasoning for the score as well.
Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- If the generated answer is not relevant to the user query, \
you should give a score of 1.
- If the generated answer is relevant but contains mistakes, \
you should give a score between 2 and 3.
- If the generated answer is relevant and fully correct, \
you should give a score between 4 and 5.
"""
CORRECTNESS_USER_TMPL = """
## User Query
{query}
## Reference Answer
{reference_answer}
## Generated Answer
{generated_answer}
"""
eval_chat_template = ChatPromptTemplate(
message_templates=[
ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
]
)
既然我们已经定义了提示词模板,接下来让我们定义一个评估函数,该函数将提示词输入给大语言模型(LLM),并将输出解析为结果字典。
from llama_index.llms.openai import OpenAI
def run_correctness_eval(
query_str: str,
reference_answer: str,
generated_answer: str,
llm: OpenAI,
threshold: float = 4.0,
) -> Dict:
"""Run correctness eval."""
fmt_messages = eval_chat_template.format_messages(
llm=llm,
query=query_str,
reference_answer=reference_answer,
generated_answer=generated_answer,
)
chat_response = llm.chat(fmt_messages)
raw_output = chat_response.message.content
# Extract from response
score_str, reasoning_str = raw_output.split("\n", 1)
score = float(score_str)
reasoning = reasoning_str.lstrip("\n")
return {"passing": score >= threshold, "score": score, "reason": reasoning}
现在让我们尝试在聊天模型(GPT-4)上运行一些示例输入。
llm = OpenAI(model="gpt-4")
# query_str = "What is the range of parameters for the large language models (LLMs) developed in this work?"
# reference_answer = "The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion."
query_str = (
"What is the specific name given to the fine-tuned LLMs optimized for"
" dialogue use cases?"
)
reference_answer = (
"The specific name given to the fine-tuned LLMs optimized for dialogue use"
" cases is Llama 2-Chat."
)
generated_answer = str(query_engine.query(query_str))
print(str(generated_answer))
The fine-tuned Large Language Models (LLMs) optimized for dialogue use cases are specifically called Llama 2-Chat.
eval_results = run_correctness_eval(
query_str, reference_answer, generated_answer, llm=llm, threshold=4.0
)
display(eval_results)
{'passing': True,
'score': 5.0,
'reason': 'The generated answer is completely relevant to the user query and matches the reference answer in terms of information. It correctly identifies "Llama 2-Chat" as the specific name given to the fine-tuned LLMs optimized for dialogue use cases.'}
构建真实性评估器¶
真实性评估器用于判断生成的响应是否忠实于检索到的上下文内容。
相较于正确性评估器,这一步骤的复杂度更高。由于上下文集合可能非常庞大,可能会超出上下文窗口的容量限制。我们需要设计一种响应合成策略来依次遍历所有上下文内容。
我们提供了配套教程展示如何从零开始构建响应合成功能,同时也提供开箱即用的响应合成模块。本指南将使用现成的模块方案。
EVAL_TEMPLATE = PromptTemplate(
"Please tell if a given piece of information "
"is supported by the context.\n"
"You need to answer with either YES or NO.\n"
"Answer YES if any of the context supports the information, even "
"if most of the context is unrelated. "
"Some examples are provided below. \n\n"
"Information: Apple pie is generally double-crusted.\n"
"Context: An apple pie is a fruit pie in which the principal filling "
"ingredient is apples. \n"
"Apple pie is often served with whipped cream, ice cream "
"('apple pie à la mode'), custard or cheddar cheese.\n"
"It is generally double-crusted, with pastry both above "
"and below the filling; the upper crust may be solid or "
"latticed (woven of crosswise strips).\n"
"Answer: YES\n"
"Information: Apple pies tastes bad.\n"
"Context: An apple pie is a fruit pie in which the principal filling "
"ingredient is apples. \n"
"Apple pie is often served with whipped cream, ice cream "
"('apple pie à la mode'), custard or cheddar cheese.\n"
"It is generally double-crusted, with pastry both above "
"and below the filling; the upper crust may be solid or "
"latticed (woven of crosswise strips).\n"
"Answer: NO\n"
"Information: {query_str}\n"
"Context: {context_str}\n"
"Answer: "
)
EVAL_REFINE_TEMPLATE = PromptTemplate(
"We want to understand if the following information is present "
"in the context information: {query_str}\n"
"We have provided an existing YES/NO answer: {existing_answer}\n"
"We have the opportunity to refine the existing answer "
"(only if needed) with some more context below.\n"
"------------\n"
"{context_msg}\n"
"------------\n"
"If the existing answer was already YES, still answer YES. "
"If the information is present in the new context, answer YES. "
"Otherwise answer NO.\n"
)
注意:在当前响应合成器的设置中,我们并未为聊天端点单独区分系统消息和用户消息,因此直接使用标准的 llm.complete 方法进行文本补全。
接下来我们定义以下函数。由于我们既为给定上下文定义了标准评估模板,又为后续上下文定义了优化模板,因此通过实现"创建-优化"响应合成策略来获取答案。
from llama_index.core.response_synthesizers import Refine
from typing import List, Dict
def run_faithfulness_eval(
generated_answer: str,
contexts: List[str],
llm: OpenAI,
) -> Dict:
"""Run faithfulness eval."""
refine = Refine(
llm=llm,
text_qa_template=EVAL_TEMPLATE,
refine_template=EVAL_REFINE_TEMPLATE,
)
response_obj = refine.get_response(generated_answer, contexts)
response_txt = str(response_obj)
if "yes" in response_txt.lower():
passing = True
else:
passing = False
return {"passing": passing, "reason": str(response_txt)}
让我们用一些数据来试试看
# use the same query_str, and reference_answer as above
# query_str = "What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?"
# reference_answer = "The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat."
response = query_engine.query(query_str)
generated_answer = str(response)
context_list = [n.get_content() for n in response.source_nodes]
eval_results = run_faithfulness_eval(
generated_answer,
contexts=context_list,
llm=llm,
)
display(eval_results)
{'passing': True, 'reason': 'YES'}
import random
sample_size = 5
qa_pairs_sample = random.sample(qa_pairs, sample_size)
import pandas as pd
def run_evals(qa_pairs: List[Tuple[str, str]], llm: OpenAI, query_engine):
results_list = []
for question, reference_answer in qa_pairs:
response = query_engine.query(question)
generated_answer = str(response)
correctness_results = run_correctness_eval(
query_str,
reference_answer,
generated_answer,
llm=llm,
threshold=4.0,
)
faithfulness_results = run_faithfulness_eval(
generated_answer,
contexts=context_list,
llm=llm,
)
cur_result_dict = {
"correctness": correctness_results["passing"],
"faithfulness": faithfulness_results["passing"],
}
results_list.append(cur_result_dict)
return pd.DataFrame(results_list)
evals_df = run_evals(qa_pairs_sample, llm, query_engine)
evals_df["correctness"].mean()
0.4
evals_df["faithfulness"].mean()
0.6