多模态 GPT4V Pydantic 程序¶
在本笔记中,我们将展示如何通过 LlamaIndex 使用新版 OpenAI GPT4V API 生成结构化数据。用户只需指定一个 Pydantic 对象即可实现。
我们还针对此任务对比了以下大型视觉模型:
- GPT4-V
- Fuyu-8B
- MiniGPT-4
- CogVLM
- Llava-14B
本地下载镜像¶
In [ ]:
Copied!
%pip install llama-index-multi-modal-llms-openai
%pip install llama-index-multi-modal-llms-replicate
%pip install llama-index-multi-modal-llms-openai
%pip install llama-index-multi-modal-llms-replicate
In [ ]:
Copied!
import os
OPENAI_API_KEY = "sk-<your-openai-api-token>"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
import os
OPENAI_API_KEY = "sk-"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
In [ ]:
Copied!
REPLICATE_API_TOKEN = "" # Your Relicate API token here
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
REPLICATE_API_TOKEN = "" # Your Relicate API token here
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
In [ ]:
Copied!
from pathlib import Path
input_image_path = Path("restaurant_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
from pathlib import Path
input_image_path = Path("restaurant_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
In [ ]:
Copied!
!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./restaurant_images/fried_chicken.png
!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./restaurant_images/fried_chicken.png
初始化餐厅的 Pydantic 类¶
In [ ]:
Copied!
from pydantic import BaseModel
class Restaurant(BaseModel):
"""Data model for an restaurant."""
restaurant: str
food: str
discount: str
price: str
rating: str
review: str
from pydantic import BaseModel
class Restaurant(BaseModel):
"""Data model for an restaurant."""
restaurant: str
food: str
discount: str
price: str
rating: str
review: str
加载 OpenAI GPT4V 多模态大语言模型¶
In [ ]:
Copied!
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core import SimpleDirectoryReader
# put your local directory here
image_documents = SimpleDirectoryReader("./restaurant_images").load_data()
openai_mm_llm = OpenAIMultiModal(
model="gpt-4o", api_key=OPENAI_API_KEY, max_new_tokens=1000
)
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core import SimpleDirectoryReader
# put your local directory here
image_documents = SimpleDirectoryReader("./restaurant_images").load_data()
openai_mm_llm = OpenAIMultiModal(
model="gpt-4o", api_key=OPENAI_API_KEY, max_new_tokens=1000
)
绘制图像¶
In [ ]:
Copied!
from PIL import Image
import matplotlib.pyplot as plt
imageUrl = "./restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
from PIL import Image
import matplotlib.pyplot as plt
imageUrl = "./restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
Out[ ]:
<matplotlib.image.AxesImage at 0x2a5cd06d0>
使用多模态Pydantic程序从GPT4V输出生成餐厅图像的结构化数据¶
In [ ]:
Copied!
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Restaurant),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Restaurant),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
In [ ]:
Copied!
response = openai_program()
for res in response:
print(res)
response = openai_program()
for res in response:
print(res)
('restaurant', 'Not Specified')
('food', '8 Wings or Chicken Poppers')
('discount', 'Black Friday Offer')
('price', '$8.73')
('rating', 'Not Specified')
('review', 'Not Specified')
测试 MiniGPT-4、Fuyu-8B、LLaVa-13B 和 CogVLM 模型的 Pydantic 功能¶
In [ ]:
Copied!
from llama_index.multi_modal_llms.replicate import ReplicateMultiModal
from llama_index.multi_modal_llms.replicate.base import (
REPLICATE_MULTI_MODAL_LLM_MODELS,
)
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
def pydantic_replicate(
model_name, output_class, image_documents, prompt_template_str
):
mm_llm = ReplicateMultiModal(
model=REPLICATE_MULTI_MODAL_LLM_MODELS[model_name],
temperature=0.1,
max_new_tokens=1000,
)
llm_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(output_class),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=mm_llm,
verbose=True,
)
response = llm_program()
print(f"Model: {model_name}")
for res in response:
print(res)
from llama_index.multi_modal_llms.replicate import ReplicateMultiModal
from llama_index.multi_modal_llms.replicate.base import (
REPLICATE_MULTI_MODAL_LLM_MODELS,
)
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
def pydantic_replicate(
model_name, output_class, image_documents, prompt_template_str
):
mm_llm = ReplicateMultiModal(
model=REPLICATE_MULTI_MODAL_LLM_MODELS[model_name],
temperature=0.1,
max_new_tokens=1000,
)
llm_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(output_class),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=mm_llm,
verbose=True,
)
response = llm_program()
print(f"Model: {model_name}")
for res in response:
print(res)
使用 Fuyu-8B 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate("fuyu-8b", Restaurant, image_documents, prompt_template_str)
pydantic_replicate("fuyu-8b", Restaurant, image_documents, prompt_template_str)
使用 LLaVa-13B 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate(
"llava-13b", Restaurant, image_documents, prompt_template_str
)
pydantic_replicate(
"llava-13b", Restaurant, image_documents, prompt_template_str
)
使用 MiniGPT-4 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate(
"minigpt-4", Restaurant, image_documents, prompt_template_str
)
pydantic_replicate(
"minigpt-4", Restaurant, image_documents, prompt_template_str
)
使用 CogVLM 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate("cogvlm", Restaurant, image_documents, prompt_template_str)
pydantic_replicate("cogvlm", Restaurant, image_documents, prompt_template_str)
观察结果:
- 只有 GPT4-V 能很好地处理这个基于图像的 pydantic 任务
- 其他视觉模型只能输出部分字段
In [ ]:
Copied!
input_image_path = Path("amazon_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
input_image_path = Path("amazon_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
In [ ]:
Copied!
!wget "https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb" -O ./amazon_images/amazon.png
!wget "https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb" -O ./amazon_images/amazon.png
初始化亚马逊商品 Pydantic 类¶
In [ ]:
Copied!
from pydantic import BaseModel
class Product(BaseModel):
"""Data model for a Amazon Product."""
title: str
category: str
discount: str
price: str
rating: str
review: str
description: str
inventory: str
from pydantic import BaseModel
class Product(BaseModel):
"""Data model for a Amazon Product."""
title: str
category: str
discount: str
price: str
rating: str
review: str
description: str
inventory: str
绘制图像¶
In [ ]:
Copied!
imageUrl = "./amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
imageUrl = "./amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
Out[ ]:
<matplotlib.image.AxesImage at 0x17b96e010>
使用多模态 Pydantic 程序从 GPT4V 输出生成亚马逊产品图片的结构化数据¶
In [ ]:
Copied!
amazon_image_documents = SimpleDirectoryReader("./amazon_images").load_data()
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Product),
image_documents=amazon_image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
amazon_image_documents = SimpleDirectoryReader("./amazon_images").load_data()
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Product),
image_documents=amazon_image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
In [ ]:
Copied!
response = openai_program_amazon()
for res in response:
print(res)
response = openai_program_amazon()
for res in response:
print(res)
('title', 'Instant Vortex 5.7QT Air Fryer Oven Combo')
('category', 'Kitchen Appliances')
('discount', '20% off')
('price', '$151.20')
('rating', '4.7 out of 5 stars')
('review', '5086 ratings')
('description', '6-in-1 functionality; air fry, broil, bake, roast, reheat, and dehydrate. EvenCrisp Technology for crispy results. Easy to use touchscreen. Dishwasher safe parts. Cooks food faster and with less oil.')
('inventory', 'In stock')
测试 MiniGPT-4、Fuyu-8B、LLaVa-13B 和 CogVLM 模型的 Pydantic 验证¶
使用 Fuyu-8B 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate(
"fuyu-8b", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"fuyu-8b", Product, amazon_image_documents, prompt_template_str
)
使用 MiniGPT-4 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate(
"minigpt-4", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"minigpt-4", Product, amazon_image_documents, prompt_template_str
)
使用 CogVLM-4 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate(
"cogvlm", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"cogvlm", Product, amazon_image_documents, prompt_template_str
)
Model: cogvlm
('title', 'Instant Vortex 5.7QT Air Fryer Oven Combo')
('category', 'Kitchen Appliances')
('discount', '20% off')
('price', '151.00')
('rating', '4.5 stars')
('review', "Amazon's Choice")
('description', 'Instant Vortex 5.7QT Air Fryer Oven Combo, From the Makers of Instant Pot, Customizable Smart Cooking Programs, Digital Touchscreen, Nonstick and Dishwasher Safe Basket, App with over 100 Recipes')
('inventory', 'In stock')
使用 LlaVa-13B 实现 Pydantic 结构化输出¶
In [ ]:
Copied!
pydantic_replicate(
"llava-13b", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"llava-13b", Product, amazon_image_documents, prompt_template_str
)
Model: llava-13b
('title', 'Instant Vortex 6.5 Qt Air Fryer Oven Combo')
('category', 'Kitchen Appliances')
('discount', '20% off')
('price', '$149.99')
('rating', '4.5 out of 5 stars')
('review', '500+ reviews')
('description', 'The Instant Vortex 6.5 Qt Air Fryer Oven Combo is a versatile and customizable small kitchen appliance that can air fry, bake, roast, broil, and dehydrate. It features a digital touchscreen, non-stick safe basket, and dishwasher safe basket, making it easy to use and clean. With over 1200 recipes, cooking programs, and digital touchscreen, this appliance is perfect for anyone looking to simplify their cooking routine.')
('inventory', 'In Stock')
观察结果:
- 只有 GPT4v、Llava-13B 和 GogVLM 输出了预期字段
- 在这三个模型中,GPT4V 获得的结果最准确。Llava-13B 和 CogVLM 的价格识别有误
初始化 Instagram 广告 Pydantic 类并比较不同多模态大语言模型的性能¶
In [ ]:
Copied!
input_image_path = Path("instagram_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
input_image_path = Path("instagram_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
In [ ]:
Copied!
!wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./instagram_images/jordan.png
!wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./instagram_images/jordan.png
In [ ]:
Copied!
from pydantic import BaseModel
class InsAds(BaseModel):
"""Data model for a Ins Ads."""
account: str
brand: str
product: str
category: str
discount: str
price: str
comments: str
review: str
description: str
from pydantic import BaseModel
class InsAds(BaseModel):
"""Data model for a Ins Ads."""
account: str
brand: str
product: str
category: str
discount: str
price: str
comments: str
review: str
description: str
In [ ]:
Copied!
from PIL import Image
import matplotlib.pyplot as plt
imageUrl = "./instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
from PIL import Image
import matplotlib.pyplot as plt
imageUrl = "./instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
Out[ ]:
<matplotlib.image.AxesImage at 0x16a722890>
In [ ]:
Copied!
ins_image_documents = SimpleDirectoryReader("./instagram_images").load_data()
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(InsAds),
image_documents=ins_image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
response = openai_program_ins()
for res in response:
print(res)
ins_image_documents = SimpleDirectoryReader("./instagram_images").load_data()
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(InsAds),
image_documents=ins_image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
response = openai_program_ins()
for res in response:
print(res)
('account', 'jordansdaily')
('brand', 'Air Jordan')
('product', 'Air Jordan 2')
('category', 'Footwear')
('discount', 'None')
('price', '$175')
('comments', 'Liked by cemm2k and others')
('review', 'Not available')
('description', "Release date November 18th - Air Jordan 2 'Italy'")
In [ ]:
Copied!
pydantic_replicate("fuyu-8b", InsAds, ins_image_documents, prompt_template_str)
pydantic_replicate("fuyu-8b", InsAds, ins_image_documents, prompt_template_str)
In [ ]:
Copied!
pydantic_replicate(
"llava-13b", InsAds, ins_image_documents, prompt_template_str
)
pydantic_replicate(
"llava-13b", InsAds, ins_image_documents, prompt_template_str
)
In [ ]:
Copied!
pydantic_replicate("cogvlm", InsAds, ins_image_documents, prompt_template_str)
pydantic_replicate("cogvlm", InsAds, ins_image_documents, prompt_template_str)
Model: cogvlm
('account', 'jordansdaily')
('brand', 'AIR JORDAN')
('product', '2')
('category', 'ITALY')
('discount', '')
('price', '$175')
('comments', '')
('review', '')
('description', "AIR JORDAN 2 'ITALY' release NOV 18TH $175")
In [ ]:
Copied!
pydantic_replicate(
"minigpt-4", InsAds, ins_image_documents, prompt_template_str
)
pydantic_replicate(
"minigpt-4", InsAds, ins_image_documents, prompt_template_str
)
观察结果:
- 只有 GPT4v 和 GogVLM 输出了预期字段
- 在这两个模型中,GPT4V 获得了更准确的结果