WordLift 向量存储库¶
简介¶
本脚本演示如何爬取产品网站、提取关键信息、构建SEO友好的知识图谱(结构化呈现PDPs和PLPs),并利用其提升搜索效果与用户体验。
核心功能与工具库:¶
- 网页抓取(Advertools)
- 产品详情页(PDPs)与产品列表页(PLPs)知识图谱构建 - WordLift
- 产品推荐系统(WordLift Neural Search)
- 购物助手创建(WordLift + LlamaIndex 🦙)
该方法可显著提升电商网站的SEO表现和用户参与度。
了解更多实现原理:
- https://www.youtube.com/watch?v=CH-ir1MTAwQ
- https://wordlift.io/academy-entries/mastering-serp-analysis-knowledge-graphs
![]() |
作者:
Andrea Volpini
与
David Riccitelli
MIT 许可协议 最后更新:2024年7月31日 |
安装¶
!pip install advertools -q
!pip install -U wordlift-client # 🎉 first time on stage 🎉
!pip install rdflib -q
# Standard library imports
import json
import logging
import os
import re
import urllib.parse
import requests
from typing import List, Optional
# Third-party imports
import advertools as adv
import pandas as pd
import nest_asyncio
# RDFLib imports
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import SDO, Namespace, DefinedNamespace
# WordLift client imports
import wordlift_client
from wordlift_client import Configuration, ApiClient
from wordlift_client.rest import ApiException
from wordlift_client.api.dataset_api import DatasetApi
from wordlift_client.api.entities_api import EntitiesApi
from wordlift_client.api.graph_ql_api import GraphQLApi
from wordlift_client.models.graphql_request import GraphqlRequest
from wordlift_client.models.page_vector_search_query_response_item import (
PageVectorSearchQueryResponseItem,
)
from wordlift_client.models.vector_search_query_request import (
VectorSearchQueryRequest,
)
from wordlift_client.api.vector_search_queries_api import (
VectorSearchQueriesApi,
)
# Asynchronous programming
import asyncio
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Apply nest_asyncio
nest_asyncio.apply()
WORDLIFT_KEY = os.getenv("WORDLIFT_KEY")
OPENAI_KEY = os.getenv("OPENAI_KEY")
使用 Advertools 爬取网站¶
# Step 1: Define the website structure
# -----------------------------------
# We're working with two types of pages:
# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/
# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/
# The product description can be found at this XPath:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()
# The price is here:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()
# The category is here:
# //span[contains(@class, 'breadcrumb')]/a/text()
# Step 2: Set up the crawl
# ------------------------
def crawl_website(url, output_file, num_pages=10):
logger.info(f"Starting crawl of {url}")
adv.crawl(
url,
output_file,
follow_links=True,
custom_settings={
"CLOSESPIDER_PAGECOUNT": num_pages,
"USER_AGENT": "WordLiftBot/1.0 (Maven Project)",
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
"DOWNLOAD_DELAY": 1,
"ROBOTSTXT_OBEY": False,
},
xpath_selectors={
"product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()",
"product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()",
"product_category": "//span[@class='posted_in']/a/text()",
},
)
logger.info(f"Crawl completed. Results saved to {output_file}")
# Step 3: Analyze URL patterns
# ----------------------------
def analyze_url_patterns(df):
df["page_type"] = df["url"].apply(
lambda x: "PLP"
if "/product-category/" in x
else ("PDP" if "/product/" in x else "Other")
)
logger.info(
f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs"
)
return df
# Step 4: Extract page data
# ----------------------------
def extract_page_data(df):
extracted_data = []
for _, row in df.iterrows():
page = {
"url": row["url"],
"title": row["title"],
"page_type": row["page_type"],
"meta_description": row.get("meta_description", ""),
"og_title": row.get("og_title", ""),
"og_description": row.get("og_description", ""),
"h1": ", ".join(row.get("h1", []))
if isinstance(row.get("h1"), list)
else row.get("h1", ""),
"h2": ", ".join(row.get("h2", []))
if isinstance(row.get("h2"), list)
else row.get("h2", ""),
}
if row["page_type"] == "PDP":
page.update(
{
"product_description": ", ".join(
row.get("product_description", [])
)
if isinstance(row.get("product_description"), list)
else row.get("product_description", ""),
"product_price": ", ".join(row.get("product_price", []))
if isinstance(row.get("product_price"), list)
else row.get("product_price", ""),
"product_category": ", ".join(
row.get("product_category", [])
)
if isinstance(row.get("product_category"), list)
else row.get("product_category", ""),
}
)
elif row["page_type"] == "PLP":
# Parse the category from the H1 content
h1_content = (
row.get("h1", [""])[0]
if isinstance(row.get("h1"), list)
else row.get("h1", "")
)
category = (
h1_content.split("@@")[-1]
if "@@" in h1_content
else h1_content.replace("Category: ", "").strip()
)
page["category_name"] = category
extracted_data.append(page)
return pd.DataFrame(extracted_data)
使用 WordLift 构建知识图谱 🕸¶
# Step 5: Configure the WordLift client
# ----------------------------
# Create a configuration object for the WordLift API client using your WordLift key.
configuration = Configuration(host="https://api.wordlift.io")
configuration.api_key["ApiKey"] = WORDLIFT_KEY
configuration.api_key_prefix["ApiKey"] = "Key"
EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/")
BASE_URI = "http://data.wordlift.io/[dataset_id]/"
# Step 6: Build the KG and the embeddings
# ----------------------------
async def cleanup_knowledge_graph(api_client):
dataset_api = wordlift_client.DatasetApi(api_client)
try:
# Delete all
await dataset_api.delete_all_entities()
except Exception as e:
print(
"Exception when calling DatasetApi->delete_all_entities: %s\n" % e
)
async def create_entity(entities_api, entity_data):
g = Graph().parse(data=json.dumps(entity_data), format="json-ld")
body = g.serialize(format="application/rdf+xml")
await entities_api.create_or_update_entities(
body=body, _content_type="application/rdf+xml"
)
def replace_url(original_url: str) -> str:
old_domain = "https://product-finder.wordlift.io/"
new_domain = "https://data-science-with-python-for-seo.wordlift.dev/"
if original_url.startswith(old_domain):
return original_url.replace(old_domain, new_domain, 1)
else:
return original_url
def create_entity_uri(url):
parsed_url = urllib.parse.urlparse(url)
path = parsed_url.path.strip("/")
path_parts = path.split("/")
fragment = parsed_url.fragment
if "product" in path_parts:
# It's a product page or product offer
product_id = path_parts[-1] # Get the last part of the path
if fragment == "offer":
return f"{BASE_URI}offer_{product_id}"
else:
return f"{BASE_URI}product_{product_id}"
elif "product-category" in path_parts:
# It's a product listing page (PLP)
category = path_parts[-1] # Get the last part of the path
return f"{BASE_URI}plp_{category}"
else:
# For any other type of page
safe_path = "".join(c if c.isalnum() else "_" for c in path)
if fragment == "offer":
return f"{BASE_URI}offer_{safe_path}"
else:
return f"{BASE_URI}page_{safe_path}"
def clean_price(price_str):
if not price_str or price_str == "N/A":
return None
if isinstance(price_str, (int, float)):
return float(price_str)
try:
# Remove any non-numeric characters except for the decimal point
cleaned_price = "".join(
char for char in str(price_str) if char.isdigit() or char == "."
)
return float(cleaned_price)
except ValueError:
logger.warning(f"Could not convert price: {price_str}")
return None
def create_product_entity(row, dataset_uri):
url = replace_url(row["url"])
product_entity_uri = create_entity_uri(url)
entity_data = {
"@context": "http://schema.org",
"@type": "Product",
"@id": product_entity_uri,
"url": url,
"name": row["title"]
if not pd.isna(row["title"])
else "Untitled Product",
"urn:meta:requestEmbeddings": [
"http://schema.org/name",
"http://schema.org/description",
],
}
if not pd.isna(row.get("product_description")):
entity_data["description"] = row["product_description"]
if not pd.isna(row.get("product_price")):
price = clean_price(row["product_price"])
if price is not None:
# Create offer ID as a sub-resource of the product ID
offer_entity_uri = f"{product_entity_uri}/offer_1"
entity_data["offers"] = {
"@type": "Offer",
"@id": offer_entity_uri,
"price": str(price),
"priceCurrency": "GBP",
"availability": "http://schema.org/InStock",
"url": url,
}
if not pd.isna(row.get("product_category")):
entity_data["category"] = row["product_category"]
custom_attributes = {
key: row[key]
for key in [
"meta_description",
"og_title",
"og_description",
"h1",
"h2",
]
if not pd.isna(row.get(key))
}
if custom_attributes:
entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
custom_attributes
)
return entity_data
def create_collection_entity(row, dataset_uri):
url = replace_url(row["url"])
entity_uri = create_entity_uri(url)
entity_data = {
"@context": "http://schema.org",
"@type": "CollectionPage",
"@id": entity_uri,
"url": url,
"name": row["category_name"] or row["title"],
}
custom_attributes = {
key: row[key]
for key in [
"meta_description",
"og_title",
"og_description",
"h1",
"h2",
]
if row.get(key)
}
if custom_attributes:
entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
custom_attributes
)
return entity_data
async def build_knowledge_graph(df, dataset_uri, api_client):
entities_api = EntitiesApi(api_client)
for _, row in df.iterrows():
try:
if row["page_type"] == "PDP":
entity_data = create_product_entity(row, dataset_uri)
elif row["page_type"] == "PLP":
entity_data = create_collection_entity(row, dataset_uri)
else:
logger.warning(
f"Skipping unknown page type for URL: {row['url']}"
)
continue
if entity_data is None:
logger.warning(
f"Skipping page due to missing critical data: {row['url']}"
)
continue
await create_entity(entities_api, entity_data)
logger.info(
f"Created entity for {row['page_type']}: {row['title']}"
)
except Exception as e:
logger.error(
f"Error creating entity for {row['page_type']}: {row['title']}"
)
logger.error(f"Error: {str(e)}")
运行展示¶
# ----------------------------
# Main Execution
# ----------------------------
# Global configuration variables
CRAWL_URL = "https://product-finder.wordlift.io/"
OUTPUT_FILE = "crawl_results.jl"
async def main():
# Step 1: Crawl the website
crawl_website(CRAWL_URL, OUTPUT_FILE)
# Step 2: Load the crawled data
df = pd.read_json(OUTPUT_FILE, lines=True)
# Step 3: Analyze URL patterns
df = analyze_url_patterns(df)
# Step 4: Extract page data
pages_df = extract_page_data(df)
async with ApiClient(configuration) as api_client:
# Clean up the existing knowledge graph
try:
await cleanup_knowledge_graph(api_client)
logger.info(f"Knowledge Graph Cleaned Up")
except Exception as e:
logger.error(
f"Failed to clean up the existing Knowledge Graph: {str(e)}"
)
return # Exit if cleanup fails
# Build the new knowledge graph
await build_knowledge_graph(pages_df, CRAWL_URL, api_client)
logger.info("Knowledge graph building completed.")
if __name__ == "__main__":
asyncio.run(main())
现在让我们使用 GraphQL 查询知识图谱中的产品¶
async def perform_graphql_query(api_client):
graphql_api = GraphQLApi(api_client)
query = """
{
products(rows: 20) {
id: iri
category: string(name:"schema:category")
name: string(name:"schema:name")
description: string(name:"schema:description")
url: string(name:"schema:url")
}
}
"""
request = GraphqlRequest(query=query)
try:
response = await graphql_api.graphql_using_post(body=request)
print("GraphQL Query Results:")
print(json.dumps(response, indent=2))
except Exception as e:
logger.error(f"An error occurred during GraphQL query: {e}")
async with ApiClient(configuration) as api_client:
# Step 6: Perform GraphQL query
await perform_graphql_query(api_client)
logger.info("Knowledge graph building and GraphQL query completed.")
利用知识图谱¶
既然我们已经成功为电商网站创建了包含产品嵌入的知识图谱,现在可以利用它来提升用户体验和功能。我们为每个产品生成的嵌入向量,使我们能够执行语义相似性搜索并构建更智能的系统。
为网页添加结构化数据¶
在本节中,我们将对WordLift的数据API进行简单测试。该API用于将知识图谱(KG)中的结构化数据标记注入到您的网页中。结构化数据能帮助搜索引擎更好地理解您的内容,可能使搜索结果中出现富片段并提升SEO效果。
本示例使用了一个预配置在演示电商网站上的知识图谱。我们将引用一个虚构的URL:https://data-science-with-python-for-seo.wordlift.dev
。
调用WordLift数据API时,我们只需传入URL即可获取对应的JSON-LD(关联数据的JavaScript对象表示法)。这类结构化数据通常包含电商网站的商品详情、价格和库存状态等信息。
下方展示的get_json_ld_from_url()
函数演示了这一流程。该函数接收URL作为输入,返回JSON-LD格式的结构化数据,可直接注入到您的网页中。
def get_json_ld_from_url(url):
# Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/'
api_url = "https://api.wordlift.io/data/https/" + url.replace(
"https://", ""
)
# Make the GET request to the API
response = requests.get(api_url)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON-LD from the response
json_ld = response.json()
return json_ld
else:
print(f"Failed to retrieve data: {response.status_code}")
return None
def pretty_print_json(json_obj):
# Pretty print the JSON object
print(json.dumps(json_obj, indent=4))
# Let's run a test
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/"
json_ld = get_json_ld_from_url(url)
json_ld
利用 WordLift 神经搜索生成相似商品链接¶
通过已建立的产品嵌入向量,我们现在可以利用 WordLift 的神经搜索功能向用户推荐相似商品。该功能通过展示基于语义相似度的相关商品,显著提升用户参与度,并有望促进销售增长。
与传统关键词匹配不同,语义相似度会考量产品描述的上下文和含义。这种方法能提供更细致准确的推荐,即使商品间不存在完全匹配的关键词。
我们先前定义的 get_top_k_similar_urls
函数实现了这一功能。它接收一个产品链接,返回按相似度分数排序的语义相似商品列表。
例如,当用户浏览一件红色棉质T恤时,该功能可能推荐不同颜色的其他棉质T恤,或不同材质的相似风格上衣。这为用户创造了更直观、更具吸引力的购物体验。
通过实施这项神经搜索功能,我们能够打造更个性化、更高效的购物体验,有望提升用户满意度和转化率。
async def get_top_k_similar_urls(configuration, query_url: str, top_k: int):
request = VectorSearchQueryRequest(
query_url=query_url,
similarity_top_k=top_k,
)
async with wordlift_client.ApiClient(configuration) as api_client:
api_instance = VectorSearchQueriesApi(api_client)
try:
page = await api_instance.create_query(
vector_search_query_request=request
)
return [
{
"url": item.id,
"name": item.text.split("\n")[0],
"score": item.score,
}
for item in page.items
if item.id and item.text
]
except Exception as e:
logger.error(f"Error querying for entities: {e}", exc_info=True)
return None
top_k = 10
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/"
similar_urls = await get_top_k_similar_urls(
configuration, query_url=url, top_k=top_k
)
print(json.dumps(similar_urls, indent=2))
基于 LlamaIndex 🦙 构建电商网站智能聊天机器人¶
我们构建的知识图谱为开发智能聊天机器人提供了完美基础。LlamaIndex(原 GPT Index)是一个强大的数据框架,能够帮助我们将私有或特定领域的数据整合到大型语言模型(LLMs)中进行结构化处理和访问。借助 LlamaIndex,我们可以创建一个理解产品目录的上下文感知聊天机器人,从而为客户提供高效协助。
通过结合知识图谱使用 LlamaIndex,我们可以开发出能够响应直接查询的聊天机器人。该机器人将具备以下产品目录理解能力:
- 解答有关产品规格、库存状态和价格的询问
- 根据客户偏好提供个性化产品推荐
- 进行同类产品间的对比分析
这种方法能实现更自然、更有价值的客户互动,从而提升购物体验。聊天机器人可以调用知识图谱中的结构化数据,利用 LlamaIndex 通过大型语言模型高效检索并呈现相关信息。
接下来的章节将详细介绍如何将 LlamaIndex 与知识图谱数据集成,并创建能够智能服务电商客户的聊天机器人。
安装 LlamaIndex
和 WordLiftVectorStore
💪¶
%%capture
!pip install llama-index
!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'
!pip install llama-index-embeddings-nomic
# import the necessary modules
from llama_index.vector_stores.wordlift import WordliftVectorStore
from llama_index.core import VectorStoreIndex
为查询引擎配置 NomicEmbeddings¶
Nomic 公司发布了其嵌入模型的 v1.5 版本 🪆🪆🪆,该版本在文本嵌入能力上带来显著提升。嵌入是将文本转化为数值表示的技术,能够捕捉语义信息,使我们的系统能够理解和比较查询与文档的内容。
Nomic v1.5 的主要特性包括:
- 支持 64 至 768 维的可变尺寸嵌入
- 采用套娃式学习(Matryoshka learning)实现嵌套表征
- 扩展至 8192 个标记的上下文长度
由于这些先进特性,我们已在 WordLift 中使用 NomicEmbeddings,现在正配置 LlamaIndex 在编码用户查询时也采用该模型。这种全技术栈统一的嵌入模型使用方式,能确保知识图谱与查询理解过程之间实现更好的协同。
更多关于 NomicEmbeddings 的信息可查阅此链接。
点击此处获取免费密钥。
from llama_index.embeddings.nomic import NomicEmbedding
nomic_api_key = os.getenv("NOMIC_KEY")
embed_model = NomicEmbedding(
api_key=nomic_api_key,
dimensionality=128,
model_name="nomic-embed-text-v1.5",
)
embedding = embed_model.get_text_embedding("Hey Ho SEO!")
len(embedding)
我们将使用 OpenAI 作为默认的 LLM 来生成响应。当然,我们也可以使用任何其他可用的 LLM。
# Set the environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_KEY
现在让我们使用知识图谱中的数据来设置 WordliftVectorStore。
# Let's configure WordliftVectorStore using our WL Key
vector_store = WordliftVectorStore(key=API_KEY)
# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
vector_store, embed_model=embed_model
)
# Create a query engine
query_engine = index.as_query_engine()
query1 = "Can you give me a product similar to the facial puff? Please add the URL also"
result1 = query_engine.query(query1)
print(result1)
# Function to handle queries
def query_engine(query):
# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
vector_store, embed_model=embed_model
)
# Create a query engine
query_engine = index.as_query_engine()
response = query_engine.query(query)
return response
# Interactive query loop
while True:
user_query = input("Enter your query (or 'quit' to exit): ")
if user_query.lower() == "quit":
break
result = query_engine(user_query)
print(result)
print("\n---\n")