WordLift 向量存储库¶

简介¶

本脚本演示如何爬取产品网站、提取关键信息、构建SEO友好的知识图谱（结构化呈现PDPs和PLPs），并利用其提升搜索效果与用户体验。

核心功能与工具库：¶

网页抓取（Advertools）
产品详情页（PDPs）与产品列表页（PLPs）知识图谱构建 - WordLift
产品推荐系统（WordLift Neural Search）
购物助手创建（WordLift + LlamaIndex 🦙）

该方法可显著提升电商网站的SEO表现和用户参与度。

了解更多实现原理：

作者： Andrea Volpini 与 David Riccitelli

MIT 许可协议

最后更新：2024年7月31日

安装¶

In [ ]:

Copied!

!pip install advertools -q
!pip install -U wordlift-client # 🎉 first time on stage 🎉
!pip install rdflib -q
!pip install advertools -q
!pip install -U wordlift-client # 🎉 first time on stage 🎉
!pip install rdflib -q

In [ ]:

Copied!





# Standard library imports
import json
import logging
import os
import re
import urllib.parse
import requests
from typing import List, Optional

# Third-party imports
import advertools as adv
import pandas as pd
import nest_asyncio

# RDFLib imports
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import SDO, Namespace, DefinedNamespace

# WordLift client imports
import wordlift_client
from wordlift_client import Configuration, ApiClient
from wordlift_client.rest import ApiException
from wordlift_client.api.dataset_api import DatasetApi
from wordlift_client.api.entities_api import EntitiesApi
from wordlift_client.api.graph_ql_api import GraphQLApi
from wordlift_client.models.graphql_request import GraphqlRequest
from wordlift_client.models.page_vector_search_query_response_item import (
    PageVectorSearchQueryResponseItem,
)
from wordlift_client.models.vector_search_query_request import (
    VectorSearchQueryRequest,
)
from wordlift_client.api.vector_search_queries_api import (
    VectorSearchQueriesApi,
)


# Asynchronous programming
import asyncio

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Apply nest_asyncio
nest_asyncio.apply()
# Standard library imports
import json
import logging
import os
import re
import urllib.parse
import requests
from typing import List, Optional

# Third-party imports
import advertools as adv
import pandas as pd
import nest_asyncio

# RDFLib imports
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import SDO, Namespace, DefinedNamespace

# WordLift client imports
import wordlift_client
from wordlift_client import Configuration, ApiClient
from wordlift_client.rest import ApiException
from wordlift_client.api.dataset_api import DatasetApi
from wordlift_client.api.entities_api import EntitiesApi
from wordlift_client.api.graph_ql_api import GraphQLApi
from wordlift_client.models.graphql_request import GraphqlRequest
from wordlift_client.models.page_vector_search_query_response_item import (
    PageVectorSearchQueryResponseItem,
)
from wordlift_client.models.vector_search_query_request import (
    VectorSearchQueryRequest,
)
from wordlift_client.api.vector_search_queries_api import (
    VectorSearchQueriesApi,
)


# Asynchronous programming
import asyncio

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Apply nest_asyncio
nest_asyncio.apply()

In [ ]:

Copied!

WORDLIFT_KEY = os.getenv("WORDLIFT_KEY")
OPENAI_KEY = os.getenv("OPENAI_KEY")
WORDLIFT_KEY = os.getenv("WORDLIFT_KEY")
OPENAI_KEY = os.getenv("OPENAI_KEY")

使用 Advertools 爬取网站¶

In [ ]:

Copied!





# Step 1: Define the website structure
# -----------------------------------

# We're working with two types of pages:
# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/
# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/

# The product description can be found at this XPath:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()
# The price is here:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()
# The category is here:
# //span[contains(@class, 'breadcrumb')]/a/text()

# Step 2: Set up the crawl
# ------------------------


def crawl_website(url, output_file, num_pages=10):
    logger.info(f"Starting crawl of {url}")
    adv.crawl(
        url,
        output_file,
        follow_links=True,
        custom_settings={
            "CLOSESPIDER_PAGECOUNT": num_pages,
            "USER_AGENT": "WordLiftBot/1.0 (Maven Project)",
            "CONCURRENT_REQUESTS_PER_DOMAIN": 2,
            "DOWNLOAD_DELAY": 1,
            "ROBOTSTXT_OBEY": False,
        },
        xpath_selectors={
            "product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()",
            "product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()",
            "product_category": "//span[@class='posted_in']/a/text()",
        },
    )
    logger.info(f"Crawl completed. Results saved to {output_file}")


# Step 3: Analyze URL patterns
# ----------------------------


def analyze_url_patterns(df):
    df["page_type"] = df["url"].apply(
        lambda x: "PLP"
        if "/product-category/" in x
        else ("PDP" if "/product/" in x else "Other")
    )
    logger.info(
        f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs"
    )
    return df


# Step 4: Extract page data
# ----------------------------


def extract_page_data(df):
    extracted_data = []
    for _, row in df.iterrows():
        page = {
            "url": row["url"],
            "title": row["title"],
            "page_type": row["page_type"],
            "meta_description": row.get("meta_description", ""),
            "og_title": row.get("og_title", ""),
            "og_description": row.get("og_description", ""),
            "h1": ", ".join(row.get("h1", []))
            if isinstance(row.get("h1"), list)
            else row.get("h1", ""),
            "h2": ", ".join(row.get("h2", []))
            if isinstance(row.get("h2"), list)
            else row.get("h2", ""),
        }

        if row["page_type"] == "PDP":
            page.update(
                {
                    "product_description": ", ".join(
                        row.get("product_description", [])
                    )
                    if isinstance(row.get("product_description"), list)
                    else row.get("product_description", ""),
                    "product_price": ", ".join(row.get("product_price", []))
                    if isinstance(row.get("product_price"), list)
                    else row.get("product_price", ""),
                    "product_category": ", ".join(
                        row.get("product_category", [])
                    )
                    if isinstance(row.get("product_category"), list)
                    else row.get("product_category", ""),
                }
            )
        elif row["page_type"] == "PLP":
            # Parse the category from the H1 content
            h1_content = (
                row.get("h1", [""])[0]
                if isinstance(row.get("h1"), list)
                else row.get("h1", "")
            )
            category = (
                h1_content.split("@@")[-1]
                if "@@" in h1_content
                else h1_content.replace("Category: ", "").strip()
            )
            page["category_name"] = category

        extracted_data.append(page)

    return pd.DataFrame(extracted_data)
# Step 1: Define the website structure
# -----------------------------------

# We're working with two types of pages:
# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/
# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/

# The product description can be found at this XPath:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()
# The price is here:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()
# The category is here:
# //span[contains(@class, 'breadcrumb')]/a/text()

# Step 2: Set up the crawl
# ------------------------


def crawl_website(url, output_file, num_pages=10):
    logger.info(f"Starting crawl of {url}")
    adv.crawl(
        url,
        output_file,
        follow_links=True,
        custom_settings={
            "CLOSESPIDER_PAGECOUNT": num_pages,
            "USER_AGENT": "WordLiftBot/1.0 (Maven Project)",
            "CONCURRENT_REQUESTS_PER_DOMAIN": 2,
            "DOWNLOAD_DELAY": 1,
            "ROBOTSTXT_OBEY": False,
        },
        xpath_selectors={
            "product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()",
            "product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()",
            "product_category": "//span[@class='posted_in']/a/text()",
        },
    )
    logger.info(f"Crawl completed. Results saved to {output_file}")


# Step 3: Analyze URL patterns
# ----------------------------


def analyze_url_patterns(df):
    df["page_type"] = df["url"].apply(
        lambda x: "PLP"
        if "/product-category/" in x
        else ("PDP" if "/product/" in x else "Other")
    )
    logger.info(
        f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs"
    )
    return df


# Step 4: Extract page data
# ----------------------------


def extract_page_data(df):
    extracted_data = []
    for _, row in df.iterrows():
        page = {
            "url": row["url"],
            "title": row["title"],
            "page_type": row["page_type"],
            "meta_description": row.get("meta_description", ""),
            "og_title": row.get("og_title", ""),
            "og_description": row.get("og_description", ""),
            "h1": ", ".join(row.get("h1", []))
            if isinstance(row.get("h1"), list)
            else row.get("h1", ""),
            "h2": ", ".join(row.get("h2", []))
            if isinstance(row.get("h2"), list)
            else row.get("h2", ""),
        }

        if row["page_type"] == "PDP":
            page.update(
                {
                    "product_description": ", ".join(
                        row.get("product_description", [])
                    )
                    if isinstance(row.get("product_description"), list)
                    else row.get("product_description", ""),
                    "product_price": ", ".join(row.get("product_price", []))
                    if isinstance(row.get("product_price"), list)
                    else row.get("product_price", ""),
                    "product_category": ", ".join(
                        row.get("product_category", [])
                    )
                    if isinstance(row.get("product_category"), list)
                    else row.get("product_category", ""),
                }
            )
        elif row["page_type"] == "PLP":
            # Parse the category from the H1 content
            h1_content = (
                row.get("h1", [""])[0]
                if isinstance(row.get("h1"), list)
                else row.get("h1", "")
            )
            category = (
                h1_content.split("@@")[-1]
                if "@@" in h1_content
                else h1_content.replace("Category: ", "").strip()
            )
            page["category_name"] = category

        extracted_data.append(page)

    return pd.DataFrame(extracted_data)

使用 WordLift 构建知识图谱 🕸¶

In [ ]:

Copied!





# Step 5: Configure the WordLift client
# ----------------------------

# Create a configuration object for the WordLift API client using your WordLift key.
configuration = Configuration(host="https://api.wordlift.io")
configuration.api_key["ApiKey"] = WORDLIFT_KEY
configuration.api_key_prefix["ApiKey"] = "Key"

EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/")

BASE_URI = "http://data.wordlift.io/[dataset_id]/"
# Step 5: Configure the WordLift client
# ----------------------------

# Create a configuration object for the WordLift API client using your WordLift key.
configuration = Configuration(host="https://api.wordlift.io")
configuration.api_key["ApiKey"] = WORDLIFT_KEY
configuration.api_key_prefix["ApiKey"] = "Key"

EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/")

BASE_URI = "http://data.wordlift.io/[dataset_id]/"

In [ ]:

Copied!





# Step 6: Build the KG and the embeddings
# ----------------------------


async def cleanup_knowledge_graph(api_client):
    dataset_api = wordlift_client.DatasetApi(api_client)
    try:
        # Delete all
        await dataset_api.delete_all_entities()
    except Exception as e:
        print(
            "Exception when calling DatasetApi->delete_all_entities: %s\n" % e
        )


async def create_entity(entities_api, entity_data):
    g = Graph().parse(data=json.dumps(entity_data), format="json-ld")
    body = g.serialize(format="application/rdf+xml")
    await entities_api.create_or_update_entities(
        body=body, _content_type="application/rdf+xml"
    )


def replace_url(original_url: str) -> str:
    old_domain = "https://product-finder.wordlift.io/"
    new_domain = "https://data-science-with-python-for-seo.wordlift.dev/"

    if original_url.startswith(old_domain):
        return original_url.replace(old_domain, new_domain, 1)
    else:
        return original_url


def create_entity_uri(url):
    parsed_url = urllib.parse.urlparse(url)
    path = parsed_url.path.strip("/")
    path_parts = path.split("/")
    fragment = parsed_url.fragment

    if "product" in path_parts:
        # It's a product page or product offer
        product_id = path_parts[-1]  # Get the last part of the path
        if fragment == "offer":
            return f"{BASE_URI}offer_{product_id}"
        else:
            return f"{BASE_URI}product_{product_id}"
    elif "product-category" in path_parts:
        # It's a product listing page (PLP)
        category = path_parts[-1]  # Get the last part of the path
        return f"{BASE_URI}plp_{category}"
    else:
        # For any other type of page
        safe_path = "".join(c if c.isalnum() else "_" for c in path)
        if fragment == "offer":
            return f"{BASE_URI}offer_{safe_path}"
        else:
            return f"{BASE_URI}page_{safe_path}"


def clean_price(price_str):
    if not price_str or price_str == "N/A":
        return None
    if isinstance(price_str, (int, float)):
        return float(price_str)
    try:
        # Remove any non-numeric characters except for the decimal point
        cleaned_price = "".join(
            char for char in str(price_str) if char.isdigit() or char == "."
        )
        return float(cleaned_price)
    except ValueError:
        logger.warning(f"Could not convert price: {price_str}")
        return None


def create_product_entity(row, dataset_uri):
    url = replace_url(row["url"])
    product_entity_uri = create_entity_uri(url)

    entity_data = {
        "@context": "http://schema.org",
        "@type": "Product",
        "@id": product_entity_uri,
        "url": url,
        "name": row["title"]
        if not pd.isna(row["title"])
        else "Untitled Product",
        "urn:meta:requestEmbeddings": [
            "http://schema.org/name",
            "http://schema.org/description",
        ],
    }

    if not pd.isna(row.get("product_description")):
        entity_data["description"] = row["product_description"]

    if not pd.isna(row.get("product_price")):
        price = clean_price(row["product_price"])
        if price is not None:
            # Create offer ID as a sub-resource of the product ID
            offer_entity_uri = f"{product_entity_uri}/offer_1"
            entity_data["offers"] = {
                "@type": "Offer",
                "@id": offer_entity_uri,
                "price": str(price),
                "priceCurrency": "GBP",
                "availability": "http://schema.org/InStock",
                "url": url,
            }

    if not pd.isna(row.get("product_category")):
        entity_data["category"] = row["product_category"]

    custom_attributes = {
        key: row[key]
        for key in [
            "meta_description",
            "og_title",
            "og_description",
            "h1",
            "h2",
        ]
        if not pd.isna(row.get(key))
    }
    if custom_attributes:
        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
            custom_attributes
        )

    return entity_data


def create_collection_entity(row, dataset_uri):
    url = replace_url(row["url"])
    entity_uri = create_entity_uri(url)

    entity_data = {
        "@context": "http://schema.org",
        "@type": "CollectionPage",
        "@id": entity_uri,
        "url": url,
        "name": row["category_name"] or row["title"],
    }

    custom_attributes = {
        key: row[key]
        for key in [
            "meta_description",
            "og_title",
            "og_description",
            "h1",
            "h2",
        ]
        if row.get(key)
    }
    if custom_attributes:
        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
            custom_attributes
        )

    return entity_data


async def build_knowledge_graph(df, dataset_uri, api_client):
    entities_api = EntitiesApi(api_client)

    for _, row in df.iterrows():
        try:
            if row["page_type"] == "PDP":
                entity_data = create_product_entity(row, dataset_uri)
            elif row["page_type"] == "PLP":
                entity_data = create_collection_entity(row, dataset_uri)
            else:
                logger.warning(
                    f"Skipping unknown page type for URL: {row['url']}"
                )
                continue

            if entity_data is None:
                logger.warning(
                    f"Skipping page due to missing critical data: {row['url']}"
                )
                continue

            await create_entity(entities_api, entity_data)
            logger.info(
                f"Created entity for {row['page_type']}: {row['title']}"
            )
        except Exception as e:
            logger.error(
                f"Error creating entity for {row['page_type']}: {row['title']}"
            )
            logger.error(f"Error: {str(e)}")
# Step 6: Build the KG and the embeddings
# ----------------------------


async def cleanup_knowledge_graph(api_client):
    dataset_api = wordlift_client.DatasetApi(api_client)
    try:
        # Delete all
        await dataset_api.delete_all_entities()
    except Exception as e:
        print(
            "Exception when calling DatasetApi->delete_all_entities: %s\n" % e
        )


async def create_entity(entities_api, entity_data):
    g = Graph().parse(data=json.dumps(entity_data), format="json-ld")
    body = g.serialize(format="application/rdf+xml")
    await entities_api.create_or_update_entities(
        body=body, _content_type="application/rdf+xml"
    )


def replace_url(original_url: str) -> str:
    old_domain = "https://product-finder.wordlift.io/"
    new_domain = "https://data-science-with-python-for-seo.wordlift.dev/"

    if original_url.startswith(old_domain):
        return original_url.replace(old_domain, new_domain, 1)
    else:
        return original_url


def create_entity_uri(url):
    parsed_url = urllib.parse.urlparse(url)
    path = parsed_url.path.strip("/")
    path_parts = path.split("/")
    fragment = parsed_url.fragment

    if "product" in path_parts:
        # It's a product page or product offer
        product_id = path_parts[-1]  # Get the last part of the path
        if fragment == "offer":
            return f"{BASE_URI}offer_{product_id}"
        else:
            return f"{BASE_URI}product_{product_id}"
    elif "product-category" in path_parts:
        # It's a product listing page (PLP)
        category = path_parts[-1]  # Get the last part of the path
        return f"{BASE_URI}plp_{category}"
    else:
        # For any other type of page
        safe_path = "".join(c if c.isalnum() else "_" for c in path)
        if fragment == "offer":
            return f"{BASE_URI}offer_{safe_path}"
        else:
            return f"{BASE_URI}page_{safe_path}"


def clean_price(price_str):
    if not price_str or price_str == "N/A":
        return None
    if isinstance(price_str, (int, float)):
        return float(price_str)
    try:
        # Remove any non-numeric characters except for the decimal point
        cleaned_price = "".join(
            char for char in str(price_str) if char.isdigit() or char == "."
        )
        return float(cleaned_price)
    except ValueError:
        logger.warning(f"Could not convert price: {price_str}")
        return None


def create_product_entity(row, dataset_uri):
    url = replace_url(row["url"])
    product_entity_uri = create_entity_uri(url)

    entity_data = {
        "@context": "http://schema.org",
        "@type": "Product",
        "@id": product_entity_uri,
        "url": url,
        "name": row["title"]
        if not pd.isna(row["title"])
        else "Untitled Product",
        "urn:meta:requestEmbeddings": [
            "http://schema.org/name",
            "http://schema.org/description",
        ],
    }

    if not pd.isna(row.get("product_description")):
        entity_data["description"] = row["product_description"]

    if not pd.isna(row.get("product_price")):
        price = clean_price(row["product_price"])
        if price is not None:
            # Create offer ID as a sub-resource of the product ID
            offer_entity_uri = f"{product_entity_uri}/offer_1"
            entity_data["offers"] = {
                "@type": "Offer",
                "@id": offer_entity_uri,
                "price": str(price),
                "priceCurrency": "GBP",
                "availability": "http://schema.org/InStock",
                "url": url,
            }

    if not pd.isna(row.get("product_category")):
        entity_data["category"] = row["product_category"]

    custom_attributes = {
        key: row[key]
        for key in [
            "meta_description",
            "og_title",
            "og_description",
            "h1",
            "h2",
        ]
        if not pd.isna(row.get(key))
    }
    if custom_attributes:
        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
            custom_attributes
        )

    return entity_data


def create_collection_entity(row, dataset_uri):
    url = replace_url(row["url"])
    entity_uri = create_entity_uri(url)

    entity_data = {
        "@context": "http://schema.org",
        "@type": "CollectionPage",
        "@id": entity_uri,
        "url": url,
        "name": row["category_name"] or row["title"],
    }

    custom_attributes = {
        key: row[key]
        for key in [
            "meta_description",
            "og_title",
            "og_description",
            "h1",
            "h2",
        ]
        if row.get(key)
    }
    if custom_attributes:
        entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
            custom_attributes
        )

    return entity_data


async def build_knowledge_graph(df, dataset_uri, api_client):
    entities_api = EntitiesApi(api_client)

    for _, row in df.iterrows():
        try:
            if row["page_type"] == "PDP":
                entity_data = create_product_entity(row, dataset_uri)
            elif row["page_type"] == "PLP":
                entity_data = create_collection_entity(row, dataset_uri)
            else:
                logger.warning(
                    f"Skipping unknown page type for URL: {row['url']}"
                )
                continue

            if entity_data is None:
                logger.warning(
                    f"Skipping page due to missing critical data: {row['url']}"
                )
                continue

            await create_entity(entities_api, entity_data)
            logger.info(
                f"Created entity for {row['page_type']}: {row['title']}"
            )
        except Exception as e:
            logger.error(
                f"Error creating entity for {row['page_type']}: {row['title']}"
            )
            logger.error(f"Error: {str(e)}")

运行展示¶

In [ ]:

Copied!





# ----------------------------
# Main Execution
# ----------------------------

# Global configuration variables
CRAWL_URL = "https://product-finder.wordlift.io/"
OUTPUT_FILE = "crawl_results.jl"


async def main():
    # Step 1: Crawl the website
    crawl_website(CRAWL_URL, OUTPUT_FILE)

    # Step 2: Load the crawled data
    df = pd.read_json(OUTPUT_FILE, lines=True)

    # Step 3: Analyze URL patterns
    df = analyze_url_patterns(df)

    # Step 4: Extract page data
    pages_df = extract_page_data(df)

    async with ApiClient(configuration) as api_client:
        # Clean up the existing knowledge graph
        try:
            await cleanup_knowledge_graph(api_client)
            logger.info(f"Knowledge Graph Cleaned Up")
        except Exception as e:
            logger.error(
                f"Failed to clean up the existing Knowledge Graph: {str(e)}"
            )
            return  # Exit if cleanup fails

        # Build the new knowledge graph
        await build_knowledge_graph(pages_df, CRAWL_URL, api_client)

    logger.info("Knowledge graph building completed.")


if __name__ == "__main__":
    asyncio.run(main())
# ----------------------------
# Main Execution
# ----------------------------

# Global configuration variables
CRAWL_URL = "https://product-finder.wordlift.io/"
OUTPUT_FILE = "crawl_results.jl"


async def main():
    # Step 1: Crawl the website
    crawl_website(CRAWL_URL, OUTPUT_FILE)

    # Step 2: Load the crawled data
    df = pd.read_json(OUTPUT_FILE, lines=True)

    # Step 3: Analyze URL patterns
    df = analyze_url_patterns(df)

    # Step 4: Extract page data
    pages_df = extract_page_data(df)

    async with ApiClient(configuration) as api_client:
        # Clean up the existing knowledge graph
        try:
            await cleanup_knowledge_graph(api_client)
            logger.info(f"Knowledge Graph Cleaned Up")
        except Exception as e:
            logger.error(
                f"Failed to clean up the existing Knowledge Graph: {str(e)}"
            )
            return  # Exit if cleanup fails

        # Build the new knowledge graph
        await build_knowledge_graph(pages_df, CRAWL_URL, api_client)

    logger.info("Knowledge graph building completed.")


if __name__ == "__main__":
    asyncio.run(main())

现在让我们使用 GraphQL 查询知识图谱中的产品¶

In [ ]:

Copied!





async def perform_graphql_query(api_client):
    graphql_api = GraphQLApi(api_client)
    query = """
    {
        products(rows: 20) {
            id: iri
            category: string(name:"schema:category")
            name: string(name:"schema:name")
            description: string(name:"schema:description")
            url: string(name:"schema:url")
        }
    }
    """
    request = GraphqlRequest(query=query)

    try:
        response = await graphql_api.graphql_using_post(body=request)
        print("GraphQL Query Results:")
        print(json.dumps(response, indent=2))
    except Exception as e:
        logger.error(f"An error occurred during GraphQL query: {e}")


async with ApiClient(configuration) as api_client:
    # Step 6: Perform GraphQL query
    await perform_graphql_query(api_client)
    logger.info("Knowledge graph building and GraphQL query completed.")
async def perform_graphql_query(api_client):
    graphql_api = GraphQLApi(api_client)
    query = """
    {
        products(rows: 20) {
            id: iri
            category: string(name:"schema:category")
            name: string(name:"schema:name")
            description: string(name:"schema:description")
            url: string(name:"schema:url")
        }
    }
    """
    request = GraphqlRequest(query=query)

    try:
        response = await graphql_api.graphql_using_post(body=request)
        print("GraphQL Query Results:")
        print(json.dumps(response, indent=2))
    except Exception as e:
        logger.error(f"An error occurred during GraphQL query: {e}")


async with ApiClient(configuration) as api_client:
    # Step 6: Perform GraphQL query
    await perform_graphql_query(api_client)
    logger.info("Knowledge graph building and GraphQL query completed.")

利用知识图谱¶

既然我们已经成功为电商网站创建了包含产品嵌入的知识图谱，现在可以利用它来提升用户体验和功能。我们为每个产品生成的嵌入向量，使我们能够执行语义相似性搜索并构建更智能的系统。

为网页添加结构化数据¶

在本节中，我们将对WordLift的数据API进行简单测试。该API用于将知识图谱(KG)中的结构化数据标记注入到您的网页中。结构化数据能帮助搜索引擎更好地理解您的内容，可能使搜索结果中出现富片段并提升SEO效果。

本示例使用了一个预配置在演示电商网站上的知识图谱。我们将引用一个虚构的URL：https://data-science-with-python-for-seo.wordlift.dev。

调用WordLift数据API时，我们只需传入URL即可获取对应的JSON-LD（关联数据的JavaScript对象表示法）。这类结构化数据通常包含电商网站的商品详情、价格和库存状态等信息。

下方展示的get_json_ld_from_url()函数演示了这一流程。该函数接收URL作为输入，返回JSON-LD格式的结构化数据，可直接注入到您的网页中。

In [ ]:

Copied!





def get_json_ld_from_url(url):
    # Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/'
    api_url = "https://api.wordlift.io/data/https/" + url.replace(
        "https://", ""
    )

    # Make the GET request to the API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON-LD from the response
        json_ld = response.json()
        return json_ld
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        return None


def pretty_print_json(json_obj):
    # Pretty print the JSON object
    print(json.dumps(json_obj, indent=4))
def get_json_ld_from_url(url):
    # Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/'
    api_url = "https://api.wordlift.io/data/https/" + url.replace(
        "https://", ""
    )

    # Make the GET request to the API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON-LD from the response
        json_ld = response.json()
        return json_ld
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        return None


def pretty_print_json(json_obj):
    # Pretty print the JSON object
    print(json.dumps(json_obj, indent=4))

In [ ]:

Copied!





# Let's run a test
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/"
json_ld = get_json_ld_from_url(url)
json_ld
# Let's run a test
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/"
json_ld = get_json_ld_from_url(url)
json_ld

利用 WordLift 神经搜索生成相似商品链接¶

通过已建立的产品嵌入向量，我们现在可以利用 WordLift 的神经搜索功能向用户推荐相似商品。该功能通过展示基于语义相似度的相关商品，显著提升用户参与度，并有望促进销售增长。

与传统关键词匹配不同，语义相似度会考量产品描述的上下文和含义。这种方法能提供更细致准确的推荐，即使商品间不存在完全匹配的关键词。

我们先前定义的 get_top_k_similar_urls 函数实现了这一功能。它接收一个产品链接，返回按相似度分数排序的语义相似商品列表。

例如，当用户浏览一件红色棉质T恤时，该功能可能推荐不同颜色的其他棉质T恤，或不同材质的相似风格上衣。这为用户创造了更直观、更具吸引力的购物体验。

通过实施这项神经搜索功能，我们能够打造更个性化、更高效的购物体验，有望提升用户满意度和转化率。

In [ ]:

Copied!





async def get_top_k_similar_urls(configuration, query_url: str, top_k: int):
    request = VectorSearchQueryRequest(
        query_url=query_url,
        similarity_top_k=top_k,
    )

    async with wordlift_client.ApiClient(configuration) as api_client:
        api_instance = VectorSearchQueriesApi(api_client)
        try:
            page = await api_instance.create_query(
                vector_search_query_request=request
            )
            return [
                {
                    "url": item.id,
                    "name": item.text.split("\n")[0],
                    "score": item.score,
                }
                for item in page.items
                if item.id and item.text
            ]
        except Exception as e:
            logger.error(f"Error querying for entities: {e}", exc_info=True)
            return None
async def get_top_k_similar_urls(configuration, query_url: str, top_k: int):
    request = VectorSearchQueryRequest(
        query_url=query_url,
        similarity_top_k=top_k,
    )

    async with wordlift_client.ApiClient(configuration) as api_client:
        api_instance = VectorSearchQueriesApi(api_client)
        try:
            page = await api_instance.create_query(
                vector_search_query_request=request
            )
            return [
                {
                    "url": item.id,
                    "name": item.text.split("\n")[0],
                    "score": item.score,
                }
                for item in page.items
                if item.id and item.text
            ]
        except Exception as e:
            logger.error(f"Error querying for entities: {e}", exc_info=True)
            return None

In [ ]:

Copied!





top_k = 10
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/"
similar_urls = await get_top_k_similar_urls(
    configuration, query_url=url, top_k=top_k
)
print(json.dumps(similar_urls, indent=2))
top_k = 10
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/"
similar_urls = await get_top_k_similar_urls(
    configuration, query_url=url, top_k=top_k
)
print(json.dumps(similar_urls, indent=2))

基于 LlamaIndex 🦙 构建电商网站智能聊天机器人¶

我们构建的知识图谱为开发智能聊天机器人提供了完美基础。LlamaIndex（原 GPT Index）是一个强大的数据框架，能够帮助我们将私有或特定领域的数据整合到大型语言模型（LLMs）中进行结构化处理和访问。借助 LlamaIndex，我们可以创建一个理解产品目录的上下文感知聊天机器人，从而为客户提供高效协助。

通过结合知识图谱使用 LlamaIndex，我们可以开发出能够响应直接查询的聊天机器人。该机器人将具备以下产品目录理解能力：

解答有关产品规格、库存状态和价格的询问
根据客户偏好提供个性化产品推荐
进行同类产品间的对比分析

这种方法能实现更自然、更有价值的客户互动，从而提升购物体验。聊天机器人可以调用知识图谱中的结构化数据，利用 LlamaIndex 通过大型语言模型高效检索并呈现相关信息。

接下来的章节将详细介绍如何将 LlamaIndex 与知识图谱数据集成，并创建能够智能服务电商客户的聊天机器人。

安装 `LlamaIndex` 和 `WordLiftVectorStore` 💪¶

In [ ]:

Copied!





%%capture
!pip install llama-index
!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'
!pip install llama-index-embeddings-nomic
%%capture
!pip install llama-index
!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'
!pip install llama-index-embeddings-nomic

In [ ]:

Copied!

# import the necessary modules
from llama_index.vector_stores.wordlift import WordliftVectorStore
from llama_index.core import VectorStoreIndex
# import the necessary modules
from llama_index.vector_stores.wordlift import WordliftVectorStore
from llama_index.core import VectorStoreIndex

为查询引擎配置 NomicEmbeddings¶

Nomic 公司发布了其嵌入模型的 v1.5 版本 🪆🪆🪆，该版本在文本嵌入能力上带来显著提升。嵌入是将文本转化为数值表示的技术，能够捕捉语义信息，使我们的系统能够理解和比较查询与文档的内容。

Nomic v1.5 的主要特性包括：

支持 64 至 768 维的可变尺寸嵌入
采用套娃式学习（Matryoshka learning）实现嵌套表征
扩展至 8192 个标记的上下文长度

由于这些先进特性，我们已在 WordLift 中使用 NomicEmbeddings，现在正配置 LlamaIndex 在编码用户查询时也采用该模型。这种全技术栈统一的嵌入模型使用方式，能确保知识图谱与查询理解过程之间实现更好的协同。

更多关于 NomicEmbeddings 的信息可查阅此链接。

点击此处获取免费密钥。

In [ ]:

Copied!





from llama_index.embeddings.nomic import NomicEmbedding

nomic_api_key = os.getenv("NOMIC_KEY")

embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)

embedding = embed_model.get_text_embedding("Hey Ho SEO!")
len(embedding)
from llama_index.embeddings.nomic import NomicEmbedding

nomic_api_key = os.getenv("NOMIC_KEY")

embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)

embedding = embed_model.get_text_embedding("Hey Ho SEO!")
len(embedding)

我们将使用 OpenAI 作为默认的 LLM 来生成响应。当然，我们也可以使用任何其他可用的 LLM。

In [ ]:

Copied!

# Set the environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_KEY
# Set the environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_KEY

现在让我们使用知识图谱中的数据来设置 WordliftVectorStore。

In [ ]:

Copied!





# Let's configure WordliftVectorStore using our WL Key
vector_store = WordliftVectorStore(key=API_KEY)

# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
    vector_store, embed_model=embed_model
)

# Create a query engine
query_engine = index.as_query_engine()
# Let's configure WordliftVectorStore using our WL Key
vector_store = WordliftVectorStore(key=API_KEY)

# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
    vector_store, embed_model=embed_model
)

# Create a query engine
query_engine = index.as_query_engine()

In [ ]:

Copied!

query1 = "Can you give me a product similar to the facial puff? Please add the URL also"
result1 = query_engine.query(query1)

print(result1)
query1 = "Can you give me a product similar to the facial puff? Please add the URL also"
result1 = query_engine.query(query1)

print(result1)

In [ ]:

Copied!





# Function to handle queries
def query_engine(query):
    # Create an index from the vector store
    index = VectorStoreIndex.from_vector_store(
        vector_store, embed_model=embed_model
    )

    # Create a query engine
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    return response


# Interactive query loop
while True:
    user_query = input("Enter your query (or 'quit' to exit): ")
    if user_query.lower() == "quit":
        break
    result = query_engine(user_query)
    print(result)
    print("\n---\n")
# Function to handle queries
def query_engine(query):
    # Create an index from the vector store
    index = VectorStoreIndex.from_vector_store(
        vector_store, embed_model=embed_model
    )

    # Create a query engine
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    return response


# Interactive query loop
while True:
    user_query = input("Enter your query (or 'quit' to exit): ")
    if user_query.lower() == "quit":
        break
    result = query_engine(user_query)
    print(result)
    print("\n---\n")

WordLift 向量存储库¶

简介¶

核心功能与工具库：¶

安装¶

使用 Advertools 爬取网站¶

使用 WordLift 构建知识图谱 🕸¶

运行展示¶

现在让我们使用 GraphQL 查询知识图谱中的产品¶

利用知识图谱¶

为网页添加结构化数据¶

利用 WordLift 神经搜索生成相似商品链接¶

基于 LlamaIndex 🦙 构建电商网站智能聊天机器人¶

安装 LlamaIndex 和 WordLiftVectorStore 💪¶

为查询引擎配置 NomicEmbeddings¶

安装 `LlamaIndex` 和 `WordLiftVectorStore` 💪¶