知识图谱构建中的LLM路径提取器对比¶

在本笔记本中，我们将比较来自llama_index的三种不同LLM路径提取器：

SimpleLLMPathExtractor（简单LLM路径提取器）
SchemaLLMPathExtractor（模式LLM路径提取器）
DynamicLLMPathExtractor（动态LLM路径提取器，新增）

我们将使用维基百科页面作为测试数据，并使用Pyvis可视化生成的知识图谱。

安装与导入¶

In [ ]:

Copied!

!pip install llama_index pyvis wikipedia
!pip install llama_index pyvis wikipedia

In [ ]:

Copied!





from llama_index.core import Document, PropertyGraphIndex
from llama_index.core.indices.property_graph import (
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor,
    DynamicLLMPathExtractor,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

import wikipedia

import os
from llama_index.core import Document, PropertyGraphIndex
from llama_index.core.indices.property_graph import (
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor,
    DynamicLLMPathExtractor,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

import wikipedia

import os

In [ ]:

Copied!

import nest_asyncio

nest_asyncio.apply()
import nest_asyncio

nest_asyncio.apply()

配置LLM后端¶

In [ ]:

Copied!





os.environ["OPENAI_API_KEY"] = "sk-proj-..."

# Set up global configurations
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")

Settings.llm = llm
Settings.chunk_size = 2048
Settings.chunk_overlap = 20
os.environ["OPENAI_API_KEY"] = "sk-proj-..."

# Set up global configurations
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")

Settings.llm = llm
Settings.chunk_size = 2048
Settings.chunk_overlap = 20

从维基百科获取原始文本¶

In [ ]:

Copied!





def get_wikipedia_content(title):
    try:
        page = wikipedia.page(title)
        return page.content
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation page. Options: {e.options}")
    except wikipedia.exceptions.PageError:
        print(f"Page '{title}' does not exist.")
    return None
def get_wikipedia_content(title):
    try:
        page = wikipedia.page(title)
        return page.content
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation page. Options: {e.options}")
    except wikipedia.exceptions.PageError:
        print(f"Page '{title}' does not exist.")
    return None

In [ ]:

Copied!





wiki_title = "Barack Obama"
content = get_wikipedia_content(wiki_title)

if content:
    document = Document(text=content, metadata={"title": wiki_title})
    print(
        f"Fetched content for '{wiki_title}' (length: {len(content)} characters)"
    )
else:
    print("Failed to fetch Wikipedia content.")
wiki_title = "Barack Obama"
content = get_wikipedia_content(wiki_title)

if content:
    document = Document(text=content, metadata={"title": wiki_title})
    print(
        f"Fetched content for '{wiki_title}' (length: {len(content)} characters)"
    )
else:
    print("Failed to fetch Wikipedia content.")

Fetched content for 'Barack Obama' (length: 83977 characters)

1. SimpleLLMPathExtractor¶

In [ ]:

Copied!





kg_extractor = SimpleLLMPathExtractor(
    llm=llm, max_paths_per_chunk=20, num_workers=4
)

simple_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

simple_index.property_graph_store.save_networkx_graph(
    name="./SimpleGraph.html"
)
simple_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = SimpleLLMPathExtractor(
    llm=llm, max_paths_per_chunk=20, num_workers=4
)

simple_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

simple_index.property_graph_store.save_networkx_graph(
    name="./SimpleGraph.html"
)
simple_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text: 100%|██████████| 11/11 [00:09<00:00,  1.19it/s]

Out[ ]:

[(EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Has', source_id='Obama', target_id='Half-sister', properties={'title': 'Barack Obama', 'triplet_source_id': 'bd93d2e0-ab20-4f4c-a412-bb42f93ae56f'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'bd93d2e0-ab20-4f4c-a412-bb42f93ae56f'}, name='Half-sister')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Selected', source_id='Obama', target_id='Joe biden as his vice presidential running mate', properties={'title': 'Barack Obama', 'triplet_source_id': 'bc18ad10-3040-41a8-b595-4dd8ddb31a0b'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'bc18ad10-3040-41a8-b595-4dd8ddb31a0b'}, name='Joe biden as his vice presidential running mate')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Made', source_id='Obama', target_id='First public speech', properties={'title': 'Barack Obama', 'triplet_source_id': '6c89e860-215d-4f5b-8b1c-3183fe71bb6c'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '6c89e860-215d-4f5b-8b1c-3183fe71bb6c'}, name='First public speech')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Banned', source_id='Obama', target_id='New offshore oil and gas drilling', properties={'title': 'Barack Obama', 'triplet_source_id': '62942a1e-18ae-4f45-9c73-ea39934f5519'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '62942a1e-18ae-4f45-9c73-ea39934f5519'}, name='New offshore oil and gas drilling')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Met with', source_id='Obama', target_id='Australian prime minister', properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Australian prime minister'))]

2. DynamicLLMPathExtractor¶

无初始本体：¶

在此场景中，我们让大语言模型动态定义本体结构，赋予其完全自由以最合适的方式标注节点。

In [ ]:

Copied!





kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    # Let the LLM infer entities and their labels (types) on the fly
    allowed_entity_types=None,
    # Let the LLM infer relationships on the fly
    allowed_relation_types=None,
    # LLM will generate any entity properties, set `None` to skip property generation (will be faster without)
    allowed_relation_props=[],
    # LLM will generate any relation properties, set `None` to skip property generation (will be faster without)
    allowed_entity_props=[],
)

dynamic_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index.property_graph_store.save_networkx_graph(
    name="./DynamicGraph.html"
)

dynamic_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    # Let the LLM infer entities and their labels (types) on the fly
    allowed_entity_types=None,
    # Let the LLM infer relationships on the fly
    allowed_relation_types=None,
    # LLM will generate any entity properties, set `None` to skip property generation (will be faster without)
    allowed_relation_props=[],
    # LLM will generate any relation properties, set `None` to skip property generation (will be faster without)
    allowed_entity_props=[],
)

dynamic_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index.property_graph_store.save_networkx_graph(
    name="./DynamicGraph.html"
)

dynamic_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting and inferring knowledge graph from text: 100%|██████████| 11/11 [00:50<00:00,  4.59s/it]

Out[ ]:

[(EntityNode(label='PERSON', embedding=None, properties={'approval_rating': '63 percent', 'title': 'Barack Obama', 'triplet_source_id': '425eced4-ff34-49c2-b4ce-64ac96bf8d43'}, name='Obama'),
  Relation(label='MOVED_TO', source_id='Obama', target_id='Afghanistan', properties={'action': 'moved to bolster', 'quantity': 'U.S. troop strength in Afghanistan', 'title': 'Barack Obama', 'triplet_source_id': 'ff7b416e-2885-4296-b7e2-156cb3578bb1'}),
  EntityNode(label='COUNTRY', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'ff7b416e-2885-4296-b7e2-156cb3578bb1'}, name='Afghanistan')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='RECEIVED', source_id='Barack Obama', target_id='Our Great National Parks', properties={'award': 'Primetime Emmy Award', 'category': 'Outstanding Narrator', 'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}),
  EntityNode(label='TV SHOW', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Our Great National Parks')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='PUBLISHED', source_id='Barack Obama', target_id='A Promised Land', properties={'title': 'Barack Obama', 'triplet_source_id': '43848a0a-858e-4552-b820-b8831931f63f'}),
  EntityNode(label='BOOK', embedding=None, properties={'release_date': 'November 17', 'title': 'Barack Obama', 'triplet_source_id': 'caf64843-39ce-4992-9c40-e7b1166af804'}, name='A Promised Land')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='RECEIVED', source_id='Barack Obama', target_id='Shoah Foundation Institute for Visual History and Education', properties={'award': 'Ambassador of Humanity Award', 'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Shoah Foundation Institute for Visual History and Education')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='SUPPORTED', source_id='Barack Obama', target_id='payday loan regulations', properties={'title': 'Barack Obama', 'triplet_source_id': '13073b9d-68e7-4973-9f70-bd65912d9604'}),
  EntityNode(label='POLICY', embedding=None, properties={'target': 'low-income workers', 'title': 'Barack Obama', 'triplet_source_id': '13073b9d-68e7-4973-9f70-bd65912d9604'}, name='payday loan regulations'))]

基于初始本体指导的知识图谱抽取：¶

在此场景中，我们对于需要识别的目标具备部分先验知识。已知文章主题与巴拉克·奥巴马相关，因此我们预先定义了一些可能辅助指导大语言模型（LLM）在标注过程中识别实体和关系的要素。这种引导并不能确保LLM必定采用这些预定义要素，仅为其提供参考方向。最终是否采用我们提供的实体和关系，仍由LLM自主决定。

In [ ]:

Copied!





kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    allowed_entity_types=["POLITICIAN", "POLITICAL_PARTY"],
    allowed_relation_types=["PRESIDENT_OF", "MEMBER_OF"],
    allowed_relation_props=["description"],
    allowed_entity_props=["description"],
)

dynamic_index_2 = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index_2.property_graph_store.save_networkx_graph(
    name="./DynamicGraph_2.html"
)
dynamic_index_2.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    allowed_entity_types=["POLITICIAN", "POLITICAL_PARTY"],
    allowed_relation_types=["PRESIDENT_OF", "MEMBER_OF"],
    allowed_relation_props=["description"],
    allowed_entity_props=["description"],
)

dynamic_index_2 = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index_2.property_graph_store.save_networkx_graph(
    name="./DynamicGraph_2.html"
)
dynamic_index_2.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting and inferring knowledge graph from text: 100%|██████████| 11/11 [00:47<00:00,  4.29s/it]

Out[ ]:

[(EntityNode(label='PERSON', embedding=None, properties={'description': '44th President of the United States', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='Obama'),
  Relation(label='MOVED_TO', source_id='Obama', target_id='Afghanistan', properties={'description': 'moved to bolster U.S. troop strength', 'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}),
  EntityNode(label='COUNTRY', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}, name='Afghanistan')),
 (EntityNode(label='POLITICIAN', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '8f9dc0b3-ff33-46e9-ad3f-040755d33fc7'}, name='Barack Obama'),
  Relation(label='ESTABLISHED', source_id='Barack Obama', target_id='White House Task Force to Protect Students from Sexual Assault', properties={'title': 'Barack Obama', 'triplet_source_id': '8af352da-b50d-4043-8002-870991473cf6'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '8af352da-b50d-4043-8002-870991473cf6'}, name='White House Task Force to Protect Students from Sexual Assault')),
 (EntityNode(label='POLITICIAN', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '8f9dc0b3-ff33-46e9-ad3f-040755d33fc7'}, name='Barack Obama'),
  Relation(label='BECAME_CHAIRMAN_OF', source_id='Barack Obama', target_id="Illinois Senate\\'s Health and Human Services Committee", properties={'title': 'Barack Obama', 'triplet_source_id': '5bf11d65-0078-48bb-97b5-109b4469d46a'}),
  EntityNode(label='COMMITTEE', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5bf11d65-0078-48bb-97b5-109b4469d46a'}, name="Illinois Senate\\'s Health and Human Services Committee")),
 (EntityNode(label='PERSON', embedding=None, properties={'description': '44th President of the United States', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='Obama'),
  Relation(label='USED', source_id='Obama', target_id='last day in office', properties={'description': 'used phrase "thanks, Obama"', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}),
  EntityNode(label='EVENT', embedding=None, properties={'description': 'final day in office', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='last day in office')),
 (EntityNode(label='PERSON', embedding=None, properties={'description': '44th President of the United States', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='Obama'),
  Relation(label='SAID', source_id='Obama', target_id='34,000 U.S. troops', properties={'description': 'said the U.S. military would reduce the troop level in Afghanistan', 'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}),
  EntityNode(label='MILITARY_FORCE', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}, name='34,000 U.S. troops'))]

3 - SchemaLLMPathExtractor¶

In [ ]:

Copied!





kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    strict=False,  # Set to False to showcase why it's not going to be the same as DynamicLLMPathExtractor
    possible_entities=None,  # USE DEFAULT ENTITIES (PERSON, ORGANIZATION... etc)
    possible_relations=None,  # USE DEFAULT RELATIONSHIPS
    possible_relation_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    possible_entity_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    num_workers=4,
)

schema_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

schema_index.property_graph_store.save_networkx_graph(
    name="./SchemaGraph.html"
)
schema_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    strict=False,  # Set to False to showcase why it's not going to be the same as DynamicLLMPathExtractor
    possible_entities=None,  # USE DEFAULT ENTITIES (PERSON, ORGANIZATION... etc)
    possible_relations=None,  # USE DEFAULT RELATIONSHIPS
    possible_relation_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    possible_entity_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    num_workers=4,
)

schema_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

schema_index.property_graph_store.save_networkx_graph(
    name="./SchemaGraph.html"
)
schema_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text with schema: 100%|██████████| 11/11 [00:52<00:00,  4.81s/it]

Out[ ]:

[(EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='HAS', source_id='Barack Obama', target_id='References', properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}),
  EntityNode(label='CONCEPT', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='References')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='INTERCEPTED', source_id='Barack Obama', target_id='pipe bomb', properties={'title': 'Barack Obama', 'triplet_source_id': 'ada0abff-9671-4156-b06c-bf5067e6d54c'}),
  EntityNode(label='PRODUCT', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'ada0abff-9671-4156-b06c-bf5067e6d54c'}, name='pipe bomb')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='HAS', source_id='Barack Obama', target_id='end of 2015', properties={'title': 'Barack Obama', 'triplet_source_id': '2b64d219-d19b-4346-a6a0-4369599af5d1'}),
  EntityNode(label='TIME', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '2b64d219-d19b-4346-a6a0-4369599af5d1'}, name='end of 2015')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='GRADUATED_FROM', source_id='Barack Obama', target_id='Columbia University', properties={'title': 'Barack Obama', 'triplet_source_id': '65be5ae1-bc74-43ee-9655-855daf81f74f'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '65be5ae1-bc74-43ee-9655-855daf81f74f'}, name='Columbia University')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='EDUCATION', source_id='Barack Obama', target_id='Schools and Universities', properties={'extra_description': 'Attended schools and universities', 'title': 'Barack Obama', 'triplet_source_id': '1f495d28-7df4-44dc-a3e3-bfc6161d3d2d'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '1f495d28-7df4-44dc-a3e3-bfc6161d3d2d'}, name='Schools and Universities'))]

对比分析¶

让我们比较三种提取器的结果：

SimpleLLMPathExtractor：该提取器会创建一个没有预定义模式的基础知识图谱。它可能产生更多样化的关系，但在实体和关系命名上可能缺乏一致性。
DynamicLLMPathExtractor：
- 这个新型提取器结合了SimpleLLMPathExtractor的灵活性和模式提供的初始引导。它能够突破初始实体和关系类型的限制，在保持一定一致性的同时，可能生成丰富多样的图谱。
- 不在输入中提供任何初始实体和关系，可以让LLM完全自由地根据其判断即时推断模式。这将因使用的LLM模型和温度参数而有所不同。
SchemaLLMPathExtractor：通过预定义模式，该提取器会生成结构更明确的图谱。实体和关系被限制在模式指定的范围内，这能带来更高的一致性，但也可能导致图谱完整性不足。即使我们将"strict"设为false，提取的知识图谱也不会反映LLM尝试寻找超出输入模式范围的新实体和类型的努力。

关键观察：¶

SimpleLLMPathExtractor生成的图谱可能拥有最多样化的实体和关系集合
SchemaLLMPathExtractor生成的图谱应该是最一致的，但可能会遗漏许多不符合预定义模式的关系，即使我们没有对模式进行严格验证
DynamicLLMPathExtractor生成的图谱应该能在多样性和一致性之间取得平衡，既能捕获基于模式方法可能遗漏的重要关系，又能保持一定的结构

提取器选择取决于具体用例：¶

当需要进行探索性分析且不关心实体类型时，使用SimpleLLMPathExtractor来捕获RAG应用中广泛的潜在关系
当领域定义明确且需要确保提取知识的一致性时，使用SchemaLLMPathExtractor
当需要在结构和灵活性之间取得平衡时，使用DynamicLLMPathExtractor，它允许模型发现新的实体和关系类型，同时仍提供一些初始引导。如果你想要一个带有标注（类型化）实体的知识图谱，但没有输入模式（或仅部分定义了模式作为起始基础），这个提取器特别有用