自定义嵌入模型¶

LlamaIndex 支持来自 OpenAI、Azure 和 Langchain 的嵌入模型。但如果这些仍不能满足需求，您还可以实现任何自定义的嵌入模型！

以下示例使用 Instructor 嵌入模型（安装/设置说明详见此处），并实现了一个自定义嵌入类。Instructor 嵌入的工作原理是：除了提供待嵌入的文本外，还需要指定文本所属领域的"指令"。当处理非常专业和特定领域的文本嵌入时，这种方法特别有用。

如果您在 Colab 上打开此 Notebook，可能需要安装 LlamaIndex 🦙。

In [ ]:

Copied!

!pip install llama-index
!pip install llama-index

In [ ]:

Copied!

# Install dependencies
# !pip install InstructorEmbedding torch transformers sentence-transformers
# Install dependencies
# !pip install InstructorEmbedding torch transformers sentence-transformers

In [ ]:

Copied!

import openai
import os

os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
openai.api_key = os.environ["OPENAI_API_KEY"]
import openai
import os

os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
openai.api_key = os.environ["OPENAI_API_KEY"]

自定义嵌入实现¶

In [ ]:

Copied!





from typing import Any, List
from InstructorEmbedding import INSTRUCTOR

from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.embeddings import BaseEmbedding


class InstructorEmbeddings(BaseEmbedding):
    _model: INSTRUCTOR = PrivateAttr()
    _instruction: str = PrivateAttr()

    def __init__(
        self,
        instructor_model_name: str = "hkunlp/instructor-large",
        instruction: str = "Represent a document for semantic search:",
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        self._model = INSTRUCTOR(instructor_model_name)
        self._instruction = instruction

    @classmethod
    def class_name(cls) -> str:
        return "instructor"

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)

    async def _aget_text_embedding(self, text: str) -> List[float]:
        return self._get_text_embedding(text)

    def _get_query_embedding(self, query: str) -> List[float]:
        embeddings = self._model.encode([[self._instruction, query]])
        return embeddings[0]

    def _get_text_embedding(self, text: str) -> List[float]:
        embeddings = self._model.encode([[self._instruction, text]])
        return embeddings[0]

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        embeddings = self._model.encode(
            [[self._instruction, text] for text in texts]
        )
        return embeddings
from typing import Any, List
from InstructorEmbedding import INSTRUCTOR

from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.embeddings import BaseEmbedding


class InstructorEmbeddings(BaseEmbedding):
    _model: INSTRUCTOR = PrivateAttr()
    _instruction: str = PrivateAttr()

    def __init__(
        self,
        instructor_model_name: str = "hkunlp/instructor-large",
        instruction: str = "Represent a document for semantic search:",
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        self._model = INSTRUCTOR(instructor_model_name)
        self._instruction = instruction

    @classmethod
    def class_name(cls) -> str:
        return "instructor"

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)

    async def _aget_text_embedding(self, text: str) -> List[float]:
        return self._get_text_embedding(text)

    def _get_query_embedding(self, query: str) -> List[float]:
        embeddings = self._model.encode([[self._instruction, query]])
        return embeddings[0]

    def _get_text_embedding(self, text: str) -> List[float]:
        embeddings = self._model.encode([[self._instruction, text]])
        return embeddings[0]

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        embeddings = self._model.encode(
            [[self._instruction, text] for text in texts]
        )
        return embeddings

使用示例¶

In [ ]:

Copied!

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import Settings

下载数据¶

In [ ]:

Copied!

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

加载文档¶

In [ ]:

Copied!

documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

In [ ]:

Copied!

embed_model = InstructorEmbeddings(embed_batch_size=2)

Settings.embed_model = embed_model
Settings.chunk_size = 512

# if running for the first time, will download model weights first!
index = VectorStoreIndex.from_documents(documents)
embed_model = InstructorEmbeddings(embed_batch_size=2)

Settings.embed_model = embed_model
Settings.chunk_size = 512

# if running for the first time, will download model weights first!
index = VectorStoreIndex.from_documents(documents)

load INSTRUCTOR_Transformer
max_seq_length  512

In [ ]:

Copied!

response = index.as_query_engine().query("What did the author do growing up?")
print(response)
response = index.as_query_engine().query("What did the author do growing up?")
print(response)

The author wrote short stories and also worked on programming, specifically on an IBM 1401 computer in 9th grade. They used an early version of Fortran and had to type programs on punch cards. Later on, they got a microcomputer, a TRS-80, and started programming more extensively, writing simple games and a word processor. They initially planned to study philosophy in college but eventually switched to AI.