knowledge

quickStart

需要准备一个知识库, 然后向量化存储, 传给ai.

db准备

python

from agno.embedder.google import GeminiEmbedder
from agno.vectordb.lancedb import LanceDb, SearchType

vector_db=LanceDb(
            table_name="recipes",
            uri="tmp/lancedb",
            # search_type=SearchType.vector,
            search_type=SearchType.hybrid,
            embedder=GeminiEmbedder(),
        )

search_type有三种, hybrid, vector, keyword.

理论上hybrid会更加合适, 不过实际上谁知道呢.

在官方给出的例子中, hybrid反而搜不到.

python

from utils import new_openai_model, new_openai_embedder

from agno.agent import Agent
from agno.models.google import Gemini
from agno.embedder.google import GeminiEmbedder
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.vectordb.lancedb import LanceDb, SearchType


def test():
    knowledge_base = PDFUrlKnowledgeBase(
        urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
        vector_db=LanceDb(
            table_name="recipes",
            uri="tmp/lancedb",
            # search_type=SearchType.vector,
            search_type=SearchType.keyword,
            embedder=GeminiEmbedder(),
        ),
    )

    knowledge_base.load(recreate=True, upsert=True)

    agent = Agent(
        model=Gemini("gemini-2.0-flash-lite"),
        knowledge=knowledge_base,
        search_knowledge=True,
        show_tool_calls=True,
        markdown=True,
    )
    
    agent.print_response(
        "How do I make chicken and galangal in coconut milk soup", stream=True
    )

换成vector模式就搜到了.

知识库准备

python

from agno.knowledge.pdf_url import PDFUrlKnowledgeBase

# ...

knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=vector_db
)

启动!

python

agent = Agent(
    model=Gemini("gemini-2.0-flash-lite"),
    knowledge=knowledge_base,
    search_knowledge=True,
    show_tool_calls=True,
    markdown=True,
)

agent.print_response(
    "How do I make chicken and galangal in coconut milk soup", stream=True
)

完整代码

python

from utils import new_openai_model, new_openai_embedder

from agno.agent import Agent
from agno.models.google import Gemini
from agno.embedder.google import GeminiEmbedder
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.vectordb.lancedb import LanceDb, SearchType


def test():
    knowledge_base = PDFUrlKnowledgeBase(
        urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
        vector_db=LanceDb(
            table_name="recipes",
            uri="tmp/lancedb",
            # search_type=SearchType.vector,
            search_type=SearchType.keyword,
            embedder=GeminiEmbedder(),
        ),
    )

    knowledge_base.load(recreate=True, upsert=True)

    agent = Agent(
        model=Gemini("gemini-2.0-flash-lite"),
        knowledge=knowledge_base,
        search_knowledge=True,
        show_tool_calls=True,
        markdown=True,
    )
    
    agent.print_response(
        "How do I make chicken and galangal in coconut milk soup", stream=True
    )

知识过滤

给知识做标注

一次性加载所有文档

python

knowledge_base = PDFKnowledgeBase(
    path=[
        {
            "path": "path/to/cv1.pdf",
            "metadata": {
                "user_id": "jordan_mitchell",
                "document_type": "cv",
                "year": 2025,
            },
        },
        # ... more documents ...
    ],
    vector_db=vector_db,
)
knowledge_base.load(recreate=True)

逐个加载文档

python

# Initialize the PDFKnowledgeBase
knowledge_base = PDFKnowledgeBase(
    vector_db=vector_db,
    num_documents=5,
)

# Load first document with user_1 metadata
knowledge_base.load_document(
    path=path/to/cv1.pdf,
    metadata={"user_id": "jordan_mitchell", "document_type": "cv", "year": 2025},
    recreate=True,  # Set to True only for the first run, then set to False
)

# Load second document with user_2 metadata
knowledge_base.load_document(
    path=path/to/cv2.pdf,
    metadata={"user_id": "taylor_brooks", "document_type": "cv", "year": 2025},
)

过滤

手动过滤

给agent加个knowledge_filters就行了.

python

agent = Agent(
    name="KnowledgeFilterAgent",
    # ...
    knowledge_filters={"user_id": "jordan_mitchell"},
)

这会匹配知识库里的元数据, 只有符合条件的才会搜索.

或者在单次查询的时候过滤

python

agent = Agent(
    knowledge=knowledge_base,
    search_knowledge=True,
)
agent.print_response(
    "Tell me about Jordan Mitchell's experience and skills",
    knowledge_filters={"user_id": "jordan_mitchell"},
    markdown=True,
)

代理过滤

python

agent = Agent(
    knowledge=knowledge_base,
    search_knowledge=True,
    enable_agentic_knowledge_filters=True,
)
agent.print_response(
    "Tell me about Jordan Mitchell's experience and skills with jordan_mitchell as user id and document type cv",
    markdown=True,
)

看文档的说明大概是智能体会根据输入和知识库的元数据, 自动推断应该搜索什么, 算是提高效率吧.

混合知识库

python

from agno.knowledge.combined import CombinedKnowledgeBase
from agno.vectordb.pgvector import PgVector
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.knowledge.website import WebsiteKnowledgeBase
from agno.knowledge.pdf import PDFKnowledgeBase


url_pdf_knowledge_base = PDFUrlKnowledgeBase(
    urls=["pdf_url"],
    # Table name: ai.pdf_documents
    vector_db=PgVector(
        table_name="pdf_documents",
        db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
    ),
)

website_knowledge_base = WebsiteKnowledgeBase(
    urls=["https://docs.agno.com/introduction"],
    # Number of links to follow from the seed URLs
    max_links=10,
    # Table name: ai.website_documents
    vector_db=PgVector(
        table_name="website_documents",
        db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
    ),
)

local_pdf_knowledge_base = PDFKnowledgeBase(
    path="data/pdfs",
    # Table name: ai.pdf_documents
    vector_db=PgVector(
        table_name="pdf_documents",
        db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
    ),
    reader=PDFReader(chunk=True),
)

knowledge_base = CombinedKnowledgeBase(
    sources=[
        url_pdf_knowledge_base,
        website_knowledge_base,
        local_pdf_knowledge_base,
    ],
    vector_db=PgVector(
        # Table name: ai.combined_documents
        table_name="combined_documents",
        db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
    ),
)

分块

固定大小

python

from agno.document.chunking.fixed import FixedSizeChunking


knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=PgVector(table_name="recipes_fixed_size_chunking", db_url=db_url),
    chunking_strategy=FixedSizeChunking(chunk_size=5000, overlap=0),
)

parameter	type	default	description
chunk_size	int	5000	每个块的最大大小
---------	----	-------	-----------
overlap	int	0	块与块之间的重叠字数
---------	----	-------	-----------

代理分块

智能分块?

python

from agno.document.chunking.agentic import AgenticChunking


knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=PgVector(table_name="recipes_agentic_chunking", db_url=db_url),
    chunking_strategy=AgenticChunking(),
)

parameter	type	default	description
model	Model	OpenAIChat	分块的模型
---------	----	-------	-----------
max_chunk_size	int	5000	每个块的最大大小
---------	----	-------	-----------

语义分块

使用嵌入器模型来按语义分块.

python

from agno.document.chunking.recursive import RecursiveChunking


knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=PgVector(table_name="recipes_semantic_chunking", db_url=db_url),
    chunking_strategy=SemanticChunking(),
)

parameter	type	default	description
embedder	Embedder	OpenAIEmbedder	语义分块的嵌入器
---------	----	-------	-----------
chunk_size	int	5000	每个块的最大大小
---------	----	-------	-----------
similarity_threshold	float	0.5	用于确定片段边界的相似度阈值
---------	----	-------	-----------

递归分块

大文档分块时用, 可以将文档分成更小的块.

python

from agno.document.chunking.recursive import RecursiveChunking


knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=PgVector(table_name="recipes_recursive_chunking", db_url=db_url),
    chunking_strategy=RecursiveChunking(),
)

parameter	type	default	description
chunk_size	int	5000	每个块的最大大小
---------	----	-------	-----------
overlap	int	0	块与块之间的重叠字数
---------	----	-------	-----------

文档分块

根据文档结构分块(段落, 章节)

python

from agno.document.chunking.document import DocumentChunking


knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=PgVector(table_name="recipes_document_chunking", db_url=db_url),
    chunking_strategy=DocumentChunking(),
)

parameter	type	default	description
chunk_size	int	5000	每个块的最大大小
---------	----	-------	-----------
overlap	int	0	块与块之间的重叠字数
---------	----	-------	-----------

knowledge ​

quickStart ​

db准备 ​

知识库准备 ​

启动! ​

完整代码 ​

知识过滤 ​

给知识做标注 ​

过滤 ​

手动过滤 ​

代理过滤 ​

混合知识库 ​

分块 ​

固定大小 ​

代理分块 ​

语义分块 ​

递归分块 ​

文档分块 ​

knowledge

quickStart

db准备

知识库准备

启动!

完整代码

知识过滤

给知识做标注

过滤

手动过滤

代理过滤

混合知识库

分块

固定大小

代理分块

语义分块

递归分块

文档分块