knowledge
quickStart
需要准备一个知识库, 然后向量化存储, 传给ai.
db准备
python
from agno.embedder.google import GeminiEmbedder
from agno.vectordb.lancedb import LanceDb, SearchType
vector_db=LanceDb(
table_name="recipes",
uri="tmp/lancedb",
# search_type=SearchType.vector,
search_type=SearchType.hybrid,
embedder=GeminiEmbedder(),
)
search_type有三种, hybrid, vector, keyword.
理论上hybrid会更加合适, 不过实际上谁知道呢.
在官方给出的例子中, hybrid反而搜不到.
python
from utils import new_openai_model, new_openai_embedder
from agno.agent import Agent
from agno.models.google import Gemini
from agno.embedder.google import GeminiEmbedder
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.vectordb.lancedb import LanceDb, SearchType
def test():
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=LanceDb(
table_name="recipes",
uri="tmp/lancedb",
# search_type=SearchType.vector,
search_type=SearchType.keyword,
embedder=GeminiEmbedder(),
),
)
knowledge_base.load(recreate=True, upsert=True)
agent = Agent(
model=Gemini("gemini-2.0-flash-lite"),
knowledge=knowledge_base,
search_knowledge=True,
show_tool_calls=True,
markdown=True,
)
agent.print_response(
"How do I make chicken and galangal in coconut milk soup", stream=True
)
换成vector模式就搜到了.
知识库准备
python
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
# ...
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=vector_db
)
启动!
python
agent = Agent(
model=Gemini("gemini-2.0-flash-lite"),
knowledge=knowledge_base,
search_knowledge=True,
show_tool_calls=True,
markdown=True,
)
agent.print_response(
"How do I make chicken and galangal in coconut milk soup", stream=True
)
完整代码
python
from utils import new_openai_model, new_openai_embedder
from agno.agent import Agent
from agno.models.google import Gemini
from agno.embedder.google import GeminiEmbedder
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.vectordb.lancedb import LanceDb, SearchType
def test():
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=LanceDb(
table_name="recipes",
uri="tmp/lancedb",
# search_type=SearchType.vector,
search_type=SearchType.keyword,
embedder=GeminiEmbedder(),
),
)
knowledge_base.load(recreate=True, upsert=True)
agent = Agent(
model=Gemini("gemini-2.0-flash-lite"),
knowledge=knowledge_base,
search_knowledge=True,
show_tool_calls=True,
markdown=True,
)
agent.print_response(
"How do I make chicken and galangal in coconut milk soup", stream=True
)
知识过滤
给知识做标注
一次性加载所有文档
python
knowledge_base = PDFKnowledgeBase(
path=[
{
"path": "path/to/cv1.pdf",
"metadata": {
"user_id": "jordan_mitchell",
"document_type": "cv",
"year": 2025,
},
},
# ... more documents ...
],
vector_db=vector_db,
)
knowledge_base.load(recreate=True)
逐个加载文档
python
# Initialize the PDFKnowledgeBase
knowledge_base = PDFKnowledgeBase(
vector_db=vector_db,
num_documents=5,
)
# Load first document with user_1 metadata
knowledge_base.load_document(
path=path/to/cv1.pdf,
metadata={"user_id": "jordan_mitchell", "document_type": "cv", "year": 2025},
recreate=True, # Set to True only for the first run, then set to False
)
# Load second document with user_2 metadata
knowledge_base.load_document(
path=path/to/cv2.pdf,
metadata={"user_id": "taylor_brooks", "document_type": "cv", "year": 2025},
)
过滤
手动过滤
给agent加个knowledge_filters就行了.
python
agent = Agent(
name="KnowledgeFilterAgent",
# ...
knowledge_filters={"user_id": "jordan_mitchell"},
)
这会匹配知识库里的元数据, 只有符合条件的才会搜索.
或者在单次查询的时候过滤
python
agent = Agent(
knowledge=knowledge_base,
search_knowledge=True,
)
agent.print_response(
"Tell me about Jordan Mitchell's experience and skills",
knowledge_filters={"user_id": "jordan_mitchell"},
markdown=True,
)
代理过滤
python
agent = Agent(
knowledge=knowledge_base,
search_knowledge=True,
enable_agentic_knowledge_filters=True,
)
agent.print_response(
"Tell me about Jordan Mitchell's experience and skills with jordan_mitchell as user id and document type cv",
markdown=True,
)
看文档的说明大概是智能体会根据输入和知识库的元数据, 自动推断应该搜索什么, 算是提高效率吧.
混合知识库
python
from agno.knowledge.combined import CombinedKnowledgeBase
from agno.vectordb.pgvector import PgVector
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.knowledge.website import WebsiteKnowledgeBase
from agno.knowledge.pdf import PDFKnowledgeBase
url_pdf_knowledge_base = PDFUrlKnowledgeBase(
urls=["pdf_url"],
# Table name: ai.pdf_documents
vector_db=PgVector(
table_name="pdf_documents",
db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
),
)
website_knowledge_base = WebsiteKnowledgeBase(
urls=["https://docs.agno.com/introduction"],
# Number of links to follow from the seed URLs
max_links=10,
# Table name: ai.website_documents
vector_db=PgVector(
table_name="website_documents",
db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
),
)
local_pdf_knowledge_base = PDFKnowledgeBase(
path="data/pdfs",
# Table name: ai.pdf_documents
vector_db=PgVector(
table_name="pdf_documents",
db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
),
reader=PDFReader(chunk=True),
)
knowledge_base = CombinedKnowledgeBase(
sources=[
url_pdf_knowledge_base,
website_knowledge_base,
local_pdf_knowledge_base,
],
vector_db=PgVector(
# Table name: ai.combined_documents
table_name="combined_documents",
db_url="postgresql+psycopg://ai:ai@localhost:5532/ai",
),
)
分块
固定大小
python
from agno.document.chunking.fixed import FixedSizeChunking
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=PgVector(table_name="recipes_fixed_size_chunking", db_url=db_url),
chunking_strategy=FixedSizeChunking(chunk_size=5000, overlap=0),
)
parameter | type | default | description |
---|---|---|---|
chunk_size | int | 5000 | 每个块的最大大小 |
--------- | ---- | ------- | ----------- |
overlap | int | 0 | 块与块之间的重叠字数 |
--------- | ---- | ------- | ----------- |
代理分块
智能分块?
python
from agno.document.chunking.agentic import AgenticChunking
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=PgVector(table_name="recipes_agentic_chunking", db_url=db_url),
chunking_strategy=AgenticChunking(),
)
parameter | type | default | description |
---|---|---|---|
model | Model | OpenAIChat | 分块的模型 |
--------- | ---- | ------- | ----------- |
max_chunk_size | int | 5000 | 每个块的最大大小 |
--------- | ---- | ------- | ----------- |
语义分块
使用嵌入器模型来按语义分块.
python
from agno.document.chunking.recursive import RecursiveChunking
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=PgVector(table_name="recipes_semantic_chunking", db_url=db_url),
chunking_strategy=SemanticChunking(),
)
parameter | type | default | description |
---|---|---|---|
embedder | Embedder | OpenAIEmbedder | 语义分块的嵌入器 |
--------- | ---- | ------- | ----------- |
chunk_size | int | 5000 | 每个块的最大大小 |
--------- | ---- | ------- | ----------- |
similarity_threshold | float | 0.5 | 用于确定片段边界的相似度阈值 |
--------- | ---- | ------- | ----------- |
递归分块
大文档分块时用, 可以将文档分成更小的块.
python
from agno.document.chunking.recursive import RecursiveChunking
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=PgVector(table_name="recipes_recursive_chunking", db_url=db_url),
chunking_strategy=RecursiveChunking(),
)
parameter | type | default | description |
---|---|---|---|
chunk_size | int | 5000 | 每个块的最大大小 |
--------- | ---- | ------- | ----------- |
overlap | int | 0 | 块与块之间的重叠字数 |
--------- | ---- | ------- | ----------- |
文档分块
根据文档结构分块(段落, 章节)
python
from agno.document.chunking.document import DocumentChunking
knowledge_base = PDFUrlKnowledgeBase(
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
vector_db=PgVector(table_name="recipes_document_chunking", db_url=db_url),
chunking_strategy=DocumentChunking(),
)
parameter | type | default | description |
---|---|---|---|
chunk_size | int | 5000 | 每个块的最大大小 |
--------- | ---- | ------- | ----------- |
overlap | int | 0 | 块与块之间的重叠字数 |
--------- | ---- | ------- | ----------- |