From aa3243ebacf705afe4f87b8aedceec22fe6eeba0 Mon Sep 17 00:00:00 2001 From: CAICAIIs <3360776475@qq.com> Date: Wed, 25 Feb 2026 20:17:54 +0800 Subject: [PATCH 1/2] perf: batch metadata query in KB retrieval to fix N+1 problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace N sequential get_document_with_metadata() calls with a single get_documents_with_metadata_batch() call using SQL IN clause. Benchmark results (local SQLite): - 10 docs: 10.67ms → 1.47ms (7.3x faster) - 20 docs: 26.00ms → 2.68ms (9.7x faster) - 50 docs: 63.87ms → 2.79ms (22.9x faster) --- astrbot/core/knowledge_base/kb_db_sqlite.py | 35 +++++++++++++++++++ .../core/knowledge_base/retrieval/manager.py | 7 ++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/astrbot/core/knowledge_base/kb_db_sqlite.py b/astrbot/core/knowledge_base/kb_db_sqlite.py index 39fc72ac85..b6f711af35 100644 --- a/astrbot/core/knowledge_base/kb_db_sqlite.py +++ b/astrbot/core/knowledge_base/kb_db_sqlite.py @@ -256,6 +256,41 @@ async def get_document_with_metadata(self, doc_id: str) -> dict | None: "knowledge_base": row[1], } + async def get_documents_with_metadata_batch( + self, doc_ids: list[str] + ) -> dict[str, dict]: + """批量获取文档及其所属知识库元数据 + + Args: + doc_ids: 文档 ID 列表 + + Returns: + dict: doc_id -> {"document": KBDocument, "knowledge_base": KnowledgeBase} + + """ + if not doc_ids: + return {} + + async with self.get_db() as session: + stmt = ( + select(KBDocument, KnowledgeBase) + .join( + KnowledgeBase, + col(KBDocument.kb_id) == col(KnowledgeBase.kb_id), + ) + .where(col(KBDocument.doc_id).in_(doc_ids)) + ) + result = await session.execute(stmt) + rows = result.all() + + return { + row[0].doc_id: { + "document": row[0], + "knowledge_base": row[1], + } + for row in rows + } + async def delete_document_by_id(self, doc_id: str, vec_db: FaissVecDB) -> None: """删除单个文档及其相关数据""" # 在知识库表中删除 diff --git a/astrbot/core/knowledge_base/retrieval/manager.py b/astrbot/core/knowledge_base/retrieval/manager.py index d406ceabce..219a9659dd 100644 --- a/astrbot/core/knowledge_base/retrieval/manager.py +++ b/astrbot/core/knowledge_base/retrieval/manager.py @@ -142,10 +142,13 @@ async def retrieve( f"Rank fusion took {time_end - time_start:.2f}s and returned {len(fused_results)} results.", ) - # 4. 转换为 RetrievalResult (获取元数据) + # 4. 转换为 RetrievalResult (批量获取元数据) + doc_ids = list({fr.doc_id for fr in fused_results}) + metadata_map = await self.kb_db.get_documents_with_metadata_batch(doc_ids) + retrieval_results = [] for fr in fused_results: - metadata_dict = await self.kb_db.get_document_with_metadata(fr.doc_id) + metadata_dict = metadata_map.get(fr.doc_id) if metadata_dict: retrieval_results.append( RetrievalResult( From e99df3495f4bcb90799a13eef4e42620016cbf8d Mon Sep 17 00:00:00 2001 From: CAICAIIs <3360776475@qq.com> Date: Wed, 25 Feb 2026 21:32:50 +0800 Subject: [PATCH 2/2] refactor: use set[str] param type and chunk IN clause for SQLite safety Address review feedback: - Change doc_ids param from list[str] to set[str] to avoid unnecessary conversion - Chunk IN clause into batches of 900 to stay under SQLite's 999 parameter limit - Remove list() wrapping at call site, pass set directly --- astrbot/core/knowledge_base/kb_db_sqlite.py | 41 +++++++++++-------- .../core/knowledge_base/retrieval/manager.py | 2 +- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/astrbot/core/knowledge_base/kb_db_sqlite.py b/astrbot/core/knowledge_base/kb_db_sqlite.py index b6f711af35..4b9dcf7dd0 100644 --- a/astrbot/core/knowledge_base/kb_db_sqlite.py +++ b/astrbot/core/knowledge_base/kb_db_sqlite.py @@ -257,12 +257,12 @@ async def get_document_with_metadata(self, doc_id: str) -> dict | None: } async def get_documents_with_metadata_batch( - self, doc_ids: list[str] + self, doc_ids: set[str] ) -> dict[str, dict]: """批量获取文档及其所属知识库元数据 Args: - doc_ids: 文档 ID 列表 + doc_ids: 文档 ID 集合 Returns: dict: doc_id -> {"document": KBDocument, "knowledge_base": KnowledgeBase} @@ -271,25 +271,30 @@ async def get_documents_with_metadata_batch( if not doc_ids: return {} + metadata_map: dict[str, dict] = {} + # SQLite 参数上限为 999,分片查询避免超限 + chunk_size = 900 + doc_id_list = list(doc_ids) + async with self.get_db() as session: - stmt = ( - select(KBDocument, KnowledgeBase) - .join( - KnowledgeBase, - col(KBDocument.kb_id) == col(KnowledgeBase.kb_id), + for i in range(0, len(doc_id_list), chunk_size): + chunk = doc_id_list[i : i + chunk_size] + stmt = ( + select(KBDocument, KnowledgeBase) + .join( + KnowledgeBase, + col(KBDocument.kb_id) == col(KnowledgeBase.kb_id), + ) + .where(col(KBDocument.doc_id).in_(chunk)) ) - .where(col(KBDocument.doc_id).in_(doc_ids)) - ) - result = await session.execute(stmt) - rows = result.all() + result = await session.execute(stmt) + for row in result.all(): + metadata_map[row[0].doc_id] = { + "document": row[0], + "knowledge_base": row[1], + } - return { - row[0].doc_id: { - "document": row[0], - "knowledge_base": row[1], - } - for row in rows - } + return metadata_map async def delete_document_by_id(self, doc_id: str, vec_db: FaissVecDB) -> None: """删除单个文档及其相关数据""" diff --git a/astrbot/core/knowledge_base/retrieval/manager.py b/astrbot/core/knowledge_base/retrieval/manager.py index 219a9659dd..1244e18af1 100644 --- a/astrbot/core/knowledge_base/retrieval/manager.py +++ b/astrbot/core/knowledge_base/retrieval/manager.py @@ -143,7 +143,7 @@ async def retrieve( ) # 4. 转换为 RetrievalResult (批量获取元数据) - doc_ids = list({fr.doc_id for fr in fused_results}) + doc_ids = {fr.doc_id for fr in fused_results} metadata_map = await self.kb_db.get_documents_with_metadata_batch(doc_ids) retrieval_results = []