Files
Crumb-Core-v.1/app/routers/document_rag.py
2025-12-24 14:32:55 +01:00

256 lines
8.0 KiB
Python

# app/routers/document_rag.py
"""
Document RAG endpoints for Markdown documentation
Auto-indexes docs from docs/rz-nullfeld and docs/crumbforest
"""
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, Field
from typing import Optional, List
from deps import get_db, get_qdrant_client, admin_required
from config import get_settings
from services.provider_factory import ProviderFactory
from services.document_indexer import DocumentIndexer
router = APIRouter()
class DocumentIndexRequest(BaseModel):
"""Request to index documents."""
category: Optional[str] = Field(None, description="Category to index (rz-nullfeld, crumbforest, or None for all)")
provider: Optional[str] = Field(None, description="Provider to use (defaults to config setting)")
force: bool = Field(False, description="Force re-indexing even if unchanged")
class DocumentIndexResponse(BaseModel):
"""Response from document indexing."""
status: str
total_files: int
indexed: int
unchanged: int
errors: int
categories: dict
class DocumentStatusResponse(BaseModel):
"""Document indexing status."""
categories: dict
@router.post("/index", response_model=DocumentIndexResponse, name="documents_index")
async def index_documents(
request: DocumentIndexRequest,
user = Depends(admin_required)
):
"""
Index Markdown documents from docs/ directories.
Admin-only endpoint for manual re-indexing.
Documents are automatically indexed on startup.
Categories:
- rz-nullfeld: RZ Nullfeld documentation
- crumbforest: Crumbforest documentation
"""
settings = get_settings()
db_conn = get_db()
qdrant_client = get_qdrant_client()
try:
# Use default provider if not specified
provider_name = request.provider or settings.default_embedding_provider
# Create provider
provider = ProviderFactory.create_provider(
provider_name=provider_name,
settings=settings
)
# Create document indexer
indexer = DocumentIndexer(
db_conn=db_conn,
qdrant_client=qdrant_client,
embedding_provider=provider,
docs_base_path="docs"
)
# Index specified category or all
if request.category:
# Index single category
result = indexer.index_category(request.category, force=request.force)
results = {
'categories': {request.category: result},
'total_files': result['total'],
'total_indexed': result['indexed'],
'total_unchanged': result['unchanged'],
'total_errors': result['errors']
}
else:
# Index all categories
results = indexer.index_all_categories(force=request.force)
return DocumentIndexResponse(
status="success",
total_files=results['total_files'],
indexed=results['total_indexed'],
unchanged=results['total_unchanged'],
errors=results['total_errors'],
categories=results['categories']
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}")
finally:
db_conn.close()
@router.get("/status", response_model=DocumentStatusResponse, name="documents_status")
async def get_documents_status(
category: Optional[str] = None,
user = Depends(admin_required)
):
"""
Get document indexing status.
Returns information about indexed documents in each category.
"""
settings = get_settings()
db_conn = get_db()
qdrant_client = get_qdrant_client()
try:
# Get any available provider
available_providers = ProviderFactory.get_available_providers(settings)
if not available_providers:
raise HTTPException(status_code=500, detail="No providers configured")
provider = ProviderFactory.create_provider(
provider_name=available_providers[0],
settings=settings
)
# Create document indexer
indexer = DocumentIndexer(
db_conn=db_conn,
qdrant_client=qdrant_client,
embedding_provider=provider,
docs_base_path="docs"
)
# Get status
status = indexer.get_indexing_status(category=category)
return DocumentStatusResponse(categories=status)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get status: {str(e)}")
finally:
db_conn.close()
@router.get("/search", name="documents_search")
async def search_documents(
q: str,
category: Optional[str] = None,
limit: int = 5,
provider: Optional[str] = None
):
"""
Search across indexed documents.
Semantic search across RZ Nullfeld and Crumbforest documentation.
"""
settings = get_settings()
db_conn = get_db()
qdrant_client = get_qdrant_client()
try:
from services.rag_service import RAGService
# Use default provider if not specified
if not provider:
provider = settings.default_embedding_provider
print(f"[DEBUG] Creating provider: {provider}")
print(f"[DEBUG] Settings default_embedding_model: {settings.default_embedding_model}")
# Create provider with explicit embedding model (without provider prefix)
embedding_provider = ProviderFactory.create_provider(
provider_name=provider,
settings=settings,
embedding_model="text-embedding-3-small" # Without openai/ prefix
)
print(f"[DEBUG] Provider created: {embedding_provider.__class__.__name__}")
print(f"[DEBUG] Provider embedding_model: {embedding_provider.embedding_model}")
# Determine collection(s) to search
if category:
indexer = DocumentIndexer(
db_conn=db_conn,
qdrant_client=qdrant_client,
embedding_provider=embedding_provider,
docs_base_path="docs"
)
collection_name = indexer.categories.get(category)
if not collection_name:
raise HTTPException(status_code=400, detail=f"Unknown category: {category}")
collections = [collection_name]
else:
# Search all document collections (prefix only, RAGService adds _{locale})
collections = ["docs_rz_nullfeld", "docs_crumbforest"]
all_results = []
for coll_name in collections:
rag_service = RAGService(
db_conn=db_conn,
qdrant_client=qdrant_client,
embedding_provider=embedding_provider,
collection_prefix=coll_name
)
try:
results = rag_service.search_posts(
query=q,
locale="", # Documents are locale-agnostic
limit=limit
)
# Add collection info to results
for r in results:
r['collection'] = coll_name
all_results.extend(results)
except Exception as e:
# Collection might not exist yet
print(f"Error searching {coll_name}: {e}")
continue
# Sort by score and limit
all_results.sort(key=lambda x: x['score'], reverse=True)
all_results = all_results[:limit]
return {
"query": q,
"results": all_results,
"provider": provider
}
except ValueError as e:
print(f"[DEBUG] ValueError in search: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
print(f"[DEBUG] Exception in search: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
finally:
db_conn.close()