# app/routers/document_rag.py """ Document RAG endpoints for Markdown documentation Auto-indexes docs from docs/rz-nullfeld and docs/crumbforest """ from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel, Field from typing import Optional, List from deps import get_db, get_qdrant_client, admin_required from config import get_settings from services.provider_factory import ProviderFactory from services.document_indexer import DocumentIndexer router = APIRouter() class DocumentIndexRequest(BaseModel): """Request to index documents.""" category: Optional[str] = Field(None, description="Category to index (rz-nullfeld, crumbforest, or None for all)") provider: Optional[str] = Field(None, description="Provider to use (defaults to config setting)") force: bool = Field(False, description="Force re-indexing even if unchanged") class DocumentIndexResponse(BaseModel): """Response from document indexing.""" status: str total_files: int indexed: int unchanged: int errors: int categories: dict class DocumentStatusResponse(BaseModel): """Document indexing status.""" categories: dict @router.post("/index", response_model=DocumentIndexResponse, name="documents_index") async def index_documents( request: DocumentIndexRequest, user = Depends(admin_required) ): """ Index Markdown documents from docs/ directories. Admin-only endpoint for manual re-indexing. Documents are automatically indexed on startup. Categories: - rz-nullfeld: RZ Nullfeld documentation - crumbforest: Crumbforest documentation """ settings = get_settings() db_conn = get_db() qdrant_client = get_qdrant_client() try: # Use default provider if not specified provider_name = request.provider or settings.default_embedding_provider # Create provider provider = ProviderFactory.create_provider( provider_name=provider_name, settings=settings ) # Create document indexer indexer = DocumentIndexer( db_conn=db_conn, qdrant_client=qdrant_client, embedding_provider=provider, docs_base_path="docs" ) # Index specified category or all if request.category: # Index single category result = indexer.index_category(request.category, force=request.force) results = { 'categories': {request.category: result}, 'total_files': result['total'], 'total_indexed': result['indexed'], 'total_unchanged': result['unchanged'], 'total_errors': result['errors'] } else: # Index all categories results = indexer.index_all_categories(force=request.force) return DocumentIndexResponse( status="success", total_files=results['total_files'], indexed=results['total_indexed'], unchanged=results['total_unchanged'], errors=results['total_errors'], categories=results['categories'] ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except Exception as e: raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}") finally: db_conn.close() @router.get("/status", response_model=DocumentStatusResponse, name="documents_status") async def get_documents_status( category: Optional[str] = None, user = Depends(admin_required) ): """ Get document indexing status. Returns information about indexed documents in each category. """ settings = get_settings() db_conn = get_db() qdrant_client = get_qdrant_client() try: # Get any available provider available_providers = ProviderFactory.get_available_providers(settings) if not available_providers: raise HTTPException(status_code=500, detail="No providers configured") provider = ProviderFactory.create_provider( provider_name=available_providers[0], settings=settings ) # Create document indexer indexer = DocumentIndexer( db_conn=db_conn, qdrant_client=qdrant_client, embedding_provider=provider, docs_base_path="docs" ) # Get status status = indexer.get_indexing_status(category=category) return DocumentStatusResponse(categories=status) except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to get status: {str(e)}") finally: db_conn.close() @router.get("/search", name="documents_search") async def search_documents( q: str, category: Optional[str] = None, limit: int = 5, provider: Optional[str] = None, user = Depends(admin_required) ): """ Search across indexed documents. Semantic search across RZ Nullfeld and Crumbforest documentation. """ settings = get_settings() db_conn = get_db() qdrant_client = get_qdrant_client() try: from services.rag_service import RAGService # Use default provider if not specified if not provider: provider = settings.default_embedding_provider print(f"[DEBUG] Creating provider: {provider}") print(f"[DEBUG] Settings default_embedding_model: {settings.default_embedding_model}") # Create provider with explicit embedding model (without provider prefix) embedding_provider = ProviderFactory.create_provider( provider_name=provider, settings=settings, embedding_model="text-embedding-3-small" # Without openai/ prefix ) print(f"[DEBUG] Provider created: {embedding_provider.__class__.__name__}") print(f"[DEBUG] Provider embedding_model: {embedding_provider.embedding_model}") # Determine collection(s) to search if category: indexer = DocumentIndexer( db_conn=db_conn, qdrant_client=qdrant_client, embedding_provider=embedding_provider, docs_base_path="docs" ) collection_name = indexer.categories.get(category) if not collection_name: raise HTTPException(status_code=400, detail=f"Unknown category: {category}") collections = [collection_name] else: # Search all document collections (prefix only, RAGService adds _{locale}) collections = ["docs_rz_nullfeld", "docs_crumbforest"] all_results = [] for coll_name in collections: rag_service = RAGService( db_conn=db_conn, qdrant_client=qdrant_client, embedding_provider=embedding_provider, collection_prefix=coll_name ) try: results = rag_service.search_posts( query=q, locale="", # Documents are locale-agnostic limit=limit ) # Add collection info to results for r in results: r['collection'] = coll_name all_results.extend(results) except Exception as e: # Collection might not exist yet print(f"Error searching {coll_name}: {e}") continue # Sort by score and limit all_results.sort(key=lambda x: x['score'], reverse=True) all_results = all_results[:limit] return { "query": q, "results": all_results, "provider": provider } except ValueError as e: print(f"[DEBUG] ValueError in search: {e}") import traceback traceback.print_exc() raise HTTPException(status_code=400, detail=str(e)) except Exception as e: print(f"[DEBUG] Exception in search: {e}") import traceback traceback.print_exc() raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}") finally: db_conn.close()