257 lines
8.1 KiB
Python
257 lines
8.1 KiB
Python
# app/routers/document_rag.py
|
|
"""
|
|
Document RAG endpoints for Markdown documentation
|
|
Auto-indexes docs from docs/rz-nullfeld and docs/crumbforest
|
|
"""
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from pydantic import BaseModel, Field
|
|
from typing import Optional, List
|
|
|
|
from deps import get_db, get_qdrant_client, admin_required
|
|
from config import get_settings
|
|
from services.provider_factory import ProviderFactory
|
|
from services.document_indexer import DocumentIndexer
|
|
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
class DocumentIndexRequest(BaseModel):
|
|
"""Request to index documents."""
|
|
category: Optional[str] = Field(None, description="Category to index (rz-nullfeld, crumbforest, or None for all)")
|
|
provider: Optional[str] = Field(None, description="Provider to use (defaults to config setting)")
|
|
force: bool = Field(False, description="Force re-indexing even if unchanged")
|
|
|
|
|
|
class DocumentIndexResponse(BaseModel):
|
|
"""Response from document indexing."""
|
|
status: str
|
|
total_files: int
|
|
indexed: int
|
|
unchanged: int
|
|
errors: int
|
|
categories: dict
|
|
|
|
|
|
class DocumentStatusResponse(BaseModel):
|
|
"""Document indexing status."""
|
|
categories: dict
|
|
|
|
|
|
@router.post("/index", response_model=DocumentIndexResponse, name="documents_index")
|
|
async def index_documents(
|
|
request: DocumentIndexRequest,
|
|
user = Depends(admin_required)
|
|
):
|
|
"""
|
|
Index Markdown documents from docs/ directories.
|
|
|
|
Admin-only endpoint for manual re-indexing.
|
|
Documents are automatically indexed on startup.
|
|
|
|
Categories:
|
|
- rz-nullfeld: RZ Nullfeld documentation
|
|
- crumbforest: Crumbforest documentation
|
|
"""
|
|
settings = get_settings()
|
|
db_conn = get_db()
|
|
qdrant_client = get_qdrant_client()
|
|
|
|
try:
|
|
# Use default provider if not specified
|
|
provider_name = request.provider or settings.default_embedding_provider
|
|
|
|
# Create provider
|
|
provider = ProviderFactory.create_provider(
|
|
provider_name=provider_name,
|
|
settings=settings
|
|
)
|
|
|
|
# Create document indexer
|
|
indexer = DocumentIndexer(
|
|
db_conn=db_conn,
|
|
qdrant_client=qdrant_client,
|
|
embedding_provider=provider,
|
|
docs_base_path="docs"
|
|
)
|
|
|
|
# Index specified category or all
|
|
if request.category:
|
|
# Index single category
|
|
result = indexer.index_category(request.category, force=request.force)
|
|
results = {
|
|
'categories': {request.category: result},
|
|
'total_files': result['total'],
|
|
'total_indexed': result['indexed'],
|
|
'total_unchanged': result['unchanged'],
|
|
'total_errors': result['errors']
|
|
}
|
|
else:
|
|
# Index all categories
|
|
results = indexer.index_all_categories(force=request.force)
|
|
|
|
return DocumentIndexResponse(
|
|
status="success",
|
|
total_files=results['total_files'],
|
|
indexed=results['total_indexed'],
|
|
unchanged=results['total_unchanged'],
|
|
errors=results['total_errors'],
|
|
categories=results['categories']
|
|
)
|
|
|
|
except ValueError as e:
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}")
|
|
finally:
|
|
db_conn.close()
|
|
|
|
|
|
@router.get("/status", response_model=DocumentStatusResponse, name="documents_status")
|
|
async def get_documents_status(
|
|
category: Optional[str] = None,
|
|
user = Depends(admin_required)
|
|
):
|
|
"""
|
|
Get document indexing status.
|
|
|
|
Returns information about indexed documents in each category.
|
|
"""
|
|
settings = get_settings()
|
|
db_conn = get_db()
|
|
qdrant_client = get_qdrant_client()
|
|
|
|
try:
|
|
# Get any available provider
|
|
available_providers = ProviderFactory.get_available_providers(settings)
|
|
|
|
if not available_providers:
|
|
raise HTTPException(status_code=500, detail="No providers configured")
|
|
|
|
provider = ProviderFactory.create_provider(
|
|
provider_name=available_providers[0],
|
|
settings=settings
|
|
)
|
|
|
|
# Create document indexer
|
|
indexer = DocumentIndexer(
|
|
db_conn=db_conn,
|
|
qdrant_client=qdrant_client,
|
|
embedding_provider=provider,
|
|
docs_base_path="docs"
|
|
)
|
|
|
|
# Get status
|
|
status = indexer.get_indexing_status(category=category)
|
|
|
|
return DocumentStatusResponse(categories=status)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to get status: {str(e)}")
|
|
finally:
|
|
db_conn.close()
|
|
|
|
|
|
@router.get("/search", name="documents_search")
|
|
async def search_documents(
|
|
q: str,
|
|
category: Optional[str] = None,
|
|
limit: int = 5,
|
|
provider: Optional[str] = None,
|
|
user = Depends(admin_required)
|
|
):
|
|
"""
|
|
Search across indexed documents.
|
|
|
|
Semantic search across RZ Nullfeld and Crumbforest documentation.
|
|
"""
|
|
settings = get_settings()
|
|
db_conn = get_db()
|
|
qdrant_client = get_qdrant_client()
|
|
|
|
try:
|
|
from services.rag_service import RAGService
|
|
|
|
# Use default provider if not specified
|
|
if not provider:
|
|
provider = settings.default_embedding_provider
|
|
|
|
print(f"[DEBUG] Creating provider: {provider}")
|
|
print(f"[DEBUG] Settings default_embedding_model: {settings.default_embedding_model}")
|
|
|
|
# Create provider with explicit embedding model (without provider prefix)
|
|
embedding_provider = ProviderFactory.create_provider(
|
|
provider_name=provider,
|
|
settings=settings,
|
|
embedding_model="text-embedding-3-small" # Without openai/ prefix
|
|
)
|
|
|
|
print(f"[DEBUG] Provider created: {embedding_provider.__class__.__name__}")
|
|
print(f"[DEBUG] Provider embedding_model: {embedding_provider.embedding_model}")
|
|
|
|
# Determine collection(s) to search
|
|
if category:
|
|
indexer = DocumentIndexer(
|
|
db_conn=db_conn,
|
|
qdrant_client=qdrant_client,
|
|
embedding_provider=embedding_provider,
|
|
docs_base_path="docs"
|
|
)
|
|
collection_name = indexer.categories.get(category)
|
|
if not collection_name:
|
|
raise HTTPException(status_code=400, detail=f"Unknown category: {category}")
|
|
collections = [collection_name]
|
|
else:
|
|
# Search all document collections (prefix only, RAGService adds _{locale})
|
|
collections = ["docs_rz_nullfeld", "docs_crumbforest"]
|
|
|
|
all_results = []
|
|
|
|
for coll_name in collections:
|
|
rag_service = RAGService(
|
|
db_conn=db_conn,
|
|
qdrant_client=qdrant_client,
|
|
embedding_provider=embedding_provider,
|
|
collection_prefix=coll_name
|
|
)
|
|
|
|
try:
|
|
results = rag_service.search_posts(
|
|
query=q,
|
|
locale="", # Documents are locale-agnostic
|
|
limit=limit
|
|
)
|
|
|
|
# Add collection info to results
|
|
for r in results:
|
|
r['collection'] = coll_name
|
|
|
|
all_results.extend(results)
|
|
except Exception as e:
|
|
# Collection might not exist yet
|
|
print(f"Error searching {coll_name}: {e}")
|
|
continue
|
|
|
|
# Sort by score and limit
|
|
all_results.sort(key=lambda x: x['score'], reverse=True)
|
|
all_results = all_results[:limit]
|
|
|
|
return {
|
|
"query": q,
|
|
"results": all_results,
|
|
"provider": provider
|
|
}
|
|
|
|
except ValueError as e:
|
|
print(f"[DEBUG] ValueError in search: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
except Exception as e:
|
|
print(f"[DEBUG] Exception in search: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
|
|
finally:
|
|
db_conn.close()
|