#!/usr/bin/env python3 """ Startup Indexing Script Automatically indexes documents on application startup """ import sys import time from deps import get_db, get_qdrant_client from config import get_settings from services.provider_factory import ProviderFactory from services.document_indexer import DocumentIndexer def wait_for_services(max_attempts=30): """Wait for database and Qdrant to be ready.""" print("⏳ Waiting for services...") # Wait for database for attempt in range(max_attempts): try: conn = get_db() with conn.cursor() as cur: cur.execute("SELECT 1") conn.close() print("✓ Database ready") break except Exception as e: if attempt == max_attempts - 1: print(f"✗ Database timeout: {e}") return False time.sleep(1) # Wait for Qdrant for attempt in range(max_attempts): try: qdrant = get_qdrant_client() qdrant.get_collections() print("✓ Qdrant ready") break except Exception as e: if attempt == max_attempts - 1: print(f"✗ Qdrant timeout: {e}") return False time.sleep(1) return True def run_document_indexing(): """Run document indexing on startup.""" print("=" * 60) print("🦉 Crumbforest Document Indexing") print("=" * 60) print("") # Wait for services if not wait_for_services(): print("✗ Services not ready, skipping document indexing") return False settings = get_settings() # Get available providers available_providers = ProviderFactory.get_available_providers(settings) if not available_providers: print("⚠️ No AI providers configured") print(" Document indexing skipped") print(" Configure API keys in compose/.env to enable") return True # Not an error, just no provider # Use default provider if available, otherwise first available provider_name = settings.default_embedding_provider if provider_name not in available_providers: provider_name = available_providers[0] print(f"✓ Using provider: {provider_name}") print("") try: # Create provider provider = ProviderFactory.create_provider( provider_name=provider_name, settings=settings ) # Get connections db_conn = get_db() qdrant_client = get_qdrant_client() # Create document indexer indexer = DocumentIndexer( db_conn=db_conn, qdrant_client=qdrant_client, embedding_provider=provider, docs_base_path="docs" ) # Index all categories print("📚 Indexing documents...") print("") results = indexer.index_all_categories(force=False) # Print results for category, cat_result in results['categories'].items(): print(f"📁 {category}:") print(f" Files found: {cat_result['total']}") print(f" Indexed: {cat_result['indexed']}") print(f" Unchanged: {cat_result['unchanged']}") print(f" Errors: {cat_result['errors']}") print("") print("=" * 60) print("Summary:") print(f" Total files: {results['total_files']}") print(f" Indexed: {results['total_indexed']}") print(f" Unchanged: {results['total_unchanged']}") print(f" Errors: {results['total_errors']}") print("=" * 60) if results['total_indexed'] > 0: print("✓ Document indexing completed successfully") else: print("✓ All documents up to date") db_conn.close() return True except Exception as e: print(f"✗ Document indexing failed: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": success = run_document_indexing() sys.exit(0 if success else 1)