Crumb-Core-v.1/app/startup_indexing.py

#!/usr/bin/env python3
"""
Startup Indexing Script
Automatically indexes documents on application startup
"""
import sys
import time
from deps import get_db, get_qdrant_client
from config import get_settings
from services.provider_factory import ProviderFactory
from services.document_indexer import DocumentIndexer


def wait_for_services(max_attempts=30):
    """Wait for database and Qdrant to be ready."""
    print("⏳ Waiting for services...")

    # Wait for database
    for attempt in range(max_attempts):
        try:
            conn = get_db()
            with conn.cursor() as cur:
                cur.execute("SELECT 1")
            conn.close()
            print("✓ Database ready")
            break
        except Exception as e:
            if attempt == max_attempts - 1:
                print(f"✗ Database timeout: {e}")
                return False
            time.sleep(1)

    # Wait for Qdrant
    for attempt in range(max_attempts):
        try:
            qdrant = get_qdrant_client()
            qdrant.get_collections()
            print("✓ Qdrant ready")
            break
        except Exception as e:
            if attempt == max_attempts - 1:
                print(f"✗ Qdrant timeout: {e}")
                return False
            time.sleep(1)

    return True


def run_document_indexing():
    """Run document indexing on startup."""
    print("=" * 60)
    print("🦉 Crumbforest Document Indexing")
    print("=" * 60)
    print("")

    # Wait for services
    if not wait_for_services():
        print("✗ Services not ready, skipping document indexing")
        return False

    settings = get_settings()

    # Get available providers
    available_providers = ProviderFactory.get_available_providers(settings)

    if not available_providers:
        print("⚠️  No AI providers configured")
        print("   Document indexing skipped")
        print("   Configure API keys in compose/.env to enable")
        return True  # Not an error, just no provider

    # Use default provider if available, otherwise first available
    provider_name = settings.default_embedding_provider
    if provider_name not in available_providers:
        provider_name = available_providers[0]

    print(f"✓ Using provider: {provider_name}")
    print("")

    try:
        # Create provider
        provider = ProviderFactory.create_provider(
            provider_name=provider_name,
            settings=settings
        )

        # Get connections
        db_conn = get_db()
        qdrant_client = get_qdrant_client()

        # Create document indexer
        indexer = DocumentIndexer(
            db_conn=db_conn,
            qdrant_client=qdrant_client,
            embedding_provider=provider,
            docs_base_path="docs"
        )

        # Index all categories
        print("📚 Indexing documents...")
        print("")
        results = indexer.index_all_categories(force=False)

        # Print results
        for category, cat_result in results['categories'].items():
            print(f"📁 {category}:")
            print(f"   Files found:    {cat_result['total']}")
            print(f"   Indexed:        {cat_result['indexed']}")
            print(f"   Unchanged:      {cat_result['unchanged']}")
            print(f"   Errors:         {cat_result['errors']}")
            print("")

        print("=" * 60)
        print("Summary:")
        print(f"  Total files:      {results['total_files']}")
        print(f"  Indexed:          {results['total_indexed']}")
        print(f"  Unchanged:        {results['total_unchanged']}")
        print(f"  Errors:           {results['total_errors']}")
        print("=" * 60)

        if results['total_indexed'] > 0:
            print("✓ Document indexing completed successfully")
        else:
            print("✓ All documents up to date")

        db_conn.close()
        return True

    except Exception as e:
        print(f"✗ Document indexing failed: {e}")
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = run_document_indexing()
    sys.exit(0 if success else 1)