Files
Crumb-Core-v.1/app/startup_indexing.py

139 lines
4.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Startup Indexing Script
Automatically indexes documents on application startup
"""
import sys
import time
from deps import get_db, get_qdrant_client
from config import get_settings
from services.provider_factory import ProviderFactory
from services.document_indexer import DocumentIndexer
def wait_for_services(max_attempts=30):
"""Wait for database and Qdrant to be ready."""
print("⏳ Waiting for services...")
# Wait for database
for attempt in range(max_attempts):
try:
conn = get_db()
with conn.cursor() as cur:
cur.execute("SELECT 1")
conn.close()
print("✓ Database ready")
break
except Exception as e:
if attempt == max_attempts - 1:
print(f"✗ Database timeout: {e}")
return False
time.sleep(1)
# Wait for Qdrant
for attempt in range(max_attempts):
try:
qdrant = get_qdrant_client()
qdrant.get_collections()
print("✓ Qdrant ready")
break
except Exception as e:
if attempt == max_attempts - 1:
print(f"✗ Qdrant timeout: {e}")
return False
time.sleep(1)
return True
def run_document_indexing():
"""Run document indexing on startup."""
print("=" * 60)
print("🦉 Crumbforest Document Indexing")
print("=" * 60)
print("")
# Wait for services
if not wait_for_services():
print("✗ Services not ready, skipping document indexing")
return False
settings = get_settings()
# Get available providers
available_providers = ProviderFactory.get_available_providers(settings)
if not available_providers:
print("⚠️ No AI providers configured")
print(" Document indexing skipped")
print(" Configure API keys in compose/.env to enable")
return True # Not an error, just no provider
# Use default provider if available, otherwise first available
provider_name = settings.default_embedding_provider
if provider_name not in available_providers:
provider_name = available_providers[0]
print(f"✓ Using provider: {provider_name}")
print("")
try:
# Create provider
provider = ProviderFactory.create_provider(
provider_name=provider_name,
settings=settings
)
# Get connections
db_conn = get_db()
qdrant_client = get_qdrant_client()
# Create document indexer
indexer = DocumentIndexer(
db_conn=db_conn,
qdrant_client=qdrant_client,
embedding_provider=provider,
docs_base_path="docs"
)
# Index all categories
print("📚 Indexing documents...")
print("")
results = indexer.index_all_categories(force=False)
# Print results
for category, cat_result in results['categories'].items():
print(f"📁 {category}:")
print(f" Files found: {cat_result['total']}")
print(f" Indexed: {cat_result['indexed']}")
print(f" Unchanged: {cat_result['unchanged']}")
print(f" Errors: {cat_result['errors']}")
print("")
print("=" * 60)
print("Summary:")
print(f" Total files: {results['total_files']}")
print(f" Indexed: {results['total_indexed']}")
print(f" Unchanged: {results['total_unchanged']}")
print(f" Errors: {results['total_errors']}")
print("=" * 60)
if results['total_indexed'] > 0:
print("✓ Document indexing completed successfully")
else:
print("✓ All documents up to date")
db_conn.close()
return True
except Exception as e:
print(f"✗ Document indexing failed: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = run_document_indexing()
sys.exit(0 if success else 1)