139 lines
4.1 KiB
Python
Executable File
139 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Startup Indexing Script
|
|
Automatically indexes documents on application startup
|
|
"""
|
|
import sys
|
|
import time
|
|
from deps import get_db, get_qdrant_client
|
|
from config import get_settings
|
|
from services.provider_factory import ProviderFactory
|
|
from services.document_indexer import DocumentIndexer
|
|
|
|
|
|
def wait_for_services(max_attempts=30):
|
|
"""Wait for database and Qdrant to be ready."""
|
|
print("⏳ Waiting for services...")
|
|
|
|
# Wait for database
|
|
for attempt in range(max_attempts):
|
|
try:
|
|
conn = get_db()
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT 1")
|
|
conn.close()
|
|
print("✓ Database ready")
|
|
break
|
|
except Exception as e:
|
|
if attempt == max_attempts - 1:
|
|
print(f"✗ Database timeout: {e}")
|
|
return False
|
|
time.sleep(1)
|
|
|
|
# Wait for Qdrant
|
|
for attempt in range(max_attempts):
|
|
try:
|
|
qdrant = get_qdrant_client()
|
|
qdrant.get_collections()
|
|
print("✓ Qdrant ready")
|
|
break
|
|
except Exception as e:
|
|
if attempt == max_attempts - 1:
|
|
print(f"✗ Qdrant timeout: {e}")
|
|
return False
|
|
time.sleep(1)
|
|
|
|
return True
|
|
|
|
|
|
def run_document_indexing():
|
|
"""Run document indexing on startup."""
|
|
print("=" * 60)
|
|
print("🦉 Crumbforest Document Indexing")
|
|
print("=" * 60)
|
|
print("")
|
|
|
|
# Wait for services
|
|
if not wait_for_services():
|
|
print("✗ Services not ready, skipping document indexing")
|
|
return False
|
|
|
|
settings = get_settings()
|
|
|
|
# Get available providers
|
|
available_providers = ProviderFactory.get_available_providers(settings)
|
|
|
|
if not available_providers:
|
|
print("⚠️ No AI providers configured")
|
|
print(" Document indexing skipped")
|
|
print(" Configure API keys in compose/.env to enable")
|
|
return True # Not an error, just no provider
|
|
|
|
# Use default provider if available, otherwise first available
|
|
provider_name = settings.default_embedding_provider
|
|
if provider_name not in available_providers:
|
|
provider_name = available_providers[0]
|
|
|
|
print(f"✓ Using provider: {provider_name}")
|
|
print("")
|
|
|
|
try:
|
|
# Create provider
|
|
provider = ProviderFactory.create_provider(
|
|
provider_name=provider_name,
|
|
settings=settings
|
|
)
|
|
|
|
# Get connections
|
|
db_conn = get_db()
|
|
qdrant_client = get_qdrant_client()
|
|
|
|
# Create document indexer
|
|
indexer = DocumentIndexer(
|
|
db_conn=db_conn,
|
|
qdrant_client=qdrant_client,
|
|
embedding_provider=provider,
|
|
docs_base_path="docs"
|
|
)
|
|
|
|
# Index all categories
|
|
print("📚 Indexing documents...")
|
|
print("")
|
|
results = indexer.index_all_categories(force=False)
|
|
|
|
# Print results
|
|
for category, cat_result in results['categories'].items():
|
|
print(f"📁 {category}:")
|
|
print(f" Files found: {cat_result['total']}")
|
|
print(f" Indexed: {cat_result['indexed']}")
|
|
print(f" Unchanged: {cat_result['unchanged']}")
|
|
print(f" Errors: {cat_result['errors']}")
|
|
print("")
|
|
|
|
print("=" * 60)
|
|
print("Summary:")
|
|
print(f" Total files: {results['total_files']}")
|
|
print(f" Indexed: {results['total_indexed']}")
|
|
print(f" Unchanged: {results['total_unchanged']}")
|
|
print(f" Errors: {results['total_errors']}")
|
|
print("=" * 60)
|
|
|
|
if results['total_indexed'] > 0:
|
|
print("✓ Document indexing completed successfully")
|
|
else:
|
|
print("✓ All documents up to date")
|
|
|
|
db_conn.close()
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"✗ Document indexing failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
success = run_document_indexing()
|
|
sys.exit(0 if success else 1)
|