Files
Crumb-Core-v.1/trigger_reindex.py
2025-12-24 14:32:55 +01:00

66 lines
2.3 KiB
Python
Executable File

import sys
import os
sys.path.insert(0, '/app')
from deps import get_db, get_qdrant_client
from config import get_settings
from services.provider_factory import ProviderFactory
from services.rag_service import RAGService
from services.document_indexer import DocumentIndexer
def deep_clean():
print("🧹 Starting Deep Clean & Re-index...")
db_conn = get_db()
qdrant = get_qdrant_client()
settings = get_settings()
try:
# 1. Setup Provider
provider = ProviderFactory.create_provider(
provider_name=settings.default_embedding_provider,
settings=settings
)
print(f"✅ Using provider: {provider.provider_name} ({provider.model_name})")
# 2. Clear Collections (Optional, but good for orphans)
# Note: This might be dangerous if production. But for dev/fix it's essential.
# Collections: "posts_de", "posts_en", "docs_crumbforest", etc.
collections = qdrant.get_collections().collections
for col in collections:
print(f"🗑️ Deleting collection: {col.name}")
qdrant.delete_collection(col.name)
# 3. Clear SQL Tracking
print("🗑️ Clearing post_vectors table...")
with db_conn.cursor() as cur:
cur.execute("TRUNCATE TABLE post_vectors")
# 4. Re-scan Documents
print("📂 Indexing Documents (Files)...")
indexer = DocumentIndexer(db_conn, qdrant, provider)
# Force re-index to ensure new IDs are used
doc_results = indexer.index_all_categories(force=True)
print(f" Indexed {doc_results['total_indexed']} documents.")
# 4. Re-index Posts (DB)
print("💾 Indexing Posts (SQL)...")
rag = RAGService(db_conn, qdrant, provider)
# Index for supported locales
for loc in ["de", "en"]: # or from config
res = rag.index_all_posts(locale=loc)
print(f" Locale {loc}: {res['indexed']} posts indexed.")
print("✨ Deep Clean Complete!")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
finally:
db_conn.close()
if __name__ == "__main__":
deep_clean()