66 lines
2.3 KiB
Python
Executable File
66 lines
2.3 KiB
Python
Executable File
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, '/app')
|
|
|
|
from deps import get_db, get_qdrant_client
|
|
from config import get_settings
|
|
from services.provider_factory import ProviderFactory
|
|
from services.rag_service import RAGService
|
|
from services.document_indexer import DocumentIndexer
|
|
|
|
def deep_clean():
|
|
print("🧹 Starting Deep Clean & Re-index...")
|
|
|
|
db_conn = get_db()
|
|
qdrant = get_qdrant_client()
|
|
settings = get_settings()
|
|
|
|
try:
|
|
# 1. Setup Provider
|
|
provider = ProviderFactory.create_provider(
|
|
provider_name=settings.default_embedding_provider,
|
|
settings=settings
|
|
)
|
|
print(f"✅ Using provider: {provider.provider_name} ({provider.model_name})")
|
|
|
|
# 2. Clear Collections (Optional, but good for orphans)
|
|
# Note: This might be dangerous if production. But for dev/fix it's essential.
|
|
# Collections: "posts_de", "posts_en", "docs_crumbforest", etc.
|
|
collections = qdrant.get_collections().collections
|
|
for col in collections:
|
|
print(f"🗑️ Deleting collection: {col.name}")
|
|
qdrant.delete_collection(col.name)
|
|
|
|
# 3. Clear SQL Tracking
|
|
print("🗑️ Clearing post_vectors table...")
|
|
with db_conn.cursor() as cur:
|
|
cur.execute("TRUNCATE TABLE post_vectors")
|
|
|
|
# 4. Re-scan Documents
|
|
print("📂 Indexing Documents (Files)...")
|
|
indexer = DocumentIndexer(db_conn, qdrant, provider)
|
|
# Force re-index to ensure new IDs are used
|
|
doc_results = indexer.index_all_categories(force=True)
|
|
print(f" Indexed {doc_results['total_indexed']} documents.")
|
|
|
|
# 4. Re-index Posts (DB)
|
|
print("💾 Indexing Posts (SQL)...")
|
|
rag = RAGService(db_conn, qdrant, provider)
|
|
# Index for supported locales
|
|
for loc in ["de", "en"]: # or from config
|
|
res = rag.index_all_posts(locale=loc)
|
|
print(f" Locale {loc}: {res['indexed']} posts indexed.")
|
|
|
|
print("✨ Deep Clean Complete!")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
finally:
|
|
db_conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
deep_clean()
|