# app/services/document_indexer.py """ Document Indexer Service Automatically indexes Markdown documents from docs/ directories Tracks changes via file hashes (DSGVO-compliant) """ import os import json import hashlib from pathlib import Path from typing import List, Dict, Any, Optional from datetime import datetime from pymysql import Connection from pymysql.cursors import DictCursor from qdrant_client import QdrantClient from lib.embedding_providers.base import BaseProvider from services.rag_service import RAGService class DocumentIndexer: """ Indexes Markdown documents from docs/ directories. Tracks changes and only re-indexes modified files. """ def __init__( self, db_conn: Connection, qdrant_client: QdrantClient, embedding_provider: BaseProvider, docs_base_path: str = "docs" ): """ Initialize document indexer. Args: db_conn: Database connection qdrant_client: Qdrant client embedding_provider: Provider for embeddings docs_base_path: Base path to docs directory """ self.db_conn = db_conn self.qdrant = qdrant_client self.embedding_provider = embedding_provider self.docs_base_path = Path(docs_base_path) # Supported document categories self.categories = { "rz-nullfeld": "docs_rz_nullfeld", "crumbforest": "docs_crumbforest" } def get_file_hash(self, file_path: Path) -> str: """Calculate MD5 hash of file content.""" with open(file_path, 'rb') as f: return hashlib.md5(f.read()).hexdigest() def find_markdown_files(self, category: str) -> List[Path]: """ Find all Markdown files in a category directory. Args: category: Category name (e.g., 'rz-nullfeld') Returns: List of Markdown file paths """ category_path = self.docs_base_path / category if not category_path.exists(): return [] # Find all .md files recursively return list(category_path.glob("**/*.md")) def should_reindex(self, file_path: Path, collection_name: str) -> bool: """ Check if file should be re-indexed based on hash comparison. Args: file_path: Path to markdown file collection_name: Qdrant collection name Returns: True if file should be re-indexed """ # Calculate current hash current_hash = self.get_file_hash(file_path) # Get stored hash from database with self.db_conn.cursor(DictCursor) as cur: cur.execute( """ SELECT file_hash FROM post_vectors WHERE post_id = %s AND collection_name = %s AND post_type = 'document' """, (str(file_path), collection_name) ) result = cur.fetchone() # Re-index if no record exists or hash changed if not result: return True return result['file_hash'] != current_hash def index_document( self, file_path: Path, category: str, force: bool = False ) -> Dict[str, Any]: """ Index a single Markdown document. Args: file_path: Path to markdown file category: Category name (e.g., 'rz-nullfeld') force: Force re-indexing even if unchanged Returns: Indexing result dictionary """ collection_name = self.categories.get(category) if not collection_name: raise ValueError(f"Unknown category: {category}") # Check if re-indexing needed if not force and not self.should_reindex(file_path, collection_name): return { 'file': str(file_path), 'status': 'unchanged', 'message': 'File unchanged, skipping' } # Read file content with open(file_path, 'r', encoding='utf-8') as f: content = f.read() if not content.strip(): return { 'file': str(file_path), 'status': 'skipped', 'message': 'Empty file' } # Create RAG service with document-specific collection rag_service = RAGService( db_conn=self.db_conn, qdrant_client=self.qdrant, embedding_provider=self.embedding_provider, collection_prefix=collection_name ) # Index as post (using file path as ID) # Use deterministic hash (MD5) instead of python's randomized hash() path_hash = int(hashlib.md5(str(file_path).encode('utf-8')).hexdigest(), 16) post_id = path_hash % (2**31) # Keep within signed 32-bit int range if needed by DB, or just use positive logic result = rag_service.index_post( post_id=post_id, title=file_path.stem, # Filename without extension slug=f"{category}/{file_path.stem}", locale="", # Documents are locale-agnostic body_md=content, extra_payload={ "file_path": str(file_path), "category": category, "src": "filesystem" }, force=force ) # Update post_vectors to mark as document type file_hash = self.get_file_hash(file_path) with self.db_conn.cursor(DictCursor) as cur: cur.execute( """ UPDATE post_vectors SET post_type='document', metadata=%s, file_hash=%s WHERE post_id=%s AND collection_name=%s """, ( json.dumps({ 'category': category, 'file_path': str(file_path), 'file_name': file_path.name }), file_hash, post_id, result['collection'] ) ) # Add metadata column if it doesn't exist try: cur.execute( "ALTER TABLE post_vectors ADD COLUMN IF NOT EXISTS metadata JSON NULL" ) except Exception: pass # Column might already exist # DSGVO Audit Log cur.execute( """ INSERT INTO audit_log (action, entity_type, entity_id, user_id, metadata) VALUES ('document_indexed', 'document', %s, NULL, %s) """, ( post_id, json.dumps({ 'category': category, 'file_path': str(file_path), 'provider': self.embedding_provider.provider_name, 'chunks': result.get('chunks', 0), 'timestamp': datetime.now().isoformat() }) ) ) return { 'file': str(file_path), 'status': 'indexed', 'chunks': result.get('chunks', 0), 'collection': result['collection'], 'provider': result.get('provider') } def index_category( self, category: str, force: bool = False ) -> Dict[str, Any]: """ Index all documents in a category. Args: category: Category name (e.g., 'rz-nullfeld') force: Force re-indexing of all files Returns: Summary of indexing results """ files = self.find_markdown_files(category) if not files: return { 'category': category, 'total': 0, 'indexed': 0, 'unchanged': 0, 'errors': 0, 'message': f'No markdown files found in docs/{category}/' } results = { 'category': category, 'total': len(files), 'indexed': 0, 'unchanged': 0, 'errors': 0, 'files': [] } for file_path in files: try: result = self.index_document(file_path, category, force=force) results['files'].append(result) if result['status'] == 'indexed': results['indexed'] += 1 elif result['status'] == 'unchanged': results['unchanged'] += 1 except Exception as e: results['errors'] += 1 results['files'].append({ 'file': str(file_path), 'status': 'error', 'message': str(e) }) print(f"Error indexing {file_path}: {e}") return results def index_all_categories(self, force: bool = False) -> Dict[str, Any]: """ Index all documents in all categories. Args: force: Force re-indexing of all files Returns: Summary of indexing results for all categories """ results = { 'categories': {}, 'total_files': 0, 'total_indexed': 0, 'total_unchanged': 0, 'total_errors': 0 } for category in self.categories.keys(): category_result = self.index_category(category, force=force) results['categories'][category] = category_result results['total_files'] += category_result['total'] results['total_indexed'] += category_result['indexed'] results['total_unchanged'] += category_result['unchanged'] results['total_errors'] += category_result['errors'] return results def get_indexing_status(self, category: Optional[str] = None) -> Dict[str, Any]: """ Get indexing status for documents. Args: category: Optional category filter Returns: Status information """ status = {} with self.db_conn.cursor(DictCursor) as cur: if category: collection_name = self.categories.get(category) if not collection_name: raise ValueError(f"Unknown category: {category}") cur.execute( """ SELECT COUNT(*) as count, SUM(chunk_count) as total_vectors, MAX(indexed_at) as last_indexed FROM post_vectors WHERE collection_name=%s AND post_type='document' """, (collection_name,) ) result = cur.fetchone() status[category] = { 'collection': collection_name, 'documents': result['count'] or 0, 'vectors': result['total_vectors'] or 0, 'last_indexed': result['last_indexed'].isoformat() if result['last_indexed'] else None } else: # Get status for all categories for cat, coll_name in self.categories.items(): cur.execute( """ SELECT COUNT(*) as count, SUM(chunk_count) as total_vectors, MAX(indexed_at) as last_indexed FROM post_vectors WHERE collection_name=%s AND post_type='document' """, (coll_name,) ) result = cur.fetchone() status[cat] = { 'collection': coll_name, 'documents': result['count'] or 0, 'vectors': result['total_vectors'] or 0, 'last_indexed': result['last_indexed'].isoformat() if result['last_indexed'] else None } return status