feat: Fix vector indexing stability, add Gitea linking, enhance admin dashboard

2025-12-07 18:42:38 +01:00
parent 7b300d1ba1
commit 9f2e599846
58 changed files with 12197 additions and 503 deletions
--- a/app/services/config_loader.py
+++ b/app/services/config_loader.py
@@ -11,15 +11,19 @@ class ConfigLoader:
    def load_config(cls, force_reload: bool = False) -> Dict[str, Any]:
        if cls._config is None or force_reload:
            try:
-                # Try to find config in root or app root
-                paths_to_try = [CONFIG_PATH, os.path.join("..", CONFIG_PATH), os.path.join(os.getcwd(), CONFIG_PATH)]
+                paths_to_try = [CONFIG_PATH, os.path.join("..", CONFIG_PATH), "/config/crumbforest_config.json"]
                found = False
                for path in paths_to_try:
                    if os.path.exists(path):
-                        with open(path, 'r', encoding='utf-8') as f:
-                            cls._config = json.load(f)
-                        found = True
-                        break
+                        try:
+                            with open(path, 'r', encoding='utf-8') as f:
+                                cls._config = json.load(f)
+                            found = True
+                            print(f"Loaded config from {path}")
+                            break
+                        except Exception as e:
+                            print(f"Failed to load config from {path}: {e}")
+                            continue
                
                if not found:
                    print(f"Warning: {CONFIG_PATH} not found in {paths_to_try}")
--- a/app/services/document_indexer.py
+++ b/app/services/document_indexer.py
@@ -153,12 +153,22 @@ class DocumentIndexer:
        )

        # Index as post (using file path as ID)
+        # Use deterministic hash (MD5) instead of python's randomized hash()
+        path_hash = int(hashlib.md5(str(file_path).encode('utf-8')).hexdigest(), 16)
+        post_id = path_hash % (2**31)  # Keep within signed 32-bit int range if needed by DB, or just use positive logic
+        
        result = rag_service.index_post(
-            post_id=hash(str(file_path)) % (2**31),  # Convert path to int ID
+            post_id=post_id,
            title=file_path.stem,  # Filename without extension
            slug=f"{category}/{file_path.stem}",
            locale="",  # Documents are locale-agnostic
-            body_md=content
+            body_md=content,
+            extra_payload={
+                "file_path": str(file_path),
+                "category": category,
+                "src": "filesystem"
+            },
+            force=force
        )

        # Update post_vectors to mark as document type
@@ -179,7 +189,7 @@ class DocumentIndexer:
                        'file_name': file_path.name
                    }),
                    file_hash,
-                    hash(str(file_path)) % (2**31),
+                    post_id,
                    result['collection']
                )
            )
@@ -199,7 +209,7 @@ class DocumentIndexer:
                VALUES ('document_indexed', 'document', %s, NULL, %s)
                """,
                (
-                    hash(str(file_path)) % (2**31),
+                    post_id,
                    json.dumps({
                        'category': category,
                        'file_path': str(file_path),
--- a/app/services/localization.py
+++ b/app/services/localization.py
@@ -0,0 +1,62 @@
+import json
+import os
+import copy
+from typing import Dict, Any, List
+
+def load_characters(lang: str = "de") -> List[Dict[str, Any]]:
+    """Load character data for given language."""
+    if lang not in ["de", "en", "fr"]:
+        lang = "de"
+
+    # Assume we are running from app root
+    characters_path = os.path.join('static', 'data', f'characters.{lang}.json')
+    try:
+        with open(characters_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        # Fallback to German
+        try:
+            with open(os.path.join('static', 'data', 'characters.de.json'), 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception:
+             return []
+    except Exception as e:
+        print(f"Error loading characters: {e}")
+        return []
+
+def merge_role_localization(roles: Dict[str, Any], lang: str) -> Dict[str, Any]:
+    """
+    Return a deep copy of roles with localized content merged in.
+    """
+    localized_roles = copy.deepcopy(roles)
+    localized_list = load_characters(lang)
+    localized_map = {char.get('id'): char for char in localized_list}
+
+    # Legacy ID mapping
+    legacy_id_map = {
+        'funkfox': 'fox',
+        'schraubaer': 'schraubär',
+        'capacitoby': 'capacitobi',
+        'taichitaube': 'taichi',
+        'taichi': 'taichi'
+    }
+
+    for role_id, role in localized_roles.items():
+        lookup_id = legacy_id_map.get(role_id, role_id)
+        
+        if lookup_id in localized_map:
+            l_data = localized_map[lookup_id]
+            if 'name' in l_data:
+                role['name'] = l_data['name']
+            if 'description' in l_data:
+                role['description'] = l_data['description']
+            if 'short' in l_data:
+                role['title'] = l_data['short'] # Map short to title if desired, or keep original title?
+                # In the previous step I mapped 'title' in the JSON file directly. 
+                # If JSON has 'title', use it.
+                if 'title' in l_data:
+                    role['title'] = l_data['title']
+            if 'system_prompt' in l_data:
+                role['system_prompt'] = l_data['system_prompt']
+    
+    return localized_roles
--- a/app/services/rag_service.py
+++ b/app/services/rag_service.py
@@ -79,7 +79,9 @@ class RAGService:
        title: str,
        slug: str,
        locale: str,
-        body_md: str
+        body_md: str,
+        extra_payload: Optional[Dict[str, Any]] = None,
+        force: bool = False
    ) -> Dict[str, Any]:
        """
        Index a single post into Qdrant.
@@ -90,7 +92,8 @@ class RAGService:
            slug: Post slug
            locale: Post locale (de, en, etc.)
            body_md: Markdown content
-
+            extra_payload: Additional metadata to store in Qdrant
+            force: If True, bypass content hash check
        Returns:
            Dictionary with indexing results
        """
@@ -99,31 +102,28 @@ class RAGService:

        # Check if already indexed and content unchanged
        content_hash = self._get_content_hash(body_md or "")
-        with self.db_conn.cursor(DictCursor) as cur:
-            cur.execute(
-                "SELECT file_hash FROM post_vectors WHERE post_id=%s AND collection_name=%s",
-                (post_id, collection_name)
-            )
-            existing = cur.fetchone()
-            if existing and existing['file_hash'] == content_hash:
-                return {
-                    'post_id': post_id,
-                    'status': 'unchanged',
-                    'message': 'Post content unchanged, skipping re-indexing'
-                }
+        if not force:
+            with self.db_conn.cursor(DictCursor) as cur:
+                cur.execute(
+                    "SELECT file_hash FROM post_vectors WHERE post_id=%s AND collection_name=%s",
+                    (post_id, collection_name)
+                )
+                existing = cur.fetchone()
+                if existing and existing['file_hash'] == content_hash:
+                    return {
+                        'post_id': post_id,
+                        'status': 'unchanged',
+                        'message': 'Post content unchanged, skipping re-indexing',
+                        'collection': collection_name
+                    }

        # Chunk and embed the content
-        chunks_with_embeddings = self.embedding_service.chunk_and_embed_post(
-            post_content=body_md or "",
-            post_id=post_id,
-            post_title=title
-        )
-
        if not chunks_with_embeddings:
            return {
                'post_id': post_id,
-                'status': 'error',
-                'message': 'No content to index'
+                'status': 'skipped',
+                'message': 'No content to index',
+                'collection': collection_name
            }

        # Prepare points for Qdrant
@@ -135,19 +135,24 @@ class RAGService:
            point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{post_id}_{chunk['chunk_position']}"))
            vector_ids.append(point_id)

+            payload = {
+                'post_id': post_id,
+                'title': title,
+                'slug': slug,
+                'locale': locale,
+                'content': chunk['content'],
+                'header': chunk.get('header', ''),
+                'header_level': chunk.get('header_level', 0),
+                'chunk_position': chunk['chunk_position']
+            }
+
+            if extra_payload:
+                payload.update(extra_payload)
+
            points.append(PointStruct(
                id=point_id,
                vector=chunk['embedding'],
-                payload={
-                    'post_id': post_id,
-                    'title': title,
-                    'slug': slug,
-                    'locale': locale,
-                    'content': chunk['content'],
-                    'header': chunk.get('header', ''),
-                    'header_level': chunk.get('header_level', 0),
-                    'chunk_position': chunk['chunk_position']
-                }
+                payload=payload
            ))

        # Upsert points to Qdrant