feat: Fix vector indexing stability, add Gitea linking, enhance admin dashboard

This commit is contained in:
2025-12-07 18:42:38 +01:00
parent 7b300d1ba1
commit 9f2e599846
58 changed files with 12197 additions and 503 deletions

View File

@@ -11,15 +11,19 @@ class ConfigLoader:
def load_config(cls, force_reload: bool = False) -> Dict[str, Any]:
if cls._config is None or force_reload:
try:
# Try to find config in root or app root
paths_to_try = [CONFIG_PATH, os.path.join("..", CONFIG_PATH), os.path.join(os.getcwd(), CONFIG_PATH)]
paths_to_try = [CONFIG_PATH, os.path.join("..", CONFIG_PATH), "/config/crumbforest_config.json"]
found = False
for path in paths_to_try:
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
cls._config = json.load(f)
found = True
break
try:
with open(path, 'r', encoding='utf-8') as f:
cls._config = json.load(f)
found = True
print(f"Loaded config from {path}")
break
except Exception as e:
print(f"Failed to load config from {path}: {e}")
continue
if not found:
print(f"Warning: {CONFIG_PATH} not found in {paths_to_try}")

View File

@@ -153,12 +153,22 @@ class DocumentIndexer:
)
# Index as post (using file path as ID)
# Use deterministic hash (MD5) instead of python's randomized hash()
path_hash = int(hashlib.md5(str(file_path).encode('utf-8')).hexdigest(), 16)
post_id = path_hash % (2**31) # Keep within signed 32-bit int range if needed by DB, or just use positive logic
result = rag_service.index_post(
post_id=hash(str(file_path)) % (2**31), # Convert path to int ID
post_id=post_id,
title=file_path.stem, # Filename without extension
slug=f"{category}/{file_path.stem}",
locale="", # Documents are locale-agnostic
body_md=content
body_md=content,
extra_payload={
"file_path": str(file_path),
"category": category,
"src": "filesystem"
},
force=force
)
# Update post_vectors to mark as document type
@@ -179,7 +189,7 @@ class DocumentIndexer:
'file_name': file_path.name
}),
file_hash,
hash(str(file_path)) % (2**31),
post_id,
result['collection']
)
)
@@ -199,7 +209,7 @@ class DocumentIndexer:
VALUES ('document_indexed', 'document', %s, NULL, %s)
""",
(
hash(str(file_path)) % (2**31),
post_id,
json.dumps({
'category': category,
'file_path': str(file_path),

View File

@@ -0,0 +1,62 @@
import json
import os
import copy
from typing import Dict, Any, List
def load_characters(lang: str = "de") -> List[Dict[str, Any]]:
"""Load character data for given language."""
if lang not in ["de", "en", "fr"]:
lang = "de"
# Assume we are running from app root
characters_path = os.path.join('static', 'data', f'characters.{lang}.json')
try:
with open(characters_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
# Fallback to German
try:
with open(os.path.join('static', 'data', 'characters.de.json'), 'r', encoding='utf-8') as f:
return json.load(f)
except Exception:
return []
except Exception as e:
print(f"Error loading characters: {e}")
return []
def merge_role_localization(roles: Dict[str, Any], lang: str) -> Dict[str, Any]:
"""
Return a deep copy of roles with localized content merged in.
"""
localized_roles = copy.deepcopy(roles)
localized_list = load_characters(lang)
localized_map = {char.get('id'): char for char in localized_list}
# Legacy ID mapping
legacy_id_map = {
'funkfox': 'fox',
'schraubaer': 'schraubär',
'capacitoby': 'capacitobi',
'taichitaube': 'taichi',
'taichi': 'taichi'
}
for role_id, role in localized_roles.items():
lookup_id = legacy_id_map.get(role_id, role_id)
if lookup_id in localized_map:
l_data = localized_map[lookup_id]
if 'name' in l_data:
role['name'] = l_data['name']
if 'description' in l_data:
role['description'] = l_data['description']
if 'short' in l_data:
role['title'] = l_data['short'] # Map short to title if desired, or keep original title?
# In the previous step I mapped 'title' in the JSON file directly.
# If JSON has 'title', use it.
if 'title' in l_data:
role['title'] = l_data['title']
if 'system_prompt' in l_data:
role['system_prompt'] = l_data['system_prompt']
return localized_roles

View File

@@ -79,7 +79,9 @@ class RAGService:
title: str,
slug: str,
locale: str,
body_md: str
body_md: str,
extra_payload: Optional[Dict[str, Any]] = None,
force: bool = False
) -> Dict[str, Any]:
"""
Index a single post into Qdrant.
@@ -90,7 +92,8 @@ class RAGService:
slug: Post slug
locale: Post locale (de, en, etc.)
body_md: Markdown content
extra_payload: Additional metadata to store in Qdrant
force: If True, bypass content hash check
Returns:
Dictionary with indexing results
"""
@@ -99,31 +102,28 @@ class RAGService:
# Check if already indexed and content unchanged
content_hash = self._get_content_hash(body_md or "")
with self.db_conn.cursor(DictCursor) as cur:
cur.execute(
"SELECT file_hash FROM post_vectors WHERE post_id=%s AND collection_name=%s",
(post_id, collection_name)
)
existing = cur.fetchone()
if existing and existing['file_hash'] == content_hash:
return {
'post_id': post_id,
'status': 'unchanged',
'message': 'Post content unchanged, skipping re-indexing'
}
if not force:
with self.db_conn.cursor(DictCursor) as cur:
cur.execute(
"SELECT file_hash FROM post_vectors WHERE post_id=%s AND collection_name=%s",
(post_id, collection_name)
)
existing = cur.fetchone()
if existing and existing['file_hash'] == content_hash:
return {
'post_id': post_id,
'status': 'unchanged',
'message': 'Post content unchanged, skipping re-indexing',
'collection': collection_name
}
# Chunk and embed the content
chunks_with_embeddings = self.embedding_service.chunk_and_embed_post(
post_content=body_md or "",
post_id=post_id,
post_title=title
)
if not chunks_with_embeddings:
return {
'post_id': post_id,
'status': 'error',
'message': 'No content to index'
'status': 'skipped',
'message': 'No content to index',
'collection': collection_name
}
# Prepare points for Qdrant
@@ -135,19 +135,24 @@ class RAGService:
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{post_id}_{chunk['chunk_position']}"))
vector_ids.append(point_id)
payload = {
'post_id': post_id,
'title': title,
'slug': slug,
'locale': locale,
'content': chunk['content'],
'header': chunk.get('header', ''),
'header_level': chunk.get('header_level', 0),
'chunk_position': chunk['chunk_position']
}
if extra_payload:
payload.update(extra_payload)
points.append(PointStruct(
id=point_id,
vector=chunk['embedding'],
payload={
'post_id': post_id,
'title': title,
'slug': slug,
'locale': locale,
'content': chunk['content'],
'header': chunk.get('header', ''),
'header_level': chunk.get('header_level', 0),
'chunk_position': chunk['chunk_position']
}
payload=payload
))
# Upsert points to Qdrant