feat: Fix vector indexing stability, add Gitea linking, enhance admin dashboard
This commit is contained in:
@@ -11,15 +11,19 @@ class ConfigLoader:
|
||||
def load_config(cls, force_reload: bool = False) -> Dict[str, Any]:
|
||||
if cls._config is None or force_reload:
|
||||
try:
|
||||
# Try to find config in root or app root
|
||||
paths_to_try = [CONFIG_PATH, os.path.join("..", CONFIG_PATH), os.path.join(os.getcwd(), CONFIG_PATH)]
|
||||
paths_to_try = [CONFIG_PATH, os.path.join("..", CONFIG_PATH), "/config/crumbforest_config.json"]
|
||||
found = False
|
||||
for path in paths_to_try:
|
||||
if os.path.exists(path):
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
cls._config = json.load(f)
|
||||
found = True
|
||||
break
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
cls._config = json.load(f)
|
||||
found = True
|
||||
print(f"Loaded config from {path}")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Failed to load config from {path}: {e}")
|
||||
continue
|
||||
|
||||
if not found:
|
||||
print(f"Warning: {CONFIG_PATH} not found in {paths_to_try}")
|
||||
|
||||
@@ -153,12 +153,22 @@ class DocumentIndexer:
|
||||
)
|
||||
|
||||
# Index as post (using file path as ID)
|
||||
# Use deterministic hash (MD5) instead of python's randomized hash()
|
||||
path_hash = int(hashlib.md5(str(file_path).encode('utf-8')).hexdigest(), 16)
|
||||
post_id = path_hash % (2**31) # Keep within signed 32-bit int range if needed by DB, or just use positive logic
|
||||
|
||||
result = rag_service.index_post(
|
||||
post_id=hash(str(file_path)) % (2**31), # Convert path to int ID
|
||||
post_id=post_id,
|
||||
title=file_path.stem, # Filename without extension
|
||||
slug=f"{category}/{file_path.stem}",
|
||||
locale="", # Documents are locale-agnostic
|
||||
body_md=content
|
||||
body_md=content,
|
||||
extra_payload={
|
||||
"file_path": str(file_path),
|
||||
"category": category,
|
||||
"src": "filesystem"
|
||||
},
|
||||
force=force
|
||||
)
|
||||
|
||||
# Update post_vectors to mark as document type
|
||||
@@ -179,7 +189,7 @@ class DocumentIndexer:
|
||||
'file_name': file_path.name
|
||||
}),
|
||||
file_hash,
|
||||
hash(str(file_path)) % (2**31),
|
||||
post_id,
|
||||
result['collection']
|
||||
)
|
||||
)
|
||||
@@ -199,7 +209,7 @@ class DocumentIndexer:
|
||||
VALUES ('document_indexed', 'document', %s, NULL, %s)
|
||||
""",
|
||||
(
|
||||
hash(str(file_path)) % (2**31),
|
||||
post_id,
|
||||
json.dumps({
|
||||
'category': category,
|
||||
'file_path': str(file_path),
|
||||
|
||||
62
app/services/localization.py
Normal file
62
app/services/localization.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import json
|
||||
import os
|
||||
import copy
|
||||
from typing import Dict, Any, List
|
||||
|
||||
def load_characters(lang: str = "de") -> List[Dict[str, Any]]:
|
||||
"""Load character data for given language."""
|
||||
if lang not in ["de", "en", "fr"]:
|
||||
lang = "de"
|
||||
|
||||
# Assume we are running from app root
|
||||
characters_path = os.path.join('static', 'data', f'characters.{lang}.json')
|
||||
try:
|
||||
with open(characters_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
# Fallback to German
|
||||
try:
|
||||
with open(os.path.join('static', 'data', 'characters.de.json'), 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error loading characters: {e}")
|
||||
return []
|
||||
|
||||
def merge_role_localization(roles: Dict[str, Any], lang: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Return a deep copy of roles with localized content merged in.
|
||||
"""
|
||||
localized_roles = copy.deepcopy(roles)
|
||||
localized_list = load_characters(lang)
|
||||
localized_map = {char.get('id'): char for char in localized_list}
|
||||
|
||||
# Legacy ID mapping
|
||||
legacy_id_map = {
|
||||
'funkfox': 'fox',
|
||||
'schraubaer': 'schraubär',
|
||||
'capacitoby': 'capacitobi',
|
||||
'taichitaube': 'taichi',
|
||||
'taichi': 'taichi'
|
||||
}
|
||||
|
||||
for role_id, role in localized_roles.items():
|
||||
lookup_id = legacy_id_map.get(role_id, role_id)
|
||||
|
||||
if lookup_id in localized_map:
|
||||
l_data = localized_map[lookup_id]
|
||||
if 'name' in l_data:
|
||||
role['name'] = l_data['name']
|
||||
if 'description' in l_data:
|
||||
role['description'] = l_data['description']
|
||||
if 'short' in l_data:
|
||||
role['title'] = l_data['short'] # Map short to title if desired, or keep original title?
|
||||
# In the previous step I mapped 'title' in the JSON file directly.
|
||||
# If JSON has 'title', use it.
|
||||
if 'title' in l_data:
|
||||
role['title'] = l_data['title']
|
||||
if 'system_prompt' in l_data:
|
||||
role['system_prompt'] = l_data['system_prompt']
|
||||
|
||||
return localized_roles
|
||||
@@ -79,7 +79,9 @@ class RAGService:
|
||||
title: str,
|
||||
slug: str,
|
||||
locale: str,
|
||||
body_md: str
|
||||
body_md: str,
|
||||
extra_payload: Optional[Dict[str, Any]] = None,
|
||||
force: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Index a single post into Qdrant.
|
||||
@@ -90,7 +92,8 @@ class RAGService:
|
||||
slug: Post slug
|
||||
locale: Post locale (de, en, etc.)
|
||||
body_md: Markdown content
|
||||
|
||||
extra_payload: Additional metadata to store in Qdrant
|
||||
force: If True, bypass content hash check
|
||||
Returns:
|
||||
Dictionary with indexing results
|
||||
"""
|
||||
@@ -99,31 +102,28 @@ class RAGService:
|
||||
|
||||
# Check if already indexed and content unchanged
|
||||
content_hash = self._get_content_hash(body_md or "")
|
||||
with self.db_conn.cursor(DictCursor) as cur:
|
||||
cur.execute(
|
||||
"SELECT file_hash FROM post_vectors WHERE post_id=%s AND collection_name=%s",
|
||||
(post_id, collection_name)
|
||||
)
|
||||
existing = cur.fetchone()
|
||||
if existing and existing['file_hash'] == content_hash:
|
||||
return {
|
||||
'post_id': post_id,
|
||||
'status': 'unchanged',
|
||||
'message': 'Post content unchanged, skipping re-indexing'
|
||||
}
|
||||
if not force:
|
||||
with self.db_conn.cursor(DictCursor) as cur:
|
||||
cur.execute(
|
||||
"SELECT file_hash FROM post_vectors WHERE post_id=%s AND collection_name=%s",
|
||||
(post_id, collection_name)
|
||||
)
|
||||
existing = cur.fetchone()
|
||||
if existing and existing['file_hash'] == content_hash:
|
||||
return {
|
||||
'post_id': post_id,
|
||||
'status': 'unchanged',
|
||||
'message': 'Post content unchanged, skipping re-indexing',
|
||||
'collection': collection_name
|
||||
}
|
||||
|
||||
# Chunk and embed the content
|
||||
chunks_with_embeddings = self.embedding_service.chunk_and_embed_post(
|
||||
post_content=body_md or "",
|
||||
post_id=post_id,
|
||||
post_title=title
|
||||
)
|
||||
|
||||
if not chunks_with_embeddings:
|
||||
return {
|
||||
'post_id': post_id,
|
||||
'status': 'error',
|
||||
'message': 'No content to index'
|
||||
'status': 'skipped',
|
||||
'message': 'No content to index',
|
||||
'collection': collection_name
|
||||
}
|
||||
|
||||
# Prepare points for Qdrant
|
||||
@@ -135,19 +135,24 @@ class RAGService:
|
||||
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{post_id}_{chunk['chunk_position']}"))
|
||||
vector_ids.append(point_id)
|
||||
|
||||
payload = {
|
||||
'post_id': post_id,
|
||||
'title': title,
|
||||
'slug': slug,
|
||||
'locale': locale,
|
||||
'content': chunk['content'],
|
||||
'header': chunk.get('header', ''),
|
||||
'header_level': chunk.get('header_level', 0),
|
||||
'chunk_position': chunk['chunk_position']
|
||||
}
|
||||
|
||||
if extra_payload:
|
||||
payload.update(extra_payload)
|
||||
|
||||
points.append(PointStruct(
|
||||
id=point_id,
|
||||
vector=chunk['embedding'],
|
||||
payload={
|
||||
'post_id': post_id,
|
||||
'title': title,
|
||||
'slug': slug,
|
||||
'locale': locale,
|
||||
'content': chunk['content'],
|
||||
'header': chunk.get('header', ''),
|
||||
'header_level': chunk.get('header_level', 0),
|
||||
'chunk_position': chunk['chunk_position']
|
||||
}
|
||||
payload=payload
|
||||
))
|
||||
|
||||
# Upsert points to Qdrant
|
||||
|
||||
Reference in New Issue
Block a user