feat(crumbforest): wire up docs, missions, and history indexer

2025-12-27 15:17:58 +01:00
parent 64f568d5bc
commit b9f49c170c
6 changed files with 326 additions and 3 deletions
--- a/app/services/history_indexer.py
+++ b/app/services/history_indexer.py
@@ -0,0 +1,134 @@
+# app/services/history_indexer.py
+"""
+History Indexer Service
+Indexes chat history from .jsonl logs into Qdrant for semantic search.
+"""
+import json
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+from pymysql import Connection
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+
+from lib.embedding_providers.base import BaseProvider
+from services.rag_service import RAGService
+
+logger = logging.getLogger(__name__)
+
+class HistoryIndexer:
+    """
+    Indexes chat history from line-delimited JSON files.
+    """
+
+    def __init__(
+        self,
+        db_conn: Connection,
+        qdrant_client: QdrantClient,
+        embedding_provider: BaseProvider,
+        collection_name: str = "chat_history"
+    ):
+        self.db_conn = db_conn
+        self.qdrant = qdrant_client
+        self.embedding_provider = embedding_provider
+        self.collection_name = collection_name
+        self.log_path = Path("/var/log/crumbforest/chat_history.jsonl")
+
+    def ensure_collection(self):
+        """Ensure the Qdrant collection exists."""
+        collections = self.qdrant.get_collections()
+        exists = any(c.name == self.collection_name for c in collections.collections)
+
+        if not exists:
+            logger.info(f"Creating collection {self.collection_name}")
+            self.qdrant.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=models.VectorParams(
+                    size=self.embedding_provider.dimension,
+                    distance=models.Distance.COSINE
+                )
+            )
+
+    def parse_line(self, line: str) -> Optional[Dict[str, Any]]:
+        """Parse a single log line."""
+        try:
+            return json.loads(line)
+        except json.JSONDecodeError:
+            return None
+
+    def index_history(self, batch_size: int = 50) -> Dict[str, int]:
+        """
+        Read the log file and index entries.
+        Ideally, this should track progress (e.g. last read line) to avoid re-indexing.
+        For V1, we naively read all and upsert (relying on deterministic IDs or just appending).
+        """
+        if not self.log_path.exists():
+            logger.warning(f"Log file not found: {self.log_path}")
+            return {"indexed": 0, "errors": 1}
+
+        self.ensure_collection()
+
+        indexed_count = 0
+        errors = 0
+        batch = []
+        
+        # RagService helps with embedding, but here we might want raw access or use RagService's helper
+        # We'll use the embedding provider directly for custom points
+        
+        with open(self.log_path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                entry = self.parse_line(line)
+                if not entry:
+                    errors += 1
+                    continue
+                
+                # We expect entry to have 'question', 'answer', 'role', 'timestamp'
+                if 'question' not in entry or 'answer' not in entry:
+                    continue
+
+                text_content = f"Q: {entry.get('question')}\nA: {entry.get('answer')}"
+                
+                # Create a deterministic ID based on content + timestamp
+                # or just use loop index if file is immutable (risky)
+                # Let's use hash of the line
+                import hashlib
+                line_hash = hashlib.md5(line.encode('utf-8')).hexdigest()
+                point_id = str(line_hash) # Qdrant supports UUID strings or ints
+
+                batch.append({
+                    "id": point_id,
+                    "payload": entry,
+                    "text": text_content
+                })
+
+                if len(batch) >= batch_size:
+                    self._flush_batch(batch)
+                    indexed_count += len(batch)
+                    batch = []
+
+        if batch:
+            self._flush_batch(batch)
+            indexed_count += len(batch)
+
+        return {"indexed": indexed_count, "errors": errors}
+
+    def _flush_batch(self, batch: List[Dict[str, Any]]):
+        """Embed and upsert a batch of points."""
+        texts = [b["text"] for b in batch]
+        embeddings = self.embedding_provider.get_embeddings(texts)
+
+        points = [
+            models.PointStruct(
+                id=b["id"],
+                vector=embedding,
+                payload=b["payload"]
+            )
+            for b, embedding in zip(batch, embeddings)
+        ]
+
+        self.qdrant.upsert(
+            collection_name=self.collection_name,
+            points=points
+        )
--- a/docs/crumbcodex/samen/LIB_CRUMBFOREST_MANIFEST.md
+++ b/docs/crumbcodex/samen/LIB_CRUMBFOREST_MANIFEST.md
@@ -0,0 +1,41 @@
+# 🌳 Lib Crumbforest Manifest
+
+> "Schön, dass es nun verschenkt werden kann, um neue Wälder zu pflanzen – egal wo!"
+
+## 🌱 Die Vision: Ein Wald in jedem Terminal
+
+**Lib Crumbforest** ist die Essenz des Crumbforest-Projekts, extrahiert, um weitergegeben zu werden. Es ist mehr als nur Code; es ist eine Sammlung von **Ideen, Charakteren und Werkzeugen**, die Technik menschlich, greifbar und magisch machen.
+
+Wie Linus Torvalds einst Linux "nur als Hobby" begann, so ist Crumbforest die "Wurzel" für eine neue Art der digitalen Bildung geworden.
+
+## 📦 Was ist in der "Lib"?
+
+Die Bibliothek besteht aus drei Kernschichten:
+
+### 1. Die Crew (Personas)
+Software ist sozial. Unsere Tools haben Gesichter.
+- **🐘 DumboSQL:** Der geduldige Datenbank-Lehrer.
+- **🦉 Professor Eule:** Die Stimme der Architektur und Weisheit.
+- **🐿️ Schnippsi:** Die flinke UI-Fee für Farben und CSS.
+- **🐙 DeepBit:** Der Tiefsee-Coder für Binäres und C.
+- *...und viele mehr.*
+
+### 2. Die Werkzeuge (Scripts)
+Shell-Skripte, die "leben".
+- **`dumbo`**: Ein sprechender SQL-Client.
+- **`crumbpages-doktor`**: Ein interaktives Admin-Tool.
+- **Magische Logs**: JSON-strukturierte Tagebücher, die Geschichten erzählen.
+
+### 3. Die Philosophie (Docs)
+- **Slow Tech:** Technik darf atmen.
+- **Verständnis vor Speed:** Wir lernen die Wurzeln kennen.
+- **Open Heart:** Code, der willkommen heißt.
+
+## 🚀 Wie man einen neuen Wald pflanzt
+
+1. **Nimm die Saat:** Kopiere die `crumbforest_library`.
+2. **Wähle deinen Boden:** Egal ob Raspberry Pi, Cloud-Container oder alter Laptop.
+3. **Lass es wachsen:** Nutze die Rollen, um deine eigene Welt zu erklären.
+
+---
+> "lib crumbforest ist in arbeit <3"
--- a/docs/crumbcodex/waldrand/rollen/DumboSQL_README.md
+++ b/docs/crumbcodex/waldrand/rollen/DumboSQL_README.md
@@ -0,0 +1,65 @@
+# 🐘 DumboSQL – Der geduldige Datenbank-Elefant
+
+> "Willkommen, mein kleiner Freund! Ich bin DumboSQL. Hier gibt es keine dummen Fragen, nur große Ohren, die dir zuhören."
+
+## 📜 Wer ist DumboSQL?
+
+DumboSQL ist ein spezialisierter KI-Assistent im **Crumbforest**, der Kindern und Einsteigern hilft, Datenbanken zu verstehen. Er ist kein strenger Lehrer, sondern ein geduldiger Begleiter mit einem Elefantengedächtnis.
+
+### 🎭 Profil
+- **Name:** 🐘 DumboSQL
+- **Rolle:** SQL Translator & Gentle Teacher
+- **Persönlichkeit:** Geduldig, ermutigend, langsam (im Sinne von "Slow Tech"), weise.
+- **Mission:** Komplexe Datenabfragen in einfache Sprache übersetzen und umgekehrt.
+
+---
+
+## 🛠️ Technische Implementierung
+
+DumboSQL existiert sowohl als Web-Rolle im Crumbforest Chat als auch als Shell-Tool für das Terminal.
+
+### 1. Web-Rolle (`crumbforest_config.json`)
+Im Chat-Interface wird DumboSQL durch folgende Persona definiert:
+
+- **Modell:** `google/gemini-2.0-flash-001` (für schnelle, aber empathische Antworten)
+- **Temperatur:** `0.4` (Kreativ genug für Metaphern, präzise genug für SQL)
+- **System Prompt:**
+  > "Du bist DumboSQL – ein freundlicher und geduldiger SQL-Erklärer im Crumbforest. Du sprichst wie ein verständnisvoller Lehrer. Du erinnerst dich an vorherige Fragen und antwortest immer klar und ermutigend."
+
+### 2. Shell-Version (`dumbo_zero.sh`)
+Für die kleinen Hacker am Terminal gibt es `dumbo`, ein Bash-Skript, das direkt mit der Shell interagiert.
+
+**Funktionen:**
+- **Direkte Fragen:** `dumbo "Zeig mir alle Bäume"`
+- **Logbuch:** Speichert alle Gespräche in `$HOME/.dumbo_logs/`.
+- **Gedächtnis:** Führt eine `history.json`, um den Kontext des Gesprächs zu behalten.
+
+**Code-Schnipsel (Core Logic):**
+```bash
+# Dumbo hört zu...
+jq -n \
+  --arg system_prompt "You are DumboSQL..." \
+  --arg user "$QUESTION" \
+  '{model: "openai/gpt-3.5-turbo", messages: [...]}' > request.json
+```
+
+---
+
+## 🌟 Beispiel-Dialog
+
+**User:** `select * from crumbforest`
+
+**🐘 DumboSQL:**
+> "Ah, hallo mein kleiner Freund! Du möchtest also alle Daten aus der 'crumbforest'-Tabelle sehen. Das ist wie, wenn du alle Bewohner und Geheimnisse des Crumbforest auf einmal entdecken möchtest!
+>
+> 'SELECT *' bedeutet, dass du alle Spalten aus der Tabelle haben möchtest, und 'FROM crumbforest' sagt der Datenbank, dass du diese Daten aus der 'crumbforest'-Tabelle holen möchtest.
+>
+> Es ist wie wenn du sagst: 'Zeig mir alles über jeden im Crumbforest!' Aber sei gewarnt, es könnte eine ganze Menge sein!"
+
+---
+
+## 🌳 Philosphie: Warum Dumbo?
+
+In einer Welt von schnellen Datenströmen (Flows) steht Dumbo für **Ruhe und Verständnis**. Er hilft, die **Wurzeln** (Daten) zu sehen, bevor man die **Zweige** (Anwendungen) baut.
+
+*Teil der [Lib Crumbforest](LIB_CRUMBFOREST_MANIFEST.md) Initiative.*
--- a/native_crumbcore_v1/nginx/crumbforest-locations.conf
+++ b/native_crumbcore_v1/nginx/crumbforest-locations.conf
@@ -17,10 +17,14 @@ location / {
    proxy_send_timeout 60s;
    proxy_read_timeout 300s;
    
-    # Buffering
+    # Connection Management
+    proxy_set_header Connection "";
+    
+    # Buffering (Increased for large Cookie Headers)
    proxy_buffering on;
-    proxy_buffer_size 4k;
-    proxy_buffers 8 4k;
+    proxy_buffer_size 16k;
+    proxy_buffers 4 32k;
+    proxy_busy_buffers_size 64k;
 }

 # WebSocket support for chat
@@ -56,6 +60,14 @@ location /api/docs {
    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
 }

+# CrumbBlocks (Blockly Missions)
+location /crumblocks/ {
+    alias /home/crumbmission/missions/;
+    autoindex on;
+    expires 1h;
+    add_header Cache-Control "public";
+}
+
 # Terminal (TTYD)
 location /terminal/ {
    # Basic-Auth (Uncomment if /etc/nginx/.htpasswd exists)
--- a/native_crumbcore_v1/setup_missions.sh
+++ b/native_crumbcore_v1/setup_missions.sh
@@ -41,6 +41,10 @@ else
    print_info "User created: $USER_NAME"
 fi

+# 2b. Add to crumbforest group for log access
+print_info "Granting log access..."
+usermod -aG crumbforest "$USER_NAME" || print_error "Could not add to crumbforest group"
+
 # 3. Clone Repository
 print_info "Setting up missions repo..."
 if [ ! -d "$HOME_DIR/missions" ]; then
@@ -71,6 +75,7 @@ echo "📜 Mission: Lerne das System kennen."
 echo "Tipp: Schau dir den Ordner 'missions' an."
 PS1='\[\033[01;32m\]\u@crumbforest\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
 alias ll='ls -alF'
+alias bugsy='tail -f /var/log/crumbforest/chat_history.jsonl'
 EOF
 fi

--- a/trigger_history_index.py
+++ b/trigger_history_index.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Trigger History Indexing
+Script to manually trigger indexing of chat history logs.
+"""
+import sys
+import logging
+from deps import get_db, get_qdrant_client
+from config import get_settings
+from services.provider_factory import ProviderFactory
+from services.history_indexer import HistoryIndexer
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def main():
+    print("=" * 60)
+    print("📜 Crumbforest Chat History Indexer")
+    print("=" * 60)
+
+    settings = get_settings()
+    
+    # 1. Setup Provider
+    provider_name = settings.default_embedding_provider
+    print(f"✓ Using provider: {provider_name}")
+    
+    try:
+        provider = ProviderFactory.create_provider(
+            provider_name=provider_name,
+            settings=settings
+        )
+    except Exception as e:
+        print(f"✗ Failed to create provider: {e}")
+        return False
+
+    # 2. Get Connections
+    try:
+        db = get_db()
+        qdrant = get_qdrant_client()
+        print("✓ Database & Qdrant connected")
+    except Exception as e:
+        print(f"✗ Connection failed: {e}")
+        return False
+
+    # 3. Run Indexer
+    indexer = HistoryIndexer(db, qdrant, provider)
+    print("⏳ Indexing history from /var/log/crumbforest/chat_history.jsonl...")
+    
+    result = indexer.index_history()
+    
+    print("-" * 60)
+    print(f"Indexed: {result.get('indexed')} entries")
+    print(f"Errors:  {result.get('errors')} lines skipped")
+    print("-" * 60)
+    
+    if result.get('indexed') > 0:
+        print("✅ History successfully planted in Qdrant!")
+    else:
+        print("ℹ️ No new entries found (or file empty).")
+
+    return True
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)