feat(crumbforest): wire up docs, missions, and history indexer
This commit is contained in:
134
app/services/history_indexer.py
Normal file
134
app/services/history_indexer.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# app/services/history_indexer.py
|
||||
"""
|
||||
History Indexer Service
|
||||
Indexes chat history from .jsonl logs into Qdrant for semantic search.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from pymysql import Connection
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models
|
||||
|
||||
from lib.embedding_providers.base import BaseProvider
|
||||
from services.rag_service import RAGService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class HistoryIndexer:
|
||||
"""
|
||||
Indexes chat history from line-delimited JSON files.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_conn: Connection,
|
||||
qdrant_client: QdrantClient,
|
||||
embedding_provider: BaseProvider,
|
||||
collection_name: str = "chat_history"
|
||||
):
|
||||
self.db_conn = db_conn
|
||||
self.qdrant = qdrant_client
|
||||
self.embedding_provider = embedding_provider
|
||||
self.collection_name = collection_name
|
||||
self.log_path = Path("/var/log/crumbforest/chat_history.jsonl")
|
||||
|
||||
def ensure_collection(self):
|
||||
"""Ensure the Qdrant collection exists."""
|
||||
collections = self.qdrant.get_collections()
|
||||
exists = any(c.name == self.collection_name for c in collections.collections)
|
||||
|
||||
if not exists:
|
||||
logger.info(f"Creating collection {self.collection_name}")
|
||||
self.qdrant.create_collection(
|
||||
collection_name=self.collection_name,
|
||||
vectors_config=models.VectorParams(
|
||||
size=self.embedding_provider.dimension,
|
||||
distance=models.Distance.COSINE
|
||||
)
|
||||
)
|
||||
|
||||
def parse_line(self, line: str) -> Optional[Dict[str, Any]]:
|
||||
"""Parse a single log line."""
|
||||
try:
|
||||
return json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
def index_history(self, batch_size: int = 50) -> Dict[str, int]:
|
||||
"""
|
||||
Read the log file and index entries.
|
||||
Ideally, this should track progress (e.g. last read line) to avoid re-indexing.
|
||||
For V1, we naively read all and upsert (relying on deterministic IDs or just appending).
|
||||
"""
|
||||
if not self.log_path.exists():
|
||||
logger.warning(f"Log file not found: {self.log_path}")
|
||||
return {"indexed": 0, "errors": 1}
|
||||
|
||||
self.ensure_collection()
|
||||
|
||||
indexed_count = 0
|
||||
errors = 0
|
||||
batch = []
|
||||
|
||||
# RagService helps with embedding, but here we might want raw access or use RagService's helper
|
||||
# We'll use the embedding provider directly for custom points
|
||||
|
||||
with open(self.log_path, 'r', encoding='utf-8') as f:
|
||||
for i, line in enumerate(f):
|
||||
entry = self.parse_line(line)
|
||||
if not entry:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# We expect entry to have 'question', 'answer', 'role', 'timestamp'
|
||||
if 'question' not in entry or 'answer' not in entry:
|
||||
continue
|
||||
|
||||
text_content = f"Q: {entry.get('question')}\nA: {entry.get('answer')}"
|
||||
|
||||
# Create a deterministic ID based on content + timestamp
|
||||
# or just use loop index if file is immutable (risky)
|
||||
# Let's use hash of the line
|
||||
import hashlib
|
||||
line_hash = hashlib.md5(line.encode('utf-8')).hexdigest()
|
||||
point_id = str(line_hash) # Qdrant supports UUID strings or ints
|
||||
|
||||
batch.append({
|
||||
"id": point_id,
|
||||
"payload": entry,
|
||||
"text": text_content
|
||||
})
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
self._flush_batch(batch)
|
||||
indexed_count += len(batch)
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
self._flush_batch(batch)
|
||||
indexed_count += len(batch)
|
||||
|
||||
return {"indexed": indexed_count, "errors": errors}
|
||||
|
||||
def _flush_batch(self, batch: List[Dict[str, Any]]):
|
||||
"""Embed and upsert a batch of points."""
|
||||
texts = [b["text"] for b in batch]
|
||||
embeddings = self.embedding_provider.get_embeddings(texts)
|
||||
|
||||
points = [
|
||||
models.PointStruct(
|
||||
id=b["id"],
|
||||
vector=embedding,
|
||||
payload=b["payload"]
|
||||
)
|
||||
for b, embedding in zip(batch, embeddings)
|
||||
]
|
||||
|
||||
self.qdrant.upsert(
|
||||
collection_name=self.collection_name,
|
||||
points=points
|
||||
)
|
||||
41
docs/crumbcodex/samen/LIB_CRUMBFOREST_MANIFEST.md
Normal file
41
docs/crumbcodex/samen/LIB_CRUMBFOREST_MANIFEST.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# 🌳 Lib Crumbforest Manifest
|
||||
|
||||
> "Schön, dass es nun verschenkt werden kann, um neue Wälder zu pflanzen – egal wo!"
|
||||
|
||||
## 🌱 Die Vision: Ein Wald in jedem Terminal
|
||||
|
||||
**Lib Crumbforest** ist die Essenz des Crumbforest-Projekts, extrahiert, um weitergegeben zu werden. Es ist mehr als nur Code; es ist eine Sammlung von **Ideen, Charakteren und Werkzeugen**, die Technik menschlich, greifbar und magisch machen.
|
||||
|
||||
Wie Linus Torvalds einst Linux "nur als Hobby" begann, so ist Crumbforest die "Wurzel" für eine neue Art der digitalen Bildung geworden.
|
||||
|
||||
## 📦 Was ist in der "Lib"?
|
||||
|
||||
Die Bibliothek besteht aus drei Kernschichten:
|
||||
|
||||
### 1. Die Crew (Personas)
|
||||
Software ist sozial. Unsere Tools haben Gesichter.
|
||||
- **🐘 DumboSQL:** Der geduldige Datenbank-Lehrer.
|
||||
- **🦉 Professor Eule:** Die Stimme der Architektur und Weisheit.
|
||||
- **🐿️ Schnippsi:** Die flinke UI-Fee für Farben und CSS.
|
||||
- **🐙 DeepBit:** Der Tiefsee-Coder für Binäres und C.
|
||||
- *...und viele mehr.*
|
||||
|
||||
### 2. Die Werkzeuge (Scripts)
|
||||
Shell-Skripte, die "leben".
|
||||
- **`dumbo`**: Ein sprechender SQL-Client.
|
||||
- **`crumbpages-doktor`**: Ein interaktives Admin-Tool.
|
||||
- **Magische Logs**: JSON-strukturierte Tagebücher, die Geschichten erzählen.
|
||||
|
||||
### 3. Die Philosophie (Docs)
|
||||
- **Slow Tech:** Technik darf atmen.
|
||||
- **Verständnis vor Speed:** Wir lernen die Wurzeln kennen.
|
||||
- **Open Heart:** Code, der willkommen heißt.
|
||||
|
||||
## 🚀 Wie man einen neuen Wald pflanzt
|
||||
|
||||
1. **Nimm die Saat:** Kopiere die `crumbforest_library`.
|
||||
2. **Wähle deinen Boden:** Egal ob Raspberry Pi, Cloud-Container oder alter Laptop.
|
||||
3. **Lass es wachsen:** Nutze die Rollen, um deine eigene Welt zu erklären.
|
||||
|
||||
---
|
||||
> "lib crumbforest ist in arbeit <3"
|
||||
65
docs/crumbcodex/waldrand/rollen/DumboSQL_README.md
Normal file
65
docs/crumbcodex/waldrand/rollen/DumboSQL_README.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# 🐘 DumboSQL – Der geduldige Datenbank-Elefant
|
||||
|
||||
> "Willkommen, mein kleiner Freund! Ich bin DumboSQL. Hier gibt es keine dummen Fragen, nur große Ohren, die dir zuhören."
|
||||
|
||||
## 📜 Wer ist DumboSQL?
|
||||
|
||||
DumboSQL ist ein spezialisierter KI-Assistent im **Crumbforest**, der Kindern und Einsteigern hilft, Datenbanken zu verstehen. Er ist kein strenger Lehrer, sondern ein geduldiger Begleiter mit einem Elefantengedächtnis.
|
||||
|
||||
### 🎭 Profil
|
||||
- **Name:** 🐘 DumboSQL
|
||||
- **Rolle:** SQL Translator & Gentle Teacher
|
||||
- **Persönlichkeit:** Geduldig, ermutigend, langsam (im Sinne von "Slow Tech"), weise.
|
||||
- **Mission:** Komplexe Datenabfragen in einfache Sprache übersetzen und umgekehrt.
|
||||
|
||||
---
|
||||
|
||||
## 🛠️ Technische Implementierung
|
||||
|
||||
DumboSQL existiert sowohl als Web-Rolle im Crumbforest Chat als auch als Shell-Tool für das Terminal.
|
||||
|
||||
### 1. Web-Rolle (`crumbforest_config.json`)
|
||||
Im Chat-Interface wird DumboSQL durch folgende Persona definiert:
|
||||
|
||||
- **Modell:** `google/gemini-2.0-flash-001` (für schnelle, aber empathische Antworten)
|
||||
- **Temperatur:** `0.4` (Kreativ genug für Metaphern, präzise genug für SQL)
|
||||
- **System Prompt:**
|
||||
> "Du bist DumboSQL – ein freundlicher und geduldiger SQL-Erklärer im Crumbforest. Du sprichst wie ein verständnisvoller Lehrer. Du erinnerst dich an vorherige Fragen und antwortest immer klar und ermutigend."
|
||||
|
||||
### 2. Shell-Version (`dumbo_zero.sh`)
|
||||
Für die kleinen Hacker am Terminal gibt es `dumbo`, ein Bash-Skript, das direkt mit der Shell interagiert.
|
||||
|
||||
**Funktionen:**
|
||||
- **Direkte Fragen:** `dumbo "Zeig mir alle Bäume"`
|
||||
- **Logbuch:** Speichert alle Gespräche in `$HOME/.dumbo_logs/`.
|
||||
- **Gedächtnis:** Führt eine `history.json`, um den Kontext des Gesprächs zu behalten.
|
||||
|
||||
**Code-Schnipsel (Core Logic):**
|
||||
```bash
|
||||
# Dumbo hört zu...
|
||||
jq -n \
|
||||
--arg system_prompt "You are DumboSQL..." \
|
||||
--arg user "$QUESTION" \
|
||||
'{model: "openai/gpt-3.5-turbo", messages: [...]}' > request.json
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🌟 Beispiel-Dialog
|
||||
|
||||
**User:** `select * from crumbforest`
|
||||
|
||||
**🐘 DumboSQL:**
|
||||
> "Ah, hallo mein kleiner Freund! Du möchtest also alle Daten aus der 'crumbforest'-Tabelle sehen. Das ist wie, wenn du alle Bewohner und Geheimnisse des Crumbforest auf einmal entdecken möchtest!
|
||||
>
|
||||
> 'SELECT *' bedeutet, dass du alle Spalten aus der Tabelle haben möchtest, und 'FROM crumbforest' sagt der Datenbank, dass du diese Daten aus der 'crumbforest'-Tabelle holen möchtest.
|
||||
>
|
||||
> Es ist wie wenn du sagst: 'Zeig mir alles über jeden im Crumbforest!' Aber sei gewarnt, es könnte eine ganze Menge sein!"
|
||||
|
||||
---
|
||||
|
||||
## 🌳 Philosphie: Warum Dumbo?
|
||||
|
||||
In einer Welt von schnellen Datenströmen (Flows) steht Dumbo für **Ruhe und Verständnis**. Er hilft, die **Wurzeln** (Daten) zu sehen, bevor man die **Zweige** (Anwendungen) baut.
|
||||
|
||||
*Teil der [Lib Crumbforest](LIB_CRUMBFOREST_MANIFEST.md) Initiative.*
|
||||
@@ -17,10 +17,14 @@ location / {
|
||||
proxy_send_timeout 60s;
|
||||
proxy_read_timeout 300s;
|
||||
|
||||
# Buffering
|
||||
# Connection Management
|
||||
proxy_set_header Connection "";
|
||||
|
||||
# Buffering (Increased for large Cookie Headers)
|
||||
proxy_buffering on;
|
||||
proxy_buffer_size 4k;
|
||||
proxy_buffers 8 4k;
|
||||
proxy_buffer_size 16k;
|
||||
proxy_buffers 4 32k;
|
||||
proxy_busy_buffers_size 64k;
|
||||
}
|
||||
|
||||
# WebSocket support for chat
|
||||
@@ -56,6 +60,14 @@ location /api/docs {
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
}
|
||||
|
||||
# CrumbBlocks (Blockly Missions)
|
||||
location /crumblocks/ {
|
||||
alias /home/crumbmission/missions/;
|
||||
autoindex on;
|
||||
expires 1h;
|
||||
add_header Cache-Control "public";
|
||||
}
|
||||
|
||||
# Terminal (TTYD)
|
||||
location /terminal/ {
|
||||
# Basic-Auth (Uncomment if /etc/nginx/.htpasswd exists)
|
||||
|
||||
@@ -41,6 +41,10 @@ else
|
||||
print_info "User created: $USER_NAME"
|
||||
fi
|
||||
|
||||
# 2b. Add to crumbforest group for log access
|
||||
print_info "Granting log access..."
|
||||
usermod -aG crumbforest "$USER_NAME" || print_error "Could not add to crumbforest group"
|
||||
|
||||
# 3. Clone Repository
|
||||
print_info "Setting up missions repo..."
|
||||
if [ ! -d "$HOME_DIR/missions" ]; then
|
||||
@@ -71,6 +75,7 @@ echo "📜 Mission: Lerne das System kennen."
|
||||
echo "Tipp: Schau dir den Ordner 'missions' an."
|
||||
PS1='\[\033[01;32m\]\u@crumbforest\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
|
||||
alias ll='ls -alF'
|
||||
alias bugsy='tail -f /var/log/crumbforest/chat_history.jsonl'
|
||||
EOF
|
||||
fi
|
||||
|
||||
|
||||
66
trigger_history_index.py
Executable file
66
trigger_history_index.py
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Trigger History Indexing
|
||||
Script to manually trigger indexing of chat history logs.
|
||||
"""
|
||||
import sys
|
||||
import logging
|
||||
from deps import get_db, get_qdrant_client
|
||||
from config import get_settings
|
||||
from services.provider_factory import ProviderFactory
|
||||
from services.history_indexer import HistoryIndexer
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("📜 Crumbforest Chat History Indexer")
|
||||
print("=" * 60)
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
# 1. Setup Provider
|
||||
provider_name = settings.default_embedding_provider
|
||||
print(f"✓ Using provider: {provider_name}")
|
||||
|
||||
try:
|
||||
provider = ProviderFactory.create_provider(
|
||||
provider_name=provider_name,
|
||||
settings=settings
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to create provider: {e}")
|
||||
return False
|
||||
|
||||
# 2. Get Connections
|
||||
try:
|
||||
db = get_db()
|
||||
qdrant = get_qdrant_client()
|
||||
print("✓ Database & Qdrant connected")
|
||||
except Exception as e:
|
||||
print(f"✗ Connection failed: {e}")
|
||||
return False
|
||||
|
||||
# 3. Run Indexer
|
||||
indexer = HistoryIndexer(db, qdrant, provider)
|
||||
print("⏳ Indexing history from /var/log/crumbforest/chat_history.jsonl...")
|
||||
|
||||
result = indexer.index_history()
|
||||
|
||||
print("-" * 60)
|
||||
print(f"Indexed: {result.get('indexed')} entries")
|
||||
print(f"Errors: {result.get('errors')} lines skipped")
|
||||
print("-" * 60)
|
||||
|
||||
if result.get('indexed') > 0:
|
||||
print("✅ History successfully planted in Qdrant!")
|
||||
else:
|
||||
print("ℹ️ No new entries found (or file empty).")
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user