Initial commit: Crumbforest Architecture Refinement v1 (Clean)

2025-12-07 01:26:46 +01:00
commit 6c38ed680b
633 changed files with 61797 additions and 0 deletions
--- a/app/lib/markdown_chunker.py
+++ b/app/lib/markdown_chunker.py
@@ -0,0 +1,155 @@
+# app/lib/markdown_chunker.py
+import re
+from typing import List, Dict, Any
+
+
+class MarkdownChunker:
+    """
+    Intelligent chunking of Markdown content with header-aware splitting.
+    Adapted for CrumbCRM post content.
+    """
+
+    def __init__(self, chunk_size: int = 1000, overlap: int = 200):
+        """
+        Initialize the chunker.
+
+        Args:
+            chunk_size: Maximum size of each chunk in characters
+            overlap: Number of characters to overlap between chunks
+        """
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+
+    def chunk_post_content(
+        self,
+        content: str,
+        post_id: int,
+        post_title: str = ""
+    ) -> List[Dict[str, Any]]:
+        """
+        Split post content by headers, then by size if needed.
+
+        Args:
+            content: Markdown content of the post
+            post_id: Database ID of the post
+            post_title: Title of the post (optional)
+
+        Returns:
+            List of chunk dictionaries with content and metadata
+        """
+        chunks = []
+        lines = content.split('\n')
+        current_chunk = []
+        current_header = post_title  # Use post title as default header
+        current_level = 0
+
+        for line in lines:
+            # Check if line is a header
+            header_match = re.match(r'^(#+)\s+(.+)', line)
+
+            if header_match:
+                # Save previous chunk if it exists
+                if current_chunk:
+                    chunk_text = '\n'.join(current_chunk).strip()
+                    if chunk_text:
+                        chunks.extend(self._split_large_chunk(
+                            chunk_text, post_id, current_header, current_level
+                        ))
+
+                # Start new chunk
+                current_level = len(header_match.group(1))
+                current_header = header_match.group(2)
+                current_chunk = [line]
+            else:
+                current_chunk.append(line)
+
+        # Handle final chunk
+        if current_chunk:
+            chunk_text = '\n'.join(current_chunk).strip()
+            if chunk_text:
+                chunks.extend(self._split_large_chunk(
+                    chunk_text, post_id, current_header, current_level
+                ))
+
+        # Add sequential index to chunks
+        for idx, chunk in enumerate(chunks):
+            chunk['chunk_position'] = idx
+
+        return chunks
+
+    def _split_large_chunk(
+        self,
+        text: str,
+        post_id: int,
+        header: str,
+        level: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Split chunks that exceed the maximum size.
+
+        Args:
+            text: Text content to split
+            post_id: Database ID of the post
+            header: Current header text
+            level: Header level (1-6)
+
+        Returns:
+            List of chunk dictionaries
+        """
+        if len(text) <= self.chunk_size:
+            return [{
+                'content': text,
+                'post_id': post_id,
+                'header': header,
+                'header_level': level,
+                'chunk_index': 0
+            }]
+
+        # Split large chunks by words with overlap
+        chunks = []
+        words = text.split()
+        current_chunk = []
+        current_size = 0
+        chunk_index = 0
+
+        for word in words:
+            word_size = len(word) + 1  # +1 for space
+
+            if current_size + word_size > self.chunk_size and current_chunk:
+                # Save current chunk
+                chunk_content = ' '.join(current_chunk)
+                chunks.append({
+                    'content': chunk_content,
+                    'post_id': post_id,
+                    'header': header,
+                    'header_level': level,
+                    'chunk_index': chunk_index
+                })
+
+                # Start new chunk with overlap
+                if self.overlap > 0:
+                    # Approximate word overlap
+                    overlap_word_count = max(1, self.overlap // 10)
+                    overlap_words = current_chunk[-overlap_word_count:]
+                else:
+                    overlap_words = []
+
+                current_chunk = overlap_words + [word]
+                current_size = sum(len(w) + 1 for w in current_chunk)
+                chunk_index += 1
+            else:
+                current_chunk.append(word)
+                current_size += word_size
+
+        # Save final chunk if exists
+        if current_chunk:
+            chunk_content = ' '.join(current_chunk)
+            chunks.append({
+                'content': chunk_content,
+                'post_id': post_id,
+                'header': header,
+                'header_level': level,
+                'chunk_index': chunk_index
+            })
+
+        return chunks