Crumb-Core-v.1/app/lib/markdown_chunker.py

# app/lib/markdown_chunker.py
import re
from typing import List, Dict, Any


class MarkdownChunker:
    """
    Intelligent chunking of Markdown content with header-aware splitting.
    Adapted for CrumbCRM post content.
    """

    def __init__(self, chunk_size: int = 1000, overlap: int = 200):
        """
        Initialize the chunker.

        Args:
            chunk_size: Maximum size of each chunk in characters
            overlap: Number of characters to overlap between chunks
        """
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_post_content(
        self,
        content: str,
        post_id: int,
        post_title: str = ""
    ) -> List[Dict[str, Any]]:
        """
        Split post content by headers, then by size if needed.

        Args:
            content: Markdown content of the post
            post_id: Database ID of the post
            post_title: Title of the post (optional)

        Returns:
            List of chunk dictionaries with content and metadata
        """
        chunks = []
        lines = content.split('\n')
        current_chunk = []
        current_header = post_title  # Use post title as default header
        current_level = 0

        for line in lines:
            # Check if line is a header
            header_match = re.match(r'^(#+)\s+(.+)', line)

            if header_match:
                # Save previous chunk if it exists
                if current_chunk:
                    chunk_text = '\n'.join(current_chunk).strip()
                    if chunk_text:
                        chunks.extend(self._split_large_chunk(
                            chunk_text, post_id, current_header, current_level
                        ))

                # Start new chunk
                current_level = len(header_match.group(1))
                current_header = header_match.group(2)
                current_chunk = [line]
            else:
                current_chunk.append(line)

        # Handle final chunk
        if current_chunk:
            chunk_text = '\n'.join(current_chunk).strip()
            if chunk_text:
                chunks.extend(self._split_large_chunk(
                    chunk_text, post_id, current_header, current_level
                ))

        # Add sequential index to chunks
        for idx, chunk in enumerate(chunks):
            chunk['chunk_position'] = idx

        return chunks

    def _split_large_chunk(
        self,
        text: str,
        post_id: int,
        header: str,
        level: int
    ) -> List[Dict[str, Any]]:
        """
        Split chunks that exceed the maximum size.

        Args:
            text: Text content to split
            post_id: Database ID of the post
            header: Current header text
            level: Header level (1-6)

        Returns:
            List of chunk dictionaries
        """
        if len(text) <= self.chunk_size:
            return [{
                'content': text,
                'post_id': post_id,
                'header': header,
                'header_level': level,
                'chunk_index': 0
            }]

        # Split large chunks by words with overlap
        chunks = []
        words = text.split()
        current_chunk = []
        current_size = 0
        chunk_index = 0

        for word in words:
            word_size = len(word) + 1  # +1 for space

            if current_size + word_size > self.chunk_size and current_chunk:
                # Save current chunk
                chunk_content = ' '.join(current_chunk)
                chunks.append({
                    'content': chunk_content,
                    'post_id': post_id,
                    'header': header,
                    'header_level': level,
                    'chunk_index': chunk_index
                })

                # Start new chunk with overlap
                if self.overlap > 0:
                    # Approximate word overlap
                    overlap_word_count = max(1, self.overlap // 10)
                    overlap_words = current_chunk[-overlap_word_count:]
                else:
                    overlap_words = []

                current_chunk = overlap_words + [word]
                current_size = sum(len(w) + 1 for w in current_chunk)
                chunk_index += 1
            else:
                current_chunk.append(word)
                current_size += word_size

        # Save final chunk if exists
        if current_chunk:
            chunk_content = ' '.join(current_chunk)
            chunks.append({
                'content': chunk_content,
                'post_id': post_id,
                'header': header,
                'header_level': level,
                'chunk_index': chunk_index
            })

        return chunks