# app/lib/markdown_chunker.py import re from typing import List, Dict, Any class MarkdownChunker: """ Intelligent chunking of Markdown content with header-aware splitting. Adapted for CrumbCRM post content. """ def __init__(self, chunk_size: int = 1000, overlap: int = 200): """ Initialize the chunker. Args: chunk_size: Maximum size of each chunk in characters overlap: Number of characters to overlap between chunks """ self.chunk_size = chunk_size self.overlap = overlap def chunk_post_content( self, content: str, post_id: int, post_title: str = "" ) -> List[Dict[str, Any]]: """ Split post content by headers, then by size if needed. Args: content: Markdown content of the post post_id: Database ID of the post post_title: Title of the post (optional) Returns: List of chunk dictionaries with content and metadata """ chunks = [] lines = content.split('\n') current_chunk = [] current_header = post_title # Use post title as default header current_level = 0 for line in lines: # Check if line is a header header_match = re.match(r'^(#+)\s+(.+)', line) if header_match: # Save previous chunk if it exists if current_chunk: chunk_text = '\n'.join(current_chunk).strip() if chunk_text: chunks.extend(self._split_large_chunk( chunk_text, post_id, current_header, current_level )) # Start new chunk current_level = len(header_match.group(1)) current_header = header_match.group(2) current_chunk = [line] else: current_chunk.append(line) # Handle final chunk if current_chunk: chunk_text = '\n'.join(current_chunk).strip() if chunk_text: chunks.extend(self._split_large_chunk( chunk_text, post_id, current_header, current_level )) # Add sequential index to chunks for idx, chunk in enumerate(chunks): chunk['chunk_position'] = idx return chunks def _split_large_chunk( self, text: str, post_id: int, header: str, level: int ) -> List[Dict[str, Any]]: """ Split chunks that exceed the maximum size. Args: text: Text content to split post_id: Database ID of the post header: Current header text level: Header level (1-6) Returns: List of chunk dictionaries """ if len(text) <= self.chunk_size: return [{ 'content': text, 'post_id': post_id, 'header': header, 'header_level': level, 'chunk_index': 0 }] # Split large chunks by words with overlap chunks = [] words = text.split() current_chunk = [] current_size = 0 chunk_index = 0 for word in words: word_size = len(word) + 1 # +1 for space if current_size + word_size > self.chunk_size and current_chunk: # Save current chunk chunk_content = ' '.join(current_chunk) chunks.append({ 'content': chunk_content, 'post_id': post_id, 'header': header, 'header_level': level, 'chunk_index': chunk_index }) # Start new chunk with overlap if self.overlap > 0: # Approximate word overlap overlap_word_count = max(1, self.overlap // 10) overlap_words = current_chunk[-overlap_word_count:] else: overlap_words = [] current_chunk = overlap_words + [word] current_size = sum(len(w) + 1 for w in current_chunk) chunk_index += 1 else: current_chunk.append(word) current_size += word_size # Save final chunk if exists if current_chunk: chunk_content = ' '.join(current_chunk) chunks.append({ 'content': chunk_content, 'post_id': post_id, 'header': header, 'header_level': level, 'chunk_index': chunk_index }) return chunks