Files
Crumb-Core-v.1/app/lib/markdown_chunker.py

156 lines
4.7 KiB
Python

# app/lib/markdown_chunker.py
import re
from typing import List, Dict, Any
class MarkdownChunker:
"""
Intelligent chunking of Markdown content with header-aware splitting.
Adapted for CrumbCRM post content.
"""
def __init__(self, chunk_size: int = 1000, overlap: int = 200):
"""
Initialize the chunker.
Args:
chunk_size: Maximum size of each chunk in characters
overlap: Number of characters to overlap between chunks
"""
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_post_content(
self,
content: str,
post_id: int,
post_title: str = ""
) -> List[Dict[str, Any]]:
"""
Split post content by headers, then by size if needed.
Args:
content: Markdown content of the post
post_id: Database ID of the post
post_title: Title of the post (optional)
Returns:
List of chunk dictionaries with content and metadata
"""
chunks = []
lines = content.split('\n')
current_chunk = []
current_header = post_title # Use post title as default header
current_level = 0
for line in lines:
# Check if line is a header
header_match = re.match(r'^(#+)\s+(.+)', line)
if header_match:
# Save previous chunk if it exists
if current_chunk:
chunk_text = '\n'.join(current_chunk).strip()
if chunk_text:
chunks.extend(self._split_large_chunk(
chunk_text, post_id, current_header, current_level
))
# Start new chunk
current_level = len(header_match.group(1))
current_header = header_match.group(2)
current_chunk = [line]
else:
current_chunk.append(line)
# Handle final chunk
if current_chunk:
chunk_text = '\n'.join(current_chunk).strip()
if chunk_text:
chunks.extend(self._split_large_chunk(
chunk_text, post_id, current_header, current_level
))
# Add sequential index to chunks
for idx, chunk in enumerate(chunks):
chunk['chunk_position'] = idx
return chunks
def _split_large_chunk(
self,
text: str,
post_id: int,
header: str,
level: int
) -> List[Dict[str, Any]]:
"""
Split chunks that exceed the maximum size.
Args:
text: Text content to split
post_id: Database ID of the post
header: Current header text
level: Header level (1-6)
Returns:
List of chunk dictionaries
"""
if len(text) <= self.chunk_size:
return [{
'content': text,
'post_id': post_id,
'header': header,
'header_level': level,
'chunk_index': 0
}]
# Split large chunks by words with overlap
chunks = []
words = text.split()
current_chunk = []
current_size = 0
chunk_index = 0
for word in words:
word_size = len(word) + 1 # +1 for space
if current_size + word_size > self.chunk_size and current_chunk:
# Save current chunk
chunk_content = ' '.join(current_chunk)
chunks.append({
'content': chunk_content,
'post_id': post_id,
'header': header,
'header_level': level,
'chunk_index': chunk_index
})
# Start new chunk with overlap
if self.overlap > 0:
# Approximate word overlap
overlap_word_count = max(1, self.overlap // 10)
overlap_words = current_chunk[-overlap_word_count:]
else:
overlap_words = []
current_chunk = overlap_words + [word]
current_size = sum(len(w) + 1 for w in current_chunk)
chunk_index += 1
else:
current_chunk.append(word)
current_size += word_size
# Save final chunk if exists
if current_chunk:
chunk_content = ' '.join(current_chunk)
chunks.append({
'content': chunk_content,
'post_id': post_id,
'header': header,
'header_level': level,
'chunk_index': chunk_index
})
return chunks