156 lines
4.7 KiB
Python
156 lines
4.7 KiB
Python
# app/lib/markdown_chunker.py
|
|
import re
|
|
from typing import List, Dict, Any
|
|
|
|
|
|
class MarkdownChunker:
|
|
"""
|
|
Intelligent chunking of Markdown content with header-aware splitting.
|
|
Adapted for CrumbCRM post content.
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 1000, overlap: int = 200):
|
|
"""
|
|
Initialize the chunker.
|
|
|
|
Args:
|
|
chunk_size: Maximum size of each chunk in characters
|
|
overlap: Number of characters to overlap between chunks
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
|
|
def chunk_post_content(
|
|
self,
|
|
content: str,
|
|
post_id: int,
|
|
post_title: str = ""
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Split post content by headers, then by size if needed.
|
|
|
|
Args:
|
|
content: Markdown content of the post
|
|
post_id: Database ID of the post
|
|
post_title: Title of the post (optional)
|
|
|
|
Returns:
|
|
List of chunk dictionaries with content and metadata
|
|
"""
|
|
chunks = []
|
|
lines = content.split('\n')
|
|
current_chunk = []
|
|
current_header = post_title # Use post title as default header
|
|
current_level = 0
|
|
|
|
for line in lines:
|
|
# Check if line is a header
|
|
header_match = re.match(r'^(#+)\s+(.+)', line)
|
|
|
|
if header_match:
|
|
# Save previous chunk if it exists
|
|
if current_chunk:
|
|
chunk_text = '\n'.join(current_chunk).strip()
|
|
if chunk_text:
|
|
chunks.extend(self._split_large_chunk(
|
|
chunk_text, post_id, current_header, current_level
|
|
))
|
|
|
|
# Start new chunk
|
|
current_level = len(header_match.group(1))
|
|
current_header = header_match.group(2)
|
|
current_chunk = [line]
|
|
else:
|
|
current_chunk.append(line)
|
|
|
|
# Handle final chunk
|
|
if current_chunk:
|
|
chunk_text = '\n'.join(current_chunk).strip()
|
|
if chunk_text:
|
|
chunks.extend(self._split_large_chunk(
|
|
chunk_text, post_id, current_header, current_level
|
|
))
|
|
|
|
# Add sequential index to chunks
|
|
for idx, chunk in enumerate(chunks):
|
|
chunk['chunk_position'] = idx
|
|
|
|
return chunks
|
|
|
|
def _split_large_chunk(
|
|
self,
|
|
text: str,
|
|
post_id: int,
|
|
header: str,
|
|
level: int
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Split chunks that exceed the maximum size.
|
|
|
|
Args:
|
|
text: Text content to split
|
|
post_id: Database ID of the post
|
|
header: Current header text
|
|
level: Header level (1-6)
|
|
|
|
Returns:
|
|
List of chunk dictionaries
|
|
"""
|
|
if len(text) <= self.chunk_size:
|
|
return [{
|
|
'content': text,
|
|
'post_id': post_id,
|
|
'header': header,
|
|
'header_level': level,
|
|
'chunk_index': 0
|
|
}]
|
|
|
|
# Split large chunks by words with overlap
|
|
chunks = []
|
|
words = text.split()
|
|
current_chunk = []
|
|
current_size = 0
|
|
chunk_index = 0
|
|
|
|
for word in words:
|
|
word_size = len(word) + 1 # +1 for space
|
|
|
|
if current_size + word_size > self.chunk_size and current_chunk:
|
|
# Save current chunk
|
|
chunk_content = ' '.join(current_chunk)
|
|
chunks.append({
|
|
'content': chunk_content,
|
|
'post_id': post_id,
|
|
'header': header,
|
|
'header_level': level,
|
|
'chunk_index': chunk_index
|
|
})
|
|
|
|
# Start new chunk with overlap
|
|
if self.overlap > 0:
|
|
# Approximate word overlap
|
|
overlap_word_count = max(1, self.overlap // 10)
|
|
overlap_words = current_chunk[-overlap_word_count:]
|
|
else:
|
|
overlap_words = []
|
|
|
|
current_chunk = overlap_words + [word]
|
|
current_size = sum(len(w) + 1 for w in current_chunk)
|
|
chunk_index += 1
|
|
else:
|
|
current_chunk.append(word)
|
|
current_size += word_size
|
|
|
|
# Save final chunk if exists
|
|
if current_chunk:
|
|
chunk_content = ' '.join(current_chunk)
|
|
chunks.append({
|
|
'content': chunk_content,
|
|
'post_id': post_id,
|
|
'header': header,
|
|
'header_level': level,
|
|
'chunk_index': chunk_index
|
|
})
|
|
|
|
return chunks
|