Initial commit: Crumbforest Architecture Refinement v1 (Clean)
This commit is contained in:
155
app/lib/markdown_chunker.py
Normal file
155
app/lib/markdown_chunker.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# app/lib/markdown_chunker.py
|
||||
import re
|
||||
from typing import List, Dict, Any
|
||||
|
||||
|
||||
class MarkdownChunker:
|
||||
"""
|
||||
Intelligent chunking of Markdown content with header-aware splitting.
|
||||
Adapted for CrumbCRM post content.
|
||||
"""
|
||||
|
||||
def __init__(self, chunk_size: int = 1000, overlap: int = 200):
|
||||
"""
|
||||
Initialize the chunker.
|
||||
|
||||
Args:
|
||||
chunk_size: Maximum size of each chunk in characters
|
||||
overlap: Number of characters to overlap between chunks
|
||||
"""
|
||||
self.chunk_size = chunk_size
|
||||
self.overlap = overlap
|
||||
|
||||
def chunk_post_content(
|
||||
self,
|
||||
content: str,
|
||||
post_id: int,
|
||||
post_title: str = ""
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Split post content by headers, then by size if needed.
|
||||
|
||||
Args:
|
||||
content: Markdown content of the post
|
||||
post_id: Database ID of the post
|
||||
post_title: Title of the post (optional)
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries with content and metadata
|
||||
"""
|
||||
chunks = []
|
||||
lines = content.split('\n')
|
||||
current_chunk = []
|
||||
current_header = post_title # Use post title as default header
|
||||
current_level = 0
|
||||
|
||||
for line in lines:
|
||||
# Check if line is a header
|
||||
header_match = re.match(r'^(#+)\s+(.+)', line)
|
||||
|
||||
if header_match:
|
||||
# Save previous chunk if it exists
|
||||
if current_chunk:
|
||||
chunk_text = '\n'.join(current_chunk).strip()
|
||||
if chunk_text:
|
||||
chunks.extend(self._split_large_chunk(
|
||||
chunk_text, post_id, current_header, current_level
|
||||
))
|
||||
|
||||
# Start new chunk
|
||||
current_level = len(header_match.group(1))
|
||||
current_header = header_match.group(2)
|
||||
current_chunk = [line]
|
||||
else:
|
||||
current_chunk.append(line)
|
||||
|
||||
# Handle final chunk
|
||||
if current_chunk:
|
||||
chunk_text = '\n'.join(current_chunk).strip()
|
||||
if chunk_text:
|
||||
chunks.extend(self._split_large_chunk(
|
||||
chunk_text, post_id, current_header, current_level
|
||||
))
|
||||
|
||||
# Add sequential index to chunks
|
||||
for idx, chunk in enumerate(chunks):
|
||||
chunk['chunk_position'] = idx
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_large_chunk(
|
||||
self,
|
||||
text: str,
|
||||
post_id: int,
|
||||
header: str,
|
||||
level: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Split chunks that exceed the maximum size.
|
||||
|
||||
Args:
|
||||
text: Text content to split
|
||||
post_id: Database ID of the post
|
||||
header: Current header text
|
||||
level: Header level (1-6)
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries
|
||||
"""
|
||||
if len(text) <= self.chunk_size:
|
||||
return [{
|
||||
'content': text,
|
||||
'post_id': post_id,
|
||||
'header': header,
|
||||
'header_level': level,
|
||||
'chunk_index': 0
|
||||
}]
|
||||
|
||||
# Split large chunks by words with overlap
|
||||
chunks = []
|
||||
words = text.split()
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
chunk_index = 0
|
||||
|
||||
for word in words:
|
||||
word_size = len(word) + 1 # +1 for space
|
||||
|
||||
if current_size + word_size > self.chunk_size and current_chunk:
|
||||
# Save current chunk
|
||||
chunk_content = ' '.join(current_chunk)
|
||||
chunks.append({
|
||||
'content': chunk_content,
|
||||
'post_id': post_id,
|
||||
'header': header,
|
||||
'header_level': level,
|
||||
'chunk_index': chunk_index
|
||||
})
|
||||
|
||||
# Start new chunk with overlap
|
||||
if self.overlap > 0:
|
||||
# Approximate word overlap
|
||||
overlap_word_count = max(1, self.overlap // 10)
|
||||
overlap_words = current_chunk[-overlap_word_count:]
|
||||
else:
|
||||
overlap_words = []
|
||||
|
||||
current_chunk = overlap_words + [word]
|
||||
current_size = sum(len(w) + 1 for w in current_chunk)
|
||||
chunk_index += 1
|
||||
else:
|
||||
current_chunk.append(word)
|
||||
current_size += word_size
|
||||
|
||||
# Save final chunk if exists
|
||||
if current_chunk:
|
||||
chunk_content = ' '.join(current_chunk)
|
||||
chunks.append({
|
||||
'content': chunk_content,
|
||||
'post_id': post_id,
|
||||
'header': header,
|
||||
'header_level': level,
|
||||
'chunk_index': chunk_index
|
||||
})
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user