Source code for kvboost.chunk_registry
"""
ChunkRegistry
=============
Splits text into cacheable chunks and owns the chunk_size policy.
Chunking strategies
-------------------
FIXED : fixed token-count windows (default, predictable)
SEMANTIC : split on paragraph / sentence boundaries (better reuse)
DOCUMENT : treat entire document as one chunk
"""
from __future__ import annotations
import enum
import logging
import re
from typing import List, Tuple
from .models import chunk_id_from_tokens
log = logging.getLogger(__name__)
[docs]
class ChunkStrategy(str, enum.Enum):
FIXED = "fixed"
SEMANTIC = "semantic"
DOCUMENT = "document"
[docs]
class ChunkRegistry:
"""
Converts (text, token_ids) into a list of (start, end, sub_token_ids) triples
according to the configured strategy.
The registry itself holds no KV state — that lives in KVCacheManager.
"""
def __init__(
self,
chunk_size: int = 128,
strategy: ChunkStrategy = ChunkStrategy.FIXED,
min_chunk_tokens: int = 32,
):
self.chunk_size = chunk_size
self.strategy = strategy
self.min_chunk_tokens = min_chunk_tokens
# ------------------------------------------------------------------
# Primary API
# ------------------------------------------------------------------
[docs]
def split(
self, token_ids: List[int], text: str = ""
) -> List[Tuple[int, int, List[int]]]:
"""
Split token_ids into cacheable slices.
Returns list of (start, end, slice_token_ids).
end is exclusive.
"""
if self.strategy == ChunkStrategy.FIXED:
return self._fixed_split(token_ids)
elif self.strategy == ChunkStrategy.SEMANTIC:
return self._semantic_split(token_ids, text)
elif self.strategy == ChunkStrategy.DOCUMENT:
return [(0, len(token_ids), token_ids)]
raise ValueError(f"Unknown strategy {self.strategy}")
[docs]
def chunk_ids_for(self, token_ids: List[int]) -> List[str]:
"""Return the chunk_ids (hashes) for each chunk of this token sequence."""
return [
chunk_id_from_tokens(slice_ids)
for _, _, slice_ids in self.split(token_ids)
]
# ------------------------------------------------------------------
# Strategies
# ------------------------------------------------------------------
def _fixed_split(
self, token_ids: List[int]
) -> List[Tuple[int, int, List[int]]]:
chunks = []
pos = 0
n = len(token_ids)
while pos < n:
end = min(pos + self.chunk_size, n)
slice_ids = token_ids[pos:end]
if len(slice_ids) >= self.min_chunk_tokens:
chunks.append((pos, end, slice_ids))
pos = end
return chunks
def _semantic_split(
self, token_ids: List[int], text: str
) -> List[Tuple[int, int, List[int]]]:
"""
Use paragraph / double-newline boundaries as split points,
then fall back to fixed chunking if segments are too large.
Works purely on token positions by finding sentence-boundary
tokens (very approximate without a proper aligner).
For a prototype, we use fixed chunking at paragraph-size
aligned boundaries.
"""
# Without a true token→char aligner we approximate:
# find newline tokens in the sequence by guessing they appear
# roughly proportional to char positions. Then enforce chunk_size
# as the max.
if not text:
return self._fixed_split(token_ids)
# Split text on double newlines → get character offsets
para_splits = [m.end() for m in re.finditer(r"\n\n+", text)]
n_chars = max(len(text), 1)
n_tokens = len(token_ids)
# Map char offsets to approximate token offsets
token_splits = sorted(
set(
int(cs / n_chars * n_tokens)
for cs in para_splits
if 0 < int(cs / n_chars * n_tokens) < n_tokens
)
)
# Merge with chunk_size constraint
boundaries = [0] + token_splits + [n_tokens]
chunks = []
for i in range(len(boundaries) - 1):
start, end = boundaries[i], boundaries[i + 1]
sub = token_ids[start:end]
# If sub-segment is too large, further split fixed
if len(sub) > self.chunk_size:
for s, e, sl in self._fixed_split(sub):
chunks.append((start + s, start + e, sl))
elif len(sub) >= self.min_chunk_tokens:
chunks.append((start, end, sub))
return chunks