Source code for kvboost.chunk_registry

"""
ChunkRegistry
=============
Splits text into cacheable chunks and owns the chunk_size policy.

Chunking strategies
-------------------
FIXED      : fixed token-count windows (default, predictable)
SEMANTIC   : split on paragraph / sentence boundaries (better reuse)
DOCUMENT   : treat entire document as one chunk
"""

from __future__ import annotations

import enum
import logging
import re
from typing import List, Tuple

from .models import chunk_id_from_tokens

log = logging.getLogger(__name__)



[docs]
class ChunkStrategy(str, enum.Enum):
    FIXED = "fixed"
    SEMANTIC = "semantic"
    DOCUMENT = "document"




[docs]
class ChunkRegistry:
    """
    Converts (text, token_ids) into a list of (start, end, sub_token_ids) triples
    according to the configured strategy.

    The registry itself holds no KV state — that lives in KVCacheManager.
    """

    def __init__(
        self,
        chunk_size: int = 128,
        strategy: ChunkStrategy = ChunkStrategy.FIXED,
        min_chunk_tokens: int = 32,
    ):
        self.chunk_size = chunk_size
        self.strategy = strategy
        self.min_chunk_tokens = min_chunk_tokens

    # ------------------------------------------------------------------
    # Primary API
    # ------------------------------------------------------------------


[docs]
    def split(
        self, token_ids: List[int], text: str = ""
    ) -> List[Tuple[int, int, List[int]]]:
        """
        Split token_ids into cacheable slices.

        Returns list of (start, end, slice_token_ids).
        end is exclusive.
        """
        if self.strategy == ChunkStrategy.FIXED:
            return self._fixed_split(token_ids)
        elif self.strategy == ChunkStrategy.SEMANTIC:
            return self._semantic_split(token_ids, text)
        elif self.strategy == ChunkStrategy.DOCUMENT:
            return [(0, len(token_ids), token_ids)]
        raise ValueError(f"Unknown strategy {self.strategy}")



[docs]
    def chunk_ids_for(self, token_ids: List[int]) -> List[str]:
        """Return the chunk_ids (hashes) for each chunk of this token sequence."""
        return [
            chunk_id_from_tokens(slice_ids)
            for _, _, slice_ids in self.split(token_ids)
        ]


    # ------------------------------------------------------------------
    # Strategies
    # ------------------------------------------------------------------

    def _fixed_split(
        self, token_ids: List[int]
    ) -> List[Tuple[int, int, List[int]]]:
        chunks = []
        pos = 0
        n = len(token_ids)
        while pos < n:
            end = min(pos + self.chunk_size, n)
            slice_ids = token_ids[pos:end]
            if len(slice_ids) >= self.min_chunk_tokens:
                chunks.append((pos, end, slice_ids))
            pos = end
        return chunks

    def _semantic_split(
        self, token_ids: List[int], text: str
    ) -> List[Tuple[int, int, List[int]]]:
        """
        Use paragraph / double-newline boundaries as split points,
        then fall back to fixed chunking if segments are too large.
        Works purely on token positions by finding sentence-boundary
        tokens (very approximate without a proper aligner).
        For a prototype, we use fixed chunking at paragraph-size
        aligned boundaries.
        """
        # Without a true token→char aligner we approximate:
        # find newline tokens in the sequence by guessing they appear
        # roughly proportional to char positions.  Then enforce chunk_size
        # as the max.
        if not text:
            return self._fixed_split(token_ids)

        # Split text on double newlines → get character offsets
        para_splits = [m.end() for m in re.finditer(r"\n\n+", text)]
        n_chars = max(len(text), 1)
        n_tokens = len(token_ids)

        # Map char offsets to approximate token offsets
        token_splits = sorted(
            set(
                int(cs / n_chars * n_tokens)
                for cs in para_splits
                if 0 < int(cs / n_chars * n_tokens) < n_tokens
            )
        )

        # Merge with chunk_size constraint
        boundaries = [0] + token_splits + [n_tokens]
        chunks = []
        for i in range(len(boundaries) - 1):
            start, end = boundaries[i], boundaries[i + 1]
            sub = token_ids[start:end]
            # If sub-segment is too large, further split fixed
            if len(sub) > self.chunk_size:
                for s, e, sl in self._fixed_split(sub):
                    chunks.append((start + s, start + e, sl))
            elif len(sub) >= self.min_chunk_tokens:
                chunks.append((start, end, sub))
        return chunks