Skip to content

tokenization

decode_vocab(tokenizer, byte2str_fallback='tokenizer')

Convert tokenizer vocabulary into byte and string representations.

Warning

The byte representation is the canonical form. Each element in byte_vocab is a Token object that contains both the token_id and byte_string. The string representation is provided for convenience but may not decode properly for all tokens, especially those containing invalid UTF-8 sequences.

Parameters:

Name Type Description Default
tokenizer

A Hugging Face tokenizer instance

required
byte2str_fallback str

Strategy for converting invalid UTF-8 bytes to strings. Options:

  • 'tokenizer': Use tokenizer's convert_ids_to_tokens (default)
  • 'latin1': Decode using latin1 encoding
  • 'replace': Use Unicode replacement character '�'
'tokenizer'

Returns:

Type Description
tuple

(byte_vocab, str_vocab) where byte_vocab is a list of Token objects and str_vocab is a list of strings

Source code in genlm/backend/tokenization/vocab.py
def decode_vocab(tokenizer, byte2str_fallback="tokenizer"):
    """Convert tokenizer vocabulary into byte and string representations.

    Warning:
        The byte representation is the canonical form. Each element in byte_vocab is a Token object that
        contains both the token_id and byte_string. The string representation is provided for convenience
        but may not decode properly for all tokens, especially those containing invalid UTF-8 sequences.

    Args:
        tokenizer: A Hugging Face tokenizer instance
        byte2str_fallback (str): Strategy for converting invalid UTF-8 bytes to strings. Options:\n
            - 'tokenizer': Use tokenizer's `convert_ids_to_tokens` (default)
            - 'latin1': Decode using latin1 encoding
            - 'replace': Use Unicode replacement character '�'

    Returns:
        (tuple): (byte_vocab, str_vocab) where byte_vocab is a list of Token objects
            and str_vocab is a list of strings
    """
    if byte2str_fallback not in ["latin1", "tokenizer", "replace"]:
        raise ValueError(f"Unknown byte2str_fallback strategy: {byte2str_fallback}")

    if tokenizer.is_fast:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer.name_or_path, use_fast=False
        )

    # Try slow tokenizer.
    try:
        raw_byte_vocab = get_byte_vocab(tokenizer)
    except ByteVocabError:
        # warnings.warn("Could not decode vocabulary from slow tokenizer. Trying using fast tokenizer.")

        # Try fast tokenizer.
        tokenizer = AutoTokenizer.from_pretrained(tokenizer.name_or_path, use_fast=True)
        try:
            raw_byte_vocab = get_byte_vocab(tokenizer)
        except ByteVocabError as e:
            raise ValueError(
                f"Could not decode byte representation of token vocabuary from tokenizer {tokenizer.name_or_path}"
            ) from e

    # Create Token objects for byte_vocab.
    # Assumption: token_id == position index in the vocabulary. This is relied upon
    # by the trie (idx_to_leaf) and weight arrays (ws[i] corresponds to decode[i]).
    byte_vocab = [
        Token(token_id=i, byte_string=b) for i, b in enumerate(raw_byte_vocab)
    ]
    str_vocab = bytes_to_strs(tokenizer, raw_byte_vocab, byte2str_fallback)

    return byte_vocab, str_vocab

Token

Bases: bytes

A vocabulary token carrying both a token ID and its byte representation.

Subclasses bytes so that existing code using byte operations (b"".join, len, indexing, .decode()) continues to work. Equality and hashing between Token objects use token_id, not byte content.

Parameters:

Name Type Description Default
token_id int

The unique identifier for this token in the vocabulary.

required
byte_string bytes

The byte representation of this token.

required
Source code in genlm/backend/tokenization/token.py
class Token(bytes):
    """A vocabulary token carrying both a token ID and its byte representation.

    Subclasses ``bytes`` so that existing code using byte operations (``b"".join``,
    ``len``, indexing, ``.decode()``) continues to work. Equality and hashing
    between Token objects use ``token_id``, not byte content.

    Args:
        token_id (int): The unique identifier for this token in the vocabulary.
        byte_string (bytes): The byte representation of this token.
    """

    def __new__(cls, token_id: int, byte_string: bytes):
        if not isinstance(token_id, int):
            raise TypeError(f"token_id must be an int, got {type(token_id)}")
        if not isinstance(byte_string, bytes):
            raise TypeError(f"byte_string must be bytes, got {type(byte_string)}")
        obj = super().__new__(cls, byte_string)
        obj.token_id = token_id
        return obj

    @property
    def byte_string(self):
        """The byte representation of this token (as plain bytes)."""
        return bytes(self)

    def __repr__(self):
        return f"Token(token_id={self.token_id}, byte_string={bytes(self)!r})"

    # -- Equality / hashing: by token_id between Tokens, by content vs bytes --

    def __eq__(self, other):
        if isinstance(other, Token):
            return self.token_id == other.token_id
        return NotImplemented

    def __ne__(self, other):
        if isinstance(other, Token):
            return self.token_id != other.token_id
        return NotImplemented

    def __hash__(self):
        return hash(self.token_id)

    # -- Ordering: by token_id --

    def __lt__(self, other):
        if not isinstance(other, Token):
            return NotImplemented
        return self.token_id < other.token_id

    def __le__(self, other):
        if not isinstance(other, Token):
            return NotImplemented
        return self.token_id <= other.token_id

    def __gt__(self, other):
        if not isinstance(other, Token):
            return NotImplemented
        return self.token_id > other.token_id

    def __ge__(self, other):
        if not isinstance(other, Token):
            return NotImplemented
        return self.token_id >= other.token_id

    # -- Helpers --

    @staticmethod
    def as_bytes(x):
        """Extract byte string from a Token or pass through plain bytes."""
        return x.byte_string if isinstance(x, Token) else x

    @staticmethod
    def is_plain_bytes(x):
        """Check if x is plain bytes (not a Token)."""
        return isinstance(x, bytes) and not isinstance(x, Token)

    # -- Pickle / deepcopy support --

    def __reduce__(self):
        return (Token, (self.token_id, bytes(self)))

byte_string property

The byte representation of this token (as plain bytes).

as_bytes(x) staticmethod

Extract byte string from a Token or pass through plain bytes.

Source code in genlm/backend/tokenization/token.py
@staticmethod
def as_bytes(x):
    """Extract byte string from a Token or pass through plain bytes."""
    return x.byte_string if isinstance(x, Token) else x

is_plain_bytes(x) staticmethod

Check if x is plain bytes (not a Token).

Source code in genlm/backend/tokenization/token.py
@staticmethod
def is_plain_bytes(x):
    """Check if x is plain bytes (not a Token)."""
    return isinstance(x, bytes) and not isinstance(x, Token)