Skip to content

cfglm

Fast computation of the posterior distrubtion over the next word in a WCFG language model.

BoolCFGLM

Bases: LM

Language model interface for Boolean-weighted CFGs.

Uses Earley's algorithm or CKY for inference. The grammar is converted to use Boolean weights if needed, where positive weights become True and zero/negative weights become False.

Parameters:

Name Type Description Default
cfg CFG

The context-free grammar to use

required
alg str

Parsing algorithm to use - either 'earley' or 'cky'

'earley'

Raises:

Type Description
ValueError

If alg is not 'earley' or 'cky'

Source code in genlm/grammar/cfglm.py
class BoolCFGLM(LM):
    """Language model interface for Boolean-weighted CFGs.

    Uses Earley's algorithm or CKY for inference. The grammar is converted to use
    Boolean weights if needed, where positive weights become True and zero/negative
    weights become False.

    Args:
        cfg (CFG): The context-free grammar to use
        alg (str): Parsing algorithm to use - either 'earley' or 'cky'

    Raises:
        ValueError: If alg is not 'earley' or 'cky'
    """

    def __init__(self, cfg, alg="earley"):
        """Initialize a BoolCFGLM.

        Args:
            cfg (CFG): The context-free grammar to use as the language model
            alg (str): Parsing algorithm to use - either 'earley' or 'cky'

        Raises:
            ValueError: If alg is not 'earley' or 'cky'
        """
        if EOS not in cfg.V:
            cfg = add_EOS(cfg, eos=EOS)
        if cfg.R != Boolean:
            cfg = cfg.map_values(lambda x: Boolean(x > 0), Boolean)
        if alg == "earley":
            from genlm.grammar.parse.earley import Earley

            self.model = Earley(cfg.prefix_grammar)
        elif alg == "cky":
            from genlm.grammar.parse.cky import CKYLM

            self.model = CKYLM(cfg)
        else:
            raise ValueError(f"unrecognized option {alg}")
        super().__init__(eos=EOS, V=cfg.V)

    def p_next(self, context):
        """Compute next token probabilities given a context.

        Args:
            context (sequence): The conditioning context

        Returns:
            (Float.chart): The next token weights

        Raises:
            AssertionError: If context contains out-of-vocabulary tokens
        """
        assert set(context) <= self.V, f"OOVs detected: {set(context) - self.V}"
        p = self.model.next_token_weights(self.model.chart(context)).trim()
        return Float.chart({w: 1 for w in p})

    def __call__(self, context):
        """Check if a context is possible under this grammar.

        Args:
            context (sequence): The context to check

        Returns:
            (bool): True if the context has non-zero weight
        """
        return float(super().__call__(context) > 0)

    def clear_cache(self):
        """Clear any cached computations."""
        self.model.clear_cache()

    @classmethod
    def from_string(cls, x, semiring=Boolean, **kwargs):
        """Create a BoolCFGLM from a string representation of a grammar.

        Args:
            x (str): The grammar string
            semiring: The semiring for weights (default: Boolean)
            **kwargs: Additional arguments passed to __init__

        Returns:
            (BoolCFGLM): A new language model
        """
        return cls(CFG.from_string(x, semiring), **kwargs)

__call__(context)

Check if a context is possible under this grammar.

Parameters:

Name Type Description Default
context sequence

The context to check

required

Returns:

Type Description
bool

True if the context has non-zero weight

Source code in genlm/grammar/cfglm.py
def __call__(self, context):
    """Check if a context is possible under this grammar.

    Args:
        context (sequence): The context to check

    Returns:
        (bool): True if the context has non-zero weight
    """
    return float(super().__call__(context) > 0)

__init__(cfg, alg='earley')

Initialize a BoolCFGLM.

Parameters:

Name Type Description Default
cfg CFG

The context-free grammar to use as the language model

required
alg str

Parsing algorithm to use - either 'earley' or 'cky'

'earley'

Raises:

Type Description
ValueError

If alg is not 'earley' or 'cky'

Source code in genlm/grammar/cfglm.py
def __init__(self, cfg, alg="earley"):
    """Initialize a BoolCFGLM.

    Args:
        cfg (CFG): The context-free grammar to use as the language model
        alg (str): Parsing algorithm to use - either 'earley' or 'cky'

    Raises:
        ValueError: If alg is not 'earley' or 'cky'
    """
    if EOS not in cfg.V:
        cfg = add_EOS(cfg, eos=EOS)
    if cfg.R != Boolean:
        cfg = cfg.map_values(lambda x: Boolean(x > 0), Boolean)
    if alg == "earley":
        from genlm.grammar.parse.earley import Earley

        self.model = Earley(cfg.prefix_grammar)
    elif alg == "cky":
        from genlm.grammar.parse.cky import CKYLM

        self.model = CKYLM(cfg)
    else:
        raise ValueError(f"unrecognized option {alg}")
    super().__init__(eos=EOS, V=cfg.V)

clear_cache()

Clear any cached computations.

Source code in genlm/grammar/cfglm.py
def clear_cache(self):
    """Clear any cached computations."""
    self.model.clear_cache()

from_string(x, semiring=Boolean, **kwargs) classmethod

Create a BoolCFGLM from a string representation of a grammar.

Parameters:

Name Type Description Default
x str

The grammar string

required
semiring

The semiring for weights (default: Boolean)

Boolean
**kwargs

Additional arguments passed to init

{}

Returns:

Type Description
BoolCFGLM

A new language model

Source code in genlm/grammar/cfglm.py
@classmethod
def from_string(cls, x, semiring=Boolean, **kwargs):
    """Create a BoolCFGLM from a string representation of a grammar.

    Args:
        x (str): The grammar string
        semiring: The semiring for weights (default: Boolean)
        **kwargs: Additional arguments passed to __init__

    Returns:
        (BoolCFGLM): A new language model
    """
    return cls(CFG.from_string(x, semiring), **kwargs)

p_next(context)

Compute next token probabilities given a context.

Parameters:

Name Type Description Default
context sequence

The conditioning context

required

Returns:

Type Description
chart

The next token weights

Raises:

Type Description
AssertionError

If context contains out-of-vocabulary tokens

Source code in genlm/grammar/cfglm.py
def p_next(self, context):
    """Compute next token probabilities given a context.

    Args:
        context (sequence): The conditioning context

    Returns:
        (Float.chart): The next token weights

    Raises:
        AssertionError: If context contains out-of-vocabulary tokens
    """
    assert set(context) <= self.V, f"OOVs detected: {set(context) - self.V}"
    p = self.model.next_token_weights(self.model.chart(context)).trim()
    return Float.chart({w: 1 for w in p})

add_EOS(cfg, eos=None)

Add an end-of-sequence symbol to a CFG's language.

Transforms the grammar to append the EOS symbol to every string it generates.

Parameters:

Name Type Description Default
cfg CFG

The input grammar

required
eos optional

The end-of-sequence symbol to add. Defaults to ▪.

None

Returns:

Type Description
CFG

A new grammar that generates strings ending in EOS

Raises:

Type Description
AssertionError

If EOS is already in the grammar's vocabulary

Source code in genlm/grammar/cfglm.py
def add_EOS(cfg, eos=None):
    """Add an end-of-sequence symbol to a CFG's language.

    Transforms the grammar to append the EOS symbol to every string it generates.

    Args:
        cfg (CFG): The input grammar
        eos (optional): The end-of-sequence symbol to add. Defaults to ▪.

    Returns:
        (CFG): A new grammar that generates strings ending in EOS

    Raises:
        AssertionError: If EOS is already in the grammar's vocabulary

    """
    S = _gen_nt("<START>")
    new = cfg.spawn(S=S)
    eos = eos or EOS
    assert eos not in cfg.V
    new.V.add(eos)
    new.add(cfg.R.one, S, cfg.S, eos)
    for r in cfg:
        new.add(r.w, r.head, *r.body)
    return new

locally_normalize(self, **kwargs)

Locally normalize the grammar's rule weights.

Returns a transformed grammar where: 1. The total weight of rules with the same head symbol sums to one 2. Each derivation's weight is proportional to the original grammar (differs only by a multiplicative normalization constant)

Parameters:

Name Type Description Default
**kwargs

Additional arguments passed to self.agenda()

{}

Returns:

Type Description
CFG

A new grammar with locally normalized weights

Source code in genlm/grammar/cfglm.py
def locally_normalize(self, **kwargs):
    """Locally normalize the grammar's rule weights.

    Returns a transformed grammar where:
    1. The total weight of rules with the same head symbol sums to one
    2. Each derivation's weight is proportional to the original grammar
       (differs only by a multiplicative normalization constant)

    Args:
        **kwargs: Additional arguments passed to self.agenda()

    Returns:
        (CFG): A new grammar with locally normalized weights
    """
    new = self.spawn()
    Z = self.agenda(**kwargs)
    for r in self:
        if Z[r.head] == 0:
            continue
        new.add(r.w * Z.product(r.body) / Z[r.head], r.head, *r.body)
    return new