Saga (p.saga.1.0.0~alpha1.doc.saga.Saga)

Library Overview

Saga consists of four main components:

Tokenization: Fast tokenization with BPE, WordPiece, and custom methods
io: Efficient file I/O utilities for large text corpora
lm: High-level statistical language models (n-grams)
sampling: Advanced text generation with composable processors

All components work together seamlessly but can be used independently.

Quick Start

Simple tokenization and text processing

  open Saga

  (* Basic word tokenization *)
  let tokens = tokenize "Hello, world! How are you?"
  (* Returns: ["Hello"; ","; "world"; "!"; "How"; "are"; "you"; "?"] *)

  (* Character-level tokenization *)
  let chars = tokenize ~method_:`Chars "Hello"
  (* Returns: ["H"; "e"; "l"; "l"; "o"] *)

  (* Batch processing with padding *)
  let batch_ids = encode_batch [ "Hello world"; "Hi there" ] ~pad:true
  (* Returns: padded tensor of token IDs *)

Training a language model

  (* Load training data *)
  let texts = read_lines "training_data.txt"

  (* Create and train a bigram model *)
  let model =
    LM.ngram ~n:2 ~tokenizer:(tokenizer `Words) () |> LM.train texts

  (* Generate new text *)
  let generated =
    LM.generate model ~num_tokens:20 ~temperature:0.8 () Printf.printf
      "Generated: %s\n" generated

  (* Evaluate on test data *)
  let test_perplexity =
    LM.perplexity model "the quick brown fox" Printf.printf
      "Test perplexity: %.2f\n" test_perplexity

Advanced text generation

  (* Create a model function (typically a neural network) *)
  let model_fn token_ids =
    (* Your neural network forward pass *)
    Array.make 50000 0.0 (* Example: uniform logits *)

  (* Configure generation with custom processors *)
  let config =
    Sampler.default
    |> Sampler.with_temperature 0.9
    |> Sampler.with_top_k 40
    |> Sampler.with_repetition_penalty 1.1

  (* Generate with fine-grained control *)
  let result =
    Sampler.generate_text ~model:model_fn
      ~tokenizer:(encode ~vocab:(vocab [ "hello"; "world" ]))
      ~decoder:(decode (vocab [ "hello"; "world" ]))
      ~prompt:"Hello" ~generation_config:config ()

Common Patterns

Text preprocessing pipeline

  let preprocess_texts texts =
    texts
    |> List.map (normalize ~lowercase:true ~collapse_whitespace:true)
    |> List.filter (fun s -> String.length s > 10) (* Filter short texts *)
    |> List.map (tokenize ~method_:`Words)

Model comparison and evaluation

  let compare_models texts test_texts =
    let models =
      [
        ("unigram", LM.ngram ~n:1 ());
        ("bigram", LM.ngram ~n:2 ());
        ("trigram", LM.ngram ~n:3 ~smoothing:0.1 ());
      ]
    in
    List.map
      (fun (name, model) ->
        let trained = LM.train model texts in
        let avg_perp =
          List.map (LM.perplexity trained) test_texts
          |> List.fold_left ( +. ) 0.
          |> fun sum -> sum /. float_of_int (List.length test_texts)
        in
        (name, avg_perp))
      models

Custom tokenization workflows

  let create_code_tokenizer () =
    tokenizer (`Regex {|[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[(){}[\].,;]|\S|})
    |> Tokenizer.with_normalizer (normalize ~collapse_whitespace:true)

  let process_code_files filenames =
    filenames |> List.map read_lines |> List.flatten
    |> List.map (Tokenizer.run (create_code_tokenizer ()))

Performance Tips

Use read_lines_lazy for very large files to avoid memory issues
Character-level models work well for small vocabularies (names, short sequences)
Word-level models are better for natural language with large vocabularies
Higher n-gram orders need exponentially more training data
BPE and WordPiece tokenizers handle out-of-vocabulary words better than simple word splitting

Integration with Other Libraries

Saga integrates well with:

Base/Core for functional programming utilities
Nx for tensor operations and neural networks
Dune for build system integration
Standard CSV/JSON libraries for data loading

Tokenization

Fast and flexible tokenization supporting multiple algorithms and custom patterns. Handles everything from simple word splitting to advanced subword tokenization.

Quick Start

  open Saga.Tokenizers
  (* Create a character-level tokenizer *)
  let tokenizer = Tokenizer.create ~model:(Model.chars ()) in
  (* Add special tokens *)
  Tokenizer.add_special_tokens tokenizer [Added_token.create ~content:"." ~special:true ()];
  (* Train on data *)
  Tokenizer.train_from_iterator tokenizer (Seq.of_list names) ~trainer:(Trainer.chars ()) ();
  (* Encode with options *)
  let encoding = Tokenizer.encode tokenizer ~sequence:"hello world" ~add_special_tokens:true ();
  (* Get ids and decode *)
  let ids = Encoding.ids encoding in
  let text = Tokenizer.decode tokenizer ids ~skip_special_tokens:true;

Key Concepts

Tokenizer.t: The main tokenizer instance, configurable with model, normalizer, etc.
Models.t: Core tokenization algorithm (e.g., Chars, BPE).
Encoding.t: Result of encoding, with ids, tokens, offsets, masks, etc., exposed as a record.
Special tokens: Handled via add_special_tokens and encoding options.
All functions handle Unicode correctly via the Unicode module. This API aligns with Hugging Face Tokenizers v0.21 (as of 2025), including support for fast Rust-backed operations where applicable.

Sourcemodule Either : sig ... end

Either type for API compatibility.

Sourcemodule Unicode = Saga_tokenizers.Unicode

Unicode utilities.

Sourcemodule Models = Saga_tokenizers.Models

Sourcemodule Normalizers = Saga_tokenizers.Normalizers

Sourcemodule Pre_tokenizers = Saga_tokenizers.Pre_tokenizers

Sourcemodule Processors = Saga_tokenizers.Processors

Sourcemodule Decoders = Saga_tokenizers.Decoders

Sourcemodule Trainers = Saga_tokenizers.Trainers

Sourcemodule Encoding = Saga_tokenizers.Encoding

Sourcemodule Bpe = Saga_tokenizers.Bpe

Sourcemodule Wordpiece = Saga_tokenizers.Wordpiece

Enums as Polymorphic Variants

Sourcetype direction = [

| `Left
| `Right

]

Padding or truncation direction.

Sourcetype split_delimiter_behavior = [

| `Removed
| `Isolated
| `Merged_with_previous
| `Merged_with_next
| `Contiguous

]

Behavior for splitting delimiters.

Sourcetype strategy = [

| `Longest_first
| `Only_first
| `Only_second

]

Truncation strategy.

Sourcetype prepend_scheme = [

| `Always
| `Never
| `First

]

Prepend scheme for metaspace.

Core Types

Sourcemodule Added_token : sig ... end

Sourcemodule Tokenizer : sig ... end

File I/O

Efficient file I/O utilities optimized for large text corpora and ML workflows.

Sourceval read_lines : ?buffer_size:int -> string -> string list

read_lines ?buffer_size filename efficiently reads all lines from a file.

parameter buffer_size
Size of the read buffer in bytes (default: 65536)

returns
List of lines without trailing newlines

raises Sys_error
if file cannot be opened or read

Features:

Efficient buffered reading for large files
Automatic resource cleanup on errors
Windows/Unix line ending compatibility
Memory-efficient for files with many lines

Sourceval read_lines_lazy : ?buffer_size:int -> string -> string Seq.t

read_lines_lazy ?buffer_size filename returns a lazy sequence of lines.

parameter buffer_size
Size of the read buffer in bytes (default: 65536)

returns
Lazy sequence of lines that are read on-demand

raises Sys_error
if file cannot be opened

Use this for very large files to avoid loading everything into memory. The file is automatically closed when the sequence is fully consumed or when an error occurs.

Sourceval write_lines : ?append:bool -> string -> string list -> unit

write_lines ?append filename lines writes lines to a file.

parameter append
If true, append to existing file (default: false)

parameter filename
Target file path

parameter lines
List of lines to write (newlines are added automatically)

raises Sys_error
if file cannot be written

Language Models

High-level statistical language models with simple training and generation APIs.

Overview

The workflow follows three main steps: 1. Create a model with ngram 2. Train it on text data with train 3. Generate new text with generate or evaluate with perplexity

All models are immutable - training returns a new model instance.

Quick Start

Train a bigram model on names and generate new ones:

  let names = [ "alice"; "bob"; "charlie"; "diana"; "eve" ]
  let model = Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Chars) ()
  let trained_model = Saga.train model names
  let new_name = Saga.generate trained_model ~num_tokens:10 ()
  (* Returns: "alicia" or similar character-level generation *)

Word-level model with custom settings:

  let texts = [ "the cat sat"; "the dog ran"; "the cat ran" ]

  let model =
    Saga.ngram ~n:3 ~smoothing:0.05 ~min_freq:2
      ~tokenizer:(Saga.tokenizer `Words) ()
    |> Saga.train texts

  let story =
    Saga.generate model ~prompt:"the cat" ~num_tokens:20 ~temperature:0.9 ()

Key Concepts

N-grams

N-gram models predict the next token based on the previous n-1 tokens. Higher n captures more context but requires more training data:

n=1 (unigram): No context, just token frequencies
n=2 (bigram): Depends on 1 previous token
n=3 (trigram): Depends on 2 previous tokens
n=4,5: Higher context, needs lots of data

Smoothing

Smoothing handles unseen token sequences:

Add-k smoothing: Adds small count to all n-grams (default: 0.01)
Higher values = more uniform distribution
Lower values = sharper predictions

Tokenization

Models work on token sequences. Built-in tokenizers:

`Words: Split on whitespace and punctuation
`Chars: Unicode character-level
Custom tokenizers for domain-specific needs

Advanced Usage

Save and load trained models:

  Saga.save trained_model "my_model.bin"

  let loaded_model = Saga.load "my_model.bin"

Evaluate model quality:

  let test_texts = [ "the quick brown fox" ]
  let perplexity = Saga.perplexity trained_model (List.hd test_texts)
  (* Lower perplexity = better model fit *)

Batch evaluation and generation:

  let samples =
    Saga.pipeline model training_texts ~num_samples:50 ~temperature:1.2 ()
      List.iter
      (fun (text, perp) ->
        Printf.printf "%s (perplexity: %.2f)\n" text perp)
      samples

Core Types

Sourcetype model

Opaque language model that can be trained and used for generation.

Models are immutable - training operations return new model instances. Supports n-gram models (n=1-5) with potential for extension to other statistical models like Markov chains or probabilistic context-free grammars.

Training

Sourceval train : model -> string list -> model

train model texts trains the model on a list of text strings.

The training process: 1. Tokenizes each text using the model's tokenizer 2. Automatically adds BOS/EOS tokens (tokenizer-specific) 3. Builds or updates the vocabulary 4. Fits the statistical backend (n-gram counts, etc.)

BOS/EOS tokens are tokenizer-aware:

Character tokenizer: Uses "." as both BOS and EOS
Word tokenizer: Uses "<bos>" and "<eos>"
Custom tokenizers: Uses configured special tokens

Returns a new trained model instance (original is unchanged).

  let untrained = Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Words) ()
  let trained = Saga.train untrained [ "hello world"; "world peace" ]
  (* trained model now knows bigrams: <bos>+hello, hello+world, world+<eos>,
     etc. *)

Text Generation

Source

val generate : 
  model ->
  ?num_tokens:int ->
  ?temperature:float ->
  ?top_k:int ->
  ?top_p:float ->
  ?seed:int ->
  ?min_new_tokens:int ->
  ?prompt:string ->
  unit ->
  string

generate model ?num_tokens ?temperature ?top_k ?top_p ?seed ?min_new_tokens ?prompt () generates text from the trained model.

Generation continues until:

Maximum tokens reached (num_tokens)
EOS token generated (unless blocked by min_new_tokens)
Model produces invalid continuation

parameter num_tokens
Maximum tokens to generate (default: 20)

parameter temperature
Sampling randomness: 0.1 = conservative, 2.0 = very random (default: 1.0)

parameter top_k
Keep only top-k most likely tokens, 0 = disabled (default: None)

parameter top_p
Nucleus sampling: keep tokens with cumulative probability ≤ p (default: None)

parameter seed
Random seed for reproducible generation (default: None = random)

parameter min_new_tokens
Block EOS tokens until at least this many tokens generated (default: None)

parameter prompt
Initial text prompt, tokenizer-specific (default: empty = auto BOS)

returns
Generated text as clean, decoded string

The generated text is automatically cleaned:

BOS/EOS tokens removed
Tokenizer-specific post-processing applied
Invalid unicode sequences handled gracefully

Examples:

  (* Conservative, deterministic generation *)
  let result = Saga.generate model ~temperature:0.1 ~num_tokens:10 ()

  (* Creative generation with nucleus sampling *)
  let story =
    Saga.generate model ~temperature:1.2 ~top_p:0.9 ~num_tokens:50 ()

  (* Prompted generation *)
  let completion =
    Saga.generate model ~prompt:"Once upon a time" ~num_tokens:30 ()

  (* Ensure minimum length *)
  let long_text = Saga.generate model ~min_new_tokens:100 ~num_tokens:200 ()

Source

val generate_ids : 
  model ->
  ?num_tokens:int ->
  ?temperature:float ->
  ?top_k:int ->
  ?top_p:float ->
  ?seed:int ->
  ?min_new_tokens:int ->
  ?prompt:string ->
  unit ->
  int list

Same as generate but returns generated token IDs (in the LM's internal vocabulary).

Sourceval decode_ids : model -> int list -> string list

decode_ids model ids converts the LM's token IDs to their string tokens. This is useful to control how to join tokens (e.g., with spaces for word-level models).

Model Evaluation

Sourceval score : model -> string -> float

score model text computes the log-probability of the text sequence under the model.

Returns the sum of log-probabilities for each token in the sequence. More negative values indicate lower probability (worse fit).

The text is automatically tokenized and BOS/EOS tokens added as needed. For unseen n-grams, uses the model's smoothing strategy.

To convert to perplexity manually: exp(-score / token_count)

  let score = Saga.score model "hello world"
  (* Returns: -5.234 (example negative log-probability) *)

  let manual_perplexity =
    let tokens = (* tokenize "hello world" *) 3 in
    exp (-.score /. float_of_int tokens)

Sourceval perplexity : model -> string -> float

perplexity model text computes perplexity, a standard metric for language model quality.

Perplexity = exp(-average_log_probability)

Lower values indicate better model fit
Perplexity ≈ average branching factor at each step
Perfect prediction gives perplexity = 1.0
Random guessing gives perplexity = vocabulary_size

Text is automatically wrapped with BOS/EOS tokens for proper evaluation.

  let perp1 = Saga.perplexity model "the cat sat"  (* Returns: 12.5 *)
  let perp2 = Saga.perplexity model "xyz qwerty"   (* Returns: 45.2 (worse fit) *)

  (* Lower perplexity = better model *)
  assert (perp1 < perp2)

Sourceval perplexities : model -> string list -> float list

perplexities model texts computes perplexity for multiple texts efficiently.

Equivalent to List.map (perplexity model) texts but may be optimized for batch processing in the future.

  let test_set = [ "the cat sat"; "dogs run fast"; "hello world" ]
  let perps = Saga.perplexities model test_set

  let avg_perp =
    List.fold_left ( +. ) 0. perps
    /. float_of_int (List.length perps) Printf.printf
         "Average test perplexity: %.2f\n" avg_perp

Model Creation

Source

val ngram : 
  n:int ->
  ?smoothing:float ->
  ?min_freq:int ->
  ?specials:string list ->
  ?tokenizer:Saga_tokenizers.Tokenizer.t ->
  unit ->
  model

ngram ~n ?smoothing ?min_freq ?specials ?tokenizer () creates an n-gram language model.

N-gram models predict tokens based on the previous n-1 tokens. This implementation uses Maximum Likelihood Estimation (MLE) with add-k smoothing, similar to NLTK's approach.

parameter n
Order of the n-gram model (1-5 supported, higher orders possible via custom backends)

parameter smoothing
Add-k smoothing parameter (default: 0.01). Higher values create more uniform distributions

parameter min_freq
Minimum frequency threshold for n-gram inclusion (default: 1)

parameter specials
Special tokens for BOS/EOS, auto-detected from tokenizer if omitted

parameter tokenizer
Tokenizer to use, auto-inferred if omitted (characters for short texts)

Special tokens are tokenizer-aware:

Character tokenizer: ".", "." (period for both BOS and EOS)
Word tokenizer: "<bos>", "<eos>" (explicit start/end markers)
Custom tokenizers: Use provided specials or tokenizer defaults

The returned model is untrained - use train to fit it on data.

Usage Examples

NLTK-style workflow:

  let model =
    Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Chars) ~min_freq:2 ()

  let trained_model = Saga.train model names

  let generated =
    Saga.generate trained_model ~num_tokens:15 ~temperature:0.8 ()

  let quality = Saga.perplexity trained_model "emma" (* Lower = better *)

Fluent chaining style:

  let model = Saga.ngram ~n:2 ~smoothing:0.01 () |> Saga.train names

  let generated_name =
    Saga.generate model ~num_tokens:20 ~temperature:1.0 () Printf.printf
      "Generated: %s\n" generated_name

Custom configuration:

  let model =
    Saga.ngram ~n:3 ~smoothing:0.05 ~min_freq:3
      ~specials:[ "<start>"; "<end>" ]
      ~tokenizer:(Saga.tokenizer (`Regex {|\\w+|[.,!?]|}))
      ()

Model Persistence

Sourceval save : model -> string -> unit

save model filename saves the trained model to a binary file.

Serializes the complete model state including:

Vocabulary mappings
N-gram counts and statistics
Tokenizer configuration
Model hyperparameters

Use load to restore the model later.

  Saga.save trained_model "my_language_model.bin"

Sourceval load : string -> model

load filename loads a previously saved model from disk.

The loaded model is immediately ready for generation and evaluation without requiring retraining.

raises Sys_error
if file doesn't exist or is corrupted

raises Invalid_argument
if file format is incompatible

  let model = Saga.load "my_language_model.bin"
  let text = Saga.generate model ~num_tokens:50 ()

Convenience Functions

Source

val pipeline : 
  model ->
  string list ->
  ?num_samples:int ->
  ?temperature:float ->
  ?top_k:int ->
  ?top_p:float ->
  ?seed:int ->
  unit ->
  (string * float) list

pipeline model texts ?num_samples ?temperature ?top_k ?top_p ?seed () is a convenience function that trains a model and generates samples with their perplexities.

This function: 1. Trains the model on the provided texts 2. Generates num_samples text samples using the trained model 3. Computes perplexity for each generated sample 4. Returns (generated_text, perplexity) pairs

Useful for quick experimentation and model evaluation.

parameter model
Untrained or partially trained model

parameter texts
Training texts

parameter num_samples
Number of samples to generate (default: 20)

parameter temperature,top_k,top_p,seed
Generation parameters (same as generate)

returns
List of (generated_text, perplexity) pairs, sorted by perplexity (best first)

Usage Examples

Quick model evaluation:

  let names = Saga.IO.read_lines "names.txt"
  let model = Saga.ngram ~n:2 ()

  let samples =
    Saga.pipeline model names ~num_samples:20 ~temperature:1.0 ()
      (* Print best samples (lowest perplexity) *)
      List.iter
      (fun (name, perp) ->
        Printf.printf "%s (perplexity: %.2f)\\n" name perp)
      (List.take 5 samples)

Parameter exploration:

  let compare_temperatures temps texts =
    List.map (fun temp ->
      let samples = Saga.pipeline model texts ~temperature:temp ~num_samples:10 ()
      let avg_perp =
        List.fold_left (fun acc (_, p) -> acc +. p) 0. samples /. 10.
      (temp, avg_perp)
    ) temps

Advanced Text Generation

Modern text generation with composable processors and fine-grained control, designed for integration with neural language models.

Sourcemodule Sampler : sig ... end

Advanced text generation and sampling utilities.

Examples

Quick tokenization

  open Saga

  (* Simple char tokenization *)
  let tok = Tokenizer.create ~model:(Models.chars ())
  let enc = Tokenizer.encode tok ~sequence:(Either.Left "Hello world!") ()
  let ids = Encoding.get_ids enc
  let text = Tokenizer.decode tok (Array.to_list ids) ()

  (* BPE tokenization *)
  let tok = tokenizer (`BPE ("vocab.json", "merges.txt"))
  let batch = encode_batch tok [ "Hello"; "World" ] ~padding:true

Training a language model

  (* Train a bigram model *)
  let texts = [ "The cat sat"; "The dog ran"; "The cat ran" ]
  let tok = tokenizer `Words
  let model = LM.train_ngram ~n:2 tok texts

  (* Generate text *)
  let generated =
    LM.generate model ~max_tokens:50 ~temperature:0.8 tok print_endline
      generated

Custom tokenizer

  (* Tokenizer with normalization *)
  let tok =
    tokenizer `Words
    |> Tokenizer.with_normalizer (normalize ~lowercase:true)

  (* Regex tokenizer for code *)
  let code_tok = tokenizer (`Regex {|[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|.|})

End-to-End Name Generator

  let build_name_generator training_file =
    (* Load and train character-level model *)
    let names = read_lines training_file in
    let model =
      LM.ngram ~n:3 ~tokenizer:(tokenizer `Chars) ~smoothing:0.1 ()
      |> LM.train names in

    (* Return generator function *)
    fun ?(temperature=0.8) ?(max_len=12) () ->
      LM.generate model ~num_tokens:max_len ~temperature ()

  (* Generate 10 new names *)
  let gen = build_name_generator "names.txt" in
  List.init 10 (fun _ -> gen ()) |> List.iter (Printf.printf "%s\n")

Advanced Neural Model Integration

  let setup_neural_generation neural_model vocab_file =
    let vocab = vocab_load vocab_file in
    let tokenize_fn text = encode ~vocab text in
    let decode_fn ids = decode vocab ids in

    (* Wrap neural model for Sampler API *)
    let model_fn token_ids =
      let tensor = Nx.of_array1 (Array.of_list token_ids) in
      let logits = neural_model tensor in
      Nx.to_array1 logits
    in

    (* Creative writing configuration *)
    let config =
      Sampler.creative_writing
      |> Sampler.with_max_new_tokens 200
      |> Sampler.with_repetition_penalty 1.15
    in

    let processors =
      [
        Sampler.temperature_warper ~temperature:1.1;
        Sampler.top_p_warper ~p:0.9;
        Sampler.no_repeat_ngram ~ngram_size:3;
      ]
    in

    (* Generate with stopping criteria *)
    Sampler.generate_text ~model:model_fn ~tokenizer:tokenize_fn
      ~decoder:decode_fn ~generation_config:config
      ~logits_processor:processors

Batch Processing Pipeline

  let process_corpus_directory input_dir output_file =
    (* Custom preprocessing pipeline *)
    let preprocess text =
      text |> normalize ~lowercase:true ~collapse_whitespace:true
      |> fun s -> if String.length s > 20 then Some s else None
    in

    Sys.readdir input_dir |> Array.to_list
    |> List.filter (fun f -> Filename.extension f = ".txt")
    |> List.map (fun f -> Filename.concat input_dir f)
    |> List.map read_lines_lazy |> Seq.concat
    |> Seq.filter_map preprocess
    |> List.of_seq |> write_lines output_file