Module SagaSource

Saga - Fast tokenization and text processing for ML in OCaml.

Saga is a comprehensive text processing library for machine learning applications, providing fast tokenization, statistical language models, and modern text generation capabilities. It combines simplicity for common use cases with flexibility for advanced workflows.

Library Overview

Saga consists of four main components:

All components work together seamlessly but can be used independently.

Quick Start

Simple tokenization and text processing

  open Saga

  (* Basic word tokenization *)
  let tokens = tokenize "Hello, world! How are you?"
  (* Returns: ["Hello"; ","; "world"; "!"; "How"; "are"; "you"; "?"] *)

  (* Character-level tokenization *)
  let chars = tokenize ~method_:`Chars "Hello"
  (* Returns: ["H"; "e"; "l"; "l"; "o"] *)

  (* Batch processing with padding *)
  let batch_ids = encode_batch [ "Hello world"; "Hi there" ] ~pad:true
  (* Returns: padded tensor of token IDs *)

Training a language model

  (* Load training data *)
  let texts = read_lines "training_data.txt"

  (* Create and train a bigram model *)
  let model =
    LM.ngram ~n:2 ~tokenizer:(tokenizer `Words) () |> LM.train texts

  (* Generate new text *)
  let generated =
    LM.generate model ~num_tokens:20 ~temperature:0.8 () Printf.printf
      "Generated: %s\n" generated

  (* Evaluate on test data *)
  let test_perplexity =
    LM.perplexity model "the quick brown fox" Printf.printf
      "Test perplexity: %.2f\n" test_perplexity

Advanced text generation

  (* Create a model function (typically a neural network) *)
  let model_fn token_ids =
    (* Your neural network forward pass *)
    Array.make 50000 0.0 (* Example: uniform logits *)

  (* Configure generation with custom processors *)
  let config =
    Sampler.default
    |> Sampler.with_temperature 0.9
    |> Sampler.with_top_k 40
    |> Sampler.with_repetition_penalty 1.1

  (* Generate with fine-grained control *)
  let result =
    Sampler.generate_text ~model:model_fn
      ~tokenizer:(encode ~vocab:(vocab [ "hello"; "world" ]))
      ~decoder:(decode (vocab [ "hello"; "world" ]))
      ~prompt:"Hello" ~generation_config:config ()

Common Patterns

Text preprocessing pipeline

  let preprocess_texts texts =
    texts
    |> List.map (normalize ~lowercase:true ~collapse_whitespace:true)
    |> List.filter (fun s -> String.length s > 10) (* Filter short texts *)
    |> List.map (tokenize ~method_:`Words)

Model comparison and evaluation

  let compare_models texts test_texts =
    let models =
      [
        ("unigram", LM.ngram ~n:1 ());
        ("bigram", LM.ngram ~n:2 ());
        ("trigram", LM.ngram ~n:3 ~smoothing:0.1 ());
      ]
    in
    List.map
      (fun (name, model) ->
        let trained = LM.train model texts in
        let avg_perp =
          List.map (LM.perplexity trained) test_texts
          |> List.fold_left ( +. ) 0.
          |> fun sum -> sum /. float_of_int (List.length test_texts)
        in
        (name, avg_perp))
      models

Custom tokenization workflows

  let create_code_tokenizer () =
    tokenizer (`Regex {|[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[(){}[\].,;]|\S|})
    |> Tokenizer.with_normalizer (normalize ~collapse_whitespace:true)

  let process_code_files filenames =
    filenames |> List.map read_lines |> List.flatten
    |> List.map (Tokenizer.run (create_code_tokenizer ()))

Performance Tips

Integration with Other Libraries

Saga integrates well with:

Tokenization

Fast and flexible tokenization supporting multiple algorithms and custom patterns. Handles everything from simple word splitting to advanced subword tokenization.

Quick Start

  open Saga.Tokenizers
  (* Create a character-level tokenizer *)
  let tokenizer = Tokenizer.create ~model:(Model.chars ()) in
  (* Add special tokens *)
  Tokenizer.add_special_tokens tokenizer [Added_token.create ~content:"." ~special:true ()];
  (* Train on data *)
  Tokenizer.train_from_iterator tokenizer (Seq.of_list names) ~trainer:(Trainer.chars ()) ();
  (* Encode with options *)
  let encoding = Tokenizer.encode tokenizer ~sequence:"hello world" ~add_special_tokens:true ();
  (* Get ids and decode *)
  let ids = Encoding.ids encoding in
  let text = Tokenizer.decode tokenizer ids ~skip_special_tokens:true;

Key Concepts

Sourcemodule Either : sig ... end

Either type for API compatibility.

Unicode utilities.

Sourcemodule Pre_tokenizers = Saga_tokenizers.Pre_tokenizers

Enums as Polymorphic Variants

Sourcetype direction = [
  1. | `Left
  2. | `Right
]

Padding or truncation direction.

Sourcetype split_delimiter_behavior = [
  1. | `Removed
  2. | `Isolated
  3. | `Merged_with_previous
  4. | `Merged_with_next
  5. | `Contiguous
]

Behavior for splitting delimiters.

Sourcetype strategy = [
  1. | `Longest_first
  2. | `Only_first
  3. | `Only_second
]

Truncation strategy.

Sourcetype prepend_scheme = [
  1. | `Always
  2. | `Never
  3. | `First
]

Prepend scheme for metaspace.

Core Types

Sourcemodule Added_token : sig ... end
Sourcemodule Tokenizer : sig ... end

File I/O

Efficient file I/O utilities optimized for large text corpora and ML workflows.

Sourceval read_lines : ?buffer_size:int -> string -> string list

read_lines ?buffer_size filename efficiently reads all lines from a file.

  • parameter buffer_size

    Size of the read buffer in bytes (default: 65536)

  • returns

    List of lines without trailing newlines

  • raises Sys_error

    if file cannot be opened or read

Features:

  • Efficient buffered reading for large files
  • Automatic resource cleanup on errors
  • Windows/Unix line ending compatibility
  • Memory-efficient for files with many lines
Sourceval read_lines_lazy : ?buffer_size:int -> string -> string Seq.t

read_lines_lazy ?buffer_size filename returns a lazy sequence of lines.

  • parameter buffer_size

    Size of the read buffer in bytes (default: 65536)

  • returns

    Lazy sequence of lines that are read on-demand

Use this for very large files to avoid loading everything into memory. The file is automatically closed when the sequence is fully consumed or when an error occurs.

Sourceval write_lines : ?append:bool -> string -> string list -> unit

write_lines ?append filename lines writes lines to a file.

  • parameter append

    If true, append to existing file (default: false)

  • parameter filename

    Target file path

  • parameter lines

    List of lines to write (newlines are added automatically)

Language Models

High-level statistical language models with simple training and generation APIs.

Overview

The workflow follows three main steps: 1. Create a model with ngram 2. Train it on text data with train 3. Generate new text with generate or evaluate with perplexity

All models are immutable - training returns a new model instance.

Quick Start

Train a bigram model on names and generate new ones:

  let names = [ "alice"; "bob"; "charlie"; "diana"; "eve" ]
  let model = Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Chars) ()
  let trained_model = Saga.train model names
  let new_name = Saga.generate trained_model ~num_tokens:10 ()
  (* Returns: "alicia" or similar character-level generation *)

Word-level model with custom settings:

  let texts = [ "the cat sat"; "the dog ran"; "the cat ran" ]

  let model =
    Saga.ngram ~n:3 ~smoothing:0.05 ~min_freq:2
      ~tokenizer:(Saga.tokenizer `Words) ()
    |> Saga.train texts

  let story =
    Saga.generate model ~prompt:"the cat" ~num_tokens:20 ~temperature:0.9 ()

Key Concepts

N-grams

N-gram models predict the next token based on the previous n-1 tokens. Higher n captures more context but requires more training data:

Smoothing

Smoothing handles unseen token sequences:

Tokenization

Models work on token sequences. Built-in tokenizers:

Advanced Usage

Save and load trained models:

  Saga.save trained_model "my_model.bin"

  let loaded_model = Saga.load "my_model.bin"

Evaluate model quality:

  let test_texts = [ "the quick brown fox" ]
  let perplexity = Saga.perplexity trained_model (List.hd test_texts)
  (* Lower perplexity = better model fit *)

Batch evaluation and generation:

  let samples =
    Saga.pipeline model training_texts ~num_samples:50 ~temperature:1.2 ()
      List.iter
      (fun (text, perp) ->
        Printf.printf "%s (perplexity: %.2f)\n" text perp)
      samples

Core Types

Sourcetype model

Opaque language model that can be trained and used for generation.

Models are immutable - training operations return new model instances. Supports n-gram models (n=1-5) with potential for extension to other statistical models like Markov chains or probabilistic context-free grammars.

Training

Sourceval train : model -> string list -> model

train model texts trains the model on a list of text strings.

The training process: 1. Tokenizes each text using the model's tokenizer 2. Automatically adds BOS/EOS tokens (tokenizer-specific) 3. Builds or updates the vocabulary 4. Fits the statistical backend (n-gram counts, etc.)

BOS/EOS tokens are tokenizer-aware:

  • Character tokenizer: Uses "." as both BOS and EOS
  • Word tokenizer: Uses "<bos>" and "<eos>"
  • Custom tokenizers: Uses configured special tokens

Returns a new trained model instance (original is unchanged).

  let untrained = Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Words) ()
  let trained = Saga.train untrained [ "hello world"; "world peace" ]
  (* trained model now knows bigrams: <bos>+hello, hello+world, world+<eos>,
     etc. *)

Text Generation

Sourceval generate : model -> ?num_tokens:int -> ?temperature:float -> ?top_k:int -> ?top_p:float -> ?seed:int -> ?min_new_tokens:int -> ?prompt:string -> unit -> string

generate model ?num_tokens ?temperature ?top_k ?top_p ?seed ?min_new_tokens ?prompt () generates text from the trained model.

Generation continues until:

  • Maximum tokens reached (num_tokens)
  • EOS token generated (unless blocked by min_new_tokens)
  • Model produces invalid continuation
  • parameter num_tokens

    Maximum tokens to generate (default: 20)

  • parameter temperature

    Sampling randomness: 0.1 = conservative, 2.0 = very random (default: 1.0)

  • parameter top_k

    Keep only top-k most likely tokens, 0 = disabled (default: None)

  • parameter top_p

    Nucleus sampling: keep tokens with cumulative probability ≤ p (default: None)

  • parameter seed

    Random seed for reproducible generation (default: None = random)

  • parameter min_new_tokens

    Block EOS tokens until at least this many tokens generated (default: None)

  • parameter prompt

    Initial text prompt, tokenizer-specific (default: empty = auto BOS)

  • returns

    Generated text as clean, decoded string

The generated text is automatically cleaned:

  • BOS/EOS tokens removed
  • Tokenizer-specific post-processing applied
  • Invalid unicode sequences handled gracefully

Examples:

  (* Conservative, deterministic generation *)
  let result = Saga.generate model ~temperature:0.1 ~num_tokens:10 ()

  (* Creative generation with nucleus sampling *)
  let story =
    Saga.generate model ~temperature:1.2 ~top_p:0.9 ~num_tokens:50 ()

  (* Prompted generation *)
  let completion =
    Saga.generate model ~prompt:"Once upon a time" ~num_tokens:30 ()

  (* Ensure minimum length *)
  let long_text = Saga.generate model ~min_new_tokens:100 ~num_tokens:200 ()
Sourceval generate_ids : model -> ?num_tokens:int -> ?temperature:float -> ?top_k:int -> ?top_p:float -> ?seed:int -> ?min_new_tokens:int -> ?prompt:string -> unit -> int list

Same as generate but returns generated token IDs (in the LM's internal vocabulary).

Sourceval decode_ids : model -> int list -> string list

decode_ids model ids converts the LM's token IDs to their string tokens. This is useful to control how to join tokens (e.g., with spaces for word-level models).

Model Evaluation

Sourceval score : model -> string -> float

score model text computes the log-probability of the text sequence under the model.

Returns the sum of log-probabilities for each token in the sequence. More negative values indicate lower probability (worse fit).

The text is automatically tokenized and BOS/EOS tokens added as needed. For unseen n-grams, uses the model's smoothing strategy.

To convert to perplexity manually: exp(-score / token_count)

  let score = Saga.score model "hello world"
  (* Returns: -5.234 (example negative log-probability) *)

  let manual_perplexity =
    let tokens = (* tokenize "hello world" *) 3 in
    exp (-.score /. float_of_int tokens)
Sourceval perplexity : model -> string -> float

perplexity model text computes perplexity, a standard metric for language model quality.

Perplexity = exp(-average_log_probability)

  • Lower values indicate better model fit
  • Perplexity ≈ average branching factor at each step
  • Perfect prediction gives perplexity = 1.0
  • Random guessing gives perplexity = vocabulary_size

Text is automatically wrapped with BOS/EOS tokens for proper evaluation.

  let perp1 = Saga.perplexity model "the cat sat"  (* Returns: 12.5 *)
  let perp2 = Saga.perplexity model "xyz qwerty"   (* Returns: 45.2 (worse fit) *)

  (* Lower perplexity = better model *)
  assert (perp1 < perp2)
Sourceval perplexities : model -> string list -> float list

perplexities model texts computes perplexity for multiple texts efficiently.

Equivalent to List.map (perplexity model) texts but may be optimized for batch processing in the future.

  let test_set = [ "the cat sat"; "dogs run fast"; "hello world" ]
  let perps = Saga.perplexities model test_set

  let avg_perp =
    List.fold_left ( +. ) 0. perps
    /. float_of_int (List.length perps) Printf.printf
         "Average test perplexity: %.2f\n" avg_perp

Model Creation

Sourceval ngram : n:int -> ?smoothing:float -> ?min_freq:int -> ?specials:string list -> ?tokenizer:Saga_tokenizers.Tokenizer.t -> unit -> model

ngram ~n ?smoothing ?min_freq ?specials ?tokenizer () creates an n-gram language model.

N-gram models predict tokens based on the previous n-1 tokens. This implementation uses Maximum Likelihood Estimation (MLE) with add-k smoothing, similar to NLTK's approach.

  • parameter n

    Order of the n-gram model (1-5 supported, higher orders possible via custom backends)

  • parameter smoothing

    Add-k smoothing parameter (default: 0.01). Higher values create more uniform distributions

  • parameter min_freq

    Minimum frequency threshold for n-gram inclusion (default: 1)

  • parameter specials

    Special tokens for BOS/EOS, auto-detected from tokenizer if omitted

  • parameter tokenizer

    Tokenizer to use, auto-inferred if omitted (characters for short texts)

Special tokens are tokenizer-aware:

  • Character tokenizer: ".", "." (period for both BOS and EOS)
  • Word tokenizer: "<bos>", "<eos>" (explicit start/end markers)
  • Custom tokenizers: Use provided specials or tokenizer defaults

The returned model is untrained - use train to fit it on data.

Usage Examples

NLTK-style workflow:

  let model =
    Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Chars) ~min_freq:2 ()

  let trained_model = Saga.train model names

  let generated =
    Saga.generate trained_model ~num_tokens:15 ~temperature:0.8 ()

  let quality = Saga.perplexity trained_model "emma" (* Lower = better *)

Fluent chaining style:

  let model = Saga.ngram ~n:2 ~smoothing:0.01 () |> Saga.train names

  let generated_name =
    Saga.generate model ~num_tokens:20 ~temperature:1.0 () Printf.printf
      "Generated: %s\n" generated_name

Custom configuration:

  let model =
    Saga.ngram ~n:3 ~smoothing:0.05 ~min_freq:3
      ~specials:[ "<start>"; "<end>" ]
      ~tokenizer:(Saga.tokenizer (`Regex {|\\w+|[.,!?]|}))
      ()

Model Persistence

Sourceval save : model -> string -> unit

save model filename saves the trained model to a binary file.

Serializes the complete model state including:

  • Vocabulary mappings
  • N-gram counts and statistics
  • Tokenizer configuration
  • Model hyperparameters

Use load to restore the model later.

  Saga.save trained_model "my_language_model.bin"
Sourceval load : string -> model

load filename loads a previously saved model from disk.

The loaded model is immediately ready for generation and evaluation without requiring retraining.

  • raises Sys_error

    if file doesn't exist or is corrupted

  let model = Saga.load "my_language_model.bin"
  let text = Saga.generate model ~num_tokens:50 ()

Convenience Functions

Sourceval pipeline : model -> string list -> ?num_samples:int -> ?temperature:float -> ?top_k:int -> ?top_p:float -> ?seed:int -> unit -> (string * float) list

pipeline model texts ?num_samples ?temperature ?top_k ?top_p ?seed () is a convenience function that trains a model and generates samples with their perplexities.

This function: 1. Trains the model on the provided texts 2. Generates num_samples text samples using the trained model 3. Computes perplexity for each generated sample 4. Returns (generated_text, perplexity) pairs

Useful for quick experimentation and model evaluation.

  • parameter model

    Untrained or partially trained model

  • parameter texts

    Training texts

  • parameter num_samples

    Number of samples to generate (default: 20)

  • parameter temperature,top_k,top_p,seed

    Generation parameters (same as generate)

  • returns

    List of (generated_text, perplexity) pairs, sorted by perplexity (best first)

Usage Examples

Quick model evaluation:

  let names = Saga.IO.read_lines "names.txt"
  let model = Saga.ngram ~n:2 ()

  let samples =
    Saga.pipeline model names ~num_samples:20 ~temperature:1.0 ()
      (* Print best samples (lowest perplexity) *)
      List.iter
      (fun (name, perp) ->
        Printf.printf "%s (perplexity: %.2f)\\n" name perp)
      (List.take 5 samples)

Parameter exploration:

  let compare_temperatures temps texts =
    List.map (fun temp ->
      let samples = Saga.pipeline model texts ~temperature:temp ~num_samples:10 ()
      let avg_perp =
        List.fold_left (fun acc (_, p) -> acc +. p) 0. samples /. 10.
      (temp, avg_perp)
    ) temps

Advanced Text Generation

Modern text generation with composable processors and fine-grained control, designed for integration with neural language models.

Sourcemodule Sampler : sig ... end

Advanced text generation and sampling utilities.

Examples

Quick tokenization

  open Saga

  (* Simple char tokenization *)
  let tok = Tokenizer.create ~model:(Models.chars ())
  let enc = Tokenizer.encode tok ~sequence:(Either.Left "Hello world!") ()
  let ids = Encoding.get_ids enc
  let text = Tokenizer.decode tok (Array.to_list ids) ()

  (* BPE tokenization *)
  let tok = tokenizer (`BPE ("vocab.json", "merges.txt"))
  let batch = encode_batch tok [ "Hello"; "World" ] ~padding:true

Training a language model

  (* Train a bigram model *)
  let texts = [ "The cat sat"; "The dog ran"; "The cat ran" ]
  let tok = tokenizer `Words
  let model = LM.train_ngram ~n:2 tok texts

  (* Generate text *)
  let generated =
    LM.generate model ~max_tokens:50 ~temperature:0.8 tok print_endline
      generated

Custom tokenizer

  (* Tokenizer with normalization *)
  let tok =
    tokenizer `Words
    |> Tokenizer.with_normalizer (normalize ~lowercase:true)

  (* Regex tokenizer for code *)
  let code_tok = tokenizer (`Regex {|[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|.|})

End-to-End Name Generator

  let build_name_generator training_file =
    (* Load and train character-level model *)
    let names = read_lines training_file in
    let model =
      LM.ngram ~n:3 ~tokenizer:(tokenizer `Chars) ~smoothing:0.1 ()
      |> LM.train names in

    (* Return generator function *)
    fun ?(temperature=0.8) ?(max_len=12) () ->
      LM.generate model ~num_tokens:max_len ~temperature ()

  (* Generate 10 new names *)
  let gen = build_name_generator "names.txt" in
  List.init 10 (fun _ -> gen ()) |> List.iter (Printf.printf "%s\n")

Advanced Neural Model Integration

  let setup_neural_generation neural_model vocab_file =
    let vocab = vocab_load vocab_file in
    let tokenize_fn text = encode ~vocab text in
    let decode_fn ids = decode vocab ids in

    (* Wrap neural model for Sampler API *)
    let model_fn token_ids =
      let tensor = Nx.of_array1 (Array.of_list token_ids) in
      let logits = neural_model tensor in
      Nx.to_array1 logits
    in

    (* Creative writing configuration *)
    let config =
      Sampler.creative_writing
      |> Sampler.with_max_new_tokens 200
      |> Sampler.with_repetition_penalty 1.15
    in

    let processors =
      [
        Sampler.temperature_warper ~temperature:1.1;
        Sampler.top_p_warper ~p:0.9;
        Sampler.no_repeat_ngram ~ngram_size:3;
      ]
    in

    (* Generate with stopping criteria *)
    Sampler.generate_text ~model:model_fn ~tokenizer:tokenize_fn
      ~decoder:decode_fn ~generation_config:config
      ~logits_processor:processors

Batch Processing Pipeline

  let process_corpus_directory input_dir output_file =
    (* Custom preprocessing pipeline *)
    let preprocess text =
      text |> normalize ~lowercase:true ~collapse_whitespace:true
      |> fun s -> if String.length s > 20 then Some s else None
    in

    Sys.readdir input_dir |> Array.to_list
    |> List.filter (fun f -> Filename.extension f = ".txt")
    |> List.map (fun f -> Filename.concat input_dir f)
    |> List.map read_lines_lazy |> Seq.concat
    |> Seq.filter_map preprocess
    |> List.of_seq |> write_lines output_file