SagaSourceSaga - Fast tokenization and text processing for ML in OCaml.
Saga is a comprehensive text processing library for machine learning applications, providing fast tokenization, statistical language models, and modern text generation capabilities. It combines simplicity for common use cases with flexibility for advanced workflows.
Saga consists of four main components:
io: Efficient file I/O utilities for large text corporalm: High-level statistical language models (n-grams)sampling: Advanced text generation with composable processorsAll components work together seamlessly but can be used independently.
open Saga
(* Basic word tokenization *)
let tokens = tokenize "Hello, world! How are you?"
(* Returns: ["Hello"; ","; "world"; "!"; "How"; "are"; "you"; "?"] *)
(* Character-level tokenization *)
let chars = tokenize ~method_:`Chars "Hello"
(* Returns: ["H"; "e"; "l"; "l"; "o"] *)
(* Batch processing with padding *)
let batch_ids = encode_batch [ "Hello world"; "Hi there" ] ~pad:true
(* Returns: padded tensor of token IDs *) (* Load training data *)
let texts = read_lines "training_data.txt"
(* Create and train a bigram model *)
let model =
LM.ngram ~n:2 ~tokenizer:(tokenizer `Words) () |> LM.train texts
(* Generate new text *)
let generated =
LM.generate model ~num_tokens:20 ~temperature:0.8 () Printf.printf
"Generated: %s\n" generated
(* Evaluate on test data *)
let test_perplexity =
LM.perplexity model "the quick brown fox" Printf.printf
"Test perplexity: %.2f\n" test_perplexity (* Create a model function (typically a neural network) *)
let model_fn token_ids =
(* Your neural network forward pass *)
Array.make 50000 0.0 (* Example: uniform logits *)
(* Configure generation with custom processors *)
let config =
Sampler.default
|> Sampler.with_temperature 0.9
|> Sampler.with_top_k 40
|> Sampler.with_repetition_penalty 1.1
(* Generate with fine-grained control *)
let result =
Sampler.generate_text ~model:model_fn
~tokenizer:(encode ~vocab:(vocab [ "hello"; "world" ]))
~decoder:(decode (vocab [ "hello"; "world" ]))
~prompt:"Hello" ~generation_config:config () let preprocess_texts texts =
texts
|> List.map (normalize ~lowercase:true ~collapse_whitespace:true)
|> List.filter (fun s -> String.length s > 10) (* Filter short texts *)
|> List.map (tokenize ~method_:`Words) let compare_models texts test_texts =
let models =
[
("unigram", LM.ngram ~n:1 ());
("bigram", LM.ngram ~n:2 ());
("trigram", LM.ngram ~n:3 ~smoothing:0.1 ());
]
in
List.map
(fun (name, model) ->
let trained = LM.train model texts in
let avg_perp =
List.map (LM.perplexity trained) test_texts
|> List.fold_left ( +. ) 0.
|> fun sum -> sum /. float_of_int (List.length test_texts)
in
(name, avg_perp))
models let create_code_tokenizer () =
tokenizer (`Regex {|[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[(){}[\].,;]|\S|})
|> Tokenizer.with_normalizer (normalize ~collapse_whitespace:true)
let process_code_files filenames =
filenames |> List.map read_lines |> List.flatten
|> List.map (Tokenizer.run (create_code_tokenizer ()))read_lines_lazy for very large files to avoid memory issuesSaga integrates well with:
Fast and flexible tokenization supporting multiple algorithms and custom patterns. Handles everything from simple word splitting to advanced subword tokenization.
open Saga.Tokenizers
(* Create a character-level tokenizer *)
let tokenizer = Tokenizer.create ~model:(Model.chars ()) in
(* Add special tokens *)
Tokenizer.add_special_tokens tokenizer [Added_token.create ~content:"." ~special:true ()];
(* Train on data *)
Tokenizer.train_from_iterator tokenizer (Seq.of_list names) ~trainer:(Trainer.chars ()) ();
(* Encode with options *)
let encoding = Tokenizer.encode tokenizer ~sequence:"hello world" ~add_special_tokens:true ();
(* Get ids and decode *)
let ids = Encoding.ids encoding in
let text = Tokenizer.decode tokenizer ids ~skip_special_tokens:true;Tokenizer.t: The main tokenizer instance, configurable with model, normalizer, etc.Models.t: Core tokenization algorithm (e.g., Chars, BPE).Encoding.t: Result of encoding, with ids, tokens, offsets, masks, etc., exposed as a record.add_special_tokens and encoding options.Unicode module. This API aligns with Hugging Face Tokenizers v0.21 (as of 2025), including support for fast Rust-backed operations where applicable.Unicode utilities.
Padding or truncation direction.
Behavior for splitting delimiters.
Truncation strategy.
Prepend scheme for metaspace.
Efficient file I/O utilities optimized for large text corpora and ML workflows.
read_lines ?buffer_size filename efficiently reads all lines from a file.
Features:
read_lines_lazy ?buffer_size filename returns a lazy sequence of lines.
Use this for very large files to avoid loading everything into memory. The file is automatically closed when the sequence is fully consumed or when an error occurs.
write_lines ?append filename lines writes lines to a file.
High-level statistical language models with simple training and generation APIs.
The workflow follows three main steps: 1. Create a model with ngram 2. Train it on text data with train 3. Generate new text with generate or evaluate with perplexity
All models are immutable - training returns a new model instance.
Train a bigram model on names and generate new ones:
let names = [ "alice"; "bob"; "charlie"; "diana"; "eve" ]
let model = Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Chars) ()
let trained_model = Saga.train model names
let new_name = Saga.generate trained_model ~num_tokens:10 ()
(* Returns: "alicia" or similar character-level generation *)Word-level model with custom settings:
let texts = [ "the cat sat"; "the dog ran"; "the cat ran" ]
let model =
Saga.ngram ~n:3 ~smoothing:0.05 ~min_freq:2
~tokenizer:(Saga.tokenizer `Words) ()
|> Saga.train texts
let story =
Saga.generate model ~prompt:"the cat" ~num_tokens:20 ~temperature:0.9 ()N-gram models predict the next token based on the previous n-1 tokens. Higher n captures more context but requires more training data:
Smoothing handles unseen token sequences:
Models work on token sequences. Built-in tokenizers:
`Words: Split on whitespace and punctuation`Chars: Unicode character-levelSave and load trained models:
Saga.save trained_model "my_model.bin"
let loaded_model = Saga.load "my_model.bin"Evaluate model quality:
let test_texts = [ "the quick brown fox" ]
let perplexity = Saga.perplexity trained_model (List.hd test_texts)
(* Lower perplexity = better model fit *)Batch evaluation and generation:
let samples =
Saga.pipeline model training_texts ~num_samples:50 ~temperature:1.2 ()
List.iter
(fun (text, perp) ->
Printf.printf "%s (perplexity: %.2f)\n" text perp)
samplesOpaque language model that can be trained and used for generation.
Models are immutable - training operations return new model instances. Supports n-gram models (n=1-5) with potential for extension to other statistical models like Markov chains or probabilistic context-free grammars.
train model texts trains the model on a list of text strings.
The training process: 1. Tokenizes each text using the model's tokenizer 2. Automatically adds BOS/EOS tokens (tokenizer-specific) 3. Builds or updates the vocabulary 4. Fits the statistical backend (n-gram counts, etc.)
BOS/EOS tokens are tokenizer-aware:
Returns a new trained model instance (original is unchanged).
let untrained = Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Words) ()
let trained = Saga.train untrained [ "hello world"; "world peace" ]
(* trained model now knows bigrams: <bos>+hello, hello+world, world+<eos>,
etc. *)val generate :
model ->
?num_tokens:int ->
?temperature:float ->
?top_k:int ->
?top_p:float ->
?seed:int ->
?min_new_tokens:int ->
?prompt:string ->
unit ->
stringgenerate model ?num_tokens ?temperature ?top_k ?top_p ?seed ?min_new_tokens ?prompt () generates text from the trained model.
Generation continues until:
num_tokens)min_new_tokens)The generated text is automatically cleaned:
Examples:
(* Conservative, deterministic generation *)
let result = Saga.generate model ~temperature:0.1 ~num_tokens:10 ()
(* Creative generation with nucleus sampling *)
let story =
Saga.generate model ~temperature:1.2 ~top_p:0.9 ~num_tokens:50 ()
(* Prompted generation *)
let completion =
Saga.generate model ~prompt:"Once upon a time" ~num_tokens:30 ()
(* Ensure minimum length *)
let long_text = Saga.generate model ~min_new_tokens:100 ~num_tokens:200 ()val generate_ids :
model ->
?num_tokens:int ->
?temperature:float ->
?top_k:int ->
?top_p:float ->
?seed:int ->
?min_new_tokens:int ->
?prompt:string ->
unit ->
int listSame as generate but returns generated token IDs (in the LM's internal vocabulary).
decode_ids model ids converts the LM's token IDs to their string tokens. This is useful to control how to join tokens (e.g., with spaces for word-level models).
score model text computes the log-probability of the text sequence under the model.
Returns the sum of log-probabilities for each token in the sequence. More negative values indicate lower probability (worse fit).
The text is automatically tokenized and BOS/EOS tokens added as needed. For unseen n-grams, uses the model's smoothing strategy.
To convert to perplexity manually: exp(-score / token_count)
let score = Saga.score model "hello world"
(* Returns: -5.234 (example negative log-probability) *)
let manual_perplexity =
let tokens = (* tokenize "hello world" *) 3 in
exp (-.score /. float_of_int tokens)perplexity model text computes perplexity, a standard metric for language model quality.
Perplexity = exp(-average_log_probability)
Text is automatically wrapped with BOS/EOS tokens for proper evaluation.
let perp1 = Saga.perplexity model "the cat sat" (* Returns: 12.5 *)
let perp2 = Saga.perplexity model "xyz qwerty" (* Returns: 45.2 (worse fit) *)
(* Lower perplexity = better model *)
assert (perp1 < perp2)perplexities model texts computes perplexity for multiple texts efficiently.
Equivalent to List.map (perplexity model) texts but may be optimized for batch processing in the future.
let test_set = [ "the cat sat"; "dogs run fast"; "hello world" ]
let perps = Saga.perplexities model test_set
let avg_perp =
List.fold_left ( +. ) 0. perps
/. float_of_int (List.length perps) Printf.printf
"Average test perplexity: %.2f\n" avg_perpval ngram :
n:int ->
?smoothing:float ->
?min_freq:int ->
?specials:string list ->
?tokenizer:Saga_tokenizers.Tokenizer.t ->
unit ->
modelngram ~n ?smoothing ?min_freq ?specials ?tokenizer () creates an n-gram language model.
N-gram models predict tokens based on the previous n-1 tokens. This implementation uses Maximum Likelihood Estimation (MLE) with add-k smoothing, similar to NLTK's approach.
Special tokens are tokenizer-aware:
".", "." (period for both BOS and EOS)"<bos>", "<eos>" (explicit start/end markers)The returned model is untrained - use train to fit it on data.
Usage Examples
NLTK-style workflow:
let model =
Saga.ngram ~n:2 ~tokenizer:(Saga.tokenizer `Chars) ~min_freq:2 ()
let trained_model = Saga.train model names
let generated =
Saga.generate trained_model ~num_tokens:15 ~temperature:0.8 ()
let quality = Saga.perplexity trained_model "emma" (* Lower = better *)Fluent chaining style:
let model = Saga.ngram ~n:2 ~smoothing:0.01 () |> Saga.train names
let generated_name =
Saga.generate model ~num_tokens:20 ~temperature:1.0 () Printf.printf
"Generated: %s\n" generated_nameCustom configuration:
let model =
Saga.ngram ~n:3 ~smoothing:0.05 ~min_freq:3
~specials:[ "<start>"; "<end>" ]
~tokenizer:(Saga.tokenizer (`Regex {|\\w+|[.,!?]|}))
()save model filename saves the trained model to a binary file.
Serializes the complete model state including:
Use load to restore the model later.
Saga.save trained_model "my_language_model.bin"load filename loads a previously saved model from disk.
The loaded model is immediately ready for generation and evaluation without requiring retraining.
let model = Saga.load "my_language_model.bin"
let text = Saga.generate model ~num_tokens:50 ()val pipeline :
model ->
string list ->
?num_samples:int ->
?temperature:float ->
?top_k:int ->
?top_p:float ->
?seed:int ->
unit ->
(string * float) listpipeline model texts ?num_samples ?temperature ?top_k ?top_p ?seed () is a convenience function that trains a model and generates samples with their perplexities.
This function: 1. Trains the model on the provided texts 2. Generates num_samples text samples using the trained model 3. Computes perplexity for each generated sample 4. Returns (generated_text, perplexity) pairs
Useful for quick experimentation and model evaluation.
Usage Examples
Quick model evaluation:
let names = Saga.IO.read_lines "names.txt"
let model = Saga.ngram ~n:2 ()
let samples =
Saga.pipeline model names ~num_samples:20 ~temperature:1.0 ()
(* Print best samples (lowest perplexity) *)
List.iter
(fun (name, perp) ->
Printf.printf "%s (perplexity: %.2f)\\n" name perp)
(List.take 5 samples)Parameter exploration:
let compare_temperatures temps texts =
List.map (fun temp ->
let samples = Saga.pipeline model texts ~temperature:temp ~num_samples:10 ()
let avg_perp =
List.fold_left (fun acc (_, p) -> acc +. p) 0. samples /. 10.
(temp, avg_perp)
) tempsModern text generation with composable processors and fine-grained control, designed for integration with neural language models.
open Saga
(* Simple char tokenization *)
let tok = Tokenizer.create ~model:(Models.chars ())
let enc = Tokenizer.encode tok ~sequence:(Either.Left "Hello world!") ()
let ids = Encoding.get_ids enc
let text = Tokenizer.decode tok (Array.to_list ids) ()
(* BPE tokenization *)
let tok = tokenizer (`BPE ("vocab.json", "merges.txt"))
let batch = encode_batch tok [ "Hello"; "World" ] ~padding:true (* Train a bigram model *)
let texts = [ "The cat sat"; "The dog ran"; "The cat ran" ]
let tok = tokenizer `Words
let model = LM.train_ngram ~n:2 tok texts
(* Generate text *)
let generated =
LM.generate model ~max_tokens:50 ~temperature:0.8 tok print_endline
generated (* Tokenizer with normalization *)
let tok =
tokenizer `Words
|> Tokenizer.with_normalizer (normalize ~lowercase:true)
(* Regex tokenizer for code *)
let code_tok = tokenizer (`Regex {|[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|.|}) let build_name_generator training_file =
(* Load and train character-level model *)
let names = read_lines training_file in
let model =
LM.ngram ~n:3 ~tokenizer:(tokenizer `Chars) ~smoothing:0.1 ()
|> LM.train names in
(* Return generator function *)
fun ?(temperature=0.8) ?(max_len=12) () ->
LM.generate model ~num_tokens:max_len ~temperature ()
(* Generate 10 new names *)
let gen = build_name_generator "names.txt" in
List.init 10 (fun _ -> gen ()) |> List.iter (Printf.printf "%s\n") let setup_neural_generation neural_model vocab_file =
let vocab = vocab_load vocab_file in
let tokenize_fn text = encode ~vocab text in
let decode_fn ids = decode vocab ids in
(* Wrap neural model for Sampler API *)
let model_fn token_ids =
let tensor = Nx.of_array1 (Array.of_list token_ids) in
let logits = neural_model tensor in
Nx.to_array1 logits
in
(* Creative writing configuration *)
let config =
Sampler.creative_writing
|> Sampler.with_max_new_tokens 200
|> Sampler.with_repetition_penalty 1.15
in
let processors =
[
Sampler.temperature_warper ~temperature:1.1;
Sampler.top_p_warper ~p:0.9;
Sampler.no_repeat_ngram ~ngram_size:3;
]
in
(* Generate with stopping criteria *)
Sampler.generate_text ~model:model_fn ~tokenizer:tokenize_fn
~decoder:decode_fn ~generation_config:config
~logits_processor:processors let process_corpus_directory input_dir output_file =
(* Custom preprocessing pipeline *)
let preprocess text =
text |> normalize ~lowercase:true ~collapse_whitespace:true
|> fun s -> if String.length s > 20 then Some s else None
in
Sys.readdir input_dir |> Array.to_list
|> List.filter (fun f -> Filename.extension f = ".txt")
|> List.map (fun f -> Filename.concat input_dir f)
|> List.map read_lines_lazy |> Seq.concat
|> Seq.filter_map preprocess
|> List.of_seq |> write_lines output_file