Module TokenizerSource

Sourcetype behavior =
  1. | Remove
  2. | Isolate
  3. | Merge_with_previous
  4. | Merge_with_next
Sourcetype pattern =
  1. | Whitespace
  2. | Dash
  3. | Bert
  4. | Regex of Re.re
Sourcetype action = pattern * Tokenizer__.Norm.behavior
Sourceval run : ?encoding:Snowball.encoding -> ?to_lowercase:bool -> action list -> string Seq.t -> string Seq.t

run actions str applies actions on the given str.

Sourceval run_on_bstr : ?encoding:Snowball.encoding -> ?to_lowercase:bool -> action list -> Bstr.t -> string Seq.t

run actions bstr applies actions on the given bstr.

NOTE: The advantage of using a bigstring is that you can load a file using Unix.map_file and process the file to obtain a stream of tokens.