Module Stdune.StringSource

Sourcetype t = string
include module type of struct include StringLabels end with type t := t

Strings

Sourceval make : int -> char -> string

make n c is a string of length n with each index holding the character c.

Sourceval init : int -> f:(int -> char) -> string

init n ~f is a string of length n with index i holding the character f i (called in increasing index order).

  • since 4.02
Sourceval empty : string

The empty string.

  • since 4.13
Sourceval length : string -> int

length s is the length (number of bytes/characters) of s.

Sourceval get : string -> int -> char

get s i is the character at index i in s. This is the same as writing s.[i].

Sourceval of_char : char -> string

of_char c is c as a string.

  • since 5.5
Sourceval of_bytes : bytes -> string

Return a new string that contains the same bytes as the given byte sequence.

  • since 4.13
Sourceval to_bytes : string -> bytes

Return a new byte sequence that contains the same bytes as the given string.

  • since 4.13
Sourceval blit : src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int -> unit

Same as Bytes.blit_string which should be preferred.

Concatenating

Note. The Stdlib.(^) binary operator concatenates two strings.

Sourceval concat : sep:string -> string list -> string

concat ~sep ss concatenates the list of strings ss, inserting the separator string sep between each.

Sourceval cat : string -> string -> string

cat s1 s2 concatenates s1 and s2 (s1 ^ s2).

  • since 4.13

Predicates and comparisons

Sourceval starts_with : prefix:string -> string -> bool

starts_with ~prefix s is true if and only if s starts with prefix.

  • since 4.13
Sourceval ends_with : suffix:string -> string -> bool

ends_with ~suffix s is true if and only if s ends with suffix.

  • since 4.13
Sourceval includes : affix:string -> string -> bool

includes ~affix s is true if and only if affix occurs in s.

Note. To test the same affix string multiple times, partially applying the ~affix argument and using the resulting function repeatedly is more efficient.

  • since 5.5
Sourceval contains_from : string -> int -> char -> bool

contains_from s start c is true if and only if c appears in s after position start.

Sourceval rcontains_from : string -> int -> char -> bool

rcontains_from s stop c is true if and only if c appears in s before position stop+1.

Sourceval contains : string -> char -> bool

contains s c is String.contains_from s 0 c.

Extracting substrings

Sourceval sub : string -> pos:int -> len:int -> string

sub s ~pos ~len is a string of length len, containing the substring of s that starts at position pos and has length len.

Splitting strings

Splitting with magnitudes

Sourceval take_first : int -> string -> string

take_first n s are the first n bytes of s. This is s if n >= length s and "" if n <= 0.

  • since 5.5
Sourceval take_last : int -> string -> string

take_last n s are the last n bytes of s. This is s if n >= length s and "" if n <= 0.

  • since 5.5
Sourceval drop_first : int -> string -> string

drop_first n s is s without the first n bytes of s. This is "" if n >= length s and s if n <= 0.

  • since 5.5
Sourceval drop_last : int -> string -> string

drop_last n s is s without the last n bytes of s. This is "" if n >= length s and s if n <= 0.

  • since 5.5
Sourceval cut_first : int -> string -> string * string

cut_first n v is (take_first n v, drop_first n v).

  • since 5.5
Sourceval cut_last : int -> string -> string * string

cut_last n v is (drop_last n v, take_last n v).

  • since 5.5

Splitting with predicates

Sourceval take_first_while : (char -> bool) -> string -> string

take_first_while p s is the first consecutive bytes of s satisfying the predicate p.

  • since 5.5
Sourceval take_last_while : (char -> bool) -> string -> string

take_last_while p s is the last consecutive bytes of s satisfying the predicate p.

  • since 5.5
Sourceval drop_first_while : (char -> bool) -> string -> string

drop_first_while p s is s without the first consecutive bytes of s satisfying the predicate p.

  • since 5.5
Sourceval drop_last_while : (char -> bool) -> string -> string

drop_last_while p s is s without the last consecutive bytes of s satisfying the predicate p.

  • since 5.5
Sourceval cut_first_while : (char -> bool) -> string -> string * string

cut_first_while p s is (take_first_while p s, drop_first_while p s).

  • since 5.5
Sourceval cut_last_while : (char -> bool) -> string -> string * string

cut_last_while p s is (drop_last_while p s, take_last_while p s).

  • since 5.5

Splitting with separators

Note. To split the same sep string multiple times, partially applying the ~sep argument of these functions and using the resulting function repeatedly is more efficient.

Sourceval split_first : sep:string -> string -> (string * string) option

split_first ~sep s is the pair Some (left, right) made of the two (possibly empty) substrings of s that are delimited by the first match of the separator sep in s or None if sep can't be found. Search for sep starts at position 0 and uses find_first.

If sep is "", this is Some ("", s).

The invariant concat sep [left; right] = s holds.

  • since 5.5
Sourceval split_last : sep:string -> string -> (string * string) option

split_last ~sep s is the pair Some (left, right) made of the two (possibly empty) substrings of s that are delimited by the last match of the separator sep in s or None if sep can't be found. Search for sep starts at position length s and uses find_last.

If sep is "", this is Some (s, "").

The invariant concat sep [left; right] = s holds.

  • since 5.5
Sourceval split_all : sep:string -> ?drop:(string -> bool) -> string -> string list

split_all ~sep s is the list of all substrings of s that are delimited by non-overlapping matches of the separator sep or the list [s] if sep can't be found. Search for sep starts at position 0 in increasing indexing order and uses find_all.

Substrings sub for which drop sub is true are not included in the result. drop defaults to Fun.const false.

If sep is "", this is [""; c0; ...; cn; ""] with ci the string of_char s.[i].

The invariant concat sep (split_all ~sep s) = s holds.

  • since 5.5
Sourceval rsplit_all : sep:string -> ?drop:(string -> bool) -> string -> string list

rsplit_all ~sep s is the list of all substrings of s that are delimited by non-overlapping matches of the separator sep or [s] if sep can't be found. Search for sep starts at position length s in decreasing indexing order and uses rfind_all.

Substrings sub for which drop sub is true are not included in the result. drop defaults to Fun.const false.

If sep is "", this is [""; c0; ...; cn; ""] with ci the string of_char s.[i].

The invariant concat sep (rsplit_all ~sep s) = s holds.

  • since 5.5
Sourceval split_on_char : sep:char -> string -> string list

split_on_char ~sep s is the list of all (possibly empty) substrings of s that are delimited by the character sep. If s is empty, the result is the singleton list [""].

The function's result is specified by the following invariants:

  • The list is not empty.
  • Concatenating its elements using sep as a separator returns a string equal to the input (concat (make 1 sep) (split_on_char sep s) = s).
  • No string in the result contains the sep character.
  • since 4.05

Transforming

Sourceval map : f:(char -> char) -> string -> string

map f s is the string resulting from applying f to all the characters of s in increasing order.

  • since 4.00
Sourceval mapi : f:(int -> char -> char) -> string -> string

mapi ~f s is like map but the index of the character is also passed to f.

  • since 4.02
Sourceval fold_left : f:('acc -> char -> 'acc) -> init:'acc -> string -> 'acc

fold_left f x s computes f (... (f (f x s.[0]) s.[1]) ...) s.[n-1], where n is the length of the string s.

  • since 4.13
Sourceval fold_right : f:(char -> 'acc -> 'acc) -> string -> init:'acc -> 'acc

fold_right f s x computes f s.[0] (f s.[1] ( ... (f s.[n-1] x) ...)), where n is the length of the string s.

  • since 4.13
Sourceval trim : string -> string

trim s is s without leading and trailing whitespace. Whitespace characters are: ' ', '\x0C' (form feed), '\n', '\r', and '\t'.

  • since 4.00
Sourceval escaped : string -> string

escaped s is s with special characters represented by escape sequences, following the lexical conventions of OCaml.

All characters outside the US-ASCII printable range [0x20;0x7E] are escaped, as well as backslash (0x5C) and double-quote (0x22).

The function Scanf.unescaped is a left inverse of escaped, i.e. Scanf.unescaped (escaped s) = s for any string s (unless escaped s fails).

Sourceval uppercase_ascii : string -> string

uppercase_ascii s is s with all lowercase letters translated to uppercase, using the US-ASCII character set.

  • since 4.05
Sourceval lowercase_ascii : string -> string

lowercase_ascii s is s with all uppercase letters translated to lowercase, using the US-ASCII character set.

  • since 4.05
Sourceval capitalize_ascii : string -> string

capitalize_ascii s is s with the first character set to uppercase, using the US-ASCII character set.

  • since 4.05
Sourceval uncapitalize_ascii : string -> string

uncapitalize_ascii s is s with the first character set to lowercase, using the US-ASCII character set.

  • since 4.05

Traversing

Sourceval iter : f:(char -> unit) -> string -> unit

iter ~f s applies function f in turn to all the characters of s. It is equivalent to f s.[0]; f s.[1]; ...; f s.[length s - 1]; ().

Sourceval iteri : f:(int -> char -> unit) -> string -> unit

iteri is like iter, but the function is also given the corresponding character index.

  • since 4.00

Finding indices

Sourceval find_first_index : (char -> bool) -> ?start:int -> string -> int option

find_first_index p ~start s is the index of the first character of s that satisfies predicate p at or after the index or position start (defaults to 0).

If start is length s, the result is always None.

  • since 5.5
Sourceval find_last_index : (char -> bool) -> ?start:int -> string -> int option

find_last_index p ~start s is the index of the last character of s that satisfies predicate p at or before the index or position start (defaults to length s).

  • since 5.5
Sourceval index_from_opt : string -> int -> char -> int option

index_from_opt s i c is the index of the first occurrence of c in s after position i (if any).

  • since 4.05
Sourceval rindex_from_opt : string -> int -> char -> int option

rindex_from_opt s i c is the index of the last occurrence of c in s before position i+1 (if any).

  • since 4.05
Sourceval index_opt : string -> char -> int option

index_opt s c is String.index_from_opt s 0 c.

  • since 4.05
Sourceval rindex_opt : string -> char -> int option

rindex_opt s c is String.rindex_from_opt s (length s - 1) c.

  • since 4.05

Finding substrings

Note. To find the same sub string multiple times, partially applying the ~sub argument of these functions and using the resulting function repeatedly is more efficient

Sourceval find_first : sub:string -> ?start:int -> string -> int option

find_first ~sub ~start s is the starting position of the first occurrence of sub in s at or after the index or position start (defaults to 0).

If sub is "" the result is Some start. The result of the function is always a valid index of s except when sub is "" and start is length s.

  • since 5.5
Sourceval find_last : sub:string -> ?start:int -> string -> int option

find_last ~sub ~start s is the starting position of the last occurrence of sub in s at or before the index or position start (defaults to String.length s).

If sub is "" the result is Some start. The result of the function is always a valid index of s except when sub is "" and start is length s.

  • since 5.5
Sourceval find_all : sub:string -> f:(int -> 'acc -> 'acc) -> ?start:int -> string -> 'acc -> 'acc

find_all ~sub f ~start s acc, starting with acc, folds f by increasing index order over all non-overlapping starting positions of sub in s at or after the index or position start (defaults to 0). The result is acc if sub could not be found in s.

If sub is "", f gets invoked on all positions of s at or after start.

  • since 5.5
Sourceval rfind_all : sub:string -> f:(int -> 'acc -> 'acc) -> ?start:int -> string -> 'acc -> 'acc

rfind_all ~sub f ~start s acc, starting with acc, folds f by decreasing index order over all non-overlapping starting positions of sub in s at or before the index or position start (defaults to String.length s). The result is acc if sub could not be found in s.

If sub is "", f gets invoked on all positions of s at or before start.

  • since 5.5

Replacing substrings

Note. To replace the same sub string multiple times, partially applying the ~sub argument of these functions and using the resulting function repeatedly is more efficient.

Sourceval replace_first : sub:string -> by:string -> ?start:int -> string -> string

replace_first ~sub ~by ~start s replaces by by the first occurrence of sub in s at or after the index or position start (defaults to 0).

If sub is "", this inserts by at position start.

  • since 5.5
Sourceval replace_last : sub:string -> by:string -> ?start:int -> string -> string

replace_last ~sub ~by ~start s replaces by by the last occurrence of sub in s at or after the index or position start (defaults to String.length s).

If sub is "", this inserts by at position start.

  • since 5.5
Sourceval replace_all : sub:string -> by:string -> ?start:int -> string -> string

replace_all ~sub ~by ~start s replaces by by all non-overlapping occurrences of sub in s at or after the index or position start (defaults to 0). Occurrences are found in increasing indexing order.

If sub is "", this inserts by on all positions from start on.

  • since 5.5

Strings and Sequences

Sourceval to_seq : t -> char Seq.t

to_seq s is a sequence made of the string's characters in increasing order.

  • since 4.07
Sourceval to_seqi : t -> (int * char) Seq.t

to_seqi s is like to_seq but also tuples the corresponding index.

  • since 4.07
Sourceval of_seq : char Seq.t -> t

of_seq s is a string made of the sequence's characters.

  • since 4.07

UTF decoding and validations

  • since 4.14

UTF-8

Sourceval get_utf_8_uchar : t -> int -> Uchar.utf_decode

get_utf_8_uchar b i decodes an UTF-8 character at index i in b.

Sourceval is_valid_utf_8 : t -> bool

is_valid_utf_8 b is true if and only if b contains valid UTF-8 data.

UTF-16BE

Sourceval get_utf_16be_uchar : t -> int -> Uchar.utf_decode

get_utf_16be_uchar b i decodes an UTF-16BE character at index i in b.

Sourceval is_valid_utf_16be : t -> bool

is_valid_utf_16be b is true if and only if b contains valid UTF-16BE data.

UTF-16LE

Sourceval get_utf_16le_uchar : t -> int -> Uchar.utf_decode

get_utf_16le_uchar b i decodes an UTF-16LE character at index i in b.

Sourceval is_valid_utf_16le : t -> bool

is_valid_utf_16le b is true if and only if b contains valid UTF-16LE data.

Spellchecking

Sourceval edit_distance : ?limit:int -> t -> t -> int

edit_distance s0 s1 is the number of single character edits (understood as insertion, deletion, substitution, transposition) that are needed to change s0 into s1.

If limit is provided the function returns with limit as soon as it was determined that s0 and s1 have distance of at least limit. This is faster if you have a fixed limit, for example for spellchecking.

The function assumes the strings are UTF-8 encoded and uses Uchar.t for the notion of character. Decoding errors are replaced by Uchar.rep. Normalizing the strings to NFC gives better results.

Note. This implements the simpler Optimal String Alignment (OSA) distance, not the Damerau-Levenshtein distance. With this function "ca" and "abc" have a distance of 3 not 2.

  • since 5.4
Sourceval spellcheck : ?max_dist:(string -> int) -> ((string -> unit) -> unit) -> string -> string list

spellcheck iter_dict s are the strings enumerated by the iterator iter_dict whose edit distance to s is the smallest and at most max_dist s. If multiple corrections are returned their order is as found in iter_dict. The default max_dist s is:

  • 0 if s has 0 to 2 Unicode characters.
  • 1 if s has 3 to 4 Unicode characters.
  • 2 otherwise.

If your dictionary is a list l, a suitable iter_dict is given by (fun yield -> List.iter yield l).

All strings are assumed to be UTF-8 encoded, decoding errors are replaced by Uchar.rep characters.

  • since 5.4

Binary decoding of integers

The functions in this section binary decode integers from strings.

All following functions raise Invalid_argument if the characters needed at index i to decode the integer are not available.

Little-endian (resp. big-endian) encoding means that least (resp. most) significant bytes are stored first. Big-endian is also known as network byte order. Native-endian encoding is either little-endian or big-endian depending on Sys.big_endian.

32-bit and 64-bit integers are represented by the int32 and int64 types, which can be interpreted either as signed or unsigned numbers.

8-bit and 16-bit integers are represented by the int type, which has more bits than the binary encoding. These extra bits are sign-extended (or zero-extended) for functions which decode 8-bit or 16-bit integers and represented them with int values.

Sourceval get_uint8 : string -> int -> int

get_uint8 b i is b's unsigned 8-bit integer starting at character index i.

  • since 4.13
Sourceval get_int8 : string -> int -> int

get_int8 b i is b's signed 8-bit integer starting at character index i.

  • since 4.13
Sourceval get_uint16_ne : string -> int -> int

get_uint16_ne b i is b's native-endian unsigned 16-bit integer starting at character index i.

  • since 4.13
Sourceval get_uint16_be : string -> int -> int

get_uint16_be b i is b's big-endian unsigned 16-bit integer starting at character index i.

  • since 4.13
Sourceval get_uint16_le : string -> int -> int

get_uint16_le b i is b's little-endian unsigned 16-bit integer starting at character index i.

  • since 4.13
Sourceval get_int16_ne : string -> int -> int

get_int16_ne b i is b's native-endian signed 16-bit integer starting at character index i.

  • since 4.13
Sourceval get_int16_be : string -> int -> int

get_int16_be b i is b's big-endian signed 16-bit integer starting at character index i.

  • since 4.13
Sourceval get_int16_le : string -> int -> int

get_int16_le b i is b's little-endian signed 16-bit integer starting at character index i.

  • since 4.13
Sourceval get_int32_ne : string -> int -> int32

get_int32_ne b i is b's native-endian 32-bit integer starting at character index i.

  • since 4.13
Sourceval seeded_hash : int -> t -> int

A seeded hash function for strings, with the same output value as Hashtbl.seeded_hash. This function allows this module to be passed as argument to the functor Hashtbl.MakeSeeded.

  • since 5.0
Sourceval get_int32_be : string -> int -> int32

get_int32_be b i is b's big-endian 32-bit integer starting at character index i.

  • since 4.13
Sourceval get_int32_le : string -> int -> int32

get_int32_le b i is b's little-endian 32-bit integer starting at character index i.

  • since 4.13
Sourceval get_int64_ne : string -> int -> int64

get_int64_ne b i is b's native-endian 64-bit integer starting at character index i.

  • since 4.13
Sourceval get_int64_be : string -> int -> int64

get_int64_be b i is b's big-endian 64-bit integer starting at character index i.

  • since 4.13
Sourceval get_int64_le : string -> int -> int64

get_int64_le b i is b's little-endian 64-bit integer starting at character index i.

  • since 4.13
Sourceval equal : t -> t -> bool
Sourceval compare : t -> t -> Ordering.t
Sourceval hash : t -> int
Sourceval to_dyn : t -> Dyn.t
Sourceval break : t -> pos:int -> t * t
Sourceval is_empty : t -> bool
Sourceval of_list : char list -> t
Sourceval is_prefix : t -> prefix:t -> bool
Sourceval is_suffix : t -> suffix:t -> bool
Sourceval take : t -> int -> t
Sourceval drop : t -> int -> t
Sourceval split_n : t -> int -> t * t
Sourceval drop_prefix : t -> prefix:t -> t option
Sourceval drop_suffix : t -> suffix:t -> t option
Sourceval capitalize : t -> t

These only change ASCII characters

Sourceval uncapitalize : t -> t
Sourceval uppercase : t -> t
Sourceval lowercase : t -> t
Sourceval index : t -> char -> int option
Sourceval index_from : t -> int -> char -> int option
Sourceval rindex : t -> char -> int option
Sourceval rindex_from : t -> int -> char -> int option
Sourceval extract_words : t -> is_word_char:(char -> bool) -> t list
Sourceval extract_comma_space_separated_words : t -> t list
Sourceval extract_blank_separated_words : t -> t list
Sourceval lsplit2 : t -> on:char -> (t * t) option
Sourceval lsplit2_exn : t -> on:char -> t * t
Sourceval rsplit2 : t -> on:char -> (t * t) option
Sourceval split : t -> on:char -> t list
Sourceval split_lines : t -> t list
Sourceval escape_only : char -> t -> t

Escace ONLY one character. escape also escapes '\n',... and transforms all chars above '~' into '\xxx' which is not suitable for UTF-8 strings.

Sourceval longest : string list -> int

Return the length of the longest string in the list

Sourceval longest_map : 'a list -> f:('a -> string) -> int
Sourceval longest_prefix : t list -> t
Sourceval exists : t -> f:(char -> bool) -> bool
Sourceval for_all : t -> f:(char -> bool) -> bool
Sourceval maybe_quoted : t -> t

maybe_quoted s is s if s doesn't need escaping according to OCaml lexing conventions and sprintf "%S" s otherwise.

(* CR-someday aalekseyev: this function is not great: barely anything "needs escaping according to OCaml lexing conventions", so the condition for whether to add the quote characters ends up being quite arbitrary. *)

Sourceval quoted : t -> t
Sourceval enumerate_and : string list -> string

Produces: "x, y and z"

Sourceval enumerate_or : string list -> string

Produces: "x, y or z"

Sourceval enumerate_one_of : t list -> t

Produces: "One of x, y or z"

Sourceval findi : ?from:int -> string -> f:(char -> bool) -> int option

Find index of first character satisfying f

Sourceval rfindi : ?from:int -> string -> f:(char -> bool) -> int option

Find index of last character satisfying f

Sourcemodule Set : sig ... end
Sourcemodule Map : sig ... end
Sourcemodule Table : Hashtbl.S with type key = t
Sourceval need_quoting : string -> bool

Whether the string needs quoting if it is part of a shell command

Sourceval quote_for_shell : string -> string

quote_for_shell s quotes s using Filename.quote if need_quoting s is true

Sourceval filter_map : string -> f:(char -> char option) -> string