123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156(** Parsing state for a POSIX shell-like word splitter.
The state machine closely follows the POSIX shell word parsing rules.
Expansions are disabled except for quote removal and escape handling. *)typestate=|Delimiter(** Between words; currently skipping delimiters. *)|Backslash(** A backslash was seen outside a word; the next character determines
whether this begins a word or is ignored (line continuation). *)|Unquoted(** Inside an unquoted word. *)|UnquotedBackslash(** A backslash was seen inside an unquoted word. *)|SingleQuoted(** Inside a single-quoted word. *)|DoubleQuoted(** Inside a double-quoted word. *)|DoubleQuotedBackslash(** A backslash was seen inside a double-quoted word. *)|Comment(** Inside a comment (everything until newline is ignored). *)typeacc={state:state;input:charlist;words:stringlist;word:charlist;in_word:bool;}(** Accumulator threaded through the parser.
Invariants:
- [word] holds the current word being built, in reverss order.
- [words] holds completed words, in reverse order.
- [in_word] is true if a word has been started, even if it is empty. (e.g.
from "" or '').
- [input] holds the remaining characters of the input, not including what
has already been consumed. *)(** Consume the next character from the input, if any. *)letnext_char=(function|{input=[];_}asacc->(None,acc)|{input=c::rest;_}asacc->(Somec,{accwithinput=rest}):acc->charoption*acc)(** Emit the current word into [words], if a word is in progress.
A word is emitted if either:
- characters have been accumulated in [word], or
- [in_word] is true (handles empty quoted words) *)letemit_wordacc=ifacc.word=[]&¬acc.in_wordthenaccelseletw=acc.word|>List.rev|>List.to_seq|>String.of_seqin{accwithword=[];words=w::acc.words;in_word=false}(** Append a character to the current word buffer. *)letpush_charcacc={accwithword=c::acc.word}(** Update the current parser state. *)letwith_statestateacc={accwithstate}(** Mark that a word has been started.
This is to distinguish "no word" from "empty word". *)letenter_wordacc={accwithin_word=true}(** Finalize parsing once input is exhausted.
Handles:
- unterminated quotes (error)
- trailing backslashes
- final word emission *)letfinishacc=matchacc.statewith|SingleQuoted|DoubleQuoted->failwith"Missing closing quote"|Backslash|UnquotedBackslash->letacc=push_char'\\'acc|>emit_wordinList.revacc.words|_->acc|>emit_word|>funa->List.reva.words(** Split a command line into words using POSIX shell parsing rules, limited to
quote removal and escape handling.
Raises [Failure] on unterminated quotes. *)letsplit(s:string):stringlist=letrecnextacc=letc,acc=next_characcinletpush_and_nextc=acc|>push_charc|>enter_word|>nextinletpush_and_entercstate=acc|>push_charc|>with_statestate|>enter_word|>nextinletenter_statestate=acc|>with_statestate|>nextinletenter_word_statestate=acc|>with_statestate|>enter_word|>nextinmatchacc.statewith|Delimiter->beginmatchcwith|None->finishacc|Some'\''->enter_word_stateSingleQuoted|Some'\"'->enter_word_stateDoubleQuoted|Some'\\'->enter_stateBackslash|Some'\t'|Some' '|Some'\n'->nextacc|Some'#'->enter_stateComment|Somec->push_and_entercUnquotedend|Backslash->beginmatchcwith|None->finishacc|Some'\n'->enter_stateDelimiter|Somec->push_and_entercUnquotedend|Unquoted->beginmatchcwith|None->emit_wordacc|>finish|Some'\''->enter_word_stateSingleQuoted|Some'\"'->enter_word_stateDoubleQuoted|Some'\\'->enter_stateUnquotedBackslash|Some'\t'|Some' '|Some'\n'->acc|>emit_word|>with_stateDelimiter|>next|Somec->push_and_nextcend|UnquotedBackslash->beginmatchcwith|None->acc|>finish|Some'\n'->enter_stateUnquoted|Somec->push_and_entercUnquotedend|SingleQuoted->beginmatchcwith|None->failwith"unterminated single quoted string"|Some'\''->enter_stateUnquoted|Somec->push_and_nextcend|DoubleQuoted->beginmatchcwith|None->failwith"unterminated double quoted string"|Some'\"'->enter_stateUnquoted|Some'\\'->enter_word_stateDoubleQuotedBackslash|Somec->push_and_nextcend|DoubleQuotedBackslash->beginmatchcwith|None->failwith"parse error"|Some'\n'->enter_word_stateDoubleQuoted|Some(('$'|'`'|'"'|'\\')asc)->push_and_entercDoubleQuoted|Somec->acc|>push_char'\\'|>push_charc|>with_stateDoubleQuoted|>enter_word|>nextend|Comment->beginmatchcwith|None->finishacc|Some'\n'->enter_stateDelimiter|Some_->enter_stateCommentendinletinput=s|>String.to_seq|>List.of_seqinnext{state=Delimiter;input;words=[];word=[];in_word=false}