Source file split.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
(** Parsing state for a POSIX shell-like word splitter.

    The state machine closely follows the POSIX shell word parsing rules.
    Expansions are disabled except for quote removal and escape handling. *)
type state =
  | Delimiter  (** Between words; currently skipping delimiters. *)
  | Backslash
      (** A backslash was seen outside a word; the next character determines
          whether this begins a word or is ignored (line continuation). *)
  | Unquoted  (** Inside an unquoted word. *)
  | UnquotedBackslash  (** A backslash was seen inside an unquoted word. *)
  | SingleQuoted  (** Inside a single-quoted word. *)
  | DoubleQuoted  (** Inside a double-quoted word. *)
  | DoubleQuotedBackslash
      (** A backslash was seen inside a double-quoted word. *)
  | Comment  (** Inside a comment (everything until newline is ignored). *)

type acc = {
  state : state;
  input : char list;
  words : string list;
  word : char list;
  in_word : bool;
}
(** Accumulator threaded through the parser.

    Invariants:
    - [word] holds the current word being built, in reverss order.
    - [words] holds completed words, in reverse order.
    - [in_word] is true if a word has been started, even if it is empty. (e.g.
      from "" or '').
    - [input] holds the remaining characters of the input, not including what
      has already been consumed. *)

(** Consume the next character from the input, if any. *)
let next_char =
  (function
   | { input = []; _ } as acc -> (None, acc)
   | { input = c :: rest; _ } as acc -> (Some c, { acc with input = rest })
    : acc -> char option * acc)

(** Emit the current word into [words], if a word is in progress.

    A word is emitted if either:
    - characters have been accumulated in [word], or
    - [in_word] is true (handles empty quoted words) *)
let emit_word acc =
  if acc.word = [] && not acc.in_word then acc
  else
    let w = acc.word |> List.rev |> List.to_seq |> String.of_seq in
    { acc with word = []; words = w :: acc.words; in_word = false }

(** Append a character to the current word buffer. *)
let push_char c acc = { acc with word = c :: acc.word }

(** Update the current parser state. *)
let with_state state acc = { acc with state }

(** Mark that a word has been started.

    This is to distinguish "no word" from "empty word". *)
let enter_word acc = { acc with in_word = true }

(** Finalize parsing once input is exhausted.

    Handles:
    - unterminated quotes (error)
    - trailing backslashes
    - final word emission *)
let finish acc =
  match acc.state with
  | SingleQuoted | DoubleQuoted -> failwith "Missing closing quote"
  | Backslash | UnquotedBackslash ->
      let acc = push_char '\\' acc |> emit_word in
      List.rev acc.words
  | _ -> acc |> emit_word |> fun a -> List.rev a.words

(** Split a command line into words using POSIX shell parsing rules, limited to
    quote removal and escape handling.

    Raises [Failure] on unterminated quotes. *)
let split (s : string) : string list =
  let rec next acc =
    let c, acc = next_char acc in
    let push_and_next c = acc |> push_char c |> enter_word |> next in
    let push_and_enter c state =
      acc |> push_char c |> with_state state |> enter_word |> next
    in
    let enter_state state = acc |> with_state state |> next in
    let enter_word_state state =
      acc |> with_state state |> enter_word |> next
    in
    match acc.state with
    | Delimiter -> begin
        match c with
        | None -> finish acc
        | Some '\'' -> enter_word_state SingleQuoted
        | Some '\"' -> enter_word_state DoubleQuoted
        | Some '\\' -> enter_state Backslash
        | Some '\t' | Some ' ' | Some '\n' -> next acc
        | Some '#' -> enter_state Comment
        | Some c -> push_and_enter c Unquoted
      end
    | Backslash -> begin
        match c with
        | None -> finish acc
        | Some '\n' -> enter_state Delimiter
        | Some c -> push_and_enter c Unquoted
      end
    | Unquoted -> begin
        match c with
        | None -> emit_word acc |> finish
        | Some '\'' -> enter_word_state SingleQuoted
        | Some '\"' -> enter_word_state DoubleQuoted
        | Some '\\' -> enter_state UnquotedBackslash
        | Some '\t' | Some ' ' | Some '\n' ->
            acc |> emit_word |> with_state Delimiter |> next
        | Some c -> push_and_next c
      end
    | UnquotedBackslash -> begin
        match c with
        | None -> acc |> finish
        | Some '\n' -> enter_state Unquoted
        | Some c -> push_and_enter c Unquoted
      end
    | SingleQuoted -> begin
        match c with
        | None -> failwith "unterminated single quoted string"
        | Some '\'' -> enter_state Unquoted
        | Some c -> push_and_next c
      end
    | DoubleQuoted -> begin
        match c with
        | None -> failwith "unterminated double quoted string"
        | Some '\"' -> enter_state Unquoted
        | Some '\\' -> enter_word_state DoubleQuotedBackslash
        | Some c -> push_and_next c
      end
    | DoubleQuotedBackslash -> begin
        match c with
        | None -> failwith "parse error"
        | Some '\n' -> enter_word_state DoubleQuoted
        | Some (('$' | '`' | '"' | '\\') as c) -> push_and_enter c DoubleQuoted
        | Some c ->
            acc |> push_char '\\' |> push_char c |> with_state DoubleQuoted
            |> enter_word |> next
      end
    | Comment -> begin
        match c with
        | None -> finish acc
        | Some '\n' -> enter_state Delimiter
        | Some _ -> enter_state Comment
      end
  in
  let input = s |> String.to_seq |> List.of_seq in
  next { state = Delimiter; input; words = []; word = []; in_word = false }