Source file codegen.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
(* MIT License
 *
 * Copyright (c) 2025 Frédéric Bour
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *)

(** Code generation for the LR matching machine

    This module generates the OCaml code for the LR matching machine from the
    compiled specification and the generated automaton.
    The "machine" is translated to a sparse transition table, a bytecode
    program. An OCaml wrapper is generated to interpret the bytecode and invoke
    the appropriate semantic actions.

    Main components:

    - The [spec] type holds global configuration including the parser name
      and lexer definition.

    - The [output_header], [output_trailer] functions generate the outer
      module structure wrapping the generated code.

    - The [output_rule] function is the core code generation function that:
      - Outputs the bytecode and transition tables for the abstract machine
      - Outputs semantic actions for each branch
      - Outputs wrapper functions glueing the interpreter to user actions

    Implementation details:

    - The [output_table] function formats and outputs the already-compacted
      bytecode and transition table as an OCaml [Lrgrep_runtime.program] record.
      The actual compaction is performed by [Lrgrep_support.compact] inside
      [output_rule] before [output_table] is called.

    - The [output_execute_function] generates a case analysis matching on:
      - The clause number
      - The lookahead token
      - For each branch that accepts the clause, outputs a case that:
        1. Binds registers to captured variables
        2. Executes the semantic action
        3. Returns the result

    - The [bind_capture] function handles different capture types:
      - [Value]: Regular captured values with full type information
      - [Start_loc], [End_loc]: Start/end position captures
      - It handles optional vs required captures based on whether they
        can be undefined

    - The [lookahead_constraint] function generates pattern matching for
      branches with lookahead constraints, ensuring only the right tokens
      trigger each branch.

    - The [output_rule] function proceeds in three steps:
      1. Compacts the state machine via [Lrgrep_support.compact] and outputs
         the bytecode and transition tables
      2. Generates semantic actions with proper variable binding, including
         type recovery for captured values
      3. Outputs a wrapper function to glue the interpreter to user actions
*)

open Fix.Indexing
open Utils
open Misc
open Info
open Spec

type priority = int

type spec = {
  parser_name : string;
  lexer_definition : Syntax.lexer_definition;
}

let print_literal_code cp (loc, txt) =
  if txt <> "" then
    Code_printer.print cp ~loc txt

let output_table (type g r) out rule (machine : (g, r, _, _) Automata.Machine.t)
    (program, table, remap) =
  let print fmt = Code_printer.fmt out fmt in
  print "let lrgrep_program_%s : Lrgrep_runtime.program = {\n"
    rule.Syntax.name;
  print "  registers = %d;\n" machine.register_count;
  print "  initial = %d;\n" (
    match machine.initial with
    | None -> 0
    | Some i -> remap.((i : _ index :> int))
  );
  print "  table = %s;\n" (fst (Lrgrep_support_packer.encode table));
  print "  code = %S;\n" program;
  print "}\n"

let output_wrapper out {Syntax.name; args; _} =
  let args = String.concat " " args in
  Code_printer.fmt out
    "let %s %s _lrgrep_env _lrgrep_lookahead = (\n\
    \  List.find_map\n\
    \    (fun m -> lrgrep_execute_%s %s m _lrgrep_lookahead)\n\
    \    (lrgrep_run lrgrep_program_%s _lrgrep_env)\n\
     )\n"
    name args
    name args
    name

type printer = Code_printer.t option -> unit

let grammar_parameters g =
  let (module Raw) = raw g in
  Raw.Grammar.parameters

let output_header (type g) (g : g grammar) spec : printer = function
  | None -> ()
  | Some out ->
    begin match grammar_parameters g with
      | [] -> ()
      | parameters ->
        Code_printer.print out "module Make";
        List.iter (Code_printer.fmt out "(%s)") parameters;
        Code_printer.fmt out
          "(%s : module type of %s.Make" spec.parser_name spec.parser_name;
        let extract_name name =
          match String.index_opt name ':' with
          | None -> name
          | Some index -> String.sub name 0 index
        in
        List.iter

          (fun param -> Code_printer.fmt out "(%s)" (extract_name param))
          parameters;
        Code_printer.print out ") = struct\n";
    end;
    print_literal_code out spec.lexer_definition.header;
    Code_printer.fmt out
      "include Lrgrep_runtime.Interpreter(%s.MenhirInterpreter)\n"
      spec.parser_name

let output_trailer (type g) (g : g grammar) spec : printer = function
  | None -> ()
  | Some out ->
    Code_printer.print out "\n";
    print_literal_code out spec.lexer_definition.Syntax.trailer;
    match grammar_parameters g with
    | [] -> ()
    | _ -> Code_printer.print out "\nend\n"

let output_rule (type g r) (g : g grammar) {parser_name; _} (rule : Syntax.rule)
    clauses (branches : (g, r) branches) (machine : (g, r, _, _) Automata.Machine.t) : printer =
  function
  | None -> ()
  | Some out ->
    (* Step 1: output bytecode and transition tables *)
    let get_state_for_compaction index =
      let add_match (clause, priority, regs) =
        let cap = branches.br_captures.:(clause) in
        let registers =
          let add_reg cap acc =
            let reg = IndexMap.find_opt cap regs in
            (reg : _ index option :> int option) :: acc
          in
          Array.of_list (List.rev (IndexSet.fold add_reg cap []))
        in
        (clause, priority, registers)
      in
      let add_transition tr acc =
        let label = machine.label.:(tr) in
        let actions = {
          Lrgrep_support.
          move = IndexMap.bindings label.moves;
          store = List.map snd label.captures;
          clear = IndexSet.elements label.clear;
          target = machine.target.:(tr);
          priority = label.priority;
        } in
        (label.filter, actions) :: acc
      in
      {
        Lrgrep_support.
        accept = List.map add_match machine.accepting.:(index);
        halting = machine.unhandled.:(index);
        transitions = IndexSet.fold add_transition machine.outgoing.:(index) [];
      }
    in
    let program = Lrgrep_support.compact (Automata.Machine.states machine) get_state_for_compaction in
    output_table out rule machine program;
    (* Step 2: output semantic actions *)
    let captures_lr1 =
      let map = ref IndexMap.empty in
      let process_transitions (label : _ Automata.Machine.label) =
        map := List.fold_left (fun map (cap, _reg) ->
            IndexMap.update cap (Misc.union_update label.filter) map
          ) !map label.captures
      in
      Vector.iter process_transitions machine.label;
      !map
    in
    let recover_type index =
      try
        let lr1s = IndexMap.find index captures_lr1 in
        let symbols = IndexSet.map (fun lr1 ->
            match Lr1.incoming g lr1 with
            | None -> raise Not_found
            | Some sym -> sym
          ) lr1s
        in
        let typ = IndexSet.fold (fun sym acc ->
            let typ = match Symbol.semantic_value g sym with
              | None -> raise Not_found
              | Some typ -> String.trim typ
            in
            match acc with
            | None -> Some typ
            | Some typ' ->
              if typ <> typ' then raise Not_found;
              acc
          ) symbols None
        in
        match typ with
        | None -> None
        | Some typ -> Some (symbols, typ)
      with Not_found -> None
    in
    let symbol_matcher s =
      (if Symbol.is_terminal g s then "T T_" else "N N_") ^
      Symbol.to_string g ~mangled:true s
    in
    let bind_capture out ~offset index (def, name, (_startpos, _endpos, positions)) =
      (* FIXME: variables should be introduced only if the relevant keyword appear in the action *)
      let is_optional = IndexSet.mem index machine.partial_captures in
      let none = if is_optional then "None" else "assert false" in
      let some x = if is_optional then "Some (" ^ x ^ ")" else x in
      match def with
      | Syntax.Value ->
        let typ = recover_type index in
        Code_printer.fmt out
          "    let %s, _startpos_%s_, _endpos_%s_, _positions_%s_ = match __registers.(%d) with \n\
          \      | Empty -> %s\n\
          \      | Location _ -> assert false\n\
          \      | Value (%s.MenhirInterpreter.Element (%s, %s, startp, endp)%s) ->\n"
          name name name name offset
          (if is_optional then "(None, None, None, None)" else "assert false")
          parser_name
          (if Option.is_none typ then "_" else "st")
          (if Option.is_none typ then "_" else "x")
          (if Option.is_none typ then "as x" else "");
        begin match typ with
          | None -> ()
          | Some (symbols, typ) ->
            Code_printer.fmt out
              "        let x = match %s.MenhirInterpreter.incoming_symbol st with\n"
              parser_name;
            List.iter (fun symbol ->
                Code_printer.fmt out "          | %s -> (x : %s)\n"
                  (symbol_matcher symbol) typ) (IndexSet.elements symbols);
            Code_printer.fmt out
              "          | _ -> assert false\n\
              \        in\n"
        end;
        positions := false;
        Code_printer.fmt out "        (%s, %s, %s, %s)\n"
          (some "x") (some "startp") (some "endp") (some "(startp, endp)");
        Code_printer.fmt out "    in\n";
        Code_printer.fmt out "    let _ = %s in\n" name
      | Start_loc ->
        Code_printer.fmt out
          "    let _startpos_%s_ = match __registers.(%d) with\n\
          \      | Empty -> %s\n\
          \      | Location (p, _) | Value (%s.MenhirInterpreter.Element (_, _, p, _)) -> %s\n\
          \    in\n"
          name offset
          none
          parser_name (some "p")
      | End_loc ->
        Code_printer.fmt out
          "    let _endpos_%s_ = match __registers.(%d) with\n\
          \      | Empty -> %s\n\
          \      | Location (_, p) | Value (%s.MenhirInterpreter.Element (_, _, _, p)) -> %s\n\
          \    in\n"
          name offset
          none
          parser_name (some "p")
    in
    let lookahead_constraint branch =
      match branches.lookaheads.:(branch) with
      | None -> None
      | Some terms ->
        let term_pattern t =
          Terminal.to_string g t ^
          match Terminal.semantic_value g t with
          | None -> ""
          | Some _ -> " _"
        in
        Some (string_concat_map ~wrap:("(",")") "|"
                term_pattern (IndexSet.elements terms))
    in
    let output_execute_function out =
      Code_printer.fmt out
        "let lrgrep_execute_%s %s\n\
        \  (__clause, (__registers : %s.MenhirInterpreter.element Lrgrep_runtime.register_values))\n\
        \  ((token : %s.MenhirInterpreter.token), _startloc_token_, _endloc_token_)\n\
        \  : _ option = match __clause, token with\n"
        rule.name (String.concat " " rule.args)
        parser_name parser_name;
      let output_clause_branches clause brs =
        let captures = clauses.captures.:(clause) in
        Code_printer.fmt out " ";
        (* Identify branches that lead to this action *)
        IndexSet.iter (fun branch ->
            Code_printer.fmt out
              " | %d, %s"
              (Index.to_int branch)
              (Option.value (lookahead_constraint branch) ~default:"_");
          ) brs;
        Code_printer.fmt out " ->\n";
        let vars = Hashtbl.create 7 in
        let captures =
          IndexMap.map begin fun (kind, var) ->
            let refs = match Hashtbl.find_opt vars var with
              | Some refs -> refs
              | None ->
                let refs = (ref false, ref false, ref false) in
                Hashtbl.add vars var refs;
                refs
            in
            (kind, var, refs)
          end captures
        in
        let body =
          match clauses.definitions.:(clause).syntax.action with
          | Unreachable _ -> ""
          | Partial (loc, str) | Total (loc, str) ->
            Misc.rewrite_keywords begin fun pos kw var ->
              match kw with
              (* FIXME Report an error message rather than a failure *)
              | "$startloc" -> Syntax.error pos "$startloc is now called $startpos"
              | "$endloc" ->  Syntax.error pos "$endloc is now called $endpos"
              | "$startpos" | "$endpos" | "$positions" ->
                (* FIXME Check if variable exists *)
                begin match Hashtbl.find_opt vars var with
                  | None -> Syntax.error pos "undefined variable %s" var
                  | Some (rstart, rend, rpos) ->
                    match kw with
                    | "$startpos" -> rstart := true
                    | "$endpos" -> rend := true
                    | "$positions" -> rpos := true
                    | _ -> ()
                end;
                true
              | kw ->
                Syntax.error pos "unknown keyword %S; did you mean $startpos, $endpos or $positions?" kw
            end loc str
        in
        let offset = ref 0 in
        IndexMap.iter
          (fun k v -> bind_capture out ~offset:!offset k v; incr offset)
          captures;
        IndexMap.iter (fun index (_, var, (_, _, positions)) ->
            if !positions then (
              let is_optional = IndexSet.mem index machine.partial_captures in
              if is_optional then
                Code_printer.fmt out
                  "    let _positions_%s_ = match _startpos_%s_, _endpos_%s_ with\n\
                  \      | Some s, Some e -> Some (s, e)\n\
                  \      | _ -> None in\n"
                  var var var
              else
                Code_printer.fmt out
                  "    let _positions_%s_ = (_startpos_%s_, _endpos_%s_) in\n"
                  var var var
            )
          ) captures;
        begin
          match clauses.definitions.:(clause).syntax.action with
          | Unreachable _ ->
            Code_printer.print out "    failwith \"Should be unreachable\"\n"
          | Partial (loc, _) ->
            Code_printer.print out "    (\n";
            Code_printer.fmt out ~loc "%s\n" body;
            Code_printer.print out "    )\n"
          | Total (loc, _) ->
            Code_printer.print out "    Some (\n";
            Code_printer.fmt out ~loc "%s\n" body;
            Code_printer.print out "    )\n"
        end;
        let constrained =
          IndexSet.filter
            (fun branch -> Option.is_some branches.lookaheads.:(branch))
            brs
        in
        if IndexSet.is_not_empty constrained then
          Code_printer.fmt out "  | (%s), _ -> None\n"
            (string_concat_map "|" string_of_index (IndexSet.elements constrained))
      in
      Vector.iteri output_clause_branches branches.of_clause;
      Code_printer.print out "  | _ -> failwith \"Invalid action (internal error or API misuse)\"\n\n"
    in
    output_execute_function out;
    (* Step 3: wrapper to glue interpreter and user actions *)
    output_wrapper out rule