123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415(* MIT License
*
* Copyright (c) 2025 Frédéric Bour
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*)(** Code generation for the LR matching machine
This module generates the OCaml code for the LR matching machine from the
compiled specification and the generated automaton.
The "machine" is translated to a sparse transition table, a bytecode
program. An OCaml wrapper is generated to interpret the bytecode and invoke
the appropriate semantic actions.
Main components:
- The [spec] type holds global configuration including the parser name
and lexer definition.
- The [output_header], [output_trailer] functions generate the outer
module structure wrapping the generated code.
- The [output_rule] function is the core code generation function that:
- Outputs the bytecode and transition tables for the abstract machine
- Outputs semantic actions for each branch
- Outputs wrapper functions glueing the interpreter to user actions
Implementation details:
- The [output_table] function formats and outputs the already-compacted
bytecode and transition table as an OCaml [Lrgrep_runtime.program] record.
The actual compaction is performed by [Lrgrep_support.compact] inside
[output_rule] before [output_table] is called.
- The [output_execute_function] generates a case analysis matching on:
- The clause number
- The lookahead token
- For each branch that accepts the clause, outputs a case that:
1. Binds registers to captured variables
2. Executes the semantic action
3. Returns the result
- The [bind_capture] function handles different capture types:
- [Value]: Regular captured values with full type information
- [Start_loc], [End_loc]: Start/end position captures
- It handles optional vs required captures based on whether they
can be undefined
- The [lookahead_constraint] function generates pattern matching for
branches with lookahead constraints, ensuring only the right tokens
trigger each branch.
- The [output_rule] function proceeds in three steps:
1. Compacts the state machine via [Lrgrep_support.compact] and outputs
the bytecode and transition tables
2. Generates semantic actions with proper variable binding, including
type recovery for captured values
3. Outputs a wrapper function to glue the interpreter to user actions
*)openFix.IndexingopenUtilsopenMiscopenInfoopenSpectypepriority=inttypespec={parser_name:string;lexer_definition:Syntax.lexer_definition;}letprint_literal_codecp(loc,txt)=iftxt<>""thenCode_printer.printcp~loctxtletoutput_table(typegr)outrule(machine:(g,r,_,_)Automata.Machine.t)(program,table,remap)=letprintfmt=Code_printer.fmtoutfmtinprint"let lrgrep_program_%s : Lrgrep_runtime.program = {\n"rule.Syntax.name;print" registers = %d;\n"machine.register_count;print" initial = %d;\n"(matchmachine.initialwith|None->0|Somei->remap.((i:_index:>int)));print" table = %s;\n"(fst(Lrgrep_support_packer.encodetable));print" code = %S;\n"program;print"}\n"letoutput_wrapperout{Syntax.name;args;_}=letargs=String.concat" "argsinCode_printer.fmtout"let %s %s _lrgrep_env _lrgrep_lookahead = (\n\
\ List.find_map\n\
\ (fun m -> lrgrep_execute_%s %s m _lrgrep_lookahead)\n\
\ (lrgrep_run lrgrep_program_%s _lrgrep_env)\n\
)\n"nameargsnameargsnametypeprinter=Code_printer.toption->unitletgrammar_parametersg=let(moduleRaw)=rawginRaw.Grammar.parametersletoutput_header(typeg)(g:ggrammar)spec:printer=function|None->()|Someout->beginmatchgrammar_parametersgwith|[]->()|parameters->Code_printer.printout"module Make";List.iter(Code_printer.fmtout"(%s)")parameters;Code_printer.fmtout"(%s : module type of %s.Make"spec.parser_namespec.parser_name;letextract_namename=matchString.index_optname':'with|None->name|Someindex->String.subname0indexinList.iter(funparam->Code_printer.fmtout"(%s)"(extract_nameparam))parameters;Code_printer.printout") = struct\n";end;print_literal_codeoutspec.lexer_definition.header;Code_printer.fmtout"include Lrgrep_runtime.Interpreter(%s.MenhirInterpreter)\n"spec.parser_nameletoutput_trailer(typeg)(g:ggrammar)spec:printer=function|None->()|Someout->Code_printer.printout"\n";print_literal_codeoutspec.lexer_definition.Syntax.trailer;matchgrammar_parametersgwith|[]->()|_->Code_printer.printout"\nend\n"letoutput_rule(typegr)(g:ggrammar){parser_name;_}(rule:Syntax.rule)clauses(branches:(g,r)branches)(machine:(g,r,_,_)Automata.Machine.t):printer=function|None->()|Someout->(* Step 1: output bytecode and transition tables *)letget_state_for_compactionindex=letadd_match(clause,priority,regs)=letcap=branches.br_captures.:(clause)inletregisters=letadd_regcapacc=letreg=IndexMap.find_optcapregsin(reg:_indexoption:>intoption)::accinArray.of_list(List.rev(IndexSet.foldadd_regcap[]))in(clause,priority,registers)inletadd_transitiontracc=letlabel=machine.label.:(tr)inletactions={Lrgrep_support.move=IndexMap.bindingslabel.moves;store=List.mapsndlabel.captures;clear=IndexSet.elementslabel.clear;target=machine.target.:(tr);priority=label.priority;}in(label.filter,actions)::accin{Lrgrep_support.accept=List.mapadd_matchmachine.accepting.:(index);halting=machine.unhandled.:(index);transitions=IndexSet.foldadd_transitionmachine.outgoing.:(index)[];}inletprogram=Lrgrep_support.compact(Automata.Machine.statesmachine)get_state_for_compactioninoutput_tableoutrulemachineprogram;(* Step 2: output semantic actions *)letcaptures_lr1=letmap=refIndexMap.emptyinletprocess_transitions(label:_Automata.Machine.label)=map:=List.fold_left(funmap(cap,_reg)->IndexMap.updatecap(Misc.union_updatelabel.filter)map)!maplabel.capturesinVector.iterprocess_transitionsmachine.label;!mapinletrecover_typeindex=tryletlr1s=IndexMap.findindexcaptures_lr1inletsymbols=IndexSet.map(funlr1->matchLr1.incomingglr1with|None->raiseNot_found|Somesym->sym)lr1sinlettyp=IndexSet.fold(funsymacc->lettyp=matchSymbol.semantic_valuegsymwith|None->raiseNot_found|Sometyp->String.trimtypinmatchaccwith|None->Sometyp|Sometyp'->iftyp<>typ'thenraiseNot_found;acc)symbolsNoneinmatchtypwith|None->None|Sometyp->Some(symbols,typ)withNot_found->Noneinletsymbol_matchers=(ifSymbol.is_terminalgsthen"T T_"else"N N_")^Symbol.to_stringg~mangled:truesinletbind_captureout~offsetindex(def,name,(_startpos,_endpos,positions))=(* FIXME: variables should be introduced only if the relevant keyword appear in the action *)letis_optional=IndexSet.memindexmachine.partial_capturesinletnone=ifis_optionalthen"None"else"assert false"inletsomex=ifis_optionalthen"Some ("^x^")"elsexinmatchdefwith|Syntax.Value->lettyp=recover_typeindexinCode_printer.fmtout" let %s, _startpos_%s_, _endpos_%s_, _positions_%s_ = match __registers.(%d) with \n\
\ | Empty -> %s\n\
\ | Location _ -> assert false\n\
\ | Value (%s.MenhirInterpreter.Element (%s, %s, startp, endp)%s) ->\n"namenamenamenameoffset(ifis_optionalthen"(None, None, None, None)"else"assert false")parser_name(ifOption.is_nonetypthen"_"else"st")(ifOption.is_nonetypthen"_"else"x")(ifOption.is_nonetypthen"as x"else"");beginmatchtypwith|None->()|Some(symbols,typ)->Code_printer.fmtout" let x = match %s.MenhirInterpreter.incoming_symbol st with\n"parser_name;List.iter(funsymbol->Code_printer.fmtout" | %s -> (x : %s)\n"(symbol_matchersymbol)typ)(IndexSet.elementssymbols);Code_printer.fmtout" | _ -> assert false\n\
\ in\n"end;positions:=false;Code_printer.fmtout" (%s, %s, %s, %s)\n"(some"x")(some"startp")(some"endp")(some"(startp, endp)");Code_printer.fmtout" in\n";Code_printer.fmtout" let _ = %s in\n"name|Start_loc->Code_printer.fmtout" let _startpos_%s_ = match __registers.(%d) with\n\
\ | Empty -> %s\n\
\ | Location (p, _) | Value (%s.MenhirInterpreter.Element (_, _, p, _)) -> %s\n\
\ in\n"nameoffsetnoneparser_name(some"p")|End_loc->Code_printer.fmtout" let _endpos_%s_ = match __registers.(%d) with\n\
\ | Empty -> %s\n\
\ | Location (_, p) | Value (%s.MenhirInterpreter.Element (_, _, _, p)) -> %s\n\
\ in\n"nameoffsetnoneparser_name(some"p")inletlookahead_constraintbranch=matchbranches.lookaheads.:(branch)with|None->None|Someterms->letterm_patternt=Terminal.to_stringgt^matchTerminal.semantic_valuegtwith|None->""|Some_->" _"inSome(string_concat_map~wrap:("(",")")"|"term_pattern(IndexSet.elementsterms))inletoutput_execute_functionout=Code_printer.fmtout"let lrgrep_execute_%s %s\n\
\ (__clause, (__registers : %s.MenhirInterpreter.element Lrgrep_runtime.register_values))\n\
\ ((token : %s.MenhirInterpreter.token), _startloc_token_, _endloc_token_)\n\
\ : _ option = match __clause, token with\n"rule.name(String.concat" "rule.args)parser_nameparser_name;letoutput_clause_branchesclausebrs=letcaptures=clauses.captures.:(clause)inCode_printer.fmtout" ";(* Identify branches that lead to this action *)IndexSet.iter(funbranch->Code_printer.fmtout" | %d, %s"(Index.to_intbranch)(Option.value(lookahead_constraintbranch)~default:"_");)brs;Code_printer.fmtout" ->\n";letvars=Hashtbl.create7inletcaptures=IndexMap.mapbeginfun(kind,var)->letrefs=matchHashtbl.find_optvarsvarwith|Somerefs->refs|None->letrefs=(reffalse,reffalse,reffalse)inHashtbl.addvarsvarrefs;refsin(kind,var,refs)endcapturesinletbody=matchclauses.definitions.:(clause).syntax.actionwith|Unreachable_->""|Partial(loc,str)|Total(loc,str)->Misc.rewrite_keywordsbeginfunposkwvar->matchkwwith(* FIXME Report an error message rather than a failure *)|"$startloc"->Syntax.errorpos"$startloc is now called $startpos"|"$endloc"->Syntax.errorpos"$endloc is now called $endpos"|"$startpos"|"$endpos"|"$positions"->(* FIXME Check if variable exists *)beginmatchHashtbl.find_optvarsvarwith|None->Syntax.errorpos"undefined variable %s"var|Some(rstart,rend,rpos)->matchkwwith|"$startpos"->rstart:=true|"$endpos"->rend:=true|"$positions"->rpos:=true|_->()end;true|kw->Syntax.errorpos"unknown keyword %S; did you mean $startpos, $endpos or $positions?"kwendlocstrinletoffset=ref0inIndexMap.iter(funkv->bind_captureout~offset:!offsetkv;incroffset)captures;IndexMap.iter(funindex(_,var,(_,_,positions))->if!positionsthen(letis_optional=IndexSet.memindexmachine.partial_capturesinifis_optionalthenCode_printer.fmtout" let _positions_%s_ = match _startpos_%s_, _endpos_%s_ with\n\
\ | Some s, Some e -> Some (s, e)\n\
\ | _ -> None in\n"varvarvarelseCode_printer.fmtout" let _positions_%s_ = (_startpos_%s_, _endpos_%s_) in\n"varvarvar))captures;beginmatchclauses.definitions.:(clause).syntax.actionwith|Unreachable_->Code_printer.printout" failwith \"Should be unreachable\"\n"|Partial(loc,_)->Code_printer.printout" (\n";Code_printer.fmtout~loc"%s\n"body;Code_printer.printout" )\n"|Total(loc,_)->Code_printer.printout" Some (\n";Code_printer.fmtout~loc"%s\n"body;Code_printer.printout" )\n"end;letconstrained=IndexSet.filter(funbranch->Option.is_somebranches.lookaheads.:(branch))brsinifIndexSet.is_not_emptyconstrainedthenCode_printer.fmtout" | (%s), _ -> None\n"(string_concat_map"|"string_of_index(IndexSet.elementsconstrained))inVector.iterioutput_clause_branchesbranches.of_clause;Code_printer.printout" | _ -> failwith \"Invalid action (internal error or API misuse)\"\n\n"inoutput_execute_functionout;(* Step 3: wrapper to glue interpreter and user actions *)output_wrapperoutrule