123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529(* MIT License
*
* Copyright (c) 2025 Frédéric Bour
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*)(** LRC (LR with classes) — implementation.
Constructs the LRC graph by iterating over LR1 states and their lookahead
classes, building transition relations, and computing transitive closure
via Tarjan's algorithm. Provides two construction paths: the full LRC
([make]) and a minimized version ([make_minimal]) using Valmari's DFA
minimization algorithm.
Implementation notes:
- [make] allocates LRC states contiguously per LR1 state, using index
offsets to encode the (LR1 state, class index) pairing efficiently.
- [make_minimal] builds the LRC graph on-the-fly during a backward
traversal from LR1 states, then minimizes with Valmari's algorithm.
- [from_entrypoints] computes reachability from a subset of states and
lazily evaluates prefix paths for counterexample generation.
- FIXME: Prefixes are minimal in number of symbols, not in number of
terminals, which is ultimately what we care about when generating
counter-examples. A short prefix can expand to a long sentence.
*)openUtilsopenMiscopenFix.IndexingopenInfotype('g,'n)t={lr1_of:('n,'glr1index)vector;lrcs_of:('glr1,'nindexset)vector;all_wait:'nindexset;all_leaf:'nindexset;all_successors:('n,'nindexset)vector;reachable_from:('n,'nindexset)vector;}(* Number of LRC states in the structure *)letcountt=Vector.lengtht.lr1_of(* Compute the difference between two indices *)letindex_delta(typen)(i:nindex)(j:nindex)=(i:>int)-(j:>int)(* Compute the index of the class of an LRC state *)letclass_indextlrc=index_deltalrc(Option.get(IndexSet.minimumt.lrcs_of.:(t.lr1_of.:(lrc))))moduleN=Unsafe_cardinal()type'gn='gN.t(* Build the LRC structure from grammar and reachability analysis.
Creates one LRC state per (LR1 state, lookahead class) pair, computes
transition relations, and closes them transitively with Tarjan's algorithm. *)letmake(typeg)(g:ggrammar)((moduleReachability):gReachability.t)=(* Compute the total number of LRC states *)letn=letcountlr1=Array.length(Reachability.Classes.for_lr1lr1)inletsum=ref0inIndex.iter(Lr1.cardinalg)(funlr1->sum:=!sum+countlr1);!sumin(* Lift `n` to type-level *)letopenN.Const(structtypet=gletcardinal=nend)in(* Shift an index by a given offset *)letindex_shift(i:nindex)offset=Index.of_intn((i:>int)+offset)in(* Map from LRC states to their corresponding LR1 states *)letlr1_of=Vector.make'n(fun()->Index.of_int(Lr1.cardinalg)0)in(* Map from LR1 states to their corresponding set of LRC states *)letlrcs_of=letcount=ref0inletinit_lr1lr1=letclasses=Reachability.Classes.for_lr1lr1inassert(Array.lengthclasses>0);letfirst=Index.of_intn!countincount:=!count+Array.lengthclasses;letall=refIndexSet.emptyinfori=Array.lengthclasses-1downto0doletlrc=index_shiftfirstiinall:=IndexSet.addlrc!all;Vector.setlr1_oflrclr1done;!allinVector.init(Lr1.cardinalg)init_lr1in(* Map from LR1 states to their first LRC state *)letfirst_lrc_of=Vector.map(funx->Option.get(IndexSet.minimumx))lrcs_ofin(* Set of wait LRC states *)letall_wait=IndexSet.map(Vector.getfirst_lrc_of)(Lr1.waitg)in(* Step timing after computing the LRC set *)stopwatch2"Allocated LRC set (%d elements)"(cardinaln);(* Compute transitions for each LRC state *)letpredecessors=Vector.makenIndexSet.emptyinletprocesslr1=lettgt_first=first_lrc_of.:(lr1)inmatchOption.map(Symbol.descg)(Lr1.incomingglr1)with|None->()|Some(Tt)->predecessors.:(tgt_first)<-IndexSet.map(funtr->letsrc=Transition.sourcegtrinletclasses=Reachability.Classes.for_lr1srcinletclass_index_classe=IndexSet.memtclasseinindex_shiftfirst_lrc_of.:(src)(Misc.array_findiclass_index0classes))(Transition.predecessorsglr1)|Some_->letprocess_transitiontr=letnode=Reachability.Tree.leaftrinletencode=Reachability.Cell.encodenodeinletpre_classes=Reachability.Classes.pre_transitiontrinletpost_classes=Reachability.Classes.post_transitiontrinletcoercion=Reachability.Coercion.infixpost_classes(Reachability.Classes.for_lr1lr1)inletsrc_first=first_lrc_of.:(Transition.sourcegtr)inletpre_classes=Array.lengthpre_classesinletpost_classes=Array.lengthpost_classesinforpost=0topost_classes-1doletreachable=refIndexSet.emptyinforpre=0topre_classes-1doifReachability.Analysis.cost(encode~pre~post)<max_intthenreachable:=IndexSet.add(index_shiftsrc_firstpre)!reachabledone;letreachable=!reachableinArray.iter(funindex->predecessors.@(index_shifttgt_firstindex)<-IndexSet.unionreachable)coercion.forward.(post)doneinIndexSet.rev_iterprocess_transition(Transition.predecessorsglr1)inIndex.rev_iter(Lr1.cardinalg)process;stopwatch2"Computed LRC transitions/predecessors";letall_successors=Misc.relation_reversenpredecessorsinstopwatch2"Computed LRC successors";letreachable_from=Vector.copyall_successorsinTarjan.close_relation(funfi->IndexSet.iterfall_successors.:(i))reachable_from;stopwatch2"Closed LRC successors";letall_leaf=IndexSet.allnin{lr1_of;lrcs_of;all_wait;all_leaf;all_successors;reachable_from}(* Convert an LRC state to a string representation *)letto_stringgtlrc=Printf.sprintf"%s/%d"(Lr1.to_stringgt.lr1_of.:(lrc))(class_indextlrc)(* Convert a set of LRC states to a string representation *)letset_to_stringinfotlrcs=string_of_indexset~index:(to_stringinfot)lrcstype'nentrypoints={reachable:'nindexset;wait:'nindexset;entrypoints:'nindexset;successors:('n,'nindexset)vector;predecessors:('n,'nindexset)vector;some_prefix:'nindex->int*'nindexlist;(** [some_prefix state] returns a prefix to reach [state] from an entrypoint.
The prefix's length and the sequence of states (excluding [state]) are
given, starting from the end.
Thus, [List.rev (state :: some_prefix state)] is a valid prefix. *)}(* Find states reachable from specific states by closing over raw reachability
information *)letfrom_entrypoints(typegn)(g:ggrammar)lrc_graphentrypoints:nentrypoints=(* Set of reachable states *)letreachable=refIndexSet.emptyin(* Compute transitive successors for each state and populate reachable set *)letsuccessors=lettable=Vector.make(countlrc_graph)IndexSet.emptyinlettodo=refentrypointsinletpopulatei=reachable:=IndexSet.addi!reachable;ifIndexSet.is_emptytable.:(i)then(letsucc=lrc_graph.all_successors.:(i)intodo:=IndexSet.unionsucc!todo;Vector.settableisucc;)inwhileIndexSet.is_not_empty!tododolettodo'=!todointodo:=IndexSet.empty;IndexSet.rev_iterpopulatetodo';done;tablein(* Compute predecessors for each state using successors information *)letpredecessors=Misc.relation_reverse(countlrc_graph)successorsin(* Final reachable states *)letreachable=!reachablein(* Wait states that are reachable *)letwait=IndexSet.interlrc_graph.all_waitreachablein(* Compute a prefix to reach each state.
The prefix is represented as (length, path) to avoid reconstructing
the path when only the length is needed for coverage analysis. *)letsome_prefix=lettable=lazy(lettable=Vector.make(countlrc_graph)(0,[])inlettodo=ref[]inletexpandprefixstate=matchVector.gettablestatewith|(0,[])->Vector.settablestateprefix;letlength,prefix=prefixinletprefix=(length+1,state::prefix)inletsucc=successors.:(state)inifIndexSet.is_not_emptysuccthenpushtodo(succ,prefix)|_->()inVector.iteri(funlr1lrcs->ifOption.is_none(Lr1.incomingglr1)thenexpand(0,[])(Option.get(IndexSet.minimumlrcs)))lrc_graph.lrcs_of;letpropagate(succ,prefix)=IndexSet.iter(expandprefix)succinfixpoint~propagatetodo;table)in(funst->Vector.get(Lazy.forcetable)st)in{reachable;wait;entrypoints;successors;predecessors;some_prefix}(* Debugging: verify the LRC automaton is deterministic by traversing all
(LR1 state, class) pairs backward and checking for dead-ends. *)letcheck_deterministic(typeg)(g:ggrammar)((moduleReachability):gReachability.t)=letprefix=Vector.make(Lr1.cardinalg)[]inletrecloopnext=function|[]->ifnot(list_is_emptynext)thenloop[]next|xs::xxs->letx=List.hdxsinletnext=iflist_is_emptyprefix.:(x)then(prefix.:(x)<-xs;IndexSet.fold(funtrnext->(Transition.targetgtr::xs)::next)(Transition.successorsgx)next)elsenextinloopnextxxsinloop[](List.map(funx->[x])(IndexSet.elements(Lr1.entrypointsg)));lettodo=ref[]inlettable=Hashtbl.create7inletvisitpathlr1classes=letkey=(lr1,classes)inletpath=key::pathinletprint_classcl=ifcl==Terminal.allgthen"T"elsestring_concat_map~wrap:("{","}")","(Terminal.to_stringg)(List.rev(IndexSet.elementscl))inletprint(lr1,classes)=letall_classes=Reachability.Classes.for_lr1lr1inPrintf.sprintf"%s@{%s}"(Lr1.to_stringglr1)(string_concat_map", "(funi->print_classall_classes.(i))(IntSet.elementsclasses))inifIntSet.is_emptyclassesthenPrintf.eprintf"Found dead-end: %s\n prefix: %s\n starting classes: %s\n"(string_concat_map" -> "printpath)(Lr1.list_to_stringg(List.rev(List.tlprefix.:(lr1))))(lettop,_=List.hd(List.revpath)instring_concat_map~wrap:("{","}")","print_class(Array.to_list(Reachability.Classes.for_lr1top)))elseifnot(Hashtbl.memtablekey)then(Hashtbl.addtablekey();pushtodopath)inletpropagatepath=let(target,post_class_indices)=List.hdpathinIndexSet.iterbeginfuntr->letsource=Transition.sourcegtrinletnode=Reachability.Tree.leaftrinletencode=Reachability.Cell.encodenodeinletpre_classes=Reachability.Classes.for_lr1sourceinletpre_class_indices=matchTransition.splitgtrwith|Rsh->letterm=Transition.shift_symbolgshinletpre=Utils.Misc.array_findi(fun_cb->IndexSet.memtermcb)0pre_classesinifReachability.Analysis.cost(encode~pre~post:0)<max_intthenIntSet.singletonpreelseIntSet.empty|Lgt->letpost_classes=Reachability.Classes.for_lr1targetinletmid_classes=Reachability.Classes.for_edgegtinIntSet.bindpost_class_indicesbeginfunpost_index->letca=post_classes.(post_index)inmatchUtils.Misc.array_findi(fun_cb->IndexSet.quick_subsetcacb)0mid_classeswith|exceptionNot_found->IntSet.empty|mid_index->IntSet.init_subset0(Array.lengthpre_classes-1)(funpre->Reachability.Analysis.cost(encode~pre~post:mid_index)<max_int)endinvisitpathsourcepre_class_indicesend(Transition.predecessorsgtarget)inIndex.iter(Lr1.cardinalg)(funlr1->letlen=Array.length(Reachability.Classes.for_lr1lr1)iniflen>0thenvisit[]lr1(IntSet.init_interval0(len-1)));fixpoint~propagatetodo;stopwatch2"Determinized Lrc all: %d states"(Hashtbl.lengthtable)moduleMlrc=Unsafe_cardinal()type'gmlrc='gMlrc.t(* Build a minimal LRC structure.
Constructs the graph lazily during backward traversal, then applies
Valmari's DFA minimization to merge equivalent states. *)letmake_minimal(typeg)(g:ggrammar)((moduleReachability):gReachability.t):(g,gmlrc)t=letopenIndexBufferinletmoduleState=Gen.Make()inletmoduleTransitions=Gen.Make()inletstates=State.get_generator()inlettransitions=Transitions.get_generator()inlettable=Hashtbl.create7inlettodo=ref[]inletvisitlr1classes=assert(not(IntSet.is_emptyclasses));letkey=(lr1,classes)inmatchHashtbl.find_opttablekeywith|Someindex->index|None->letindex=Gen.addstateskeyinHashtbl.addtablekeyindex;pushtodoindex;indexinletpropagateindex=let(target,post_class_indices)=Gen.getstatesindexinIndexSet.iterbeginfuntr->letsource=Transition.sourcegtrinletnode=Reachability.Tree.leaftrinletencode=Reachability.Cell.encodenodeinletpre_classes=Reachability.Classes.for_lr1sourceinletpre_class_indices=matchTransition.splitgtrwith|Rsh->letterm=Transition.shift_symbolgshinletpre=Utils.Misc.array_findi(fun_cb->IndexSet.memtermcb)0pre_classesinifReachability.Analysis.cost(encode~pre~post:0)<max_intthenIntSet.singletonpreelseIntSet.empty|Lgt->letpost_classes=Reachability.Classes.for_lr1targetinletmid_classes=Reachability.Classes.for_edgegtinIntSet.bindpost_class_indicesbeginfunpost_index->letca=post_classes.(post_index)inmatchUtils.Misc.array_findi(fun_cb->IndexSet.quick_subsetcacb)0mid_classeswith|exceptionNot_found->IntSet.empty|mid_index->IntSet.init_subset0(Array.lengthpre_classes-1)(funpre->Reachability.Analysis.cost(encode~pre~post:mid_index)<max_int)endinifnot(IntSet.is_emptypre_class_indices)thenletindex'=visitsourcepre_class_indicesinignore(Gen.addtransitions(index,index'))end(Transition.predecessorsgtarget)inletfast_mapsf=letl=IndexSet.fold(funxacc->matchfxwithNone->acc|Somey->y::acc)s[]inIndexSet.of_listlinletvisit_statelr1=letlen=Array.length(Reachability.Classes.for_lr1lr1)iniflen>0thenletset=IntSet.init_interval0(len-1)inSome(visitlr1set)elseNoneinletall_wait=fast_map(Lr1.waitg)visit_stateinfixpoint~propagatetodo;stopwatch2"Determinized Lrc wait: %d states"(Hashtbl.lengthtable);letall_leaf=fast_map(Lr1.allg)visit_stateinfixpoint~propagatetodo;stopwatch2"Determinized Lrc all: %d states"(Hashtbl.lengthtable);(*IndexSet.iter (fun st ->
Printf.eprintf "lrc%d has label %s\n" (Index.to_int st) (Lr1.to_string g (fst (Gen.get states st)))
) all_wait;*)(*Hashtbl.iter (fun (lr1, classes) state ->
if IndexSet.mem lr1 (Lr1.entrypoints g) then (
Printf.eprintf
"LRC%d represent entrypoint %s with classes %s (lr1 has %d classes)\n"
(Index.to_int state)
(Lr1.to_string g lr1)
(string_concat_map ~wrap:("{","}") "," string_of_int (IntSet.elements classes))
(Array.length (Reachability.Classes.for_lr1 lr1))
)
) table;*)letmoduleMin=Valmari.Minimize(structtypet=glr1indexletcompare=Index.compareend)(structletsourcetr=fst(Gen.gettransitionstr)lettargettr=snd(Gen.gettransitionstr)letlabeltr=fst(Gen.getstates(sourcetr))letinitialsf=IndexSet.iterfall_leafletfinalsf=IndexSet.iter(funlr1->matchHashtbl.find_opttable(lr1,IntSet.singleton0)with|None->assertfalse|Somestate->(*Printf.eprintf "Marking entrypoint %d: %s\n" (Index.to_int state) (Lr1.to_string g lr1);*)fstate)(Lr1.entrypointsg)letrefinementsf=IndexSet.iter(funlr1->matchHashtbl.find_opttable(lr1,IntSet.singleton0)with|None->()|Someindex->f(fun~add->addindex))(Lr1.entrypointsg)typestates=State.nletstates=State.ntypetransitions=Transitions.nlettransitions=Transitions.nend)instopwatch2"Minimized deterministic Lrc: %d states"(cardinalMin.states);(* Lr1 of each state *)letlr1_of=Vector.initMin.states(funst->fst(Gen.getstates(Min.represent_statest)))in(* Lrcs of each lr1 state *)letlrcs_of=Vector.make(Lr1.cardinalg)IndexSet.emptyinVector.rev_iteri(funlrclr1->lrcs_of.@(lr1)<-IndexSet.addlrc)lr1_of;(* Sanity check: single lr1 per lrc *)Index.iterState.n(funst->matchMin.transport_statestwith|None->()|Somemst->letst'=Min.represent_statemstinassert(st=st'||fst(Gen.getstatesst)=lr1_of.:(mst)));letall_wait=IndexSet.filter_mapMin.transport_stateall_waitinletall_leaf=IndexSet.filter_mapMin.transport_stateall_leafinletall_successors=Vector.makeMin.statesIndexSet.emptyinIndex.rev_iterMin.transitions(funtr->all_successors.@(Min.targettr)<-IndexSet.add(Min.sourcetr));(* Reachable *)letreachable_from=Vector.copyall_successorsinTarjan.close_relation(funfi->IndexSet.iterfall_successors.:(i))reachable_from;stopwatch2"Minimal Lrc ready";(* Lift `Min.states` to type-level *)letopenMlrc.Eq(structtypet=gtypen=Min.statesletn=Min.statesend)inletRefl=eqin{lr1_of;lrcs_of;all_wait;all_leaf;all_successors;reachable_from}