123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291(* MIT License
*
* Copyright (c) 2026 Frédéric Bour
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
*
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*)(** Generating parse sentences from transitions
This module provides functionality to generate concrete parse examples
(sentences) from sequences of LR states or LR transitions. It's used
for generating counterexamples and debug information.
Core algorithm:
- The algorithm works backwards from the desired parsing outcome to find
a valid sequence of transitions that would produce that outcome.
- [to_transitions] converts a sequence of LR states to a sequence of
transitions that connect those states.
- [to_cells] maps transitions to reduction graph cells, using dynamic
programming to find the minimum-cost path through the reduction graph.
- [expand_cells] recursively expands cells back to the original terminal
symbols that would trigger the reductions.
Key data structures:
- Cells: Represent positions in the reduction graph, encoded as a compact
triple (node, pre_class, post_class) for efficient storage and lookup.
(Pre_class and post_class constrain the lookahead symbols that can precede
and follow)
- The algorithm uses dynamic programming to find minimum-cost paths
through the reduction graph.
Implementation details:
- [to_cells] uses a sophisticated dynamic programming approach where at
each transition, it considers:
- All post_classes of the current node
- For each, all pre_classes that can reach it with finite cost
Then it keeps only the minimal cost paths
- [expand_cells] handles two cases:
- [L tr]: A transition node - either shift (return the terminal) or
goto (recursively solve the subproblem with minimum cost)
- [R (l, r)]: An inner node - decompose into left and right subproblems,
finding solutions that minimize total cost
- The [Break] exception is used to short-circuit when a minimal-cost
solution is found during the exploration of all possible decompositions.
- Nullable reductions need some special care. If a nullable reduction is
possible and the lookahead classes allow it, the algorithm takes that path
instead of the non-nullable one.
*)openUtilsopenFix.IndexingopenInfo(** Find the transition from LR state [x] to LR state [y].
Returns a goto transition if [y] is reached via a non-production,
or a shift transition if [y] is reached via a terminal.
Raises [Invalid_argument] if [y] is an entrypoint.
Raises [Not_found] if there is no transition from [x] to [y]. *)letfind_transition(typeg)(g:ggrammar)xy=matchLr1.incominggywith|None->invalid_arg"Sentence_generation.find_transition: y is an entrypoint"|Somesym->matchSymbol.descgsymwith|Nn->Transition.of_gotog(Transition.find_gotogxn)|T_->(* FIXME: IndexSet.choose raises Not_found if the intersection is empty,
which could happen with an invalid state sequence. Consider using
Transition.find for consistency, or add a more informative error. *)IndexSet.choose(IndexSet.inter(Transition.successorsgx)(Transition.predecessorsgy))(** Convert a sequence of LR states to the initial state and the list of
transitions connecting consecutive states.
Raises [Invalid_argument] if the input list is empty. *)letto_transitionsg=function|[]->invalid_arg"Sentence_generation.to_transitions: empty list"|initial::rest->letfollowxy=(y,find_transitiongxy)inlet_,trs=List.fold_left_mapfollowinitialrestin(initial,trs)(** Map a list of transitions to reduction graph cells, finding the
minimum-cost path through the cost DAG using dynamic programming.
Processes transitions right-to-left. For each transition, iterates all
post_classes and pre_classes, selecting the (pre, post) pair that
minimizes the total cost: [cost(cell) + cost(suffix)].
Returns the list of cells along the minimum-cost path. *)letto_cells(typegcell)(g:ggrammar)((moduleR):(g,cell)Reachability.t_cell)trs=letrecaux=function|[]->[Terminal.allg,0,[]]|x::xs->letcandidates=auxxsinletnode=R.Tree.leafxinletpost_candidates=Array.to_seqi(R.Tree.post_classesnode)|>Seq.filter_map(fun(post,classe)->letcost,tail=List.fold_leftbeginfun(bcost,_asbest)(cclasse,ccost,tail)->ifccost<bcost&&IndexSet.quick_subsetcclasseclassethen(ccost,tail)elsebestend(max_int,[])candidatesinifcost<max_intthenSome(post,cost,tail)elseNone)|>List.of_seqinletencode=R.Cell.encodenodeinletpre_candidates=Array.to_seqi(R.Tree.pre_classesnode)|>Seq.filter_map(fun(pre,classe)->letcost,tail=List.fold_leftbeginfun(bcost,_asbest)(post,cost,tail)->letcell=encode~pre~postinletcost'=R.Analysis.costcellinifcost'<max_int&&cost'+cost<bcostthen(cost'+cost,cell::tail)elsebestend(max_int,[])post_candidatesinifcost<max_intthenSome(classe,cost,tail)elseNone)|>List.of_seqinpre_candidatesinsnd(List.fold_left(fun(bcost,_asbest)(_la,cost,tail')->ifcost<bcostthen(cost,tail')elsebest)(max_int,[])(auxtrs))(** Recursively expand reduction graph cells back to terminal symbols.
Handles two node types from the cost tree:
- [L tr]: A leaf transition — shifts return the terminal symbol;
gotos check for nullable reductions first, then recurse into the
minimum-cost non-nullable reduction equation.
- [R (l, r)]: An inner node — decomposes into left and right
sub-problems via the coercion matrix, recursing into both children
whose combined cost equals the current cost.
Uses a [Break] exception to short-circuit once a minimal-cost
decomposition is found. *)letexpand_cells(typegcell)(g:ggrammar)((moduleR):(g,cell)Reachability.t_cell)cells=letopenRinletexceptionBreakofgterminalindexlistinletrecauxcellacc=letnode,i_pre,i_post=Cell.decodecellinmatchTree.splitnodewith|Ltr->(* The node corresponds to a transition *)beginmatchTransition.splitgtrwith|Rshift->(* It is a shift transition, just shift the symbol *)Transition.shift_symbolgshift::acc|Lgoto->(* It is a goto transition *)leteqn=Tree.goto_equationsgotoinletc_pre=(Tree.pre_classesnode).(i_pre)inletc_post=(Tree.post_classesnode).(i_post)inifnot(IndexSet.is_emptyeqn.nullable_lookaheads)&&IndexSet.quick_subsetc_posteqn.nullable_lookaheads&¬(IndexSet.disjointc_prec_post)then(* If a nullable reduction is possible, don't do anything *)accelse(* Otherwise look at all equations that define the cost of the
goto transition and recursively visit one of minimal cost *)letcurrent_cost=Analysis.costcellin(* FIXME: List.find_map returns the first equation that satisfies
all conditions and has matching cost. If multiple equations have
the same minimal cost, the choice is arbitrary (first in list). *)matchList.find_mapbeginfun(red,node')->ifIndexSet.disjointc_postred.lookaheadthen(* The post lookahead class does not permit reducing this
production *)NoneelsematchTree.pre_classesnode'with|[|c_pre'|]whenIndexSet.disjointc_pre'c_pre->(* The pre lookahead class does not allow entering this
branch. *)None|pre'->(* Visit all lookahead classes, pre and post, and find
the mapping between the parent node and this
sub-node *)letpred_pre_c_pre'=IndexSet.quick_subsetc_pre'c_preinletpred_post_c_post'=IndexSet.quick_subsetc_postc_post'inmatchMisc.array_findipred_pre0pre',Misc.array_findipred_post0(Tree.post_classesnode')with|exceptionNot_found->None|i_pre',i_post'->letcell=Cell.encodenode'~pre:i_pre'~post:i_post'inifAnalysis.costcell=current_costthen(* We found a candidate of minimal cost *)SomecellelseNoneendeqn.non_nullablewith|None->Printf.eprintf"abort, cost = %d\n%!"current_cost;assertfalse|Somecell'->(* Solve the sub-node *)auxcell'accend|R(l,r)->(* It is an inner node.
We decompose the problem into left-hand and right-hand
sub-problems, finding sub-solutions of minimal cost *)letcurrent_cost=Analysis.costcellinletcoercion=Coercion.infix(Tree.post_classesl)(Tree.pre_classesr)inletl_index=Cell.encodelinletr_index=Cell.encoderinbegintryArray.iteri(funi_post_lall_pre_r->letl_cost=Analysis.cost(l_index~pre:i_pre~post:i_post_l)inArray.iter(funi_pre_r->letr_cost=Analysis.cost(r_index~pre:i_pre_r~post:i_post)inifl_cost+r_cost=current_costthen(letacc=aux(r_index~pre:i_pre_r~post:i_post)accinletacc=aux(l_index~pre:i_pre~post:i_post_l)accinraise(Breakacc)))all_pre_r)coercion.Coercion.forward;assertfalsewithBreakacc->accendinList.fold_rightauxcells[](** Generate a terminal sentence from a list of transitions.
Combines [to_cells] and [expand_cells] in a single pipeline. *)letsentence_of_transitions(typeg)(g:ggrammar)((moduleR):gReachability.t)trs=expand_cellsg(moduleR)(to_cellsg(moduleR)trs)(** Generate a terminal sentence from a list of LR states (a parse stack).
Combines [to_transitions], [to_cells], and [expand_cells] in a single
pipeline. The input list must be non-empty (see [to_transitions]). *)letsentence_of_stack(typeg)(g:ggrammar)((moduleR):gReachability.t)lr1s=let_initial,transitions=to_transitionsglr1sinletcells=to_cellsg(moduleR)transitionsinexpand_cellsg(moduleR)cells