123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286(* MIT License
*
* Copyright (c) 2025 Frédéric Bour
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
*
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*)(** Specification structures for LRGrep
This module defines the core data structures that represent the user's
error specification after parsing and resolution: clauses, branches, and
their relationships.
Design:
- A *clause* represents one alternative in the user's grammar definition.
Multiple clauses can be grouped together (e.g., using `%%shortest`).
- A *branch* is a single pattern-action pair. Each clause contains one
or more branches (patterns), each with its own action.
- Key relationships:
- [of_clause] maps each clause to the set of branches in that clause
- [clause] maps each branch to its containing clause
- [priority] tracks the priority chain: branches in the same group
have the same priority, branches in different groups have higher
priority if they appear earlier
- Data structures:
- [branches]: Contains all branch-level information:
- [clause], [pattern], [expr]: The branch's membership, syntax, and
compiled regular expression
- [of_clause]: Mapping from clauses to branches
- [lookaheads]: Optional lookahead constraints for each branch
- [br_captures]: Captured variables for each branch
- [is_total], [is_partial]: Whether branches accept all inputs or
can fail
- [priority]: Priority mapping for branch handling
- The import_rules function transforms the user-facing syntax into these
internal structures:
- Parsing clauses and their actions (total/partial/unreachable)
- Compiling patterns to Expr.t using the [Transl.transl] function
- Computing priority relationships between branches
Tricky implementation details:
- The priority system is complex: branches within a group share priority,
but branches in later groups have lower priority. This enables
specification like:
clause1 : pattern1a | pattern1b (high priority)
clause2 : pattern2a | pattern2b (lower priority)
- The [is_total] vs [is_partial] distinction matters for coverage analysis:
total clauses accept any lookahead, partial clauses have specific
lookahead constraints.
- The [lookaheads] field stores optional lookahead specifications, which
can be terminals or `first(nonterminal)` to match the first symbol
of a nonterminal.
- The [br_captures] vector tracks which variables are captured by each
branch, for register allocation during code generation.
*)openUtilsopenMiscopenFix.IndexingopenInfoopenRegexpmoduleClause=Unsafe_cardinal()type('g,'r)clause=('g*'r)Clause.ttypeclause_def={new_group:bool;shortest:bool;syntax:Syntax.clause;}type('g,'r)clauses={definitions:(('g,'r)clause,clause_def)vector;captures:(('g,'r)clause,(Capture.n,Syntax.capture_kind*string)indexmap)vector;}moduleBranch=Unsafe_cardinal()type('g,'r)branch=('g*'r)Branch.ttype('g,'r)branches={clause:(('g,'r)branch,('g,'r)clauseindex)vector;pattern:(('g,'r)branch,Syntax.pattern)vector;expr:(('g,'r)branch,'gExpr.t)vector;of_clause:(('g,'r)clause,('g,'r)branchindexset)vector;lookaheads:(('g,'r)branch,'gterminalindexsetoption)vector;br_captures:(('g,'r)branch,Capture.nindexset)vector;is_total:('g,'r)branchBoolvector.t;is_partial:('g,'r)branchBoolvector.t;priority:(('g,'r)branch,('g,'r)branchoptindex)vector;}type'g_rule=Rule:('g,'r)clauses*('g,'r)branches->'g_ruleletimport_rule(typeg)(g:ggrammar)(rg:gRedgraph.graph)(indices:gTransl.Indices.t)(trie:gRedgraph.target_trie)(rule:Syntax.rule):g_rule=letopenstructtyperendinletmoduleClauses=Vector.Of_array(structtypea=clause_defletarray=letindex=function|[]->[]|[clause]->[{new_group=true;shortest=false;syntax=clause}]|clauses->List.mapibeginfuni(clause:Syntax.clause)->beginmatchclause.actionwith|Syntax.Total_->()|Syntax.Partial(pos,_)->Syntax.errorpos"%%partial clauses are not supported in a %%shortest group"|Syntax.Unreachablepos->Syntax.errorpos"unreachable \".\" clauses are not supported in a %%shortest group"end;List.iterbeginfunpat->matchpat.Syntax.lookaheadswith|[]->()|(_,pos)::_->Syntax.errorpos"lookahead constraints are not supported in a %%shortest group"endclause.patterns;{new_group=(i=0);shortest=true;syntax=clause}endclausesinArray.of_list(List.concat_mapindexrule.clauses)end)inletmoduleBranches=Vector.Of_array(structtypea=Clauses.nindex*Syntax.pattern*bool*boolletarray=Vector.mapibeginfunclausedef->List.mapi(funipattern->(clause,pattern,i=0&&def.new_group,def.shortest))def.syntax.patternsendClauses.vector|>Vector.to_list|>List.flatten|>Array.of_listend)in(* Branch definitions *)letbranch_count=Vector.lengthBranches.vectorinletclause=Vector.map(fun(c,_,_,_)->c)Branches.vectorinletpattern=Vector.map(fun(_,p,_,_)->p)Branches.vectorinletpriority=letlast=refOpt.noneinVector.mapibeginfunindex(_,_,ng,sh)->ifngthenlast:=Opt.someindex;ifshthenOption.get(Index.pred!last)else!lastendBranches.vectorinletof_clause=letindex=ref0inletimportclause=letcount=List.lengthclause.syntax.patternsinletfirst=Index.of_intbranch_count!indexinindex:=!index+count;letlast=Index.of_intbranch_count(!index-1)inIndexSet.init_intervalfirstlastinVector.mapimportClauses.vectorinletlookaheads=Vector.mapbeginfunpattern->matchpattern.Syntax.lookaheadswith|[]->None|symbols->letlookahead_msg="Lookahead can either be a terminal or `first(nonterminal)'"inletsym_pattern(sym,pos)=matchsymwith|Syntax.Apply("first",[sym])->beginmatchSymbol.descg(Transl.Indices.get_symbolgpossym)with|Tt->lett=Terminal.to_stringgtinSyntax.errorpos"%s; in first(%s), %s is a terminal"lookahead_msgtt|Nn->Nonterminal.firstgnend|Syntax.Name_->beginmatchSymbol.descg(Transl.Indices.get_symbolgpossym)with|Nn->Syntax.errorpos"%s; %s is a nonterminal"lookahead_msg(Nonterminal.to_stringgn)|Tt->IndexSet.singletontend|_->Syntax.errorpos"%s"lookahead_msginSome(List.fold_leftIndexSet.unionIndexSet.empty(List.mapsym_patternsymbols))endpatterninletis_partial=Boolvector.initbranch_countbeginfunbr->matchClauses.vector.:(clause.:(br)).syntax.actionwith|Syntax.Partial_->true|Syntax.Total_->false|Syntax.Unreachablepos->Syntax.warnpos"unreachable clauses \"{.}\" are not yet supported";falseendinletis_total=Boolvector.initbranch_countbeginfunbr->Option.is_nonelookaheads.:(br)&¬(Boolvector.testis_partialbr)endinletbr_captures=Vector.makebranch_countIndexSet.emptyinletexpr=Vector.makebranch_countExpr.emptyin(* Clause definitions *)letclause_count=Vector.lengthClauses.vectorinletcaptures=letgensym=Capture.gensym()inVector.initclause_count@@funclause->letcapture_tbl=Hashtbl.create7inletcapture_def=refIndexMap.emptyinletcapturekindname=letkey=(kind,name)inmatchHashtbl.find_optcapture_tblkeywith|Someindex->index|None->letindex=gensym()inHashtbl.addcapture_tblkeyindex;capture_def:=IndexMap.addindexkey!capture_def;indexinlettranslate_branch(br:Branches.nindex)=letcap,exp=Transl.translgrgindicestrie~capturepattern.:(br).exprinbr_captures.:(br)<-cap;expr.:(br)<-exp;inIndexSet.itertranslate_branchof_clause.:(clause);!capture_defin(* Packing *)letmoduleClauses_def=Clause.Eq(structtypet=g*rtypen=Clauses.nletn=Vector.lengthClauses.vectorend)inletRefl=Clauses_def.eqinletmoduleBranches_def=Branch.Eq(structtypet=g*rtypen=Branches.nletn=Vector.lengthBranches.vectorend)inletRefl=Branches_def.eqinletclauses={definitions=Clauses.vector;captures}inletbranches={clause;pattern;expr;of_clause;lookaheads;br_captures;is_total;is_partial;priority}inRule(clauses,branches)letbranch_countbranches=Vector.lengthbranches.expr