123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430(* MIT License
*
* Copyright (c) 2025 Frédéric Bour
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*)(** Regular expression definitions and operations for LRGrep
This module implements regular expressions used in LRGrep, including
derivation operations for filtering LR states and matching on reductions.
Architecture:
- Capture module: Defines variables that can capture semantic values or
positions during derivation. Each capture is identified by a unique index.
- Reductions module: Represents reduction operations in regular expressions,
tracking which target patterns are being recognized.
- Expr module: The core regular expression structure:
- Set: Match a set of LR states, with optional capture
- Alt: Disjunction of sub-expressions
- Seq: Concatenation of sub-expressions
- Star: Kleene star (repetition)
- Filter: State guard that restricts matching to a set of LR states
- Reduce: Reduction operation
- Usage: A set that tracks which source constructs are exercised,
enabling dead-code warnings for unreachable parts of a specification.
- Label module: A label is a combination of:
- filter: Which LR states match
- captures: Which variables are captured
- usage: Which source constructs are being recognized
- K module: Continuations that appear during derivative computation:
- Accept: Recognition complete
- Done: Reached end of expression (derives to Accept)
- More: Continue with sub-expression
- Reducing: Intermediate states of reduction recognition
- Key operations:
- [derive]: Compute derivatives of regular expressions with respect to LR
states
- [compare]: Compare expressions by unique ID
Implementation details:
- The derivation algorithm handles the complex case of reductions with
nullable and non-nullable parts. It uses Antimirov's derivatives adapted
for LR states.
- The [Reducing] case handles the case where a reduction must be performed
before continuing. It tracks:
- The reduction targets we are looking for
- The current positions in the reduction graph
- The continuation to use when reduction succeeds
- The [Shortest] vs [Longest] policy determines whether the parser should
prefer smaller or larger reductions when there are multiple possible reductions.
This is implemented by ordering the resulting continuations (shortest:
accept first, longest: accept last).
- The usage tracking enables dead-code analysis: expressions that are never
executed can be detected and reported.
*)openFix.IndexingopenUtilsopenMiscopenInfo(** The Capture module defines types and functions for representing variables
captured in regular expressions.
It uses an index type to uniquely identify a capture in an expression. *)moduleCapture:sigtypentypet=nindextypeset=nindexsettype'amap=(n,'a)indexmap(* The gensym is instantiated separately for each expression *)valgensym:unit->unit->nindexend=structincludePositivetypet=nindextypeset=nindexsettype'amap=(n,'a)indexmapletgensym()=letr=ref(-1)infun()->incrr;Index.of_intn!rend(** Reductions represent pattern-match operations in regular expressions.
Each reduction tracks which reduction targets to match, which captures
to bind, and whether to prefer shortest or longest match. *)moduleReductions=structtype'gt={pattern:'gRedgraph.targetindexset;capture:Capture.set;policy:Syntax.quantifier_kind;}letcomparer1r2=ifr1==r2then0elseletc=IndexSet.comparer1.patternr2.patterninifc<>0thencelseletc=IndexSet.comparer1.capturer2.captureincletcmon{capture=_;pattern;policy}=Cmon.record[(*"capture", cmon_indexset capture;*)"pattern_domain",cmon_set_cardinal(*cmon_indexset*)pattern;"policy",Syntax.cmon_quantifier_kindpolicy;]endmoduleExpr=struct(** Integer that serves as a unique id to identify sub-terms.
Thanks to properties of Antimirov's derivatives, no new term is
introduced during derivation. All terms are produced during initial
parsing. *)typeuid=intletuid=letk=ref0infun()->incrk;!ktype'gt={uid:uid;desc:'gdesc;position:Syntax.position;}(** The different constructors of regular expressions *)and'gdesc=|Setof'glr1indexset*Capture.set(** Recognise a set of states, and optionally bind the matching state to
a variable. *)|Altof'gtlist(** [Alt ts] is the disjunction of sub-terms [ts].
[Alt []] represents the empty language. *)|Seqof'gtlist(** [Seq ts] is the concatenation of sub-terms [ts].
[Seq []] represents the empty string {ε}. *)|Starof'gt*Syntax.quantifier_kind(** [Star t qk] is the Kleene star of [t] with quantifier policy [qk]
(shortest or longest match). *)|Filterof'glr1indexset(** Restrict matching to LR(1) states in the given set. *)|ReduceofCapture.set*'gReductions.t(** The reduction operator. The first component is the set of captures
to bind, the second is the reduction specification. *)|UsageofUsage.set(** Dead-code tracking marker. The set records which source constructs
are exercised at this point in the expression. *)(** An empty expression representing the empty language. *)letempty={uid=0;desc=Alt[];position=Lexing.dummy_pos}(** Introduce a new term, allocating a unique ID *)letmakepositiondesc={uid=uid();desc;position}(** Compare two terms *)letcomparet1t2=Int.comparet1.uidt2.uidletcmon?(lr1=cmon_index)t=letrecauxt=matcht.descwith|Set(lr1s,_var)->Cmon.construct"Set"[cmon_indexset~index:lr1lr1s]|Altts->Cmon.constructor"Alt"(Cmon.list_mapauxts)|Seqts->Cmon.constructor"Seq"(Cmon.list_mapauxts)|Star(t,qk)->Cmon.construct"Star"[auxt;Syntax.cmon_quantifier_kindqk]|Filterlr1s->Cmon.constructor"Filter"(cmon_indexset~index:lr1lr1s)|Reduce(_var,r)->Cmon.construct"Reduce"[(*cmon_indexset var;*)Reductions.cmonr]|Usage_->Cmon.constant"Usage"inauxtendmoduleLabel=structtype'gt={filter:'glr1indexset;captures:Capture.set;usage:Usage.set;}letcomparel1l2=ifl1==l2then0elseletc=IndexSet.comparel1.filterl2.filterinifc<>0thencelseIndexSet.comparel1.capturesl2.capturesletfilterlabelfilter=letfilter=IndexSet.interlabel.filterfilterinifIndexSet.is_emptyfilterthenNoneelseSome{labelwithfilter}letunionl1l2={filter=IndexSet.unionl1.filterl2.filter;captures=IndexSet.unionl1.capturesl2.captures;usage=Usage.joinl1.usagel2.usage;}letcapturelabelvarsusage=ifIndexSet.is_emptyvars&&Usage.is_emptyusagethenlabelelse{labelwithcaptures=IndexSet.unionlabel.capturesvars;usage=Usage.joinlabel.usageusage}endmoduleK=structtype'gt=|Accept|Done|Moreof'gExpr.t*'gt|Reducingof{reduction:'gReductions.t;steps:'gRedgraph.stepindexset;next:'gt;}letcmon?lr1?stepk=letrecaux=function|Accept->Cmon.constant"Accept"|Done->Cmon.constant"Done"|More(e,t)->Cmon.construct"More"[Expr.cmon?lr1e;auxt]|Reducing{reduction=_;steps;next}->Cmon.crecord"Reducing"["reduction",Cmon.constant"...";"steps",cmon_indexset?index:stepsteps;"next",auxnext;]inauxkletreccomparet1t2=ift1==t2then0elsematcht1,t2with|Accept,Accept->0|Done,Done->0|More(e1,t1'),More(e2,t2')->letc=Expr.comparee1e2inifc<>0thencelsecomparet1't2'|Reducingr1,Reducingr2->letc=Reductions.comparer1.reductionr2.reductioninifc<>0thencelseletc=IndexSet.comparer1.stepsr2.stepsinifc<>0thencelsecomparer1.nextr2.next|Accept,(More_|Reducing_|Done)->-1|Done,(More_|Reducing_)->-1|(More_|Reducing_|Done),Accept->+1|(More_|Reducing_),Done->+1|More_,Reducing_->-1|Reducing_,More_->+1letintersectings1s2=not(IndexSet.disjoints1s2)letderive(typeg)(_g:ggrammar)(rg:gRedgraph.graph)filterk=letcontinuerlabelnext=match!rwith|(label',next')::r'whennext'==next->r:=(Label.unionlabel'label,next)::r'|r'->r:=(label,next)::r'inletks=ref[]inletrecprocess_reduction_stepmatchingnext_stepsfilter(reduction:_Reductions.t)step=matchRedgraph.followrgstepwith|Advancestep'->next_steps:=IndexMap.updatestep'(union_updatefilter)!next_steps|Switchmap->letmatching'=refIndexSet.emptyinIndexMap.rev_iterbeginfun(lr1,trs)->ifIndexSet.memlr1filterthen(lethas_match=reffalseinList.iterbeginfun(tr:_Redgraph.transition)->ifnot!has_matchthenhas_match:=intersectingtr.reachedreduction.pattern;ifintersectingtr.reachablereduction.patternthenbegin(*let reach = IndexSet.inter reachable reduction.pattern in
Printf.eprintf "continuing to step %d on %s because targets %s are reachable\n"
(step : _ index :> int) (Lr1.to_string g lr1)
(string_of_indexset reach)
;*)process_reduction_stepmatchingnext_steps(IndexSet.singletonlr1)reductiontr.stependendtrs;if!has_matchthenmatching':=IndexSet.addlr1!matching';)endmap;matching:=IndexSet.union!matching'!matchinginletrecprocess_klabel=function|Accept->()|Done->continuekslabelAccept|More(re,next)asself->process_relabelselfnextre.desc|Reducing{reduction;steps;next}->letfilter0=label.filterinletmatching=refIndexSet.emptyinletnext_steps=refIndexMap.emptyinletf=process_reduction_stepmatchingnext_stepslabel.filterreductioninIndexSet.iterfsteps;letpush_matching()=ifIndexSet.is_not_empty!matchingthen(letlabel={labelwithfilter=!matching}inprocess_klabelnext)inletpush_steps()=letlabel=Label.capturelabelreduction.captureUsage.emptyinletnext_steps=!next_steps|>IndexMap.bindings|>List.map(fun(a,b)->(b,a))|>IndexRefine.annotated_partitioninList.iter(fun(filter,steps)->assert(IndexSet.subsetfilterfilter0);letsteps=IndexSet.of_liststepsincontinueks{labelwithfilter}(Reducing{reduction;steps;next});)next_steps;inbeginmatchreduction.policywith|Shortest->push_matching();push_steps()|Longest->push_steps();push_matching()endandprocess_relabelselfnext=function|Set(s,var)->beginmatchLabel.filterlabelswith|None->()|Somelabel->continueks(Label.capturelabelvarUsage.empty)nextend|Altes->List.iter(fune->process_klabel(More(e,next)))es|Star(r,Shortest)->process_klabelnext;process_klabel(More(r,self))|Star(r,Longest)->process_klabel(More(r,self));process_klabelnext|Seqes->process_klabel(List.fold_right(funek->More(e,k))esnext)|Filterfilter->beginmatchLabel.filterlabelfilterwith|None->()|Somelabel'->process_klabel'nextend|Reduce(cap,reduction)->letlabel=Label.capturelabel(IndexSet.unioncapreduction.capture)Usage.emptyinletnext_steps=ref[]inIndexSet.iterbeginfunlr1->letsteps=List.fold_rightbeginfun(tr:_Redgraph.transition)steps->ifintersectingtr.reachablereduction.patternthenIndexSet.addtr.stepstepselsestepsend(Redgraph.initialrglr1)IndexSet.emptyinifIndexSet.is_not_emptystepsthenpushnext_steps(steps,lr1);endlabel.filter;letnext_steps=IndexRefine.annotated_partition!next_stepsinList.iter(fun(steps,filter)->letfilter=IndexSet.of_listfilterincontinueks{labelwithfilter}(Reducing{reduction;steps;next}))next_steps;|Usageset->letlabel=Label.capturelabelIndexSet.emptysetinprocess_klabelnextinletlabel={Label.filter;captures=IndexSet.empty;usage=Usage.empty}inprocess_klabelk;List.rev!ksend