123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459openCore_kernelmoduletypeSymbol=sigtypetvalall:tlistvalto_string:t->stringendmoduletypeScore=sigtypetvalzero:tvalcompare:t->t->intval(<):t->t->boolvalmin:t->t->tvalmax:t->t->tval(+):t->t->tendmoduletypeProfile=sigtypesymboltypettypescorevalscore:t->symbol->scorevalmissing_score:t->scoreendmoduletypeS=sigtypeexpressiontypeautomatontypescoretypeprofiletypesymbolvalprofile:profile->expressionvaldisjunction:expressionlist->expressionvalsequence:expressionlist->expressionvalgap:min_length:int->max_length:int->expression(* val tandem : expression -> expression -> int -> int -> expression *)(* val map : expression -> f:(profile -> profile) -> expression *)valmin_score:expression->scorevalmax_score:expression->scorevalscores:expression->scorearray(* val string_of_expression : expression -> string *)valautomaton:expression->automatonvalscan:(char->symboloption)->automaton->string->(int*score*int)Seq.tendmoduleNucleotide=structtypet=[`A|`C|`G|`T]letall=[`A;`C;`G;`T]letto_string=function|`A->"A"|`C->"C"|`G->"G"|`T->"T"leta=`Aletc=`Cletg=`Glett=`TendmoduleNucleotide_frequency=structtypet=float*float*float*floattypesymbol=Nucleotide.ttypescore=floatletscore(a,c,g,t)b=letf=matchbwith|`A->a|`C->c|`G->g|`T->tinFloat.(100.*f)(* let to_string (a,c,g,t) = Printf.sprintf "(%g,%g,%g,%g)" a c g t *)letmissing_score(a,c,g,t)=Float.((a+c+g+t)/4.)letcomplement(a,c,g,t)=(t,g,c,a)endmoduleNucleotide_IUPAC=structtypet=[|Nucleotide.t|`DisjofNucleotide.tlist]typesymbol=Nucleotide.ttypescore=intletscore(p:t)(b:symbol)=matchpwith|#Nucleotide.tasb'->ifPoly.(b=b')then0else-1|`Disjl->ifList.mem~equal:Poly.(=)lbthen0else-1letmissing_score_=0(* let complement_base = function
* `a -> `t
* | `t -> `a
* | `c -> `g
* | `g -> `c
* | `n -> `n *)(* let complement = function
* | #base as b -> complement_base b
* | `d l -> `d (List.map complement_base l)
*
* let rec tag_base = function
* | `sequence s -> `sequence (List.map tag_base s)
* | `disjunction d -> `disjunction (List.map tag_base d)
* | `gap g as e -> e
* | #base as b -> `base b
* | `d l as e -> `base e
*
* let base_to_string = function
* `a -> "A"
* | `c -> "C"
* | `g -> "G"
* | `t -> "T"
* | `n -> "N"
*
* let to_string = function
* | #base as b -> base_to_string b
* | `d bases -> Printf.sprintf "[%s]" (String.concat "" (List.map base_to_string bases)) *)endmoduleMake(Score:Score)(Symbol:Symbol)(Profile:Profilewithtypescore=Score.tandtypesymbol=Symbol.t)=structtypeexpression=|ProfileofProfile.t|Disjunctionofexpressionlist|Gapofint*int|Sequenceofexpressionlisttypeautomaton=<initial_states:intlist;nb_vertices:int;nb_transitions:int;final_state:int;transitions:(int*[`profileofProfile.t|`any]*int)list;max_state:int>letgap~min_length~max_length=Gap(min_length,max_length)letsequencexs=Sequencexsletdisjunctionxs=Disjunctionxsletprofilep=Profilep(* let rec string_of_expression = function
* `base b -> C.to_string b
* | `disjunction exprz ->
* Printf.sprintf "disjunction(%s)"
* (String.concat "," (List.map string_of_expression exprz))
* | `sequence exprz ->
* Printf.sprintf "sequence(%s)"
* (String.concat "," (List.map string_of_expression exprz))
* | `gap (s,e) -> Printf.sprintf "gap(%d,%d)" s e *)letrecopt_scoreopt=function|Profileprofile->List.foldSymbol.all~init:Score.zero~f:(funcandidatesymbol->optcandidate(Profile.scoreprofilesymbol))|Disjunctionxs->List.foldxs~init:Score.zero~f:(funacce->optacc(opt_scoreopte))|Sequencexs->List.foldxs~init:Score.zero~f:(funacce->Score.(acc+opt_scoreopte))|Gap_->Score.zeroletmin_score=opt_scorePoly.minletmax_score=opt_scorePoly.maxmoduleScoreSet=Set.Make(structincludeScoreletsexp_of_t_=assertfalselett_of_sexp_=assertfalseend)letconvolutionxsys=ScoreSet.foldxs~init:ScoreSet.empty~f:(funaccx->ScoreSet.foldys~init:acc~f:(funaccy->ScoreSet.addaccScore.(x+y)))letrecscores=function|Profileprofile->List.foldSymbol.all~init:ScoreSet.empty~f:(funaccx->ScoreSet.addacc(Profile.scoreprofilex))|Sequenceexprz->List.reduce_exn~f:convolution(List.map~f:scoresexprz)|Disjunctionexprz->(* FIXME j'ai un doute, c'est sans doute une surapproximation dans le cas de la disjonction*)List.reduce_exn~f:ScoreSet.union(List.map~f:scoresexprz)|Gap_->ScoreSet.singletonScore.zeroletscorese=ScoreSet.to_array(scorese)(* The code that follows is adapted from WAPAM *)letrecgen_gapi=function|0->[]|n->(i,`any,i+1)::(gen_gap(i+1)(n-1))letgen_gapvi=letrecauxj=function|0->[]|n->(i,`epsilon,j+1)::(j,`any,j+1)::(aux(j+1)(n-1))inauxiletreccompilationi=function|Profilep->(i+1,[(i,`profilep,i+1)])|Sequences->List.folds~init:(i,[])~f:(fun(i,accu)e->let(o,t)=compilationiein(o,t@accu))|Disjunctions->letforeach_expre(i,accu)=let(o,t)=compilationiein(o+1,(i,t,o)::accu)andconnect_autinputoutput(i,aut,o)accu:(int*[`profileofProfile.t|`epsilon|`any]*int)list=letaut'=(input,`epsilon,i)::(o,`epsilon,output)::autinaut'@accuinletoutput,subaut=List.fold_right~f:foreach_exprs~init:(i+1,[])in(output,List.fold_right~f:(connect_autioutput)subaut~init:[])|Gap(l,u)->(i+u,(gen_gapil)@(gen_gapv(i+l)(u-l)))letepsiloninitialfinalarcs=List.fold_left~f:(funl(i,a,f)->if(f=initial)then(i,a,final)::lelsel)~init:arcsarcsletrecfind_and_removef=function|[]->raiseCaml.Not_found|t::q->if(ft)thent,qelseletx,l=find_and_removefqinx,t::lletcheck_no_epsilonl=List.mapl~f:(function|(i,(`profile_|`anyasp),j)->(i,p,j)|_->assertfalse)letrecenleve_epsilonarcs=trylet(i,_,f),l=find_and_remove(function(_,`epsilon,_)->true|_->false)arcsinenleve_epsilon(epsilonifl)withCaml.Not_found->check_no_epsilonarcsletvertices=List.fold~f:(funaccu(i,_,j)->letaccu'=ifnot(List.mem~equal:Poly.(=)accui)theni::accuelseaccuinifnot(List.mem~equal:Poly.(=)accu'j)thenj::accu'elseaccu')~init:[](* let out_degree g v =
* List.fold_left g ~init:0 ~f:(fun d (i,_,_) ->
* if i = v then d + 1 else d
* ) *)letepsilon_successorsgv=List.map~f:(fun(_,_,v)->v)(List.filter~f:(fun(i,e,_)->v=i&&Poly.(e=`epsilon))g)letrecunionl=function[]->l|h::twhennot(List.mem~equal:Poly.(=)lh)->union(h::l)t|_::t->unionlt(* let rec intersection l = function
* [] -> []
* | h :: t when List.mem ~equal:Poly.( = ) l h -> h :: (intersection l t)
* | _ :: t -> intersection l t *)letreccontainsl=function[]->true|h::twhenList.mem~equal:Poly.(=)lh->containslt|_->falseletequalsll'=containsll'&&containsl'lletepsilon_reachablegv=letrecauxaccuv'=letaccu'=unionaccu(epsilon_successorsgv')inifequalsaccuaccu'thenaccuelseList.fold~f:aux~init:accu'accu'inaux[]v(* to be revived *)(* let rec reverse_complement = function
* `base b -> `base (C.complement b)
* | `disjunction l -> `disjunction (List.map reverse_complement l)
* | `gap (s,e) -> `gap (s,e)
* | `sequence s -> `sequence (List.rev_map reverse_complement s) *)(* NOTE: après suppression des epsilon transitions, les
epsilons-predecesseurs des sommets introduits comme entrée et
sortie des disjonctions deviennent inutiles. On pourrait les
supprimer pour simplifier l'automate *)letautomatonexpr=letfinal_state,transitions=compilation0exprinletepsilon_free=enleve_epsilontransitionsandinitial_states=0::(epsilon_reachabletransitions0)inlettransitions=List.sort~compare:(fun(x,_,y)(x',_,y')->Poly.compare(x,y)(x',y'))epsilon_freeinobjectmethodinitial_states=initial_statesmethodnb_vertices=List.length(verticestransitions)methodnb_transitions=List.lengthtransitionsmethodfinal_state=final_statemethodtransitions=transitionsmethodmax_state=List.foldtransitions~init:(-1)~f:(funaccu(i,_,j)->maxaccu(maxij))endletsetav=fori=0toArray.lengtha-1doa.(i)<-vdonemoduleBScore=structtypet=Bottom|ValueofScore.tlet(<)xy=matchx,ywith|Bottom,Bottom->false|Bottom,Value_->true|Value_,Bottom->false|Valueu,Valuev->Score.(u<v)let(+)xy=matchx,ywith|Bottom,_|_,Bottom->Bottom|Valueu,Valuev->ValueScore.(u+v)letzero=ValueScore.zeroletvalue_exn=function|Bottom->assertfalse|Valuev->vendletscorepc=matchpwith|`any->Score.zero|`profilep->matchcwith|Somec->Profile.scorepc|None->Profile.missing_scorepletrecscan_auxchar(aut:automaton)dnakstateposstate'pos'()=letn=Array.lengthstateinifk>=String.lengthdnathenSeq.Nilelse(setstate'BScore.Bottom;setpos'(-1);letc=chardna.[k]inList.iteraut#initial_states~f:(funj->ifBScore.(state.(j)<zero)then(state.(j)<-BScore.zero;pos.(j)<-k));for_=0ton-1doList.iteraut#transitions~f:(fun(i,p,j)->letvalue=BScore.(state.(i)+Value(scorepc))inifBScore.(state'.(j)<value)then(state'.(j)<-value;pos'.(j)<-pos.(i)));done;letres=k,BScore.value_exnstate'.(aut#final_state),pos'.(aut#final_state)inCons(res,scan_auxcharautdna(k+1)state'pos'statepos))letscancharautdna=letn=aut#max_state+1inletstate=Array.create~len:nBScore.Bottomandstate'=Array.create~len:nBScore.zeroandpos=Array.create~len:n(-1)andpos'=Array.create~len:n0inscan_auxcharautdna0stateposstate'pos'endmodulePSSM=structmoduleM=Make(Float)(Nucleotide)(Nucleotide_frequency)includeMlettandeme1e2min_lengthmax_length=letgap=gap~min_length~max_lengthindisjunction[sequence[e1;gap;e2];sequence[e2;gap;e1]](** Ugly stuff to build pssm out of count matrices *)letpseudo_countskmat=Array.mapmat~f:(Array.map~f:(funi->k+.floati))letlogx=logx/.log2.letlogoddsmat=letforeach_posp=lettotal=Array.fold_right~f:(+.)p~init:0.inArray.map~f:(func->log(c/.total/.0.25))pinArray.map~f:foreach_posmatletcast=function[|a;c;g;t|]->a,c,g,t|_->assertfalseletof_counts?(prior=0.000001)mat=sequenceArray.(to_list(map(logodds(pseudo_countspriormat))~f:(funp->profile(castp))))end