123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112# 1 "src/owl/nlp/owl_nlp_utils.ml"(*
* OWL - an OCaml numerical library for scientific computing
* Copyright (c) 2016-2017
* Ben Catterall <bpwc2@cam.ac.uk>
* Liang Wang <liang.wang@cl.cam.ac.uk>
*)(* some useful regular expressions *)letregexp_split=Str.regexp"[ \t;,.'!?()’“”\\/&—\\-]+"let_allocate_spacex=Owl_log.info"allocate more space";letl=Array.lengthxinlety=Array.makel[||]inArray.appendxyletload_from_file?stopwordsf=Owl_log.info"load text corpus";lett=matchstopwordswith|Somet->t|None->Hashtbl.create2inletx=ref(Array.make(64*1024)[||])inletc=ref0inletw=ref0inleth=open_infin(trywhiletruedoif!c=(Array.length!x)-1thenx:=_allocate_space!x;lets=Str.split(Str.regexp" ")(input_lineh)|>List.filter(funw->Hashtbl.memtw=false)|>Array.of_listin!x.(!c)<-s;c:=!c+1;w:=!w+Array.lengths;donewithEnd_of_file->());close_inh;Owl_log.info"load %i docs, %i words"!c!w;Array.sub!x0!cletload_from_string?stopwordss=lett=matchstopwordswith|Somet->t|None->Hashtbl.create2inStr.split(Str.regexp" ")s|>List.filter(funw->Hashtbl.memtw=false)|>Array.of_listletload_stopwordsf=Owl_log.info"load stopwords";letx=Hashtbl.create(64*1024)inleth=open_infin(trywhiletruedoletw=input_linehinifHashtbl.memxw=falsethenHashtbl.addxw0donewithEnd_of_file->());close_inh;x(* return both word->index and index->word hashtbl *)letbuild_vocabularyx=Owl_log.info"build up vocabulary";letw2i=Hashtbl.create(64*1024)inArray.iter(funl->Array.iter(funw->ifHashtbl.memw2iw=falsethenHashtbl.addw2iw0)l)x;lety=Array.make(Hashtbl.lengthw2i)""inleti=ref0inHashtbl.iter(funw_->y.(!i)<-w;i:=!i+1)w2i;Array.sortString.comparey;leti2w=Hashtbl.(create(lengthw2i))inHashtbl.resetw2i;Array.iteri(funiw->Hashtbl.addw2iwi;Hashtbl.addi2wiw;)y;w2i,i2wlettokenisedictdata=Array.map(Hashtbl.finddict)datalettokenise_alldictdata=Array.map(Array.map(Hashtbl.finddict))dataletsave_vocabularyxf=Owl_io.marshal_to_filexfletload_vocabularyf=Owl_io.marshal_from_filefletsave_lda_modelmf=Owl_log.info"save LDA model";Owl_io.marshal_to_filem(f^".model")letload_lda_modelf=Owl_log.info"load LDA model";Owl_io.marshal_from_file(f^".model")(* TODO: perform simple processing of the passed in string *)letsimple_processs=s(* ends here *)