Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file utls.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618(* Copyright (C) 2020, Francois Berenger
Yamanishi laboratory,
Department of Bioscience and Bioinformatics,
Faculty of Computer Science and Systems Engineering,
Kyushu Institute of Technology,
680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)openPrintfmoduleA=BatArraymoduleHt=BatHashtblmoduleL=BatListmoduleLog=Dolog.LogmoduleS=BatStringtypefilename=stringlettapfx=fx;xletfst3(a,_,_)=aletcreate_tmp_filename()=letres=Filename.temp_file""(* no_prefix *)""(* no_suffix *)in(* tap (Log.info "create_tmp_filename: %s") res; *)resletmkfifo(fn:filename):unit=Unix.mkfifofn0o600(* abort if condition is not met *)letenforce(condition:bool)(err_msg:string):unit=ifnotconditionthenfailwitherr_msgletenforce_f(condition:bool)(msg_constr:unit->string):unit=ifnotconditionthenfailwith(msg_constr())letwith_out_file(fn:filename)(f:out_channel->'a):'a=letoutput=open_out_binfninletres=foutputinclose_outoutput;resletwith_temp_out_file(f:filename->'a):'a=lettemp_file=Filename.temp_file""(* no_prefix *)""(* no_suffix *)inletres=ftemp_fileinSys.removetemp_file;resletwith_in_file(fn:filename)(f:in_channel->'a):'a=letinput=open_in_binfninletres=finputinclose_ininput;resletwith_in_file2fn1fn2f=letin1=open_in_binfn1inletin2=open_in_binfn2inletres=fin1in2inclose_inin1;close_inin2;resletwith_in_file3fn1fn2fn3f=letin1=open_in_binfn1inletin2=open_in_binfn2inletin3=open_in_binfn3inletres=fin1in2in3inclose_inin1;close_inin2;close_inin3;resletwith_infile_outfile(in_fn:filename)(out_fn:filename)(f:in_channel->out_channel->'a):'a=letinput=open_in_binin_fninletoutput=open_out_binout_fninletres=finputoutputinclose_ininput;close_outoutput;resletlines_of_file(fn:filename):stringlist=with_in_filefn(funinput->letres,exn=L.unfold_exc(fun()->input_lineinput)inifexn<>End_of_filethenraiseexnelseres)letlines_to_filefnl=with_out_filefn(funout->L.iter(fprintfout"%s\n")l)(* alias *)letstring_list_to_file=lines_to_file(* read 'nlines' from 'input' in_channel *)letread_n_linesnlinesinput=assert(nlines>=0);letrecloopnacc=ifn=0thenL.revaccelseloop(n-1)(input_lineinput::acc)inloopnlines[](* if the first line is a comment (starts with '#'); then we extract
it separately from other lines; else we treat it as any other line *)letmaybe_extract_comment_header(fn:filename):stringoption*stringlist=letall_lines=lines_of_filefninmatchall_lineswith|[]->(None,[])|fst::others->ifBatString.starts_withfst"#"then(Somefst,others)else(None,all_lines)(* call f on lines of file *)letiter_on_lines_of_filefnf=letinput=open_in_binfnintrywhiletruedof(input_lineinput)donewithEnd_of_file->close_ininputletiteri_on_lines_of_filefnf=leti=ref0inletinput=open_in_binfnintrywhiletruedof!i(input_lineinput);incridonewithEnd_of_file->close_ininputletmap_on_file(fn:filename)(f:in_channel->'a):'alist=with_in_filefn(funinput->letres,exn=L.unfold_exc(fun()->finput)inifexn=End_of_filethenreselseraiseexn)(* map f on lines of file *)letmap_on_lines_of_file(fn:filename)(f:string->'a):'alist=with_in_filefn(funinput->letres,exn=L.unfold_exc(fun()->f(input_lineinput))inifexn=End_of_filethenreselseraiseexn)(* Murphy's law: very big files always have problematic lines... *)letmaybe_map_on_lines_of_file(fn:filename)(f:string->'aoption):'alist=letres=ref[]initer_on_lines_of_filefn(funline->matchflinewith|Somex->res:=x::!res|None->Log.warn"line: '%s'"line);L.rev!resletmapi_on_lines_of_file(fn:filename)(f:int->string->'a):'alist=with_in_filefn(funinput->leti=ref0inletres,exn=L.unfold_exc(fun()->letcurr=f!i(input_lineinput)inincri;curr)inifexn=End_of_filethenreselseraiseexn)(* skip 'nb' blocks from file being read *)letskip_blocksnbread_oneinput=ifnb=0then()elselet()=assert(nb>0)infor_=1tonbdoignore(read_oneinput)doneletread_lines=lines_of_fileletwrite_lines(lines:stringlist)(output_fn:filename):unit=with_out_fileoutput_fn(funout->List.iter(fprintfout"%s\n")lines)letoutput_lines=write_lines(* keep only lines that satisfy p *)letfilter_lines_of_filefnp=L.filterp(lines_of_filefn)(* get the first line (stripped) output by given command *)letget_command_output(cmd:string):string=Log.info"get_command_output: %s"cmd;let_stat,output=BatUnix.run_and_readcmdinmatchBatString.split_on_char'\n'outputwith|first_line::_others->first_line|[]->(Log.fatal"get_command_output: no output for: %s"cmd;exit1)(* run the given command in a sub process (in parallel to the current process)
and returns its pid so that we can wait for it later *)letfork_out_cmd(cmd:string):int=Log.info"fork_out_cmd: %s"cmd;matchUnix.fork()with|0->(* child process *)exit(Sys.commandcmd)|-1->(* error *)(Log.fatal"fork_out_cmd: fork failed";exit1)|pid->pid(* return full path of command, if found in PATH, none else *)letcommand_exists(cmd:string):stringoption=letwhere_is_cmd="which "^cmdinifUnix.system(where_is_cmd^" 2>&1 > /dev/null")=Unix.WEXITED0thenSome(get_command_outputwhere_is_cmd)elseNoneletrun_command(cmd:string):unit=Log.info"run_command: %s"cmd;matchUnix.systemcmdwith|Unix.WSIGNALED_->(Log.fatal"run_command: signaled: %s"cmd;exit1)|Unix.WSTOPPED_->(Log.fatal"run_command: stopped: %s"cmd;exit1)|Unix.WEXITEDiwheni<>0->(Log.fatal"run_command: exit %d: %s"icmd;exit1)|Unix.WEXITED_(* i = 0 then *)->()letget_env(env_var:string):stringoption=trySome(Sys.getenvenv_var)withNot_found->None(* look for exe in PATH then given env. var *)letfind_command(exe:string)(env_var:string):stringoption=matchcommand_existsexewith|Somecmd->Somecmd|None->matchget_envenv_varwith|Somecmd->Somecmd|None->(Log.warn"%s not found in PATH; \
put it in your PATH or setup the \
%s env. var. to point to it"exeenv_var;None)letfilename_is_absolutefn=not(Filename.is_relativefn)letrelative_to_absolutefn=ifFilename.is_relativefnthenletcwd=Sys.getcwd()inFilename.concatcwdfnelsefn(* remove the prefix if it is there, or do nothing if it is not *)letremove_string_prefixprfxstr=ifBatString.starts_withstrprfxthenletprfx_len=String.lengthprfxinBatString.tailstrprfx_lenelsestrletstring_contains_non_binary_digit=Str.regexp"[^01]"letstring_contains_only_zeros_or_ones(s:string):bool=not(Str.string_matchstring_contains_non_binary_digits0)letstring_contains_non_digits_non_sep=Str.regexp"[^-0123456789;]"letstring_is_a_list_of_integers(s:string):bool=BatString.starts_withs"["&&BatString.ends_withs"]"&&letchopped=BatString.chop~l:1~r:1sinnot(Str.string_matchstring_contains_non_digits_non_sepchopped0)letmay_apply_optf=function|Somex->Some(fx)|None->Noneletmay_applyf=function|Somex->fx|None->()(* returns true if we could create the file; false else (already there) *)letlock_file_for_writing(fn:filename):bool=tryletfd=Unix.(openfilefn[O_CREAT;O_EXCL;O_WRONLY]0o600)inUnix.closefd;truewithUnix.Unix_error_->false(* compute all distinct pairs ((a,b) = (b,a)) of elements in 'l' *)letall_pairs(l:'alist):('a*'a)list=letpairxys=L.map(funy->(x,y))ysinletrecloopacc=function|[]->acc|x::xs->loop(L.rev_append(pairxxs)acc)xsinloop[]lexceptionEnough_times(* accumulate the result of calling 'f' 'n' times *)letn_timesnf=leti=ref0inBatList.unfold_exc(fun()->if!i=nthenraiseEnough_timeselseletres=f()inincri;res)letpush(x:'a)(l:'alistref):unit=l:=x::!l(* the identity function *)letidx=xletone_or_more_spaces=Str.regexp"[ ]+"letstring_of_floats_arrayfv=letbuff=Buffer.create80inA.iter(funf->Buffer.add_stringbuff(sprintf"%f "f))fv;Buffer.contentsbuff(* enforce filename uses one of the allowed extensions *)letenforce_file_extensionallowed_extsfn=assert(L.exists(BatString.ends_withfn)allowed_exts)(* Pi math constant *)letm_pi=4.0*.atan1.0letprependxxs=xs:=x::!xs(* test (lb <= x <= hb) *)letin_boundslbxhb=x>=lb&&x<=hbletlist_medianf(l:floatlist):float=letxs=A.of_listlinA.sortBatFloat.comparexs;letn=A.lengthxsinifnmod2=1thenxs.(n/2)else0.5*.(xs.(n/2)+.xs.(n/2-1))(*$T list_medianf
list_medianf [1.;2.;3.;4.;5.] = 3.0
list_medianf [1.;2.;3.;4.] = 2.5
*)letstring_of_array?pre:(pre="[|")?sep:(sep=";")?suf:(suf="|]")to_stra=letbuff=Buffer.create80inBuffer.add_stringbuffpre;A.iteri(funix->ifi>0thenBuffer.add_stringbuffsep;Buffer.add_stringbuff(to_strx))a;Buffer.add_stringbuffsuf;Buffer.contentsbuff(* marshal x to file *)letsavefnx=with_out_filefn(funout->Marshal.to_channeloutx[Marshal.No_sharing])(* unmarshal x from file; the file might be gzip, bzip2 or xz compressed *)letrestorefn=ifS.ends_withfn".gz"thenletin_chan=Unix.open_process_in("gunzip -ck "^fn)inMarshal.from_channelin_chanelseifS.ends_withfn".bz2"thenletin_chan=Unix.open_process_in("bunzip2 -ck "^fn)inMarshal.from_channelin_chanelseifS.ends_withfn".xz"thenletin_chan=Unix.open_process_in("unxz -ck "^fn)inMarshal.from_channelin_chanelse(* assume file is not compressed *)with_in_filefn(funinput->Marshal.from_channelinput)letis_oddi=imod2=1letis_eveni=imod2=0letget_first_linefn=with_in_filefninput_line(* like the cut unix command *)letcutdfline=letsplitted=BatString.split_on_chardlineinBatList.atsplittedfletget_ncores()=int_of_string(get_command_output"getconf _NPROCESSORS_ONLN")letint_of_bool=function|true->1|false->0letbool_of_int=function|1->true|0->false|_->assert(false)(* test that 'y' is within 'x' +/- 'epsilon';
i.e. y \in [x - epsilon, x + epsilon] *)letapprox_equalepsilonxy=(y>=x-.epsilon)&&(y<=x+.epsilon)(* proper NaN/nan testing *)letis_nanx=matchclassify_floatxwith|FP_nan->true|_->false(* some statistics *)letfaverage=BatList.favgletfmean=faverage(* population standard deviation *)letstddev(l:floatlist):float=letn,sx,sx2=List.fold_left(fun(n,sx,sx2)x->(n+.1.,sx+.x,sx2+.(x*.x)))(0.,0.,0.)linsqrt((sx2-.(sx*.sx)/.n)/.n)(* stddev [2.; 4.; 4.; 4.; 5.; 5.; 7.; 9.] = 2.0 *)(* elements ranks as floats (in a new array) *)letrank(arr:floatarray):floatarray=letsorted=A.copyarrinA.sortFloat.comparesorted;letn=A.lengtharrinletht=Ht.createninletrank=ref1.0inA.iter(funx->trybegin(* deal with ties *)letranks=Ht.findhtxinHt.replacehtx(!rank::ranks);rank:=!rank+.1.0endwithNot_found->beginHt.addhtx[!rank];rank:=!rank+.1.0end)sorted;letelt2rank=Ht.map(fun_eltranks->L.favgranks)htinA.iteri(funix->A.unsafe_setsortedi(Ht.findelt2rankx))arr;sorted(* (\* unit test for rank *\)
* let () =
* assert(rank [|2.; 4.; 7.; 7.; 12.|] = [|1.0; 2.0; 3.5; 3.5; 5.0|]) *)(* Code comes from Biocaml.Math *)letwilcoxon_rank_sum_to_zarr1arr2=letl1,l2=A.(lengtharr1,lengtharr2)inletranked=rank(A.appendarr1arr2)inletarr1=A.subranked0l1inletl1,l2=Float.(of_intl1,of_intl2)inletsum1=A.fsumarr1inletexpectation=(l1*.(l1+.l2+.1.))/.2.inletvar=(l1*.l2*.((l1+.l2+.1.)/.12.))in(sum1-.expectation)/.(sqrtvar)letcndx=(* Modified from C++ code by David Koppstein. Found from
www.sitmo.com/doc/Calculating_the_Cumulative_Normal_Distribution *)letb1,b2,b3,b4,b5,p,c=0.319381530,-0.356563782,1.781477937,-1.821255978,1.330274429,0.2316419,0.39894228inifx>=0.0thenlett=1./.(1.+.(p*.x))in(1.-.(c*.(exp(-.x*.x/.2.))*.t*.(t*.(t*.(t*.((t*.b5)+.b4)+.b3)+.b2)+.b1)))elselett=1./.(1.-.p*.x)inc*.(exp(-.x*.x/.2.))*.t*.(t*.(t*.(t*.((t*.b5)+.b4)+.b3)+.b2)+.b1)letwilcoxon_rank_sum_to_parr1arr2=(* assumes a two-tailed distribution *)letz=wilcoxon_rank_sum_to_zarr1arr2in2.*.(1.-.(cnd(Float.absz)))letfincr_by(xref:floatref)(dx:float):unit=xref:=!xref+.dxletint_of_digit_char=function|'0'->0|'1'->1|'2'->2|'3'->3|'4'->4|'5'->5|'6'->6|'7'->7|'8'->8|'9'->9|c->failwith(sprintf"Utls.int_of_digit_char: not a digit: %c"c)letchar_of_digit=function|0->'0'|1->'1'|2->'2'|3->'3'|4->'4'|5->'5'|6->'6'|7->'7'|8->'8'|9->'9'|d->failwith(sprintf"Utls.char_of_digit: not a digit: %d"d)letstring_of_digitd=String.make1(char_of_digitd)letmake_pairxy=(x,y)(* sort then count duplicates *)letlist_uniq_countl=letsorted=L.sortcomparelinletgroups=L.group_consecutive(=)sortedinL.map(funl->L.(hdl,lengthl))groups(* coarse grain chronometer *)lettime_itf=letstart=Unix.gettimeofday()inletres=f()inletstop=Unix.gettimeofday()in(stop-.start,res)letceili(x:float):int=int_of_float(ceilx)letcount_lines_of_file(fn:string):int=letcount=ref0initer_on_lines_of_filefn(fun_line->incrcount);!countletlist_rev_sortcmpl=List.sort(funxy->cmpyx)lletlist_really_takenl=lettook=L.takenlinassert(L.lengthtook=n);tookletarray_rand_eltrnga=letn=A.lengthainleti=BatRandom.State.intrngnina.(i)(* [|1;2;3|] [4;5;6] -> [1;2;3;4;5;6] *)letprepend_list_with_arrayal=Array.fold_right(funxacc->x::acc)alletarray_without_elt_atia=letn=(Array.lengtha)-1inletres=Array.makena.(0)inletdest=ref0inforj=0tondoifj<>ithenbeginres.(!dest)<-a.(j);incrdestenddone;resletlist_really_remove_onelx=assert(L.memxl);(* BatList.remove doesn't enforce this *)L.removelxletarray_prepend_to_listal=A.fold_right(funxacc->x::acc)alletstring_array_concatsepa=letbuff=Buffer.create1024inA.iteri(funis->ifi>0thenBuffer.add_stringbuffsep;Buffer.add_stringbuffs)a;Buffer.contentsbuff