Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file csv.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367(* csv.ml - comma separated values parser
*
* $Id: csv.ml,v 1.5 2005/02/17 15:51:47 rich Exp $
*)(* The format of CSV files:
*
* Each field starts with either a double quote char or some other
* char. For the some other char case things are simple: just read up
* to the next comma (,) which marks the end of the field.
*
* In the case where a field begins with a double quote char the
* parsing rules are different. Any double quotes are doubled ("") and
* we finish reading when we reach an undoubled quote. eg: "The
* following is a quote: "", and that's all" is the CSV equivalent of
* the following literal field: The following is a quote: ", and that's
* all
*
* "0 is the quoted form of ASCII NUL.
*
* CSV fields can also contain literal carriage return characters, if
* they are quoted, eg: "This field
* is split over lines" represents a
* single field containing a \n.
*
* Excel will only use the quoting format if a field contains a double
* quote or comma, although there's no reason why Excel couldn't always
* use the quoted format.
*
* The practical upshot of this is that you can't split a line in a CSV
* file just by looking at the commas. You need to parse each field
* separately.
*
* How we represent CSV files:
*
* We load in the whole CSV file at once, and store it internally as a
* 'string list list' type (note that each line in the CSV file can,
* and often will, have different lengths). We then provide simple
* functions to read the CSV file line-by-line, copy it out, or copy a
* subset of it into a matrix.
*)(* namespace redirection: (mostly to get tail-recursive List functions) *)openCoreopenPolyincludestructmoduleList=structopenListletlength=lengthletrev=revletzip_exn=zip_exnletmapft=mapt~fletiterft=itert~fletfold_leftfinitt=fold_leftt~f~initendmoduleString=structopenStringletmake=makeletlength=lengthletcontains=containsletset=Bytes.setletget=getletescaped=escapedletconcatsepxs=concatxs~sependendtypet=stringlistlistincludeBad_csvletrecdropwhilef=function|[]->[]|x::xswhenfx->dropwhilefxs|xs->xsletlines=List.lengthletcolumnscsv=List.fold_leftmax0(List.mapList.lengthcsv)openStateletload_rows_inchar?(separator=',')finchar=letrow=ref[]in(* Current row. *)letfield=ref[]in(* Current field. *)letstate=refStartFieldin(* Current state. *)letend_of_field()=letfield_list=List.rev!fieldinletfield_len=List.lengthfield_listinletfield_str=Bytes.createfield_leninletrecloopi=function[]->()|x::xs->field_str.[i]<-x;loop(i+1)xsinloop0field_list;row:=(Bytes.unsafe_to_string~no_mutation_while_string_reachable:field_str)::!row;field:=[];state:=StartFieldinletempty_field()=row:=""::!row;field:=[];state:=StartFieldinletend_of_row()=letrow_list=List.rev!rowinfrow_list;row:=[];state:=StartFieldinletrecloop()=letc=inchar()inifc<>'\r'then((* Always ignore \r characters. *)match!statewithStartField->(* Expecting quote or other char. *)ifc='\"'then(state:=InQuotedField;field:=[])elseifc=separatorthen(* Empty field. *)empty_field()elseifc='\n'then((* Empty field, end of row. *)empty_field();end_of_row())else(state:=InUnquotedField;field:=[c])|InUnquotedField->(* Reading chars to end of field. *)ifc=separatorthen(* End of field. *)end_of_field()elseifc='\n'then((* End of field and end of row. *)end_of_field();end_of_row())elsefield:=c::!field|InQuotedField->(* Reading chars to end of field. *)ifc='\"'thenstate:=InQuotedFieldAfterQuoteelsefield:=c::!field|InQuotedFieldAfterQuote->ifc='\"'then((* Doubled quote. *)field:=c::!field;state:=InQuotedField)elseifc='0'then((* Quote-0 is ASCII NUL. *)field:='\000'::!field;state:=InQuotedField)elseifc=separatorthen(* End of field. *)end_of_field()elseifc='\n'then((* End of field and end of row. *)end_of_field();end_of_row())elseifChar.is_whitespacecthen()elseraise(Bad_CSV_file"Extra data after end quote"));(* end of match *)loop()intryloop()withEnd_of_file->(* Any part left to write out? *)(match!statewithStartField->if!row<>[]then(empty_field();end_of_row())|InUnquotedField|InQuotedFieldAfterQuote->end_of_field();end_of_row()|InQuotedField->()(*raise (Bad_CSV_file "Missing end quote after quoted field.")*))letload_rows?separatorfchan=letinchar()=matchIn_channel.input_charchanwith|None->raiseEnd_of_file|Somec->cinload_rows_inchar?separatorfinchar;;letload_inchar?separatorinchar=letcsv=ref[]inletfrow=csv:=row::!csvinload_rows_inchar?separatorfinchar;List.rev!csv;;letload_string?separators=letpos=ref0inletlen=String.lengthsinletinchar()=if!pos=lenthenraiseEnd_of_file;letc=s.[!pos]inincrpos;cinload_inchar?separatorinchar;;letload_in?separatorchan=letcsv=ref[]inletfrow=csv:=row::!csvinload_rows?separatorfchan;List.rev!csvletload?separatorfilename=letchan=In_channel.createfilenameinletcsv=load_in?separatorchaninIn_channel.closechan;csvlettrim?(top=true)?(left=true)?(right=true)?(bottom=true)csv=letrecempty_row=function|[]->true|x::_whenx<>""->false|_::xs->empty_rowxsinletcsv=iftopthendropwhileempty_rowcsvelsecsvinletcsv=ifrightthenList.map(funrow->letrow=List.revrowinletrow=dropwhile((=)"")rowinletrow=List.revrowinrow)csvelsecsvinletcsv=ifbottomthen(letcsv=List.revcsvinletcsv=dropwhileempty_rowcsvinletcsv=List.revcsvincsv)elsecsvinletempty_left_cell=function[]->true|x::_whenx=""->true|_->falseinletempty_left_col=List.fold_left(funarow->a&&empty_left_cellrow)trueinletremove_left_col=List.map(function[]->[]|_::xs->xs)inletrecloopcsv=ifempty_left_colcsvthen(letcsv=remove_left_colcsvinloopcsv)elsecsvinletcsv=ifleftthenloopcsvelsecsvincsvletsquarecsv=letcolumns=columnscsvinList.map(funrow->letn=List.lengthrowinletrow=List.revrowinletrecloopacc=function|0->acc|i->""::loopacc(i-1)inletrow=looprow(columns-n)inList.revrow)csvletassociateheaderdata=letnr_cols=List.lengthheaderinletrectrunc=function|0,_->[]|n,[]->""::trunc(n-1,[])|n,(x::xs)->x::trunc(n-1,xs)inList.map(funrow->letrow=trunc(nr_cols,row)inList.zip_exnheaderrow)dataletsave_fn?(separator=',')put_stringcsv=(* Quote a single CSV field. *)letquote_fieldfield=ifString.containsfieldseparator||String.containsfield'\"'||String.containsfield'\n'then(letbuffer=Buffer.create100inBuffer.add_charbuffer'\"';fori=0to(String.lengthfield)-1domatchfield.[i]with'\"'->Buffer.add_stringbuffer"\"\""|c->Buffer.add_charbuffercdone;Buffer.add_charbuffer'\"';Buffer.contentsbuffer)elsefieldinletseparator=String.make1separatorinList.iter(funline->put_string(String.concatseparator(List.mapquote_fieldline));put_string"\n")csvletsave_out?separatorchancsv=save_fn?separator(Out_channel.output_stringchan)csv;;letprint?separatorcsv=save_out?separatorOut_channel.stdoutcsv;Out_channel.flushOut_channel.stdoutletsave?separatorfilecsv=letchan=Out_channel.createfileinsave_out?separatorchancsv;Out_channel.closechanletsave_fn_readableoutput_stringcsv=(* Escape all the strings in the CSV file first. *)letcsv=List.map(List.mapString.escaped)csvinletcsv=squarecsvin(* Find the width of each column. *)letwidths=matchcsvwith|[]->[]|r::_->letn=List.lengthrinletlengths=List.map(List.mapString.length)csvinletmax2rowsr1r2=letrp=List.zip_exnr1r2inList.map(fun((a:int),(b:int))->maxab)rpinletrecrepeatx=function|0->[]|i->x::repeatx(i-1)inList.fold_leftmax2rows(repeat0n)lengthsin(* Print out each cell at the correct width. *)letrecrepeatf=function|0->()|i->f();repeatf(i-1)inList.iter(funrow->letrow=List.zip_exnwidthsrowinList.iter(fun(width,cell)->output_stringcell;letn=String.lengthcellinrepeat(fun()->output_string" ")(width-n+1))row;output_string"\n")csvletsave_out_readableoc=save_fn_readable(Out_channel.output_stringoc)letprint_readable=save_out_readablestdout