Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file csv.ml
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129#1 "src/csv.pp.ml"(* File: csv.pp.ml
Copyright (C) 2005-2009
Richard Jones
email: rjones@redhat.com
Christophe Troestler
email: Christophe.Troestler@umons.ac.be
WWW: http://math.umons.ac.be/anum/software/
This library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License version 2.1 or
later as published by the Free Software Foundation, with the special
exception on linking described in the file LICENSE.
This library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
LICENSE for more details. *)(* MOTIVATION. There used to be several solutions to parse CSV files
in OCaml. They did not suit my needs however because:
- The files I need to deal with have a header which does not always
reflect the data structure (say the first row are the names of
neurones but there are two columns per name). In other words I
want to be able to deal with heterogeneous files.
- I do not want to read the the whole file at once (I may but I
just want to be able to choose). Higher order functions like fold
are fine provided the read stops at the line an exception is raised
(so it can be reread again).
- For similarly encoded line, being able to specify once a decoder
and then use a type safe version would be nice.
- Speed is not neglected (we would like to be able to parse a
~2.5Mb file under 0.1 sec on my machine (2GHz Core Duo)).
We follow the CVS format documentation available at
http://www.creativyst.com/Doc/Articles/CSV/CSV01.htm
*)(*
open Csv__Csv_utils
module Header = Csv__Csv_row.Header
module Row = Csv__Csv_row.Row
*)includeCsv_utilsmoduleHeader=Csv_row.HeadermoduleRow=Csv_row.Rowtypet=stringlistlist(*
open Lwt
*)classtypein_obj_channel=objectmethodinput:Bytes.t->int->int->intmethodclose_in:unit->unitendclasstypeout_obj_channel=objectmethodoutput:Bytes.t->int->int->intmethodclose_out:unit->unitend(*
* Input
*)exceptionFailureofint*int*stringletbuffer_len=0x1FFF(* We buffer the input as this allows the be efficient while using
very basic input channels. The drawback is that if we want to use
another tool, there will be data hold in the buffer. That is why
we allow to convert a CSV handle to an object sharing the same
buffer. Because if this, we actually decided to implement the CSV
handle as an object that is coercible to a input-object.
FIXME: This is not made for non-blocking channels. Can we fix it? *)typein_channel={in_chan:(* Lwt_io.input_channel *)in_obj_channel;in_buf:Bytes.t;(* The data in the in_buf is at indexes i s.t. in0 <= i < in1.
Invariant: 0 <= in0 ; in1 <= buffer_len in1 < 0 indicates a
closed channel. *)mutablein0:int;mutablein1:int;mutableend_of_file:bool;(* If we encounter an End_of_file exception, we set this flag to
avoid reading again because we do not know how the channel will
react past an end of file. That allows us to assume that
reading past an end of file will keep raising End_of_file. *)current_field:Buffer.t;(* buffer reused to scan fields *)mutablerecord:stringlist;(* The current record *)mutablerecord_n:int;(* For error messages *)has_header:bool;mutableheader:Header.t;(* Convert the rows on demand (=> do not
pay the price if one does not use that
feature). *)separator:char;backslash_escape:bool;(* Whether \x is considered as an escape *)excel_tricks:bool;fix:bool;(* Whitespace related stripping functions: *)is_space:char->bool;lstrip_buffer:Buffer.t->unit;rstrip_substring:Bytes.t->int->int->string;rstrip_contents:Buffer.t->string;}(*
* CSV input format parsing
*)(* [fill_in_buf_or_Eof chan] refills in_buf if needed (when empty). After
this [in0 < in1] or [in0 = in1 = 0], the latter indicating that
there is currently no bytes to read (for a non-blocking channel).
@raise End_of_file if there are no more bytes to read. *)letfill_in_buf_or_Eofic=ific.end_of_filethen(* Lwt.fail *)raiseEnd_of_fileelseific.in0>=ic.in1thenbeginic.in0<-0;(* Lwt_io.read_into ic.in_chan ic.in_buf 0 buffer_len >>= fun len ->
if len = 0 then (
ic.end_of_file <- true;
Lwt.fail End_of_file
)
else (ic.in1 <- len; Lwt.())
*)tryic.in1<-ic.in_chan#inputic.in_buf0buffer_len;withEnd_of_file->ic.end_of_file<-true;raiseEnd_of_fileend(* else Lwt.() *)(* Add chars to [ic.current_field] from [ic.in_buf.[i]] as long as they
satisfy [predicate]. *)letrecadd_if_satisfyicpredicatei=ifi>=ic.in1then(Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.in0<-i;fill_in_buf_or_Eofic;add_if_satisfyicpredicate0)elseletc=Bytes.unsafe_getic.in_bufiinifpredicatecthenadd_if_satisfyicpredicate(i+1)else(Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.in0<-i;(* at char [c]; [i < ic.in1]. *)())letadd_spacesic=add_if_satisfyicic.is_spaceic.in0(* Assume that the current position [ic.in0] is just after the end of
a field. Determine if a subsequent field follows or a new record
must be started. Place the current position at the beginning of
the next field. *)lethas_next_fieldic=assert(ic.in0<ic.in1);letc=Bytes.unsafe_getic.in_bufic.in0inic.in0<-ic.in0+1;ifc='\r'then((* Skip a possible CR *)tryfill_in_buf_or_Eofic;ifBytes.unsafe_getic.in_bufic.in0='\n'thenic.in0<-ic.in0+1;(false)withEnd_of_file->(false))else(c=ic.separator)(* Unquoted field. Read till a delimiter, a newline, or the
end of the file. Skip the next delimiter or newline.
@ [true] if more fields follow, [false] if the record
is complete. *)letrecseek_unquoted_separatorici=ifi>=ic.in1then((* End not found, need to look at the next chunk *)Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.in0<-i;fill_in_buf_or_Eofic;seek_unquoted_separatoric0)elseletc=Bytes.unsafe_getic.in_bufiinifc=ic.separator||c='\n'||c='\r'then(ifBuffer.lengthic.current_field=0then(* Avoid copying the string to the buffer if unnecessary *)ic.record<-ic.rstrip_substringic.in_bufic.in0(i-ic.in0)::ic.recordelse(Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.record<-ic.rstrip_contentsic.current_field::ic.record);ic.in0<-i;has_next_fieldic)elseseek_unquoted_separatoric(i+1)letadd_unquoted_fieldic=tryseek_unquoted_separatoricic.in0withEnd_of_file->ic.record<-ic.rstrip_contentsic.current_field::ic.record;(false)letrecexamine_quoted_fieldicfield_noafter_final_quote~after_bad_quotei=ifi>=ic.in1then((* End of field not found, need to look at the next chunk *)Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.in0<-i;fill_in_buf_or_Eofic;examine_quoted_fieldicfield_noafter_final_quote~after_bad_quote0)elseletc=Bytes.unsafe_getic.in_bufiinifc='\"'then(after_final_quote:=true;(* Save the field so far, without the quote *)Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.in0<-i+1;(* skip the quote *)(* The field up to [ic.in0] is saved, can refill if needed. *)fill_in_buf_or_Eofic;(* possibly update [ic.in0] *)letc=Bytes.unsafe_getic.in_bufic.in0inifc=ic.separator||c='\n'||c='\r'then(ic.record<-Buffer.contentsic.current_field::ic.record;has_next_fieldic)elseifc='\"'then((* Either a correctly escaped quote or the closing of a badly
escaped one and the closing of the field. In both cases,
the field has a quote. *)Buffer.add_charic.current_field'\"';ic.in0<-ic.in0+1;letlen_field=Buffer.lengthic.current_fieldinadd_spacesic;(* [ic.in0 < ic.in1] or EOF *)letc=Bytes.unsafe_getic.in_bufic.in0inifafter_bad_quote(* ⇒ [ic.fix] *)&&(c=ic.separator||c='\n'||c='\r')then((* space + separator, consider it closes the field. *)ic.record<-Buffer.subic.current_field0len_field::ic.record;has_next_fieldic)else((* Not [after_bad_quote] (e.g. if [ic.fix] is false) or does
not look like the end of a field ⇒ escaped quote (already
added). *)after_final_quote:=false;(* [c] is kept so a quote will be included in the field *)examine_quoted_fieldicfield_noafter_final_quote~after_bad_quoteic.in0))elseific.excel_tricks&&c='0'then((* Supposedly, '"' '0' means ASCII NULL *)after_final_quote:=false;Buffer.add_charic.current_field'\000';ic.in0<-ic.in0+1;(* skip the '0' *)examine_quoted_fieldicfield_noafter_final_quote~after_bad_quoteic.in0)elseific.is_spacec||ic.fixthen((* Either a final quote or a badly escaped one. Keep the
length of the field if it is complete (the normal case) and
add more to the buffer in case it must be kept. *)letlen_field=Buffer.lengthic.current_fieldinBuffer.add_charic.current_field'\"';add_spacesic;(* [ic.in0 < ic.in1] or EOF *)letc=Bytes.unsafe_getic.in_bufic.in0inifc=ic.separator||c='\n'||c='\r'then((* Normal field termination ⇒ save field; after_final_quote=true *)ic.record<-Buffer.subic.current_field0len_field::ic.record;has_next_fieldic)elseific.fixthen((* Badly escaped quote, [ic.current_field] to be continued *)after_final_quote:=false;examine_quoted_fieldicfield_noafter_final_quote~after_bad_quote:(notafter_bad_quote)ic.in0)elseraise(Failure(ic.record_n,field_no,"Non-space char after closing the quoted field")))elseraise(Failure(ic.record_n,field_no,"Bad '\"' in quoted field")))elseific.backslash_escape&&c='\\'then((* Save the field so far, without the backslash: *)Buffer.add_subbytesic.current_fieldic.in_bufic.in0(i-ic.in0);ic.in0<-i+1;(* skip the backslash *)fill_in_buf_or_Eofic;(* possibly update [ic.in0] *)letc=Bytes.unsafe_getic.in_bufic.in0inBuffer.add_charic.current_fieldunescape.(Char.codec);ic.in0<-ic.in0+1;(* skip the char [c]. *)examine_quoted_fieldicfield_noafter_final_quote~after_bad_quoteic.in0)elseexamine_quoted_fieldicfield_noafter_final_quote~after_bad_quote(i+1)letadd_quoted_fieldicfield_no=letafter_final_quote=reffalsein(* preserved through exn *)tryexamine_quoted_fieldicfield_noafter_final_quote~after_bad_quote:falseic.in0withEnd_of_file->(* Add the field even if not closed well *)ic.record<-Buffer.contentsic.current_field::ic.record;if!after_final_quote||ic.fixthen(false)(* = record is complete *)elseraise(Failure(ic.record_n,field_no,"Quoted field closed by end of file"))(* We suppose to be at the beginning of a field. Add the next field
to [record]. @ [true] if more fields follow, [false] if the
record is complete.
Return Failure (if there is a format error) or End_of_file (if
there is not more data to read). *)letadd_next_fieldicfield_no=Buffer.clearic.current_field;tryadd_spacesic;(* Now, in0 < in1 or End_of_file was raised *)letc=Bytes.unsafe_getic.in_bufic.in0inifc='\"'then(ic.in0<-ic.in0+1;Buffer.clearic.current_field;(* remove spaces *)add_quoted_fieldicfield_no)elseific.excel_tricks&&c='='then(ic.in0<-ic.in0+1;(* mark '=' as read *)tryfill_in_buf_or_Eofic;ifBytes.unsafe_getic.in_bufic.in0='\"'then((* Excel trick ="..." to prevent spaces around the field
to be removed. *)ic.in0<-ic.in0+1;(* skip '"' *)add_quoted_fieldicfield_no)else(ic.lstrip_bufferic.current_field;(* remove spaces *)Buffer.add_charic.current_field'=';add_unquoted_fieldic)withEnd_of_file->ic.record<-"="::ic.record;(false))else(ic.lstrip_bufferic.current_field;(* remove spaces *)add_unquoted_fieldic)withEnd_of_file->(* If it is the first field, coming from [next()], the field is
made of spaces. If after the first, we are sure we read a
delimiter before (but maybe the field is empty). Thus add
en empty field. *)ic.record<-""::ic.record;(false)letrecadd_all_record_fieldsic~more_fields~field_no=ifmore_fieldsthen(letmore=add_next_fieldicfield_noinadd_all_record_fieldsic~more_fields:more~field_no:(field_no+1))else()letnextic=ific.in1<0thenraise(Sys_error"Bad file descriptor")else(fill_in_buf_or_Eofic;(* End_of_file means no more records *)ic.record<-[];ic.record_n<-ic.record_n+1;(* the current line being read *)add_all_record_fieldsic~more_fields:true~field_no:1;ic.record<-List.revic.record;(ic.record))letcurrent_recordic=ic.record(*
let rec fold_left ~f ~init:a ic =
Lwt.catch (fun () -> next ic >>= fun r ->
f a r >>= fun a ->
fold_left ~f ~init:a ic)
(function End_of_file -> (a)
| exn -> Lwt.fail exn)
*)letfold_left~f~init:a0ic=leta=refa0intry(* Single "try" block for the whole loop. *)whiletruedoa:=f!a(nextic)done;assertfalsewithEnd_of_file->!a(*
let rec iter ~f ic =
Lwt.catch (fun () -> next ic >>= fun r ->
f r >>= fun () ->
iter ~f ic)
(function End_of_file -> ()
| exn -> Lwt.fail exn)
*)letiter~fic=trywhiletruedof(nextic)done;withEnd_of_file->()letinput_allic=letrecords=fold_left~f:(funlr->(r::l))~init:[]icin(List.revrecords)letfold_right~fica0=(* We to collect all records before applying [f] -- last row first. *)letlr=fold_left~f:(funlr->(r::l))~init:[]icin(* Lwt_list.fold_left_s *)List.fold_left(funar->fra)a0lr(*
* Creating a handle, possibly with header
*)letof_in_obj?(separator=',')?(strip=true)?(has_header=false)?header?(backslash_escape=false)?(excel_tricks=true)?(fix=false)in_chan=ifseparator='\n'||separator='\r'theninvalid_arg"Csv (input): the separator cannot be '\\n' or '\\r'";letic={in_chan=in_chan;in_buf=Bytes.createbuffer_len;in0=0;in1=0;end_of_file=false;current_field=Buffer.create0xFF;record=[];record_n=0;(* => first record numbered 1 *)has_header=has_header||header<>None;header=Header.empty;separator=separator;backslash_escape;excel_tricks=excel_tricks;fix=fix;(* Stripping *)is_space=(ifseparator='\t'thenis_real_spaceelseis_space_or_tab);lstrip_buffer=(ifstripthenBuffer.clearelsedo_nothing);rstrip_substring=(ifstripthenrstrip_substringelseBytes.sub_string);rstrip_contents=(ifstripthenrstrip_contentselseBuffer.contents);}inifhas_headerthen((* Try to initialize headers with the first record that is read. *)tryletnames=nexticinleth=Header.of_namesnamesinleth=matchheaderwith|None->h|Someh0->Header.merge~main:(Header.of_namesh0)hin{icwithheader=h}withEnd_of_file|Failure_->(ic))else((* The channel does not contain a header. *)matchheaderwith|None->(ic)|Someh0->{icwithheader=Header.of_namesh0})(* let of_channel = of_in_obj *)letof_channel?separator?strip?has_header?header?backslash_escape?excel_tricks?fixfh=of_in_obj?separator?strip?has_header?header?backslash_escape?excel_tricks?fix(objectvalfh=fhmethodinputsofslen=tryletr=Pervasives.inputfhsofsleninifr=0thenraiseEnd_of_file;rwithSys_blocked_io->0methodclose_in()=Pervasives.close_infhend)letof_string?separator?strip?has_header?header?backslash_escape?excel_tricks?fixstr=of_in_obj?separator?strip?has_header?header?backslash_escape?excel_tricks?fix(objectvalmutableposition=0methodinputbufofslen=ifposition>=String.lengthstrthenraiseEnd_of_fileelse(letactual=minlen(String.lengthstr-position)inString.blitstrpositionbufofsactual;position<-position+actual;actual)methodclose_in()=()end)letclose_inic=ific.in1>=0thenbeginic.in0<-0;ic.in1<--1;(* Lwt_io.close ic.in_chan *)ic.in_chan#close_in();(* may raise an exception *)endelse()(* *)letto_in_objic=objectvalic=icmethodinputbufofslen=ifofs<0||len<0||ofs+len>Bytes.lengthbuftheninvalid_arg"Csv.to_in_obj#input";ific.in1<0thenraise(Sys_error"Bad file descriptor");fill_in_buf_or_Eofic;letr=minlen(ic.in1-ic.in0)inBytes.blitic.in_bufic.in0bufofsr;ic.in0<-ic.in0+r;rmethodclose_in()=close_inicendletload?separator?strip?backslash_escape?excel_tricks?fixfname=letfh=iffname="-"then(* (Lwt_io.stdin) *)stdinelse(* Lwt_io.open_file ~mode:Lwt_io.Input fname *)open_infnameinletcsv=of_channel?separator?strip?backslash_escape?excel_tricks?fixfhinlett=input_allcsvinclose_incsv;(t)letload_in?separator?strip?backslash_escape?excel_tricks?fixch=letfh=of_channel?separator?strip?backslash_escape?excel_tricks?fixchininput_allfh(* *)(* @deprecated *)letload_rows?separator?strip?backslash_escape?excel_tricks?fixfch=iter~f(of_channel?separator?strip?backslash_escape?excel_tricks?fixch)(*
* Output
*)(* Arrays for backslash-escaping. *)letmust_escape=Array.make256falselet()=List.iter(func->must_escape.(Char.codec)<-true)['\"';'\\';'\000';'\b';'\n';'\r';'\t';'\026']letescape=(* Keep in sync with [unescape]. *)letescape_ofc=matchChar.unsafe_chrcwith|'\000'->'0'(* esape: \0 *)|'\b'->'b'|'\n'->'n'|'\r'->'r'|'\t'->'t'|'\026'->'Z'|c->cinArray.init256escape_of(* FIXME: Rework this part *)typeout_channel={out_chan:(* Lwt_io.output_channel *)out_obj_channel;out_separator:char;out_separator_bytes:Bytes.t;out_backslash_escape:bool;out_excel_tricks:bool;quote_all:bool;}letto_out_obj?(separator=',')?(backslash_escape=false)?(excel_tricks=false)?(quote_all=false)out_chan=ifseparator='\n'||separator='\r'theninvalid_arg"Csv (output): the separator cannot be '\\n' or '\\r'";{out_chan=out_chan;out_separator=separator;out_separator_bytes=Bytes.make1separator;out_backslash_escape=backslash_escape;out_excel_tricks=excel_tricks;quote_all=quote_all;}(* let to_channel = to_out_obj
*)letto_channel?separator?backslash_escape?excel_tricks?quote_allfh=to_out_obj?separator?backslash_escape?excel_tricks?quote_all(objectvalfh=fhmethodoutputsofslen=outputfhsofslen;lenmethodclose_out()=close_outfhend)letto_buffer?separator?backslash_escape?excel_tricks?quote_allbuf=to_out_obj?separator?backslash_escape?excel_tricks?quote_all(objectmethodoutputsofslen=Buffer.add_subbytesbufsofslen;lenmethodclose_out()=()end)letclose_outoc=(* Lwt_io.close oc.out_chan *)oc.out_chan#close_out()letrecreally_outputocsofslen=(*
Lwt_io.write_from oc.out_chan s ofs len >>= fun w ->
*)letw=oc.out_chan#outputsofsleninifw<lenthenreally_outputocs(ofs+w)(len-w)else()letquote_bytes=Bytes.make1'\"'letoutput_quoteoc=really_outputocquote_bytes01letequal_quote_bytes=Bytes.make2'='let()=Bytes.unsafe_setequal_quote_bytes1'\"'letoutput_equal_quoteoc=really_outputocequal_quote_bytes02letnewline_bytes=Bytes.make1'\n'letoutput_newlineoc=really_outputocnewline_bytes01(* Determine whether the string s must be quoted and how many chars it
must be extended to contain the escaped values. Return -1 if there
is no need to quote. It is assumed that the string length [len]
is > 0. *)letmust_quoteocslen=letquote=ref(is_space_or_tab(String.unsafe_gets0)||is_space_or_tab(String.unsafe_gets(len-1)))inletn=ref0infori=0tolen-1doletc=String.unsafe_getsiinifoc.out_backslash_escape&&must_escape.(Char.codec)then((* Must be done first because backslash escaping will be
favored, even for the separator, '\n',... *)quote:=true;incrn)elseifc=oc.out_separator||c='\n'||c='\r'thenquote:=trueelseifc='"'||(oc.out_excel_tricks&&c='\000')then(quote:=true;incrn)done;if!quotethen!nelse-1letneed_excel_trickslen=letc=String.unsafe_gets0inis_space_or_tabc||c='0'||is_space_or_tab(String.unsafe_gets(len-1))(* Do some work to avoid quoting a field unless it is absolutely
required. *)letwrite_escapedocfield=ifString.lengthfield>0thenbeginletlen=String.lengthfieldinletuse_excel_trick=oc.out_excel_tricks&&need_excel_trickfieldlenandn=must_quoteocfieldleninifn<0&¬use_excel_trick&¬oc.quote_allthen(* [really_output] does not mutate the [Bytes.t] argument. *)really_outputoc(Bytes.unsafe_of_stringfield)0lenelse(letfield=ifn<=0thenBytes.unsafe_of_stringfieldelse(* There are some quotes to escape *)lets=Bytes.create(len+n)inletj=ref0infori=0tolen-1doletc=String.unsafe_getfieldiinifoc.out_backslash_escape&&must_escape.(Char.codec)then(Bytes.unsafe_sets!j'\\';incrj;Bytes.unsafe_sets!jescape.(Char.codec);incrj)elseifc='"'then(Bytes.unsafe_sets!j'"';incrj;Bytes.unsafe_sets!j'"';incrj)elseifoc.out_excel_tricks&&c='\000'then(Bytes.unsafe_sets!j'"';incrj;Bytes.unsafe_sets!j'0';incrj)else(Bytes.unsafe_sets!jc;incrj)done;sin(ifuse_excel_trickthenoutput_equal_quoteocelseoutput_quoteoc);really_outputocfield0(Bytes.lengthfield);output_quoteoc)endelseifoc.quote_allthen(output_quoteoc;output_quoteoc)else()letoutput_recordoc=function|[]->output_newlineoc|[f]->write_escapedocf;output_newlineoc|f::tl->write_escapedocf;(* Lwt_list.iter_s *)List.iter(funf->really_outputococ.out_separator_bytes01;write_escapedocf)tl;output_newlineocletoutput_alloct=(* Lwt_list.iter_s *)List.iter(funr->output_recordocr)tletprint?separator?backslash_escape?excel_tricks?quote_allt=letcsv=to_channel?separator?backslash_escape?excel_tricks?quote_all(* Lwt_io.stdout *)stdoutinoutput_allcsvt;(* Lwt_io.flush Lwt_io.stdout *)flushstdout(* *)letsave_out?separator?backslash_escape?excel_trickscht=letcsv=to_channel?separator?backslash_escape?excel_trickschinoutput_allcsvtletsave?separator?backslash_escape?excel_tricks?quote_allfnamet=letch=(* Lwt_io.open_file ~mode:Lwt_io.Output fname *)open_outfnameinletcsv=to_channel?separator?backslash_escape?excel_tricks?quote_allchinoutput_allcsvt;(* Lwt_io.close *)Pervasives.close_outch(*
* Reading rows with headers
*)moduleRows=structletheaderic=Header.namesic.headerletset_header?(replace=false)icnames=leth0=Header.of_namesnamesinic.header<-ifreplacethenh0elseHeader.merge~main:h0ic.headerletcurrentic=Row.makeic.headeric.recordletnextic=letrecord=nexticin(Row.makeic.headerrecord)(* The convenience higher order functions are defined in terms of
[next] in the same way as above. *)(*
let rec fold_left ~f ~init:a ic =
Lwt.catch (fun () -> next ic >>= fun r ->
f a r >>= fun a ->
fold_left ~f ~init:a ic)
(function End_of_file -> (a)
| exn -> Lwt.fail exn)
let rec iter ~f ic =
Lwt.catch (fun () -> next ic >>= fun r ->
f r >>= fun () ->
iter ~f ic)
(function End_of_file -> ()
| exn -> Lwt.fail exn)
*)letfold_left~f~init:a0ic=leta=refa0intrywhiletruedoa:=f!a(nextic)done;assertfalsewithEnd_of_file->!aletiter~fic=trywhiletruedof(nextic)done;withEnd_of_file->()letinput_allic=letrecords=fold_left~f:(funlr->(r::l))~init:[]icin(List.revrecords)letfold_right~fica0=(* We to collect all records before applying [f] -- last row first. *)letlr=fold_left~f:(funlr->(r::l))~init:[]icin(* Lwt_list.fold_left_s *)List.fold_left(funar->fra)a0lrletload?separator?strip?has_header?header?backslash_escape?excel_tricks?fixfname=letfh=iffname="-"then(* (Lwt_io.stdin) *)stdinelse(* Lwt_io.open_file ~mode:Lwt_io.Input fname *)open_infnameinletcsv=of_channel?separator?strip?has_header?header?backslash_escape?excel_tricks?fixfhinlett=input_allcsvinclose_incsv;(t)end#1 "src/csv_memory.ml"(* File: csv_memory.ml
Copyright (C) 2017-
Christophe Troestler <Christophe.Troestler@umons.ac.be>
WWW: http://math.umons.ac.be/an/software/
This library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License version 2.1 or
later as published by the Free Software Foundation, with the special
exception on linking described in the file LICENSE.
This library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
LICENSE for more details. *)(* REMARK: This file in copied into csv.ml — instead of being in
csv.ml and including the preprocessed version of csv.pp.ml — in
order for the exception [Failure] to be well qualified when printed
by the default exception handler. *)(*
* Acting on CSV data in memory
*)letlines=List.lengthletcolumnscsv=letm=ref0inList.iter(funrow->m:=max!m(List.lengthrow))csv;!mletrecdropwhilef=function|[]->[]|x::xswhenfx->dropwhilefxs|xs->xsletrecempty_row=function|[]->true|""::xs->empty_rowxs|_::_->falselettrim?(top=true)?(left=true)?(right=true)?(bottom=true)csv=letcsv=iftopthendropwhileempty_rowcsvelsecsvinletcsv=ifrightthenList.map(funrow->letrow=List.revrowinletrow=dropwhile((=)"")rowinletrow=List.revrowinrow)csvelsecsvinletcsv=ifbottomthen(letcsv=List.revcsvinletcsv=dropwhileempty_rowcsvinletcsv=List.revcsvincsv)elsecsvinletand_empty_left_cell(col_empty,one_nonempty_row)=function|[]->col_empty,one_nonempty_row|""::_->col_empty,true|_->false,trueinletempty_left_col=List.fold_leftand_empty_left_cell(true,false)inletremove_left_col=List.map(function[]->[]|_::xs->xs)inletrecloopcsv=letleft_col_empty,one_nonempty_row=empty_left_colcsvinifleft_col_empty&&one_nonempty_rowthenloop(remove_left_colcsv)elsecsvinletcsv=ifleftthenloopcsvelsecsvincsvletsquarecsv=letcolumns=columnscsvinList.map(funrow->letn=List.lengthrowinletrow=List.revrowinletrecloopacc=function|0->acc|i->""::loopacc(i-1)inletrow=looprow(columns-n)inList.revrow)csvletis_squarecsv=letcolumns=columnscsvinList.for_all(funrow->List.lengthrow=columns)csvletrecset_columns~cols=function|[]->[]|r::rs->letrecloopicells=ifi<colsthen(matchcellswith|[]->""::loop(succi)[]|c::cs->c::loop(succi)cs)else[]inloop0r::set_columns~colsrsletrecset_rows~rowscsv=ifrows>0then(matchcsvwith|[]->[]::set_rows~rows:(predrows)[]|r::rs->r::set_rows~rows:(predrows)rs)else[]letset_size~rows~colscsv=set_columns~cols(set_rows~rowscsv)(* from extlib: *)letrecdropn=function|_::lwhenn>0->drop(n-1)l|l->lletsub~r~c~rows~colscsv=letcsv=droprcsvinletcsv=List.map(dropc)csvinletcsv=set_rows~rowscsvinletcsv=set_columns~colscsvincsv(* Compare two rows for semantic equality - ignoring any blank cells
* at the end of each row.
*)letreccompare_row(row1:stringlist)row2=matchrow1,row2with|[],[]->0|x::xs,y::ys->letc=comparexyinifc<>0thencelsecompare_rowxsys|""::xs,[]->compare_rowxs[]|_::_,[]->1|[],""::ys->compare_row[]ys|[],_::_->-1(* Semantic equality for CSV files. *)letreccompare(csv1:t)csv2=matchcsv1,csv2with|[],[]->0|x::xs,y::ys->letc=compare_rowxyinifc<>0thencelsecomparexsys|x::xs,[]->letc=compare_rowx[]inifc<>0thencelsecomparexs[]|[],y::ys->letc=compare_row[]yinifc<>0thencelsecompare[]ys(* Concatenate - arrange left to right. *)letrecconcat=function|[]->[]|[csv]->csv|left_csv::csvs->(* Concatenate the remaining CSV files. *)letright_csv=concatcsvsin(* Set the height of the left and right CSVs to the same. *)letnr_rows=max(linesleft_csv)(linesright_csv)inletleft_csv=set_rows~rows:nr_rowsleft_csvinletright_csv=set_rows~rows:nr_rowsright_csvin(* Square off the left CSV. *)letleft_csv=squareleft_csvin(* Prepend the right CSV rows with the left CSV rows. *)List.map(fun(left_row,right_row)->List.appendleft_rowright_row)(List.combineleft_csvright_csv)lettranspose=(* Suppose the CSV data is presented with the last row first. Then
new rows may be constructed in a tail rec way. We use mutable
rows in order to preserve tail recursiveness. *)(* Return the new 1st row; whether all rows are empty. *)letrecrow_of_1st_coltr_rowempty=function|[]->(tr_row,empty)(* No more rows *)|r::rows->match!rwith|[]->(* Last row empty *)lettr_row=iftr_row=[]thentr_rowelse""::tr_rowinrow_of_1st_coltr_rowemptyrows|a::tl->r:=tl;lettr_row=ifa=""&&tr_row=[]then[]elsea::tr_rowinrow_of_1st_coltr_rowfalserowsinletrectrtr_csvcsv=letrow,empty=row_of_1st_col[]truecsvin(* remove [csv] 1st col *)ifemptythenList.revtr_csvelsetr(row::tr_csv)csvinfuncsv->tr[](List.rev_maprefcsv)letto_arraycsv=Array.of_list(List.mapArray.of_listcsv)letof_arraycsv=List.mapArray.to_list(Array.to_listcsv)letreccombine~headerrow=matchheader,rowwith|[],_->[]|_,[]->List.map(funh->(h,""))header|h0::h,x::r->(h0,x)::combine~header:hrletassociateheaderdata=List.map(funrow->combine~headerrow)dataletmap~fcsv=List.map(funrow->List.map(funel->fel)row)csvletsave_out_readablechancsv=(* Escape all the strings in the CSV file first. *)(* XXX Why are we doing this? I commented it out anyway.
let csv = List.map (List.map String.escaped) csv in
*)(* Find the width of each column. *)letwidths=(* Don't consider rows with only a single element - typically
* long titles.
*)letcsv=List.filter(function[_]->false|_->true)csvin(* Square the CSV file - makes the next step simpler to implement. *)letcsv=squarecsvinmatchcsvwith|[]->[]|row1::rest->letlengths_row1=List.mapString.lengthrow1inletlengths_rest=List.map(List.mapString.length)restinletmax2rowsr1r2=letrp=tryList.combiner1r2withInvalid_argument_->failwith(Printf.sprintf"Csv.save_out_readable: internal \
error: length r1 = %d, length r2 = %d"(List.lengthr1)(List.lengthr2))inList.map(fun((a:int),(b:int))->maxab)rpinList.fold_leftmax2rowslengths_row1lengths_restin(* Print out each cell at the correct width. *)letrecrepeatf=function|0->()|i->f();repeatf(i-1)inList.iter(function|[cell]->(* Single column. *)output_stringchancell;output_charchan'\n'|row->(* Other. *)(* Pair up each cell with its max width. *)letrow=letrecloop=function|([],_)->[]|(_,[])->failwith"Csv.save_out_readable: internal error"|(cell::cells,width::widths)->(cell,width)::loop(cells,widths)inloop(row,widths)inList.iter(fun(cell,width)->output_stringchancell;letn=String.lengthcellinrepeat(fun()->output_charchan' ')(width-n+1))row;output_charchan'\n')csvletprint_readable=save_out_readablestdout