Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
fasta.ml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
(** Parsing FASTA files *) open! Base (** A record type for FASTA files. {1 Overview} {2 Example 1} If you have a fasta file something like this: {[ >s1 apple pie ACTG actg ]} Then you would get a [record] something like this: {[ (* "s1" *) Fasta.Record.id record;; (* Some "apple pie" *) Fasta.Record.desc record;; (* "ACTGactg" *) Fasta.Record.seq record ]} {2 Example 2} If you have a fasta file something like this: {[ >s1 ACTG actg ]} Then you would get a [record] something like this: {[ (* "s1" *) Fasta.Record.id record;; (* None *) Fasta.Record.desc record;; (* "ACTGactg" *) Fasta.Record.seq record ]} {2 Example 3} To change a part of the [Fasta.Record] use the [with_*] functions. E.g., {[ Fasta.Record.with_id "apple" record ]} would change give you a [t] with the [id] set to ["apple"]. *) module Record : sig (** {1 API} *) type t [@@deriving sexp] val create : id:string -> desc:string option -> seq:string -> t (** [create ~id ~desc ~seq] creates a new [t]. Shouldn't raise as literally any values of the correct type are accepted. *) val to_string : t -> string (** [to_string t] returns a string representation of [t] ready to print to a FASTA output file. *) val to_string_nl : ?nl:string -> t -> string (** [to_string_nl t ~nl] returns a string representation of [t] ready to print to a FASTA output file, including a trailing newline (nl) string. [nl] defaults to ["\n"]. *) val equal : t -> t -> bool (** [equal this other] returns [true] if all fields of two [t]s are the same. *) val ( = ) : t -> t -> bool (** [Fasta.Record.(this = other)] is like [equal this other]. *) val id : t -> string (** [id t] returns the [id] of the [t]. *) val desc : t -> string option (** [desc t] returns the [desc] (description) of the [t]. *) val seq : t -> string (** [seq t] returns the [seq] of the [t]. *) val seq_length : t -> int (** [seq_length t] returns the length of the [seq] of [t]. If you construct a record by hand (e.g., with [create]), and there are spaces or other weird characters in the sequences, they will be counted in the length. E.g., {[ let r = Fasta.Record.create ~id:"apple" ~desc:None ~seq:"a a" in assert (Int.(3 = Fasta.Record.seq_length r)) ]} *) val with_id : string -> t -> t (** [with_id new_id t] returns a [t] with [new_id] instead of the original [id]. *) val with_seq : string -> t -> t (** [with_seq new_seq t] returns a [t] with [new_seq] instead of the original [seq]. *) val with_desc : string option -> t -> t (** [with_desc new_desc t] returns a [t] with [new_desc] instead of the original [desc]. *) val rev : t -> t (** [rev t] returns the reverse of [t]. I.e., the [seq] is reversed reversed. *) val comp : t -> t (** [comp t] returns the complement of [t]. I.e., the [seq] is complemented. Uses IUPAC conventions. Any "base" (char) that isn't part of the IUPAC passes through unchanged. Note that [comp] does not round-trip. *) val rev_comp : t -> t (** [rev_comp t] returns the reverse complement of [t]. I.e., the [seq] is reverse complemented. Uses IUPAC conventions. Any "base" (char) that isn't part of the IUPAC passes through unchanged. Note that [rev_comp] does not round-trip. *) end = struct type t = { id : string; desc : string option; seq : string } [@@deriving sexp] let create ~id ~desc ~seq = { id; desc; seq } let to_string r = match r.desc with | None -> Printf.sprintf ">%s\n%s" r.id r.seq | Some desc -> Printf.sprintf ">%s %s\n%s" r.id desc r.seq let to_string_nl ?(nl = "\n") r = to_string r ^ nl let equal r1 r2 = String.(r1.id = r2.id) && String.(r1.seq = r2.seq) && Option.equal String.equal r1.desc r2.desc let ( = ) = equal let id r = r.id let desc r = r.desc let seq r = r.seq let seq_length r = String.length r.seq let with_id id r = { r with id } let with_seq seq r = { r with seq } let with_desc desc r = { r with desc } let rev t = t |> with_seq (String.rev t.seq) let comp t = t |> with_seq (Utils.complement t.seq) let rev_comp t = t |> with_seq (Utils.rev_complement t.seq) end (** [In_channel] for FASTA records. For more general info, see the [Record_in_channel] module mli file. {1:examples Examples} {2 Return all records in a list} {[ let records = Fasta.In_channel.with_file_records fname ]} {2 Iterating over records} Use the [iter] functions when you need to go over each record and perform some side-effects with them. Print sequence IDs and sequence lengths {[ let () = Fasta.In_channel.with_file_iter_records "sequences.fasta" ~f:(fun record -> let open Fasta.Record in printf "%s => %d\n" (id record) (seq_length record)) ]} Print sequence index, IDs, and sequence lengths. This is like the last example except that we also want to print the index. The first record is 0, the 2nd is 1, etc. {[ let () = Fasta.In_channel.with_file_iteri_records "sequences.fasta" ~f:(fun i record -> let open Fasta.Record in printf "%d: %s => %d\n" (i + 1) (id record) (seq_length record)) ]} {2 Folding over records} If you need to reduce all the records down to a single value, use the [fold] functions. Get total length of all sequences in the file. {[ let total_length = Fasta.In_channel.with_file_fold_records "sequences.fasta" ~init:0 ~f:(fun length record -> length + Fasta.Record.seq_length record) ]} {2:pipelines Pipelines with records} Sometimes you have a "pipeline" of computations that you need to do one after the other on records. In that case, you could the [sequence] functions. Here's a silly example. {[ let () = Fasta.In_channel.with_file name ~f:(fun chan -> Fasta.In_channel.record_sequence chan (* Add sequence index to record description *) |> Sequence.mapi ~f:(fun i record -> let new_desc = match Fasta.Record.desc record with | None -> Some (sprintf "sequence %d" i) | Some old_desc -> Some (sprintf "%s -- sequence %d" old_desc i) in Fasta.Record.with_desc new_desc record) (* Convert all sequence chars to lowercase *) |> Sequence.map ~f:(fun record -> let new_seq = String.lowercase (Fasta.Record.seq record) in Fasta.Record.with_seq new_seq record) (* Print sequences *) |> Sequence.iter ~f:(fun record -> print_endline @@ Fasta.Record.to_string record)) ]} One thing to watch out for though...if you get an exception half way through and you are running side-effecting code like we are here then part of your side effects will have occured and part of them will {i not} have occured. As you can see, if that fasta file has more than one sequence it will hit the [assert false] and blow up. *) module In_channel : sig include Record_in_channel.S with type record := Record.t end = struct module T = struct include Private.Peekable_in_channel type record = Record.t let clean_sequence s = String.filter s ~f:(fun c -> Char.(c <> ' ')) let parse_header_line line = assert (String.is_prefix line ~prefix:">"); match String.split ~on:' ' @@ String.chop_prefix_exn ~prefix:">" @@ String.strip line with (* Empty header lines get id = "" *) | [ id ] -> Record.create ~id ~desc:None ~seq:"" | id :: desc -> Record.create ~id ~desc:(Some (String.concat ~sep:" " desc)) ~seq:"" | [] -> (* String.split should at least give [""]. Should never get here. *) assert false let input_record chan = let rec loop thing = match (peek_char chan, thing) with | None, None -> None | None, Some (header, seq) | Some '>', Some (header, seq) -> let r = parse_header_line header in let seq = String.concat ~sep:"" @@ List.rev seq in Some (Record.with_seq seq r) | Some '>', None -> let line = input_line_exn ~fix_win_eol:true chan in loop (Some (line, [])) | Some _, None -> failwith "Not at a header line, but not currently in a sequence" | Some _, Some (header, seq) -> let line = input_line_exn ~fix_win_eol:true chan in let seq_part = clean_sequence line in loop (Some (header, seq_part :: seq)) in loop None end include T include Record_in_channel.Make (T) end