package bio_io

  1. Overview
  2. Docs

Source file cigar.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
open! Base

(* Limited cigar format used by mmseqs.
   https://github.com/soedinglab/MMseqs2/wiki#alignment-format *)

exception Int_overflow of (int * int) [@@deriving sexp]

let add_exn x y =
  let sum = x + y in
  let sign_x = Int.sign x in
  if Sign.(sign_x = Int.sign y && sign_x <> Int.sign sum) then
    raise (Int_overflow (x, y))
  else sum

exception Exn of string [@@deriving sexp]

(** An operation describing an alignment.

    - [Match]: A non-gap alignment position. May be a mismatch.
    - [Insertion]: Gap in the target/reference sequence.
    - [Deletion]: Gap in the query/read sequence.

    (Note that the drive5 webpage has this backwards. It's possible they have
    reversed definition of query and target.) *)
type op = Match | Insertion | Deletion [@@deriving equal, sexp]

let op_of_char = function
  | 'M' -> Or_error.return Match
  | 'I' -> Or_error.return Insertion
  | 'D' -> Or_error.return Deletion
  | c -> Or_error.errorf "Expected M, D, or I. Got %c." c

let op_to_char = function Match -> 'M' | Insertion -> 'I' | Deletion -> 'D'
let op_to_string = function Match -> "M" | Insertion -> "I" | Deletion -> "D"

module Chunk : sig
  (** Mint a new type here so the lengths are guaranteed to be good. *)

  type t [@@deriving equal, sexp]

  val create : int -> op -> t Or_error.t
  val to_string : t -> string
  val length : t -> int
  val op : t -> op
end = struct
  type t = int * op [@@deriving equal, sexp]

  let create length op =
    if length > 0 then Or_error.return (length, op)
    else Or_error.errorf "Length must be > 0.  Got %d." length

  let to_string (length, op) = Int.to_string length ^ op_to_string op
  let length (length, _op) = length
  let op (_length, op) = op
end

type t = Chunk.t list [@@deriving equal, sexp]

let is_int = function
  | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' -> true
  | _ -> false

let is_cigar_op = function 'M' | 'D' | 'I' -> true | _ -> false

(* One weird thing is that back to back same ops are allowed. See tests. *)
let of_string_exn s =
  let _, pairs =
    String.fold s ~init:("", []) ~f:(fun (current, all) c ->
        match (is_int c, is_cigar_op c) with
        | true, true ->
            (* Can't be both int and cigar_op *)
            assert false
        | false, false ->
            raise (Exn ("Expected int or operation. Got " ^ Char.to_string c))
        | false, true ->
            (* You can get an Op without a preceding integer. In this case,
               treat the length as 1. See
               https://drive5.com/usearch/manual/cigar.html: "In some CIGAR
               variants, the integer may be omitted if it is 1." *)
            let length =
              if String.(current = "") then 1 else Int.of_string current
            in
            let op = Or_error.ok_exn @@ op_of_char c in
            (* ok_exn okay here as the parsing disallows negative numbers. *)
            let chunk = Or_error.ok_exn @@ Chunk.create length op in
            ("", chunk :: all)
        | true, false -> (current ^ String.of_char c, all))
  in
  List.rev pairs

let of_string s = Utils.try1' ~msg:"Error parsing cigar string" of_string_exn s
let to_string cigar = String.concat ~sep:"" @@ List.map cigar ~f:Chunk.to_string

let alignment_length_exn cigar =
  List.fold cigar ~init:0 ~f:(fun length chunk ->
      add_exn length (Chunk.length chunk))

let alignment_length cigar = Utils.try1 alignment_length_exn cigar

let num_gaps_exn cigar =
  List.fold cigar ~init:0 ~f:(fun acc chunk ->
      match Chunk.op chunk with
      | Insertion | Deletion -> add_exn acc (Chunk.length chunk)
      | Match -> acc)

let num_gaps cigar = Utils.try1 num_gaps_exn cigar

(* This is as in https://doi.org/10.1093/bioinformatics/bty262. Where ungapped
   is the alignment length minus number of gaps. I'm taking that to mean any
   position with a gap in either sequence is not counted. So the ungapped_length
   is the number of matches. *)
let num_matches_exn cigar =
  List.fold cigar ~init:0 ~f:(fun acc chunk ->
      match Chunk.op chunk with
      | Match -> add_exn acc (Chunk.length chunk)
      | Insertion | Deletion -> acc)

let num_matches cigar = Utils.try1 num_matches_exn cigar

let query_length_exn cigar =
  List.fold cigar ~init:0 ~f:(fun length chunk ->
      match Chunk.op chunk with
      | Match | Insertion -> add_exn length (Chunk.length chunk)
      | Deletion -> length)

let query_length cigar = Utils.try1 query_length_exn cigar

let target_length_exn cigar =
  List.fold cigar ~init:0 ~f:(fun length chunk ->
      match Chunk.op chunk with
      | Match | Deletion -> add_exn length (Chunk.length chunk)
      | Insertion -> length)

let target_length cigar = Utils.try1 target_length_exn cigar

(* "Drawing" functions and helpers *)

let op_to_target_draw_char ?(gap = '-') ?(non_gap = '-') op =
  match op with Match | Deletion -> non_gap | Insertion -> gap

let op_to_query_draw_char ?(gap = '-') ?(non_gap = '-') op =
  match op with Match | Insertion -> non_gap | Deletion -> gap

let op_to_op_draw_char = op_to_char

(* "Draw" a cigar string by converting the operations to the proper char
   representation. Takes a [op_to_char_fun] so that you can use it for queries,
   targets, and operation strings by passing the appropriate function. *)
let draw_helper op_to_char_fun cigar =
  String.concat ~sep:""
  @@ List.map cigar ~f:(fun chunk ->
         let c = op_to_char_fun @@ Chunk.op chunk in
         (* WARNING: this can raise an Out of memory error depending on the
            cigar string :) *)
         String.make (Chunk.length chunk) c)

let char_list_to_string cl =
  String.concat ~sep:"" @@ List.map cl ~f:Char.to_string

(** Take a [length] and a string [s], and break it up into [length] size splits.
    The last split will have length <= [length]. *)
let string_splits length s =
  List.map ~f:char_list_to_string @@ List.chunks_of ~length @@ String.to_list s

(* Will raise if strings are different length. *)
let wrap_aligment max_len ~target ~target_label ~query ~query_label ~op
    ~op_label =
  let string_splits' = string_splits max_len in
  (* Use _exn here as the strings passed in should have the same length from the
     caller. *)
  String.concat ~sep:"\n\n"
  @@ List.map3_exn (string_splits' target) (string_splits' query)
       (string_splits' op) ~f:(fun target_split query_split op_split ->
         String.concat ~sep:"\n"
           [
             target_label ^ target_split;
             query_label ^ query_split;
             op_label ^ op_split;
           ])

let draw_exn ?(max_aln_len = 1000) ?(gap = '-') ?(non_gap = 'X') ?(wrap = 60)
    cigar =
  let len = alignment_length_exn cigar in
  assert (len >= 0);
  if len > max_aln_len then ""
  else
    let draw_target = draw_helper @@ op_to_target_draw_char ~gap ~non_gap in
    let draw_query = draw_helper @@ op_to_query_draw_char ~gap ~non_gap in
    let draw_op = draw_helper op_to_op_draw_char in
    let target_label = "t: " in
    let query_label = "q: " in
    let op_label = "o: " in
    let target = draw_target cigar in
    let query = draw_query cigar in
    let op = draw_op cigar in
    wrap_aligment wrap ~target ~target_label ~query ~query_label ~op ~op_label

let draw ?(max_aln_len = 1000) ?(gap = '-') ?(non_gap = 'X') ?(wrap = 60) cigar
    =
  match draw_exn ~max_aln_len ~gap ~non_gap ~wrap cigar with
  | exception exn ->
      Or_error.error "Couldn't calculate alignment length" exn Exn.sexp_of_t
  | result -> Or_error.return result
OCaml

Innovation. Community. Security.