package molenc

  1. Overview
  2. Docs
Molecular encoder/featurizer using rdkit and OCaml

Install

Dune Dependency

Authors

Maintainers

Sources

v16.13.0.tar.gz
sha256=deb4a9f58f49bd9cefb7cf2004ad7ce750aa949655e6f277d4c3e61dfa23c6d6
md5=e90db1862c04f7eb39cf437d33ddf9b3

doc/src/molenc/MSE_mol.ml.html

Source file MSE_mol.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
(* Copyright (C) 2020, Francois Berenger

   Yamanishi laboratory,
   Department of Bioscience and Bioinformatics,
   Faculty of Computer Science and Systems Engineering,
   Kyushu Institute of Technology,
   680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)

(* Multi-Scale-Encoded molecule *)

open Printf

module L = MyList
module Log = Dolog.Log
module String = BatString
module StringMap = BatMap.String

type t = { name: string; map: int StringMap.t }

let create name map =
  { name; map }

let get_name x =
  x.name

let get_map x =
  x.map

let feat_count_of_string s =
  try Scanf.sscanf s "%s %d" (fun s d -> (s, d))
  with exn -> (eprintf "MSE_mol.feat_count_of_string: cannot parse: %s" s;
               raise exn)

(* to construct one molecules with all its constituent lines
   already read from the input file *)
let read_one = function
  | [] -> failwith "MSE_mol.read_one: empty list"
  | name_line :: feat_count_strs ->
    (* molecule separator is a line starting with a '#' char *)
    assert(String.get name_line 0 = '#');
    let name = String.lchop name_line in (* remove it *)
    let map =
      List.fold_left (fun acc line ->
          let feat, count = feat_count_of_string line in
          (* feature cannot already be here; otherwise,
             there was a problem during encoding of the molecule *)
          if StringMap.mem feat acc then
            Log.warn "mol: %s dup feat: %s" name feat;
          StringMap.add feat count acc
        ) StringMap.empty feat_count_strs in
    create name map

let previous_name = ref ""

exception Break

(* get lines for just one molecule (i.e. for one call to read_one after) *)
let get_lines input =
  let acc = ref [] in
  if !previous_name = "" then
    begin
      let line = input_line input in
      assert(BatString.starts_with line "#"); (* enforce name line *)
      previous_name := line
    end;
  acc := [!previous_name];
  try
    while true do
      let line' = input_line input in
      if BatString.starts_with line' "#" then
        (* this is the start of another molecule *)
        begin
          previous_name := line';
          raise Break
        end
      else
        acc := line' :: !acc
    done;
    assert(false) (* for typing: should never be reached at exec *)
  with Break -> L.rev !acc
     | End_of_file ->
       begin
         previous_name := "";
         L.rev !acc
       end

let of_lines lines =
  let rec loop acc ls =
    match ls with
    | [] -> L.rev acc
    | _ ->
      let name_l, rest =
        L.fold_while (fun l -> String.starts_with l "#")
          (fun acc x -> x :: acc) [] ls in
      (match name_l with
       | [name] ->
         (let feat_counts, remaining_mols =
            L.fold_while (fun l -> not (String.starts_with l "#"))
              (fun acc x -> x :: acc) [] rest in
          let mol = read_one (name :: feat_counts) in
          loop (mol :: acc) remaining_mols)
       | _ -> assert(false)) in
  loop [] lines
OCaml

Innovation. Community. Security.