package molenc

  1. Overview
  2. Docs

Source file mini_mol.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
(* Copyright (C) 2020, Francois Berenger

   Yamanishi laboratory,
   Department of Bioscience and Bioinformatics,
   Faculty of Computer Science and Systems Engineering,
   Kyushu Institute of Technology,
   680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)

(* mini molecule module *)

module A = BatArray
module IntSet = BatSet.Int
module Ht = BatHashtbl
module L = BatList
module StringMap = BatMap.String

type t = { name: string;
           graph: Node.t array;
           diameter: int;
           matrix: int array array }

let get_name m = m.name

let get_graph m = m.graph

let nb_atoms m =
  A.length m.graph

let create name graph diameter matrix =
  { name; graph; diameter; matrix }

let get_typ (m: t) (i: int) =
  Node.get_typ (A.unsafe_get m.graph i)

let get_succs (m: t) (i: int) =
  Node.get_succs (A.unsafe_get m.graph i)

(* list (sorted-uniq-counted) atom types of all atoms
   at given distance from center atom *)
let types_at_distance (center: int) (curr_height: int) (mol: t) =
  let matrix_line = mol.matrix.(center) in
  let unsorted =
    A.fold_lefti (fun acc i x ->
        if x = curr_height then
          (get_typ mol i) :: acc
        else
          acc
      ) [] matrix_line in
  (* layer at 'curr_height' *)
  (curr_height, Utls.list_uniq_count unsorted)

let encode (max_height: int) (mol: t): (Atom_env.t * int) list =
  (* compute atom envs. of given atom up to maximum height allowed *)
  (* we cannot go deeper than 'maxi' on this molecule *)
  let maxi = min max_height mol.diameter in
  let encode_atom (n_i: int): Atom_env.t =
    let depths = L.range 0 `To maxi in
    let layers =
      L.map (fun height ->
          types_at_distance n_i height mol
        ) depths in
    (* non empty layers *)
    L.filter (fun (_h, typs) -> typs <> []) layers
  in
  let nb_atoms = A.length mol.graph in
  let atom_indexes = L.range 0 `To (nb_atoms - 1) in
  (* canonicalize the encoding of the molecule by sorting its atom envs
     and counting duplicates *)
  let atom_envs = L.map encode_atom atom_indexes in
  Utls.list_uniq_count atom_envs

(* encode the molecule to counted atom pairs *)
let atom_pairs (mol: t): (Atom_pair.t * int) list =
  let n = nb_atoms mol in
  assert(n >= 1); (* at least one heavy atom *)
  let max_nb_pairs = max 1 (n * (n - 1) / 2) in
  let pair2count = Ht.create max_nb_pairs in
  for i = 0 to n - 1 do
    let type_i = get_typ mol i in
    for j = i to n - 1 do
      let type_j = get_typ mol j in
      let dist = A.unsafe_get (A.unsafe_get mol.matrix i) j in
      let pair = Atom_pair.create type_i type_j dist in
      let prev_count = Ht.find_default pair2count pair 0 in
      Ht.replace pair2count pair (prev_count + 1)
    done;
  done;
  (* canonicalization will be done later; when the features (string) are
   * converted to feature ids (int) *)
  Ht.bindings pair2count
OCaml

Innovation. Community. Security.