package forester

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file Tokenizer.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
(*
 * SPDX-FileCopyrightText: 2024 The Forester Project Contributors
 *
 * SPDX-License-Identifier: GPL-3.0-or-later
 *)

open struct module T = Forester_core.Types end

module Set = Set.Make(String)

type loc =
  | In_frontmatter
  | In_mainmatter

let int_of_field_frontmatter
= function
  | `uri -> 0
  | `title -> 1
  | `dates -> 2
  | `attributions -> 3
  | `taxon -> 4
  | `number -> 5
  | `designated_parent -> 6
  | `source_path -> 7
  | `tags -> 8
  | `metas -> 9

let int_of_field_article
= function
  | `frontmatter -> 0
  | `mainmatter -> 1
  | `backmatter -> 2

type token = {v: string; loc: loc}

let common_words =
  Set.of_list
    [
      "a";
      "and";
      "be";
      "have";
      "i";
      "in";
      "of";
      "that";
      "the";
      "to";
    ]

let tokenize string =
  Str.(split @@ regexp "[^a-zA-Z0-9]+") string
  |> List.filter_map
      (fun s ->
        let lower = String.lowercase_ascii s in
        if not @@ Set.mem lower common_words then
          Some (Stemming.stem lower)
        else
          None
      )

let rec tokenize_content
  : int list -> loc -> T.content -> (int list * string) list
= fun path loc node ->
  match node with
  | T.Content nodes ->
    List.concat @@
      List.mapi
        (fun i node ->
          match node with
          | T.Text s ->
            List.mapi
              (fun j token -> j :: i :: path, token)
              (tokenize s)
          (* i :: path, token *)
          | T.CDATA s ->
            List.mapi
              (fun j token -> j :: i :: path, token)
              (tokenize s)
          | T.Xml_elt {content; _} ->
            (* TODO: Consider tokenizing xml_qname *)
            tokenize_content
              (i :: path)
              loc
              content
          | T.Section {frontmatter; mainmatter; _} ->
            tokenize_frontmatter
              (int_of_field_article `frontmatter :: path)
              frontmatter @
              tokenize_content path loc mainmatter
          | T.Link {content; _} -> tokenize_content (i :: path) loc content
          | T.KaTeX (_, _) ->
            (* NOTE:
               In order to properly search math, we need to revamp the
               architecture and add more features...*)
            []
          | T.Transclude _
          | T.Contextual_number _
          | T.Artefact _
          | T.Uri _
          | T.Route_of_uri _
          | T.Datalog_script _
          | T.Results_of_datalog_query _ ->
            []
        )
        nodes

and tokenize_vertex
  : int list ->
  loc ->
  T.content T.vertex ->
  (int list * string) list
= fun path loc v ->
  match v with
  | T.Uri_vertex _ -> []
  | T.Content_vertex c ->
    tokenize_content path loc c

and tokenize_attribution
  : int list ->
  loc ->
  T.content T.attribution ->
  (int list * string) list
= fun path loc v ->
  match v with
  | T.{vertex; _} ->
    tokenize_vertex path loc vertex

and tokenize_frontmatter
  : int list ->
  T.content T.frontmatter ->
  (int list * string) list
= fun path fm ->
  match fm with
  | {title;
    attributions;
    taxon;
    tags;
    metas;
    _;
  } ->
    List.concat
      [
        Option.value
          ~default: []
          (
            Option.map
              (
                tokenize_content
                  (int_of_field_frontmatter `title :: path)
                  In_frontmatter
              )
              title
          );
        Option.value
          ~default: []
          (
            Option.map
              (
                tokenize_content
                  (int_of_field_frontmatter `taxon :: path)
                  In_frontmatter
              )
              taxon
          );
        List.concat_map (tokenize_attribution path In_frontmatter) attributions;
        List.concat_map (tokenize_vertex path In_frontmatter) tags;
        List.concat @@
          List.mapi
            (fun _i (s, c) ->
              (List.mapi (fun i t -> i :: path, t) @@ tokenize s) @
                tokenize_content path In_frontmatter c
            )
            metas;
      ]

let tokenize_article : T.content T.article -> (int list * string) list = function
  | {frontmatter; mainmatter; _} ->
    tokenize_frontmatter [0] frontmatter @
      tokenize_content [1] In_mainmatter mainmatter
    |> List.map (fun (x, y) -> List.rev x, y)
OCaml

Innovation. Community. Security.