package forester
A tool for tending mathematical forests
Install
Dune Dependency
Authors
Maintainers
Sources
5.0.tar.gz
md5=24f4aed96a8b8af33aba13fba66f1b37
sha512=d36b896aca11858bb4a00fc704c16cc27a1f197bdb3e479d6132fd70f70d67d7158096285cb0b6fb00db14417f0f822cc27fe65d82f0971e42378fd8271ce573
doc/src/forester.search/Index.ml.html
Source file Index.ml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
(* * SPDX-FileCopyrightText: 2024 The Forester Project Contributors * * SPDX-License-Identifier: GPL-3.0-or-later *) open Forester_core open Spelll open struct module T = Forester_core.Types end module Ocurrences = Set.Make(struct type t = int list list * URI.t (* FIXME: *) let compare (_i, x) (_j, y) = URI.compare x y end) type t = { index: Ocurrences.t Index.t; number_of_tokens: int; number_of_docs: int; } let average_doc_length : t -> float = fun {number_of_tokens; number_of_docs; _} -> Float.of_int number_of_tokens /. Float.of_int number_of_docs let add_one : T.content T.article -> t -> t = fun article ({index; number_of_tokens; number_of_docs;} as t) -> if Option.is_none T.(article.frontmatter.uri) then t else let tokens_in_article = Tokenizer.tokenize_article article in let uri = Option.get T.(article.frontmatter.uri) in let new_tokens = ref 0 in let new_index = List.fold_left (fun index (ocurrences, token) -> match Index.retrieve_l ~limit: 0 index token with | [] -> (* Unseen token*) (* TODO: add to list of ocurrences*) let ocurrence = Ocurrences.singleton ([ocurrences], uri) in new_tokens := !new_tokens + 1; Index.add index token ocurrence | ids :: [] -> Index.add index token (Ocurrences.add ([ocurrences], uri) ids) | _ -> (* We are using limit=0, so this shouldn't happen*) assert false ) index tokens_in_article in { index = new_index; number_of_docs = number_of_docs + 1; number_of_tokens = number_of_tokens + !new_tokens } let add : T.content T.article list -> t -> t = List.fold_right add_one let search : ?fuzz: int -> t -> string -> (int list list * URI.t) list = fun ?(fuzz = 0) index term -> Tokenizer.tokenize term |> List.concat_map (fun str -> List.concat_map Ocurrences.to_list @@ Index.retrieve_l ~limit: fuzz index.index str ) module BM_25 = struct let sum = List.fold_left (+.) 0. (* Inverse document frequency *) let idf q (index : t) = let n = Float.of_int @@ List.length @@ search ~fuzz: 0 index q in log @@ ((Float.of_int index.number_of_docs -. n +. 0.5) /. n +. 0.5) +. 1. let doc_length d = Float.of_int @@ List.length @@ Tokenizer.tokenize_article d let score : T.content T.article -> string -> t -> float = fun d q index -> let tokens = Tokenizer.tokenize q in assert (List.length tokens > 0); let avg_len = average_doc_length index in let k_1 = 1.5 in let b = 0.75 in sum @@ List.map (fun q_i -> let num_occurrences = Float.of_int @@ List.length @@ search index q_i in (* Format.printf "num_occurrences: %f" num_occurrences; *) idf q index *. begin (num_occurrences *. k_1 +. 1.) /. (num_occurrences +. k_1 *. (1. -. b +. (b *. doc_length d /. avg_len))) +. 1. end ) tokens end let create articles = let index = { index = Index.empty; number_of_docs = 0; number_of_tokens = 0 } in add articles index let marshal (v : t) filename = let oc = open_out_bin filename in Fun.protect ~finally: (fun () -> close_out oc) (fun () -> Marshal.to_channel oc v []) let unmarshal filename : t = let ic = open_in_bin filename in Fun.protect ~finally: (fun () -> close_in ic) (fun () -> Marshal.from_channel ic)
sectionYPositions = computeSectionYPositions($el), 10)"
x-init="setTimeout(() => sectionYPositions = computeSectionYPositions($el), 10)"
>