package wseg
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file wseg.ml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
(* * wseg.ml * ----------- * Copyright : (c) 2019, ZAN DoYe <zandoye@gmail.com> * Licence : MIT * * This file is a part of wseg *) open Base open Stdio module Tuple2 = struct let get1 (v, _)= v let get2 (_, v)= v end module Dict = struct module Tree = Trie.Make(String) type entry= string * float type entries= entry list let split_utf8 ?(pos=0) s= let len= String.length s in let rec to_list pos= if pos >= len then [] else let next= CamomileLibrary.UTF8.next s pos in if next > len then [] else String.sub s ~pos ~len:(next - pos) :: (to_list next) in to_list pos let buildEntries rawEntries= let quantity= List.fold ~f:(fun acc (_char, count)-> acc +. count) ~init:0. rawEntries in rawEntries |> List.map ~f:(fun (char, count)-> (char, count /. quantity)) let buildIndex entries= let tree= Tree.create None in List.iter ~f:(fun (token, freq)-> Tree.set tree (split_utf8 token) freq) entries; tree type word= string list * float type chunk= word list let dispConds cd= List.iter ~f:(fun (cl, _freq)-> List.iter ~f:(printf "%s ") cl; Out_channel.newline stdout) cd; Out_channel.newline stdout let dispCands cd= List.iter ~f:(fun (wl:word list)-> List.iter ~f:(fun (cl, _freq)-> List.iter ~f:(Out_channel.output_string stdout) cl; Out_channel.output_string stdout " ") wl; Out_channel.newline stdout) cd; Out_channel.newline stdout let result_of_cand wl= List.map ~f:(fun (cl, _freq)-> String.concat ~sep:"" cl) wl |> String.concat ~sep:"|" let condWord node s= let rec condWord node cl pos= match cl with | []-> [] | c::tl-> match Tree.sub node [c] with | None-> [] | Some node-> match Tree.get node [] with | None-> condWord node tl (pos+1) | Some freq-> ((List.split_n s (pos+1) |> Tuple2.get1), freq) :: (condWord node tl (pos+1)) in condWord node s 0 let candidates wordDict s max= let rec candidates s max= if max > 0 && List.length s > 0 then let words= condWord wordDict s in List.map ~f:(fun word-> let (cl, _freq)= word in let suffix= candidates (List.split_n s (List.length cl) |> Tuple2.get2) (max-1) in match suffix with | []-> [[word]] | _-> List.map ~f:(fun suffix-> word::suffix) suffix) words |> List.concat else [] in candidates (split_utf8 s) max end let length_word (sl, _)= List.fold ~f:(fun acc s-> acc + String.length s) ~init:0 sl let length_chunk chunk= List.fold ~f:(fun acc word-> acc + length_word word) ~init:0 chunk let average chunk= (/.) (List.fold ~f:(fun acc word-> acc + length_word word) ~init:0 chunk |> Float.of_int) (List.length chunk |> Float.of_int) let possibility_mul chunk= List.fold ~f:(fun acc (_, freq)-> acc *. freq) ~init:1. chunk let variance chunk= let avg= average chunk in (/.) (List.fold ~f:(fun acc word-> acc +. (Caml.( ** ) (Float.of_int (length_word word) -. avg) 2.)) ~init:0. chunk ) (List.length chunk |> Float.of_int) module MMSEG = struct let rule1 chunks= let (_len, res)= List.fold ~f:(fun (len, res) chunk-> let len_curr = length_chunk chunk in if len_curr = len then (len, chunk::res) else if len_curr > len then (len_curr, [chunk]) else (len, res) ) ~init:(0, []) chunks in List.rev res let rule2 chunks= let (_avg, res)= List.fold ~f:(fun (avg, res) chunk-> let open Float in let avg_curr = average chunk in if avg_curr = avg then (avg, chunk::res) else if avg_curr > avg then (avg_curr, [chunk]) else (avg, res)) ~init:(0., []) chunks in List.rev res let rule3 chunks= let (_vari, res)= List.fold ~f:(fun (vari, res) chunk-> let open Float in let vari_curr = variance chunk in if vari_curr = vari then (vari, chunk::res) else if vari_curr < vari then (vari_curr, [chunk]) else (vari, res)) ~init:(Float.infinity, []) chunks in List.rev res let rule4 chunks= let (_possibility, res)= List.fold ~f:(fun (possibility, res) chunk-> let open Float in let possibility_curr = possibility_mul chunk in if possibility_curr = possibility then (possibility, chunk::res) else if possibility_curr > possibility then (possibility_curr, [chunk]) else (possibility, res)) ~init:(0., []) chunks in List.rev res let rule_final= List.hd let seg chunks= chunks |> rule1 |> rule2 |> rule3 |> rule4 |> rule_final end module MPSEG = struct end