Source file Stemming.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
exception No_stem of string
let rule_list_1a = [
(101, "sses", "ss", -1);
(102, "ies", "i", -1);
(103, "ss", "ss", -1);
(104, "s", "", -1)
]
let rule_list_1b = [(105, "eed", "ee", 0); (106, "ed", "", -1); (107, "ing", "", -1)]
let rule_list_1b1 = [
(108, "at", "ate", -1);
(109, "bl", "ble", -1);
(110, "iz", "ize", -1);
(111, "bb", "b", -1);
(112, "dd", "d", -1);
(113, "ff", "f", -1);
(114, "gg", "g", -1);
(115, "mm", "m", -1);
(116, "nn", "n", -1);
(117, "pp", "p", -1);
(118, "rr", "r", -1);
(119, "tt", "t", -1);
(120, "ww", "w", -1);
(121, "xx", "x", -1);
(122, "", "e", -1)
]
let rule_list_1c = [(123, "y", "i", -1)]
let rule_list_2 = [
(203, "ational", "ate", 0);
(204, "tional", "tion", 0);
(205, "enci", "ence", 0);
(206, "anci", "ance", 0);
(207, "izer", "ize", 0);
(208, "abli", "able", 0);
(209, "alli", "al", 0);
(210, "entli", "ent", 0);
(211, "eli", "e", 0);
(213, "ousli", "ous", 0);
(214, "ization", "ize", 0);
(215, "ation", "ate", 0);
(216, "ator", "ate", 0);
(217, "alism", "al", 0);
(218, "iveness", "ive", 0);
(219, "fulnes", "ful", 0);
(220, "ousness", "ous", 0);
(221, "aliti", "al", 0);
(222, "iviti", "ive", 0);
(223, "biliti", "ble", 0)
]
let rule_list_3 = [
(301, "icate", "ic", 0);
(302, "ative", "", 0);
(303, "alize", "al", 0);
(304, "iciti", "ic", 0);
(305, "ical", "ic", 0);
(308, "ful", "", 0);
(309, "ness", "", 0)
]
let rule_list_4 = [
(401, "al", "", 1);
(402, "ance", "", 1);
(403, "ence", "", 1);
(405, "er", "", 1);
(406, "ic", "", 1);
(407, "able", "", 1);
(408, "ible", "", 1);
(409, "ant", "", 1);
(410, "ement", "", 1);
(411, "ment", "", 1);
(412, "ent", "", 1);
(423, "sion", "s", 1);
(424, "tion", "t", 1);
(415, "ou", "", 1);
(416, "ism", "", 1);
(417, "ate", "", 1);
(418, "iti", "", 1);
(419, "ous", "", 1);
(420, "ive", "", 1);
(421, "ize", "", 1)
]
let rule_list_5a = [(501, "e", "", 1); (502, "e", "", -1)]
let rule_list_5b = [(503, "ll", "l", 1)]
let all_rules = [
rule_list_1a;
rule_list_1b;
rule_list_1c;
rule_list_2;
rule_list_3;
rule_list_4;
rule_list_5a;
rule_list_5b
]
let is_vowel c =
match c with 'a' | 'e' | 'i' | 'o' | 'u' -> true | _ -> false
let word_size word =
let wordlen = String.length word in
let rec aux idx count state =
if idx < wordlen then
let call = aux (succ idx) in
match state with
| 0 ->
if is_vowel word.[idx] then call count 1 else call count 2
| 1 ->
if is_vowel word.[idx] then call count 1 else call (succ count) 2
| 2 ->
if is_vowel word.[idx] || word.[idx] = 'y' then call count 1
else call count 2
| _ ->
failwith "Impossible state"
else count
in
aux 0 0 0
let ends_with_cvc str =
let len = String.length str in
let vowel_or_y c = is_vowel c || c = 'y' in
let vowel_or_wxy c = vowel_or_y c || c = 'x' || c = 'w' in
if len < 3 then false
else if (not (vowel_or_wxy str.[len - 1]))
&& vowel_or_y str.[len - 2]
&& not (is_vowel str.[len - 3]) then true
else false
let add_an_e word =
if word_size word = 1 && ends_with_cvc word then true else false
let remove_an_e word =
if word_size word = 1 && not (ends_with_cvc word) then true else false
let contains_vowel str =
let len = String.length str in
let rec aux idx =
if idx = len then false
else if is_vowel str.[idx] || str.[idx] = 'y' then true
else aux (succ idx)
in
is_vowel str.[0] || aux 1
let rules_criteria = [([106; 107; 123], contains_vowel); ([122], add_an_e); ([502], remove_an_e)]
let match_rule word ((num, orig, _, min_root): int * string * string * int) =
let orig_len = String.length orig and word_len = String.length word in
let rec aux_rule word num lst =
match lst with
| (rules, fn) :: tl ->
if List.mem num rules then fn word else aux_rule word num tl
| [] ->
true
in
if word_len > orig_len then
let word_end = String.sub word (word_len - orig_len) orig_len
and word_root = String.sub word 0 (word_len - orig_len)
in
if word_end = orig
&& min_root < word_size word_root
&& aux_rule word_root num rules_criteria then
true
else false
else false
let apply_rule word ((_, orig, rep, _): int * string * string * int) =
let orig_len = String.length orig and word_len = String.length word in
let orig_word = word
and new_word = String.sub word 0 (word_len - orig_len) ^ rep
in
if String.length new_word < 2 then orig_word else new_word
let rec replace_end word (rule_list : (int * string * string * int) list) =
match rule_list with
| hd :: tl ->
if match_rule word hd then
let rule, _, _, _ = hd in
(rule, apply_rule word hd)
else replace_end word tl
| [] ->
(0, word)
let stem in_word =
let word = String.lowercase_ascii in_word in
let rec aux aux_word list =
match list with
| hd :: tl ->
(
match replace_end aux_word hd with
| 106, out | 107, out ->
let _, out2 = replace_end out rule_list_1b1 in
aux out2 tl
| _, out ->
aux out tl
)
| [] ->
aux_word
in
aux word all_rules
let stem_cmp s1 s2 = stem s1 = stem s2
let stem_gt s1 s2 = stem s1 > stem s2
let stem_gte s1 s2 = stem s1 >= stem s2
let stem_lt s1 s2 = stem s1 < stem s2
let stem_lte s1 s2 = stem s1 <= stem s2