package acgtk

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file data_lexer.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
(* to generate as comment the character corresponding to a unicode
   code:

   perl -C -pe 's/(0x([\dA-Fa-f]{1,4}))/sprintf("%s (*%s*)",$1,chr(hex($2)))/eg' FILE

   Add the -i option to change the file in place.

*)


type context = NoContext | Signature | Lexicon
type brackets = Round | Square

let brackets = ref []
let kind_to_char = function Round -> '(' | Square -> '['
let add_bracket br loc = brackets := (loc, br) :: !brackets

let remove_bracket br loc =
  match !brackets with
  | [] -> Errors.(LexingErrors.emit Lexing_l.UnstartedBracket ~loc)
  | (_, k) :: tl when k = br -> brackets := tl
  | (loc, k) :: _ -> Errors.(LexingErrors.emit (Lexing_l.MismatchParentheses (kind_to_char k)) ~loc)

let check_brackets () =
  match !brackets with
  | [] -> ()
  | (loc, k) :: _ -> Errors.(LexingErrors.emit (Lexing_l.MismatchParentheses (kind_to_char k)) ~loc)

let ctx = ref NoContext
let set c = ctx := c
let get_loc = Sedlexing.lexing_positions

let subscripts = [%sedlex.regexp?
    0x1D63 (*ᵣ*) | 0x1D64 (*ᵤ*) | 0x1D65 (*ᵥ*) | 0x1D66 (*ᵦ*) | 0x1D67 (*ᵧ*) | 0x1D68 (*ᵨ*) | 0x1D69 (*ᵩ*) | 0x1D6A (*ᵪ*)
  | 0x2080 (*₀*) | 0x2081 (*₁*) | 0x2082 (*₂*) | 0x2083 (*₃*) | 0x2084 (*₄*) | 0x2085 (*₅*) | 0x2086 (*₆*) | 0x2087 (*₇*)
  | 0x2088 (*₈*) | 0x2089 (*₉*) | 0x208A (*₊*) | 0x208B (*₋*) | 0x208C (*₌*) | 0x208D (*₍*) | 0x208E (*₎*) | 0x208F (*₏*)
  | 0x2090 (*ₐ*) | 0x2091 (*ₑ*) | 0x2092 (*ₒ*) | 0x2093 (*ₓ*) | 0x2094 (*ₔ*) | 0x2095 (*ₕ*) | 0x2096 (*ₖ*) | 0x2097 (*ₗ*)
  | 0x2098 (*ₘ*) | 0x2099 (*ₙ*) | 0x209A (*ₚ*) | 0x209B (*ₛ*) | 0x209C (*ₜ*) | 0x2C7C (*ⱼ*) ]

let superscripts = [%sedlex.regexp?
    0xB2 (*²*) | 0xB3 (*³*) | 0xB9 (*¹*) | 0x2070 (*⁰*) | 0x2071 (*ⁱ*) | 0x2074 (*⁴*) | 0x2075 (*⁵*) | 0x2076 (*⁶*) | 0x2077 (*⁷*)
  | 0x2078 (*⁸*) | 0x2079 (*⁹*) | 0x207A (*⁺*) | 0x207B (*⁻*) | 0x207C (*⁼*) | 0x207D (*⁽*) | 0x207E (*⁾*) | 0x207F (*ⁿ*) ]

let id = [%sedlex.regexp?
  id_start, Star id_continue, Star (subscripts | superscripts), Star '\'' ]

let symbols_base =
  [%sedlex.regexp?
      '|' | '!' | '"' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '/'| '\''
                 | '<' | '>' | '?' | '@' | '^' | '`' | '~' | '\\'
                 | 0x00AC (*¬*)
                 | 0xD7 (*×*) | 0xB7 (*·*) | 0xF7 (*÷*) | 0x2200 (*∀*) | 0x2201 (*∁*) | 0x2202 (*∂*)
                 | 0x2203 (*∃*) | 0x2204 (*∄*) | 0x2205 (*∅*) | 0x2206 (*∆*) | 0x2207 (*∇*) | 0x2208 (*∈*) | 0x2209 (*∉*) | 0x220A (*∊*)
                 | 0x220B (*∋*) | 0x220C (*∌*) | 0x220D (*∍*) | 0x220E (*∎*) | 0x220F (*∏*) | 0x2210 (*∐*) | 0x2211 (*∑*) | 0x2212 (*−*)
                 | 0x2213 (*∓*) | 0x2214 (*∔*) | 0x2215 (*∕*) | 0x2216 (*∖*) | 0x2217 (*∗*) | 0x2218 (*∘*) | 0x2219 (*∙*) | 0x221A (*√*)
                 | 0x221B (*∛*) | 0x221C (*∜*) | 0x221D (*∝*) | 0x221E (*∞*) | 0x221F (*∟*) | 0x2220 (*∠*) | 0x2221 (*∡*) | 0x2222 (*∢*)
                 | 0x2223 (*∣*) | 0x2224 (*∤*) | 0x2225 (*∥*) | 0x2226 (*∦*) | 0x2227 (*∧*) | 0x2228 (*∨*) | 0x2229 (*∩*) | 0x222A (*∪*)
                 | 0x222B (*∫*) | 0x222C (*∬*) | 0x222D (*∭*) | 0x222E (*∮*) | 0x222F (*∯*) | 0x2230 (*∰*) | 0x2231 (*∱*) | 0x2232 (*∲*)
                 | 0x2233 (*∳*) | 0x2234 (*∴*) | 0x2235 (*∵*) | 0x2236 (*∶*) | 0x2237 (*∷*) | 0x2238 (*∸*) | 0x2239 (*∹*) | 0x223A (*∺*)
                 | 0x223B (*∻*) | 0x223C (*∼*) | 0x223D (*∽*) | 0x223E (*∾*) | 0x223F (*∿*) | 0x2240 (*≀*) | 0x2241 (*≁*) | 0x2242 (*≂*)
                 | 0x2243 (*≃*) | 0x2244 (*≄*) | 0x2245 (*≅*) | 0x2246 (*≆*) | 0x2247 (*≇*) | 0x2248 (*≈*) | 0x2249 (*≉*) | 0x224A (*≊*)
                 | 0x224B (*≋*) | 0x224C (*≌*) | 0x224D (*≍*) | 0x224E (*≎*) | 0x224F (*≏*) | 0x2250 (*≐*) | 0x2251 (*≑*) | 0x2252 (*≒*)
                 | 0x2253 (*≓*) | 0x2254 (*≔*) | 0x2255 (*≕*) | 0x2256 (*≖*) | 0x2257 (*≗*) | 0x2258 (*≘*) | 0x2259 (*≙*) | 0x225A (*≚*)
                 | 0x225B (*≛*) | 0x225C (*≜*) | 0x225D (*≝*) | 0x225E (*≞*) | 0x225F (*≟*) | 0x2260 (*≠*) | 0x2261 (*≡*) | 0x2262 (*≢*)
                 | 0x2263 (*≣*) | 0x2264 (*≤*) | 0x2265 (*≥*) | 0x2266 (*≦*) | 0x2267 (*≧*) | 0x2268 (*≨*) | 0x2269 (*≩*) | 0x226A (*≪*)
                 | 0x226B (*≫*) | 0x226C (*≬*) | 0x226D (*≭*) | 0x226E (*≮*) | 0x226F (*≯*) | 0x2270 (*≰*) | 0x2271 (*≱*) | 0x2272 (*≲*)
                 | 0x2273 (*≳*) | 0x2274 (*≴*) | 0x2275 (*≵*) | 0x2276 (*≶*) | 0x2277 (*≷*) | 0x2278 (*≸*) | 0x2279 (*≹*) | 0x227A (*≺*)
                 | 0x227B (*≻*) | 0x227C (*≼*) | 0x227D (*≽*) | 0x227E (*≾*) | 0x227F (*≿*) | 0x2280 (*⊀*) | 0x2281 (*⊁*) | 0x2282 (*⊂*)
                 | 0x2283 (*⊃*) | 0x2284 (*⊄*) | 0x2285 (*⊅*) | 0x2286 (*⊆*) | 0x2287 (*⊇*) | 0x2288 (*⊈*) | 0x2289 (*⊉*) | 0x228A (*⊊*)
                 | 0x228B (*⊋*) | 0x228C (*⊌*) | 0x228D (*⊍*) | 0x228E (*⊎*) | 0x228F (*⊏*) | 0x2290 (*⊐*) | 0x2291 (*⊑*) | 0x2292 (*⊒*)
                 | 0x2293 (*⊓*) | 0x2294 (*⊔*) | 0x2295 (*⊕*) | 0x2296 (*⊖*) | 0x2297 (*⊗*) | 0x2298 (*⊘*) | 0x2299 (*⊙*) | 0x229A (*⊚*)
                 | 0x229B (*⊛*) | 0x229C (*⊜*) | 0x229D (*⊝*) | 0x229E (*⊞*) | 0x229F (*⊟*) | 0x22A0 (*⊠*) | 0x22A1 (*⊡*) | 0x22A2 (*⊢*)
                 | 0x22A3 (*⊣*) | 0x22A4 (*⊤*) | 0x22A5 (*⊥*) | 0x22A6 (*⊦*) | 0x22A7 (*⊧*) | 0x22A8 (*⊨*) | 0x22A9 (*⊩*) | 0x22AA (*⊪*)
                 | 0x22AB (*⊫*) | 0x22AC (*⊬*) | 0x22AD (*⊭*) | 0x22AE (*⊮*) | 0x22AF (*⊯*) | 0x22B0 (*⊰*) | 0x22B1 (*⊱*) | 0x22B2 (*⊲*)
                 | 0x22B3 (*⊳*) | 0x22B4 (*⊴*) | 0x22B5 (*⊵*) | 0x22B6 (*⊶*) | 0x22B7 (*⊷*) | 0x22B8 (*⊸*) | 0x22B9 (*⊹*) | 0x22BA (*⊺*)
                 | 0x22BB (*⊻*) | 0x22BC (*⊼*) | 0x22BD (*⊽*) | 0x22BE (*⊾*) | 0x22BF (*⊿*) | 0x22C0 (*⋀*) | 0x22C1 (*⋁*) | 0x22C2 (*⋂*)
                 | 0x22C3 (*⋃*) | 0x22C4 (*⋄*) | 0x22C5 (*⋅*) | 0x22C6 (*⋆*) | 0x22C7 (*⋇*) | 0x22C8 (*⋈*) | 0x22C9 (*⋉*) | 0x22CA (*⋊*)
                 | 0x22CB (*⋋*) | 0x22CC (*⋌*) | 0x22CD (*⋍*) | 0x22CE (*⋎*) | 0x22CF (*⋏*) | 0x22D0 (*⋐*) | 0x22D1 (*⋑*) | 0x22D2 (*⋒*)
                 | 0x22D3 (*⋓*) | 0x22D4 (*⋔*) | 0x22D5 (*⋕*) | 0x22D6 (*⋖*) | 0x22D7 (*⋗*) | 0x22D8 (*⋘*) | 0x22D9 (*⋙*) | 0x22DA (*⋚*)
                 | 0x22DB (*⋛*) | 0x22DC (*⋜*) | 0x22DD (*⋝*) | 0x22DE (*⋞*) | 0x22DF (*⋟*) | 0x22E0 (*⋠*) | 0x22E1 (*⋡*) | 0x22E2 (*⋢*)
                 | 0x22E3 (*⋣*) | 0x22E4 (*⋤*) | 0x22E5 (*⋥*) | 0x22E6 (*⋦*) | 0x22E7 (*⋧*) | 0x22E8 (*⋨*) | 0x22E9 (*⋩*) | 0x22EA (*⋪*)
                 | 0x22EB (*⋫*) | 0x22EC (*⋬*) | 0x22ED (*⋭*) | 0x22EE (*⋮*) | 0x22EF (*⋯*) | 0x22F0 (*⋰*) | 0x22F1 (*⋱*) | 0x22F2 (*⋲*)
                 | 0x22F3 (*⋳*) | 0x22F4 (*⋴*) | 0x22F5 (*⋵*) | 0x22F6 (*⋶*) | 0x22F7 (*⋷*) | 0x22F8 (*⋸*) | 0x22F9 (*⋹*) | 0x22FA (*⋺*)
                 | 0x22FB (*⋻*) | 0x22FC (*⋼*) | 0x22FD (*⋽*) | 0x22FE (*⋾*)
                 (* arrows but linear and intuitionistic ones *)
                 | 0x2190 (*←*) | 0x2191 (*↑*) | 0x2193 (*↓*) | 0x2194 (*↔*) | 0x2195 (*↕*) | 0x2196 (*↖*) | 0x2197 (*↗*) | 0x2198 (*↘*) | 0x2199 (*↙*)
                 | 0x219A (*↚*) | 0x219B (*↛*) | 0x219C (*↜*) | 0x219D (*↝*) | 0x219E (*↞*) | 0x219F (*↟*)
                 | 0x21A0 (*↠*) | 0x21A1 (*↡*) | 0x21A2 (*↢*) | 0x21A3 (*↣*) | 0x21A4 (*↤*) | 0x21A5 (*↥*) | 0x21A6 (*↦*) | 0x21A7 (*↧*) | 0x21A8 (*↨*) | 0x21A9 (*↩*)
                 | 0x21AA (*↪*) | 0x21AB (*↫*) | 0x21AC (*↬*) | 0x21AD (*↭*) | 0x21AE (*↮*) | 0x21AF (*↯*)
                 | 0x21B0 (*↰*) | 0x21B1 (*↱*) | 0x21B2 (*↲*) | 0x21B3 (*↳*) | 0x21B4 (*↴*) | 0x21B5 (*↵*) | 0x21B6 (*↶*) | 0x21B7 (*↷*) | 0x21B8 (*↸*) | 0x21B9 (*↹*)
                 | 0x21BA (*↺*) | 0x21BB (*↻*) | 0x21BC (*↼*) | 0x21BD (*↽*) | 0x21BE (*↾*) | 0x21BF (*↿*)
                 | 0x21C0 (*⇀*) | 0x21C1 (*⇁*) | 0x21C2 (*⇂*) | 0x21C3 (*⇃*) | 0x21C4 (*⇄*) | 0x21C5 (*⇅*) | 0x21C6 (*⇆*) | 0x21C7 (*⇇*) | 0x21C8 (*⇈*) | 0x21C9 (*⇉*)
                 | 0x21CA (*⇊*) | 0x21CB (*⇋*) | 0x21CC (*⇌*) | 0x21CD (*⇍*) | 0x21CE (*⇎*) | 0x21CF (*⇏*)
                 | 0x21D0 (*⇐*) | 0x21D1 (*⇑*) | 0x21D3 (*⇓*) | 0x21D4 (*⇔*) | 0x21D5 (*⇕*) | 0x21D6 (*⇖*) | 0x21D7 (*⇗*) | 0x21D8 (*⇘*) | 0x21D9 (*⇙*)
                 | 0x21DA (*⇚*) | 0x21DB (*⇛*) | 0x21DC (*⇜*) | 0x21DD (*⇝*) | 0x21DE (*⇞*) | 0x21DF (*⇟*)
                 | 0x21E0 (*⇠*) | 0x21E1 (*⇡*) | 0x21E2 (*⇢*) | 0x21E3 (*⇣*) | 0x21E4 (*⇤*) | 0x21E5 (*⇥*) | 0x21E6 (*⇦*) | 0x21E7 (*⇧*) | 0x21E8 (*⇨*) | 0x21E9 (*⇩*)
                 | 0x21EA (*⇪*) | 0x21EB (*⇫*) | 0x21EC (*⇬*) | 0x21ED (*⇭*) | 0x21EE (*⇮*) | 0x21EF (*⇯*)
                 | 0x21F0 (*⇰*) | 0x21F1 (*⇱*) | 0x21F2 (*⇲*) | 0x21F3 (*⇳*) | 0x21F4 (*⇴*) | 0x21F5 (*⇵*) | 0x21F6 (*⇶*) | 0x21F7 (*⇷*) | 0x21F8 (*⇸*) | 0x21F9 (*⇹*)
                 | 0x21FA (*⇺*) | 0x21FB (*⇻*) | 0x21FC (*⇼*) | 0x21FD (*⇽*) | 0x21FE (*⇾*) | 0x21FF (*⇿*)
  ]
  
let symbols = [%sedlex.regexp?
  Plus symbols_base, Star (subscripts | superscripts), Opt ('_', Star (id_continue)), Star (subscripts | superscripts) ]

let rec eat_comment buf n =
  if n = 0 then lex buf
  else
    try
      match%sedlex buf with
      | "(*" -> eat_comment buf (n + 1)
      | "*)" -> eat_comment buf (n - 1)
      | eof -> Errors.(LexingErrors.emit Lexing_l.UnterminatedComment ~loc:(get_loc buf))
      | any -> eat_comment buf n
      | _ -> assert false
    with
    | Sedlexing.MalFormed -> Errors.(LexingErrors.emit Lexing_l.Malformed ~loc:(get_loc buf))

and lex buf =
  try
    match%sedlex buf with
    | white_space -> lex buf
    | '\010' | '\013' | "\013\010" -> lex buf
    | "(*" -> eat_comment buf 1
    | "*)" ->
        Errors.(LexingErrors.emit Lexing_l.UnstartedComment ~loc:(get_loc buf))
    | eof ->
        check_brackets ();
        Data_parser.EOI
    | "signature" ->
        check_brackets ();
        if !ctx = NoContext then
          let () = set Signature in
          Data_parser.SIG_OPEN (get_loc buf)
        else Data_parser.IDENT (Sedlexing.Utf8.lexeme buf, get_loc buf)
    | "lexicon" ->
        check_brackets ();
        if !ctx = NoContext then
          let () = set Lexicon in
          Data_parser.LEX_OPEN (get_loc buf)
        else Data_parser.IDENT (Sedlexing.Utf8.lexeme buf, get_loc buf)
    | "nl_lexicon" ->
        check_brackets ();
        if !ctx = NoContext then
          let () = set Lexicon in
          Data_parser.NL_LEX_OPEN (get_loc buf)
        else Data_parser.IDENT (Sedlexing.Utf8.lexeme buf, get_loc buf)
    | '=' ->
        check_brackets ();
        Data_parser.EQUAL (get_loc buf)
    | "<<" ->
        check_brackets ();
        set NoContext;
        Data_parser.COMPOSE (get_loc buf)
    | ';' ->
        check_brackets ();
        Data_parser.SEMICOLON (get_loc buf)
    | ':' ->
        check_brackets ();
        Data_parser.COLON (get_loc buf)
    | ',' -> Data_parser.COMMA (get_loc buf)
    | '(' ->
        let loc = get_loc buf in
        add_bracket Round loc;
        Data_parser.LPAREN loc
    | ')' ->
        let loc = get_loc buf in
        remove_bracket Round loc;
        Data_parser.RPAREN loc
    | '[' ->
        let loc = get_loc buf in
        add_bracket Square loc;
        Data_parser.LSQBRACKET loc
    | ']' ->
        let loc = get_loc buf in
        remove_bracket Square loc;
        Data_parser.RSQBRACKET loc
    | '.' -> Data_parser.DOT (get_loc buf)
    | "end" ->
        check_brackets ();
        set NoContext;
        Data_parser.END_OF_DEC (get_loc buf)
    | "type" ->
        check_brackets ();
        Data_parser.TYPE (get_loc buf)
    | "prefix" ->
        check_brackets ();
        Data_parser.PREFIX (get_loc buf)
    | "infix" ->
        check_brackets ();
        Data_parser.INFIX (get_loc buf)
    | "binder" ->
        check_brackets ();
        Data_parser.BINDER (get_loc buf)
    | "lambda" | (0x03BB (*λ*), 0x2070 (*⁰*)) (* λ⁰ *) -> Data_parser.LAMBDA0 (get_loc buf)
    | "Lambda" | 0x03BB (*λ*) (* λ *)-> Data_parser.LAMBDA (get_loc buf)
    | "->" | 0x2192 (*→*) (* "→" *) -> Data_parser.LIN_ARROW (get_loc buf)
    | "=>" | 0x21D2 (*⇒*) (* "⇒" *) -> Data_parser.ARROW (get_loc buf)
    | ":=" -> Data_parser.COLON_EQUAL (get_loc buf)
    | id ->
        Data_parser.IDENT (Sedlexing.Utf8.lexeme buf, get_loc buf)
    | '\\', id ->
        let n = Sedlexing.Utf8.lexeme buf in
        Data_parser.IDENT (String.sub n 1 (String.length n - 1), get_loc buf)
    | symbols ->
        Data_parser.SYMBOL (Sedlexing.Utf8.lexeme buf, get_loc buf)
    | any ->
        Errors.(LexingErrors.emit (Lexing_l.BadChar (Sedlexing.Utf8.lexeme buf)) ~loc:(get_loc buf))
    | _ -> assert false
  with
  | Sedlexing.MalFormed ->
    Errors.(LexingErrors.emit Lexing_l.Malformed ~loc:(get_loc buf))
OCaml

Innovation. Community. Security.