package pfff

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file parse_cpp.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
(* Yoann Padioleau
 *
 * Copyright (C) 2002-2013 Yoann Padioleau
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License (GPL)
 * version 2 as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * file license.txt for more details.
 *)
open Common

module Flag = Flag_parsing
module PI = Parse_info
module Stat = Parse_info
module FT = File_type

module Ast = Ast_cpp
module Flag_cpp = Flag_parsing_cpp
module T = Parser_cpp
module TH = Token_helpers_cpp
module Lexer = Lexer_cpp
module Semantic = Parser_cpp_mly_helper

(*****************************************************************************)
(* Prelude *)
(*****************************************************************************)
(* 
 * A heuristic based C/cpp/C++ parser.
 * 
 * See "Parsing C/C++ Code without Pre-Preprocessing - Yoann Padioleau, CC'09"
 * avalaible at http://padator.org/papers/yacfe-cc09.pdf
 *)

(*****************************************************************************)
(* Types *)
(*****************************************************************************)

type toplevels_and_tokens = (Ast.toplevel * Parser_cpp.token list) list

let program_of_program2 xs = 
  xs +> List.map fst

exception Parse_error of Parse_info.info

(*****************************************************************************)
(* Wrappers *)
(*****************************************************************************)
let pr2, _pr2_once = Common2.mk_pr2_wrappers Flag.verbose_parsing

(*****************************************************************************)
(* Error diagnostic *)
(*****************************************************************************)

let error_msg_tok tok = 
  Parse_info.error_message_info (TH.info_of_tok tok)

(*****************************************************************************)
(* Stats on what was passed/commentized  *)
(*****************************************************************************)

let commentized xs = xs +> Common.map_filter (function
  | T.TComment_Pp (cppkind, ii) -> 
      if !Flag_cpp.filter_classic_passed
      then 
        (match cppkind with
        | Token_cpp.CppOther -> 
            let s = PI.str_of_info ii in
            (match s with
            | s when s =~ "KERN_.*" -> None
            | s when s =~ "__.*" -> None
            | _ -> Some (ii.PI.token)
            )
             
        | Token_cpp.CppDirective | Token_cpp.CppAttr | Token_cpp.CppMacro
            -> None
        | Token_cpp.CppMacroExpanded
        | Token_cpp.CppPassingNormal
        | Token_cpp.CppPassingCosWouldGetError 
          -> raise Todo
        )
      else Some (ii.PI.token)
      
  | T.TAny_Action ii ->
      Some (ii.PI.token)
  | _ -> 
      None
 )
  
let count_lines_commentized xs = 
  let line = ref (-1) in
  let count = ref 0 in
  commentized xs +> List.iter (function
    | PI.OriginTok pinfo 
    | PI.ExpandedTok (_,pinfo,_) -> 
        let newline = pinfo.PI.line in
        if newline <> !line
        then begin
          line := newline;
          incr count
        end
    | _ -> ()
  );
  !count


(* See also problematic_lines and parsing_stat.ml *)

(* for most problematic tokens *)
let is_same_line_or_close line tok =
  TH.line_of_tok tok =|= line ||
  TH.line_of_tok tok =|= line - 1 ||
  TH.line_of_tok tok =|= line - 2

(*****************************************************************************)
(* Lexing only *)
(*****************************************************************************)

(* called by parse below *)
let tokens2 file = 
 let table     = Parse_info.full_charpos_to_pos_large file in

 Common.with_open_infile file (fun chan -> 
  let lexbuf = Lexing.from_channel chan in
  try 
    let rec tokens_aux () = 
      let tok = Lexer.token lexbuf in
      (* fill in the line and col information *)
      let tok = tok +> TH.visitor_info_of_tok (fun ii -> 
        { ii with PI.token=
          (* could assert pinfo.filename = file ? *)
          match ii.PI.token with
          |  PI.OriginTok pi ->
             PI.OriginTok (Parse_info.complete_token_location_large file 
                             table pi)
          | PI.ExpandedTok (pi,vpi, off) ->
              PI.ExpandedTok(
                (Parse_info.complete_token_location_large file table pi),vpi, 
                off)
          | PI.FakeTokStr (s,vpi_opt) -> PI.FakeTokStr (s,vpi_opt)
          | PI.Ab -> raise Impossible
      })
      in

      if TH.is_eof tok
      then [tok]
      else tok::(tokens_aux ())
    in
    tokens_aux ()
  with
  | Lexer.Lexical s -> 
    failwith (spf "lexical error %s \n = %s"
                s (PI.error_message file (PI.lexbuf_to_strpos lexbuf)))
  | e -> raise e
 )
   
let tokens a = 
  Common.profile_code "Parse_cpp.tokens" (fun () -> tokens2 a)

(*****************************************************************************)
(* Fuzzy parsing *)
(*****************************************************************************)

let rec multi_grouped_list xs = 
  xs +> List.map multi_grouped

and multi_grouped = function
  | Token_views_cpp.Braces (tok1, xs, (Some tok2)) ->
      Ast_fuzzy.Braces (tokext tok1, multi_grouped_list xs, tokext tok2)
  | Token_views_cpp.Parens (tok1, xs, (Some tok2)) ->
      Ast_fuzzy.Parens (tokext tok1, multi_grouped_list_comma xs, tokext tok2)
  | Token_views_cpp.Angle (tok1, xs, (Some tok2)) ->
      Ast_fuzzy.Angle (tokext tok1, multi_grouped_list xs, tokext tok2)
  | Token_views_cpp.Tok (tok) ->
    (match PI.str_of_info (tokext tok) with
    | "..." -> Ast_fuzzy.Dots (tokext tok)
    | s when Ast_fuzzy.is_metavar s -> Ast_fuzzy.Metavar (s, tokext tok)
    | s -> Ast_fuzzy.Tok (s, tokext tok)
    )
  | _ -> failwith "could not find closing brace/parens/angle"
and tokext tok_extended =
  TH.info_of_tok tok_extended.Token_views_cpp.t
and multi_grouped_list_comma xs =
  let rec aux acc xs =
  match xs with
  | [] ->
      if null acc
      then []
      else [Left (acc +> List.rev +> multi_grouped_list)]
  | (x::xs) ->
      (match x with
      | Token_views_cpp.Tok tok when PI.str_of_info (tokext tok) = "," ->
          let before = acc +> List.rev +> multi_grouped_list in
          if null before
          then aux [] xs
          else (Left before)::(Right (tokext tok))::aux [] xs
      | _ ->
        aux (x::acc) xs
      )
  in
  aux [] xs


(* This is similar to what I did for OPA. This is also similar
 * to what I do for parsing hacks, but this fuzzy AST can be useful
 * on its own, e.g. for a not too bad sgrep/spatch.
 * 
 * note: this is similar to what cpplint/fblint of andrei does? 
 *)
let parse_fuzzy file =
  Common.save_excursion Flag_cpp.sgrep_mode true (fun () ->
  let toks_orig = tokens file in
  let toks = 
    toks_orig +> Common.exclude (fun x ->
      Token_helpers_cpp.is_comment x || Token_helpers_cpp.is_eof x
    )
  in
  let extended = toks +> List.map Token_views_cpp.mk_token_extended in
  Parsing_hacks_cpp.find_template_inf_sup extended;
  let groups = Token_views_cpp.mk_multi extended in
  multi_grouped_list groups, toks_orig
  )

(*****************************************************************************)
(* Extract macros *)
(*****************************************************************************)

(* It can be used to to parse the macros defined in a macro.h file. It 
 * can also be used to try to extract the macros defined in the file 
 * that we try to parse *)
let extract_macros2 file = 
  Common.save_excursion Flag.verbose_lexing false (fun () -> 
    let toks = tokens (* todo: ~profile:false *) file in
    let toks = Parsing_hacks_define.fix_tokens_define toks in
    Pp_token.extract_macros toks
  )
let extract_macros a = 
  Common.profile_code_exclusif "Parse_cpp.extract_macros" (fun () -> 
    extract_macros2 a)

(* less: pass it as a parameter to parse_program instead ? 
 * old: was a ref, but a hashtbl.t is actually already a kind of ref
 *) 
let (_defs : (string, Pp_token.define_body) Hashtbl.t)  = 
  Hashtbl.create 101


(* We used to have also a init_defs_builtins() so that we could use a
 * standard.h containing macros that were always useful, and a macros.h
 * that the user could customize for his own project.
 * But this was adding complexity so now we just have _defs and people
 * can call add_defs to add local macro definitions.
 *)
let add_defs file =
  if not (Sys.file_exists file)
  then failwith (spf "Could not find %s, have you set PFFF_HOME correctly?"
                   file);
  pr2 (spf "Using %s macro file" file);
  let xs = extract_macros file in
  xs +> List.iter (fun (k, v) -> Hashtbl.add _defs k v)

let init_defs file =     
  Hashtbl.clear _defs;
  add_defs file

(*****************************************************************************)
(* Error recovery *)
(*****************************************************************************)
(* see parsing_recovery_cpp.ml *)

(*****************************************************************************)
(* Consistency checking *)
(*****************************************************************************)
(* todo: a parsing_consistency_cpp.ml *)
      
(*****************************************************************************)
(* Helper for main entry point *)
(*****************************************************************************)

(* Hacked lex. This function use refs passed by parse. 
 * 'tr' means 'token refs'. This is used mostly to enable
 * error recovery (This used to do lots of stuff, such as
 * calling some lookahead heuristics to reclassify
 * tokens such as TIdent into TIdent_Typeded but this is
 * now done in a fix_tokens style in parsing_hacks_typedef.ml.
 *)
let rec lexer_function tr = fun lexbuf -> 
  match tr.PI.rest with
  | [] -> (pr2 "LEXER: ALREADY AT END"; tr.PI.current)
  | v::xs -> 
      tr.PI.rest <- xs;
      tr.PI.current <- v;
      tr.PI.passed <- v::tr.PI.passed;

      if !Flag.debug_lexer then pr2_gen v;

      if TH.is_comment v
      then lexer_function (*~pass*) tr lexbuf
      else v

(* was a define ? *)
let passed_a_define tr =
  let xs = tr.PI.passed +> List.rev +> Common.exclude TH.is_comment in
  if List.length xs >= 2 
   then 
      (match Common2.head_middle_tail xs with
      | T.TDefine _, _, T.TCommentNewline_DefineEndOfMacro _ -> true
      | _ -> false
      )
   else begin
     pr2 "WIERD: length list of error recovery tokens < 2 ";
     false
   end

(*****************************************************************************)
(* Main entry point *)
(*****************************************************************************)
(* 
 * note: as now we go in two passes, there is first all the error message of
 * the lexer, and then the error of the parser. It is not anymore
 * interwinded.
 * 
 * !!!This function use refs, and is not reentrant !!! so take care.
 * It uses the _defs global defined above!!!!
 *)
let parse_with_lang ?(lang=Flag_parsing_cpp.Cplusplus) file = 

  let stat = Parse_info.default_stat file in
  let filelines = Common2.cat_array file in

  (* -------------------------------------------------- *)
  (* call lexer and get all the tokens *)
  (* -------------------------------------------------- *)
  let toks_orig = tokens file in

  let toks = 
    try Parsing_hacks.fix_tokens ~macro_defs:_defs lang toks_orig
    with Token_views_cpp.UnclosedSymbol s ->
      pr2 s;
      if !Flag_cpp.debug_cplusplus 
      then raise (Token_views_cpp.UnclosedSymbol s)
      else toks_orig
  in

  let tr = Parse_info.mk_tokens_state toks in
  let lexbuf_fake = Lexing.from_function (fun _buf _n -> raise Impossible) in

  let rec loop () =

    let info = TH.info_of_tok tr.PI.current in
    (* todo?: I am not sure that it represents current_line, cos maybe
     * tr.current partipated in the previous parsing phase, so maybe tr.current
     * is not the first token of the next parsing phase. Same with checkpoint2.
     * It would be better to record when we have a } or ; in parser.mly,
     *  cos we know that they are the last symbols of external_declaration2.
     *)
    let checkpoint = PI.line_of_info info in
    (* bugfix: may not be equal to 'file' as after macro expansions we can
     * start to parse a new entity from the body of a macro, for instance
     * when parsing a define_machine() body, cf standard.h
     *)
    let checkpoint_file = PI.file_of_info info in

    tr.PI.passed <- [];
    (* for some statistics *)
    let was_define = ref false in

    let elem = 
      (try 
          (* -------------------------------------------------- *)
          (* Call parser *)
          (* -------------------------------------------------- *)
          Parser_cpp.toplevel (lexer_function tr) lexbuf_fake
        with e -> 
          if not !Flag.error_recovery 
          then raise (Parse_error (TH.info_of_tok tr.PI.current));

          if !Flag.show_parsing_error then
            (match e with
            (* Lexical is not anymore launched I think *)
            | Lexer.Lexical s -> 
              pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.PI.current)
            | Parsing.Parse_error -> 
              pr2 ("parse error \n = " ^ error_msg_tok tr.PI.current)
            | Semantic.Semantic (s, _i) -> 
              pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.PI.current)
            | e -> raise e
            );

          let line_error = TH.line_of_tok tr.PI.current in

          let pbline =
            tr.PI.passed
            +> List.filter (is_same_line_or_close line_error)
            +> List.filter TH.is_ident_like
          in
          let error_info =
            (pbline +> List.map (fun tok->PI.str_of_info (TH.info_of_tok tok))),
            line_error 
          in
          stat.Stat.problematic_lines <-
            error_info::stat.Stat.problematic_lines;

          (*  error recovery, go to next synchro point *)
          let (passed', rest') = 
            Parsing_recovery_cpp.find_next_synchro tr.PI.rest tr.PI.passed in
          tr.PI.rest <- rest';
          tr.PI.passed <- passed';

          tr.PI.current <- List.hd passed';

          (* <> line_error *)
          let info = TH.info_of_tok tr.PI.current in
          let checkpoint2 = PI.line_of_info info in 
          let checkpoint2_file = PI.file_of_info info in

          was_define := passed_a_define tr;
          (if !was_define && !Flag_cpp.filter_define_error
           then ()
           else 
              (* bugfix: *)
              (if (checkpoint_file = checkpoint2_file) && checkpoint_file = file
              then PI.print_bad line_error (checkpoint, checkpoint2) filelines
              else pr2 "PB: bad: but on tokens not from original file"
              )
          );

          let info_of_bads = 
            Common2.map_eff_rev TH.info_of_tok tr.PI.passed in 
          
          Some (Ast.NotParsedCorrectly info_of_bads)
      )
    in

    (* again not sure if checkpoint2 corresponds to end of bad region *)
    let info = TH.info_of_tok tr.PI.current in
    let checkpoint2 = PI.line_of_info info in
    let checkpoint2_file = PI.file_of_info info in

    let diffline = 
      if (checkpoint_file = checkpoint2_file) && (checkpoint_file = file)
      then (checkpoint2 - checkpoint) 
      else 0
        (* TODO? so if error come in middle of something ? where the
         * start token was from original file but synchro found in body
         * of macro ? then can have wrong number of lines stat.
         * Maybe simpler just to look at tr.passed and count
         * the lines in the token from the correct file ?
         *)
    in
    let info = List.rev tr.PI.passed in 

    (* some stat updates *)
    stat.Stat.commentized <- 
      stat.Stat.commentized + count_lines_commentized info;
    (match elem with
    | Some (Ast.NotParsedCorrectly _xs) -> 
        if !was_define && !Flag_cpp.filter_define_error
        then stat.Stat.commentized <- stat.Stat.commentized + diffline
        else stat.Stat.bad     <- stat.Stat.bad     + diffline

    | _ -> stat.Stat.correct <- stat.Stat.correct + diffline
    );

    (match elem with
    | None -> []
    | Some xs -> (xs, info):: loop () (* recurse *)
    )
  in
  let v = loop() in
  (v, stat)

let parse2 file =
  match File_type.file_type_of_file file with
  | FT.PL (FT.C _) ->
    (try 
      parse_with_lang ~lang:Flag_cpp.C file
    with _exn ->
      parse_with_lang ~lang:Flag_cpp.Cplusplus file
    )
  | FT.PL (FT.Cplusplus _) ->
    parse_with_lang ~lang:Flag_cpp.Cplusplus file
  | _ -> failwith (spf "not a C/C++ file: %s" file)
    

let parse file  = 
  Common.profile_code "Parse_cpp.parse" (fun () -> 
    try 
      parse2 file
    with Stack_overflow ->
      pr2 (spf "PB stack overflow in %s" file);
      [(Ast.NotParsedCorrectly [], ([]))], {Stat.
        correct = 0;
        bad = Common2.nblines_with_wc file;
        filename = file;
        have_timeout = true;
        commentized = 0;
        problematic_lines = [];
      }
  )

let parse_program file = 
  let (ast2, _stat) = parse file in
  program_of_program2 ast2
OCaml

Innovation. Community. Security.