package pfff

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file database_light_ml.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
(* Yoann Padioleau
 *
 * Copyright (C) 2010 Facebook
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * version 2.1 as published by the Free Software Foundation, with the
 * special exception on linking described in file license.txt.
 * 
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the file
 * license.txt for more details.
 *)
open Common

module Flag = Flag_parsing
module Db = Database_code
module HC = Highlight_code
module PI = Parse_info

module T = Parser_ml

(*****************************************************************************)
(* Prelude *)
(*****************************************************************************)
(* 
 * Obsolete file: see the -db_of_graph_code option of codegraph and
 * graph_code_cmt.ml for a more complete implementation.
 * 
 * Light database building for OCaml code (mainly used by the codemap
 * semantic code visualizer). We currently abuse the code highlighter
 * to extract the entity defintions, and for the uses we are mainly using
 * the list of tokens ... not the AST.
 *
 * We build the full database in multiple steps as some
 * operations need the information computed globally by the
 * previous step:
 * 
 * - collect all definitions and their files
 * - collect all uses, updating the count number of the
 *   corresponding entity (if it's used in a different file)
 *   as well as the entity->test_files_using_it hash.
 * 
 * Currently many analysis are just lexical-based (yes I know, I am
 * ridiculous) so there is some ambiguity when we find a use such
 * as a function call. We don't always know to which precise entity 
 * it corresponds to.To be precise would require to resolve module name. 
 * Fortunately in my code I don't use 'open' that much and only use the
 * simple alias-module idiom which makes it tractable to
 * identify in practice to which entity a qualified function call refers to.
 * 
 *)

(*****************************************************************************)
(* Types *)
(*****************************************************************************)

(* poor's man id for now. It's quite close to the fullid we have in
 * database_php.ml. 
 *)
type entity_poor_id = 
  Id of (Common.filename * Common2.filepos)

(*****************************************************************************)
(* Helpers *)
(*****************************************************************************)

let is_pleac_file file = 
  let file = String.lowercase_ascii file in
  file =~ ".*pleac*"

(* todo? quite pad specific ... 
 * try detect when use OUnit ?
 *)
let is_test_file file =
  let file = String.lowercase_ascii file in
  (file =~ ".*/test_" || file =~ ".*/unit_")

let is_test_or_pleac_file file = 
  is_test_file file || is_pleac_file file



let entity_poor_id_of_entity e = 
  Id (e.Db.e_file, e.Db.e_pos)

(* give a score per id and then sort and return top k *)
let rank_and_filter_examples_of_use ~root ids entities_arr =
  ids +> List.map (fun id ->
    let file = entities_arr.(id).Db.e_file in
    let file = Filename.concat root file in
    let size = Common2.filesize file in
    
    (* Low means better; so prefer small size and pleac files *)
    let score = 
      size / (if is_pleac_file file then 4 else 1) in
    score, id
  ) 
  +> Common.sort_by_key_lowfirst 
  +> List.map snd

let parse file =
  Common.save_excursion Flag.error_recovery true (fun () ->
  Common.save_excursion Flag.show_parsing_error false (fun () ->
    Parse_ml.parse file 
  ))

(*****************************************************************************)
(* Main entry point *)
(*****************************************************************************)

let compute_database ?(verbose=false) files_or_dirs = 

  let root = Common2.common_prefix_of_files_or_dirs files_or_dirs in
  let root = Common2.chop_dirsymbol root in
  if verbose then pr2 (spf "generating ML db_light with root = %s" root);

  let files = Lib_parsing_ml.find_source_files_of_dir_or_files files_or_dirs in
  let dirs = files +> List.map Filename.dirname +> Common2.uniq_eff in

  (* PHASE 1: collecting definitions *)
  if verbose then pr2 (spf "PHASE 1: collecting definitions");

  let (hdefs: (string, Db.entity) Hashtbl.t) = Hashtbl.create 1001 in

  (* This is used later when one wants to get the first id of a file.
   *
   * This is just because in step2 when we are collecting uses we
   * don't know in which entity we currently are but we know in
   * which file we are and for the good_examples_of_use we really
   * just need to give one of the id in the (supposidely small) test_or_pleac
   * file. 
   * 
   * todo: once we have a real callgraph we will not need this anymore.
   *)
  let (hfile_to_entities: (filename, entity_poor_id) Hashtbl.t) = 
    Hashtbl.create 1001 in

  files +> Console.progress ~show:verbose (fun k -> 
   List.iter (fun file ->
    k();
    let ((ast, toks), _stat) = 
      parse file 
    in
    let ast = 
      match ast with
      (* in database light we do error recovery *)
      | None -> []
      | Some xs -> xs
    in

    (* this is quite similar to what we do in tags_ml.ml *)
    let prefs = Highlight_code.default_highlighter_preferences in

    Highlight_ml.visit_program
        ~lexer_based_tagger:true (* !! *)
        ~tag_hook:(fun info categ -> 
          (* todo: use is_entity_def_category ? *)
          match categ with
          | HC.Entity (_, HC.Def2 _)
          | HC.FunctionDecl _ 
            ->
              let s = PI.str_of_info info in
              let l = PI.line_of_info info in
              let c = PI.col_of_info info in

              let file = Parse_info.file_of_info info +> Common.readable ~root 
              in

              let module_name = Module_ml.module_name_of_filename file in

              let fullpath = Parse_info.file_of_info info in

              (* stuff in mli is ok only where there is no .ml, like
               * for the externals/core/ stuff
               *)
              let (d,b,e) = Common2.dbe_of_filename fullpath in
              if e = "ml" ||
                 (e = "mli" && not (Sys.file_exists
                                      (Common2.filename_of_dbe (d,b, "ml"))))
              then begin
              
              let entity = { Database_code.
                e_name = s;
                e_fullname = spf "%s.%s" module_name s;
                e_file = file;
                e_pos = { Common2.l = l; c };
                e_kind = Common2.some 
                  (Db.entity_kind_of_highlight_category_def categ);
                (* filled in step 2 *)
                e_number_external_users = 0;
                e_good_examples_of_use = [];

                (* TODO once we have a real parser, can at least
                 * set the UseGlobal property.
                 *)
                e_properties = [];
              }
              in
              (* todo? could be more precise and add the Modulename.s
               * in the hash so that we don't need to call
               * Hashtbl.find_all but just Hashtbl.find later ?
               *)
              Hashtbl.add hdefs s entity;

              Hashtbl.add hfile_to_entities file 
                (entity_poor_id_of_entity entity);
              end;
              
          | _ -> ()
        )
        prefs
        (ast, toks)
    )
  );

  (* PHASE 2: collecting uses *)
  if verbose then pr2 (spf "PHASE 2: collecting uses");

  let entities_arr = 
    Common.hash_to_list hdefs +> List.map snd +> Array.of_list
  in

  (* this is useful when we want to add cross-references in the entities
   * such as the good_examples_of_use that reference another Db.entity_id.
   *)
  let (h_id_mldb_to_id_db: (entity_poor_id, Db.entity_id) Hashtbl.t) = 
    Hashtbl.create 1001 in

  entities_arr +> Array.iteri (fun id_db e ->
    let id_mldb = entity_poor_id_of_entity e in
    Hashtbl.add h_id_mldb_to_id_db id_mldb id_db;
  );

  (* todo: could rank later.
   *  so would need a first phase where we collect with
   *   let (hentity_to_test_files_using_it: 
   *    (entity_poor_id, Common.filename) Hashtbl.t) =
   *    Hashtbl.create 101 in
   *  ?
   *  
   * For now the granularity of the goto_example is entity -> 
   * test_files_using_it instead of test_functions_that_use_it
   * because we don't have the full callgraph and different 
   * entities id as in database_php.ml. We could try to identify
   * in which entity a function call is by reusing the highlight/visitor
   * above and tracking the tokens and what was the last entity
   * encountered.
   *)
  let add_good_example_of_use test_file entity =
    let poor_id_opt = Common2.hfind_option test_file hfile_to_entities in
    (match poor_id_opt with
    | None -> pr2 (spf "WEIRD, could not find an entity in %s" test_file)
    | Some poor_id_user ->
        let id_user = Hashtbl.find h_id_mldb_to_id_db poor_id_user in
        (* could do a take_safe 3 but for ocaml I don't think we have
         * any scaling issues
         *)
        entity.Db.e_good_examples_of_use <-
          (id_user :: entity.Db.e_good_examples_of_use);
    )
  in


  files +> Console.progress ~show:verbose (fun k -> 
   List.iter (fun file ->
    k ();

    if file =~ ".*external/" && 
      (* I don't really want pleac files to participate in the
       * e_number_external_users statistics but I want pleac files
       * to participate in the e_good_examples_of_use so have
       * to special case it here. Could introduce a step3 phase ...
       *)
      not (file =~ ".*pleac/")
    then pr2 (spf "skipping external file: %s" file)
    else begin

    let ((_ast, toks), _stat) = parse file in

    let file = Common.readable ~root file in

    (* try to resolve function use more precisely instead of incrementing 
     * all entities that have xxx as a name. Look if the module name
     * match the basename of the file defining the entity.
     * But have to remember the module X = XXX aliases.
     *)
    let hmodule_aliases = Hashtbl.create 11 in

    let toks = toks +> Common.exclude (function
      | T.TCommentSpace _ -> true
      | _ -> false
    )
    in

      (* Only consider Module.xxx. Otherwise names such as 'x', or 'yylex'
       * which are variables or internal functions are considered
       * as having a huge count.
       * 
       *)
      let rec aux_toks toks = 
        match toks with
        | T.Tmodule _
          ::T.TUpperIdent(s, _ii)
          ::T.TEq _
          ::T.TUpperIdent(s2, _ii2)::xs
          ->
            (* we want to transform every occurence of s  into s2,
             * to remove the alias sugar
             *)
            Hashtbl.add hmodule_aliases s s2;
            aux_toks xs

        | T.TUpperIdent(s, _ii)::T.TDot _ii2::T.TLowerIdent(s2, _ii3)::xs ->
          
            Hashtbl.find_all hdefs s2 +> List.iter (fun entity ->
              let file_entity = entity.Db.e_file in

              let final_module_name = 
                if Hashtbl.mem hmodule_aliases s
                then Hashtbl.find hmodule_aliases s
                else s
              in
              let module_entity = 
                let (_d,b,_e) = Common2.dbe_of_filename file_entity in
                String.capitalize_ascii b
              in
              
              if file_entity <> file && final_module_name = module_entity
              then begin
                entity.Db.e_number_external_users <-
                  entity.Db.e_number_external_users + 1;

                if is_test_or_pleac_file file
                then
                  add_good_example_of_use file entity;
              end
            );
            aux_toks xs

        | [] -> ()
        | _x::xs ->
            aux_toks xs
      in
      aux_toks toks;
    end
    )
  );

  (* PHASE 3: adjusting entities *)
  if verbose then pr2 (spf "PHASE 3: adjusting entities");

  entities_arr +> Array.iter (fun e ->
    let ids = e.Db.e_good_examples_of_use in
    e.Db.e_good_examples_of_use <- 
      rank_and_filter_examples_of_use ~root ids entities_arr;
  );

  let dirs = dirs +> List.map (fun s -> Common.readable ~root s) in
  let dirs = Db.alldirs_and_parent_dirs_of_relative_dirs dirs in

  { Db.
    root = root;
    dirs = dirs +> List.map (fun d -> d, 0); (* TODO *)
    files = files +> List.map (fun f -> Common.readable ~root f, 0); (* TODO *)
    entities = entities_arr;
  }
OCaml

Innovation. Community. Security.