Source file tfidf.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
open! Import

let prefix_stratgey s =
  let res = ref [] in
  for i = 1 to String.length s do
    res := String.sub s 0 i :: !res
  done;
  List.rev !res

module Mono
    (Uid : Search_intf.Uid) (Doc : sig
      type t
    end) =
struct
  module TokenMap = Map.Make (String)
  module UidMap = Map.Make (Uid)

  type uid = Uid.t
  type key = Uid.t
  type doc = Doc.t

  type t = {
    mutable indexes : (Doc.t -> string) list;
    mutable documents : (Uid.t * Doc.t) list;
    mutable token_map : stats TokenMap.t;
    mutable cache : float TokenMap.t;
    strategy : string -> string list;
    tokeniser : string -> string list;
    santiser : string -> string;
  }

  and stats = {
    document_occurences : int;
    total_occurences : int;
    mutable uid_map : (doc * int) UidMap.t;
  }

  let pp_uid ppf (s, (_, i)) = Format.fprintf ppf "%s: %i" (Uid.to_string s) i

  let pp_stats ppf stats =
    let uids = UidMap.bindings stats.uid_map in
    Format.fprintf ppf "document: %i, total: %i, uid: %a"
      stats.document_occurences stats.total_occurences
      Format.(pp_print_list pp_uid)
      uids

  let pp_binding ppf (s, stats) = Format.fprintf ppf "%s: %a" s pp_stats stats

  let empty ?(santiser = String.lowercase_ascii) ?(strategy = prefix_stratgey)
      ?(tokeniser = String.split_on_char ' ') () =
    {
      indexes = [];
      documents = [];
      token_map = TokenMap.empty;
      cache = TokenMap.empty;
      strategy;
      santiser;
      tokeniser;
    }

  let get_documents t = t.documents
  let reset_documents t = t.documents <- []

  let pp ppf t =
    let b = TokenMap.bindings t.token_map in
    Format.fprintf ppf "[@.%a@.]@."
      Format.(pp_print_list ~pp_sep:pp_print_newline pp_binding)
      b

  let index t ~uid ~token doc =
    t.cache <- TokenMap.empty;
    let token_data =
      match TokenMap.find_opt token t.token_map with
      | Some stats ->
          { stats with total_occurences = stats.total_occurences + 1 }
      | None ->
          {
            document_occurences = 0;
            total_occurences = 1;
            uid_map = UidMap.empty;
          }
    in
    let token_data, uid_map =
      match UidMap.find_opt uid token_data.uid_map with
      | Some (doc, occurences) ->
          (token_data, UidMap.add uid (doc, occurences + 1) token_data.uid_map)
      | None ->
          ( {
              token_data with
              document_occurences = token_data.document_occurences + 1;
            },
            UidMap.add uid (doc, 1) token_data.uid_map )
    in
    token_data.uid_map <- uid_map;
    t.token_map <- TokenMap.add token token_data t.token_map

  let add_document t (uid : Uid.t) doc =
    t.documents <- (uid, doc) :: t.documents;
    let fields = List.fold_left (fun acc i -> i doc :: acc) [] t.indexes in
    List.iter
      (fun field ->
        List.iter
          (fun token ->
            List.iter
              (fun expanded_token -> index t ~uid ~token:expanded_token doc)
              (t.strategy token))
          (t.tokeniser (t.santiser field)))
      fields

  let add_index t index =
    t.indexes <- index :: t.indexes;
    let docs = t.documents in
    t.documents <- [];
    List.iter (fun (k, v) -> add_document t k v) docs

  let add_indexes t indexes =
    t.indexes <- indexes @ t.indexes;
    let docs = t.documents in
    t.documents <- [];
    List.iter (fun (k, v) -> add_document t k v) docs

  let idf t token docs =
    match TokenMap.find_opt token t.cache with
    | Some i -> i
    | None ->
        let value =
          match TokenMap.find_opt token t.token_map with
          | Some stats -> float_of_int stats.document_occurences
          | None -> 0.
        in
        let i = 1. +. log (float_of_int (List.length docs) /. (1. +. value)) in
        t.cache <- TokenMap.add token i t.cache;
        i

  let tfidf t tokens docs doc =
    let f score token =
      let inverse = idf t token docs in
      let freq =
        match TokenMap.find_opt token t.token_map with
        | None -> 0.
        | Some stats -> (
            let uid = fst doc in
            match UidMap.find_opt uid stats.uid_map with
            | None -> 0.
            | Some (_, v) -> float_of_int v)
      in
      score +. (freq *. inverse)
    in
    List.fold_left f 0. tokens

  let score_cmp v = int_of_float v

  let search t token =
    let tokens = t.santiser token |> t.tokeniser in
    let documents = UidMap.empty in
    let i = ref (-1) in
    let document acc token =
      incr i;
      match TokenMap.find_opt token t.token_map with
      | None -> acc
      | Some stats ->
          let docs = if !i = 0 then stats.uid_map else documents in
          let map =
            UidMap.fold
              (fun uid (doc, _) docs ->
                if !i = 0 then UidMap.add uid doc docs
                else if not (UidMap.mem uid stats.uid_map) then
                  UidMap.remove uid docs
                else docs)
              docs documents
          in
          UidMap.union (fun _ v _ -> Some v) map acc
    in
    let uid_docs = List.fold_left document UidMap.empty tokens in
    let documents = UidMap.bindings uid_docs in
    let tf = tfidf t tokens t.documents in
    List.sort (fun d1 d2 -> score_cmp @@ (tf d2 -. tf d1)) documents
    |> List.map snd
end

(* Type identifiers.
   See http://alan.petitepomme.net/cwn/2015.03.24.html#1 and
   https://erratique.ch/repos/hmap/tree/src/hmap.ml *)
module Generic (U : Search_intf.Uid) = struct
  module TokenMap = Map.Make (String)
  module Witness = Witness

  module Uid = struct
    type 'a witness = { uid : U.t option; tid : 'a Witness.t }

    let tid v = v.tid

    let create () =
      let tid = Witness.make () in
      { uid = None; tid }

    type t = V : 'a witness -> t

    let hide_type k = V k

    let equal (V k0) (V k1) =
      U.compare (Option.get k0.uid) (Option.get k1.uid) = 0

    let compare (V k0) (V k1) =
      U.compare (Option.get k0.uid) (Option.get k1.uid)

    let to_string (V k) = U.to_string (Option.get k.uid)
  end

  type 'v uid = 'v Uid.witness
  type binding = KV : ('v uid * 'v) -> binding
  type doc = binding
  type key = U.t

  module UidMap = Map.Make (Uid)

  type t = {
    mutable indexes : (binding -> string option) list;
    mutable token_map : stats TokenMap.t;
    mutable cache : float TokenMap.t;
    mutable documents : binding UidMap.t;
    strategy : string -> string list;
    tokeniser : string -> string list;
    santiser : string -> string;
  }

  and stats = {
    document_occurences : int;
    total_occurences : int;
    mutable uid_map : (binding * int) UidMap.t;
  }

  let pp_uid ppf (s, (_, i)) = Format.fprintf ppf "%s: %i" (Uid.to_string s) i

  let pp_stats ppf stats =
    let uids = UidMap.bindings stats.uid_map in
    Format.fprintf ppf "document: %i, total: %i, uid: %a"
      stats.document_occurences stats.total_occurences
      Format.(pp_print_list pp_uid)
      uids

  let pp_binding ppf (s, stats) = Format.fprintf ppf "%s: %a" s pp_stats stats

  let empty ?(santiser = String.lowercase_ascii) ?(strategy = prefix_stratgey)
      ?(tokeniser = String.split_on_char ' ') () =
    {
      indexes = [];
      documents = UidMap.empty;
      token_map = TokenMap.empty;
      cache = TokenMap.empty;
      strategy;
      santiser;
      tokeniser;
    }

  let pp ppf t =
    let b = TokenMap.bindings t.token_map in
    Format.fprintf ppf "[@.%a@.]@."
      Format.(pp_print_list ~pp_sep:pp_print_newline pp_binding)
      b

  let index t ~uid ~token doc =
    t.cache <- TokenMap.empty;
    let token_data =
      match TokenMap.find_opt token t.token_map with
      | Some stats ->
          { stats with total_occurences = stats.total_occurences + 1 }
      | None ->
          {
            document_occurences = 0;
            total_occurences = 1;
            uid_map = UidMap.empty;
          }
    in
    let key = Uid.V uid in
    let token_data, uid_map =
      match UidMap.find_opt key token_data.uid_map with
      | Some (doc, occurences) ->
          (token_data, UidMap.add key (doc, occurences + 1) token_data.uid_map)
      | None ->
          ( {
              token_data with
              document_occurences = token_data.document_occurences + 1;
            },
            UidMap.add key (doc, 1) token_data.uid_map )
    in
    token_data.uid_map <- uid_map;
    t.token_map <- TokenMap.add token token_data t.token_map

  let add_document t (type v) (uid : v uid) key (doc : v) =
    let uid = { uid with uid = Some key } in
    let uid_key = Uid.V uid in
    let binding = KV (uid, doc) in
    t.documents <- UidMap.add uid_key binding t.documents;
    let fields = List.fold_left (fun acc i -> i binding :: acc) [] t.indexes in
    List.iter
      (function
        | Some field ->
            List.iter
              (fun token -> index t ~uid ~token binding)
              (t.strategy (t.santiser field))
        | None -> ())
      fields

  let apply (type v) (k : v uid) ~default (f : v -> 'a) =
    let k = k.Uid.tid in
    fun (KV (k', v)) ->
      match Witness.eq k k'.tid with Some Teq -> f v | _ -> default

  let apply_exn (type v) (k : v uid) (f : v -> 'a) =
    let k = k.Uid.tid in
    fun (KV (k', v)) ->
      match Witness.eq k k'.tid with
      | Some Teq -> f v
      | _ -> failwith "Type witnesses were not equal."

  let add_index t k index =
    t.indexes <- apply ~default:None k (fun v -> Some (index v)) :: t.indexes;
    let docs = t.documents in
    t.documents <- UidMap.empty;
    UidMap.iter
      (fun _ (KV (k, v)) -> add_document t k (Option.get k.uid) v)
      docs

  let add_indexes t k indexes =
    let indexes =
      List.map
        (fun index -> apply ~default:None k (fun v -> Some (index v)))
        indexes
    in
    t.indexes <- indexes @ t.indexes;
    let docs = t.documents in
    t.documents <- UidMap.empty;
    UidMap.iter
      (fun _ (KV (k, v)) -> add_document t k (Option.get k.uid) v)
      docs

  type univ = Pack : 'a Witness.witness * 'a -> univ

  let int : int Witness.t = Witness.make ()

  let pack (type u) (module Witness : Witness.Tid with type t = u) x =
    Pack (Witness.Tid, x)

  let idf t token docs =
    match TokenMap.find_opt token t.cache with
    | Some i -> i
    | None ->
        let value =
          match TokenMap.find_opt token t.token_map with
          | Some stats -> float_of_int stats.document_occurences
          | None -> 0.
        in
        let i = 1. +. log (float_of_int (List.length docs) /. (1. +. value)) in
        t.cache <- TokenMap.add token i t.cache;
        i

  let tfidf t tokens docs doc =
    let f score token =
      let inverse = idf t token docs in
      let freq =
        match TokenMap.find_opt token t.token_map with
        | None -> 0.
        | Some stats -> (
            doc |> fun (KV (uid, _doc)) ->
            match UidMap.find_opt (Uid.V uid) stats.uid_map with
            | None -> 0.
            | Some (_, v) -> float_of_int v)
      in
      score +. (freq *. inverse)
    in
    List.fold_left f 0. tokens

  let score_cmp v = int_of_float v

  let search t token =
    let tokens = t.santiser token |> t.tokeniser in
    let documents = UidMap.empty in
    let i = ref (-1) in
    let document acc token =
      incr i;
      match TokenMap.find_opt token t.token_map with
      | None -> acc
      | Some stats ->
          let docs = if !i = 0 then stats.uid_map else documents in
          let map =
            UidMap.fold
              (fun uid (doc, _) docs ->
                if !i = 0 then UidMap.add uid doc docs
                else if not (UidMap.mem uid stats.uid_map) then
                  UidMap.remove uid docs
                else docs)
              docs documents
          in
          UidMap.union (fun _ v _ -> Some v) map acc
    in
    let uid_docs = List.fold_left document UidMap.empty tokens in
    let documents = UidMap.bindings uid_docs |> List.map snd in
    let tf = tfidf t tokens (UidMap.bindings t.documents) in
    List.sort (fun d1 d2 -> score_cmp @@ (tf d2 -. tf d1)) documents
end