1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
open! Import
let prefix_stratgey s =
let res = ref [] in
for i = 1 to String.length s do
res := String.sub s 0 i :: !res
done;
List.rev !res
module Mono
(Uid : Search_intf.Uid) (Doc : sig
type t
end) =
struct
module TokenMap = Map.Make (String)
module UidMap = Map.Make (Uid)
type uid = Uid.t
type key = Uid.t
type doc = Doc.t
type t = {
mutable indexes : (Doc.t -> string) list;
mutable documents : (Uid.t * Doc.t) list;
mutable token_map : stats TokenMap.t;
mutable cache : float TokenMap.t;
strategy : string -> string list;
tokeniser : string -> string list;
santiser : string -> string;
}
and stats = {
document_occurences : int;
total_occurences : int;
mutable uid_map : (doc * int) UidMap.t;
}
let pp_uid ppf (s, (_, i)) = Format.fprintf ppf "%s: %i" (Uid.to_string s) i
let pp_stats ppf stats =
let uids = UidMap.bindings stats.uid_map in
Format.fprintf ppf "document: %i, total: %i, uid: %a"
stats.document_occurences stats.total_occurences
Format.(pp_print_list pp_uid)
uids
let pp_binding ppf (s, stats) = Format.fprintf ppf "%s: %a" s pp_stats stats
let empty ?(santiser = String.lowercase_ascii) ?(strategy = prefix_stratgey)
?(tokeniser = String.split_on_char ' ') () =
{
indexes = [];
documents = [];
token_map = TokenMap.empty;
cache = TokenMap.empty;
strategy;
santiser;
tokeniser;
}
let get_documents t = t.documents
let reset_documents t = t.documents <- []
let pp ppf t =
let b = TokenMap.bindings t.token_map in
Format.fprintf ppf "[@.%a@.]@."
Format.(pp_print_list ~pp_sep:pp_print_newline pp_binding)
b
let index t ~uid ~token doc =
t.cache <- TokenMap.empty;
let token_data =
match TokenMap.find_opt token t.token_map with
| Some stats ->
{ stats with total_occurences = stats.total_occurences + 1 }
| None ->
{
document_occurences = 0;
total_occurences = 1;
uid_map = UidMap.empty;
}
in
let token_data, uid_map =
match UidMap.find_opt uid token_data.uid_map with
| Some (doc, occurences) ->
(token_data, UidMap.add uid (doc, occurences + 1) token_data.uid_map)
| None ->
( {
token_data with
document_occurences = token_data.document_occurences + 1;
},
UidMap.add uid (doc, 1) token_data.uid_map )
in
token_data.uid_map <- uid_map;
t.token_map <- TokenMap.add token token_data t.token_map
let add_document t (uid : Uid.t) doc =
t.documents <- (uid, doc) :: t.documents;
let fields = List.fold_left (fun acc i -> i doc :: acc) [] t.indexes in
List.iter
(fun field ->
List.iter
(fun token ->
List.iter
(fun expanded_token -> index t ~uid ~token:expanded_token doc)
(t.strategy token))
(t.tokeniser (t.santiser field)))
fields
let add_index t index =
t.indexes <- index :: t.indexes;
let docs = t.documents in
t.documents <- [];
List.iter (fun (k, v) -> add_document t k v) docs
let add_indexes t indexes =
t.indexes <- indexes @ t.indexes;
let docs = t.documents in
t.documents <- [];
List.iter (fun (k, v) -> add_document t k v) docs
let idf t token docs =
match TokenMap.find_opt token t.cache with
| Some i -> i
| None ->
let value =
match TokenMap.find_opt token t.token_map with
| Some stats -> float_of_int stats.document_occurences
| None -> 0.
in
let i = 1. +. log (float_of_int (List.length docs) /. (1. +. value)) in
t.cache <- TokenMap.add token i t.cache;
i
let tfidf t tokens docs doc =
let f score token =
let inverse = idf t token docs in
let freq =
match TokenMap.find_opt token t.token_map with
| None -> 0.
| Some stats -> (
let uid = fst doc in
match UidMap.find_opt uid stats.uid_map with
| None -> 0.
| Some (_, v) -> float_of_int v)
in
score +. (freq *. inverse)
in
List.fold_left f 0. tokens
let score_cmp v = int_of_float v
let search t token =
let tokens = t.santiser token |> t.tokeniser in
let documents = UidMap.empty in
let i = ref (-1) in
let document acc token =
incr i;
match TokenMap.find_opt token t.token_map with
| None -> acc
| Some stats ->
let docs = if !i = 0 then stats.uid_map else documents in
let map =
UidMap.fold
(fun uid (doc, _) docs ->
if !i = 0 then UidMap.add uid doc docs
else if not (UidMap.mem uid stats.uid_map) then
UidMap.remove uid docs
else docs)
docs documents
in
UidMap.union (fun _ v _ -> Some v) map acc
in
let uid_docs = List.fold_left document UidMap.empty tokens in
let documents = UidMap.bindings uid_docs in
let tf = tfidf t tokens t.documents in
List.sort (fun d1 d2 -> score_cmp @@ (tf d2 -. tf d1)) documents
|> List.map snd
end
module Generic (U : Search_intf.Uid) = struct
module TokenMap = Map.Make (String)
module Witness = Witness
module Uid = struct
type 'a witness = { uid : U.t option; tid : 'a Witness.t }
let tid v = v.tid
let create () =
let tid = Witness.make () in
{ uid = None; tid }
type t = V : 'a witness -> t
let hide_type k = V k
let equal (V k0) (V k1) =
U.compare (Option.get k0.uid) (Option.get k1.uid) = 0
let compare (V k0) (V k1) =
U.compare (Option.get k0.uid) (Option.get k1.uid)
let to_string (V k) = U.to_string (Option.get k.uid)
end
type 'v uid = 'v Uid.witness
type binding = KV : ('v uid * 'v) -> binding
type doc = binding
type key = U.t
module UidMap = Map.Make (Uid)
type t = {
mutable indexes : (binding -> string option) list;
mutable token_map : stats TokenMap.t;
mutable cache : float TokenMap.t;
mutable documents : binding UidMap.t;
strategy : string -> string list;
tokeniser : string -> string list;
santiser : string -> string;
}
and stats = {
document_occurences : int;
total_occurences : int;
mutable uid_map : (binding * int) UidMap.t;
}
let pp_uid ppf (s, (_, i)) = Format.fprintf ppf "%s: %i" (Uid.to_string s) i
let pp_stats ppf stats =
let uids = UidMap.bindings stats.uid_map in
Format.fprintf ppf "document: %i, total: %i, uid: %a"
stats.document_occurences stats.total_occurences
Format.(pp_print_list pp_uid)
uids
let pp_binding ppf (s, stats) = Format.fprintf ppf "%s: %a" s pp_stats stats
let empty ?(santiser = String.lowercase_ascii) ?(strategy = prefix_stratgey)
?(tokeniser = String.split_on_char ' ') () =
{
indexes = [];
documents = UidMap.empty;
token_map = TokenMap.empty;
cache = TokenMap.empty;
strategy;
santiser;
tokeniser;
}
let pp ppf t =
let b = TokenMap.bindings t.token_map in
Format.fprintf ppf "[@.%a@.]@."
Format.(pp_print_list ~pp_sep:pp_print_newline pp_binding)
b
let index t ~uid ~token doc =
t.cache <- TokenMap.empty;
let token_data =
match TokenMap.find_opt token t.token_map with
| Some stats ->
{ stats with total_occurences = stats.total_occurences + 1 }
| None ->
{
document_occurences = 0;
total_occurences = 1;
uid_map = UidMap.empty;
}
in
let key = Uid.V uid in
let token_data, uid_map =
match UidMap.find_opt key token_data.uid_map with
| Some (doc, occurences) ->
(token_data, UidMap.add key (doc, occurences + 1) token_data.uid_map)
| None ->
( {
token_data with
document_occurences = token_data.document_occurences + 1;
},
UidMap.add key (doc, 1) token_data.uid_map )
in
token_data.uid_map <- uid_map;
t.token_map <- TokenMap.add token token_data t.token_map
let add_document t (type v) (uid : v uid) key (doc : v) =
let uid = { uid with uid = Some key } in
let uid_key = Uid.V uid in
let binding = KV (uid, doc) in
t.documents <- UidMap.add uid_key binding t.documents;
let fields = List.fold_left (fun acc i -> i binding :: acc) [] t.indexes in
List.iter
(function
| Some field ->
List.iter
(fun token -> index t ~uid ~token binding)
(t.strategy (t.santiser field))
| None -> ())
fields
let apply (type v) (k : v uid) ~default (f : v -> 'a) =
let k = k.Uid.tid in
fun (KV (k', v)) ->
match Witness.eq k k'.tid with Some Teq -> f v | _ -> default
let apply_exn (type v) (k : v uid) (f : v -> 'a) =
let k = k.Uid.tid in
fun (KV (k', v)) ->
match Witness.eq k k'.tid with
| Some Teq -> f v
| _ -> failwith "Type witnesses were not equal."
let add_index t k index =
t.indexes <- apply ~default:None k (fun v -> Some (index v)) :: t.indexes;
let docs = t.documents in
t.documents <- UidMap.empty;
UidMap.iter
(fun _ (KV (k, v)) -> add_document t k (Option.get k.uid) v)
docs
let add_indexes t k indexes =
let indexes =
List.map
(fun index -> apply ~default:None k (fun v -> Some (index v)))
indexes
in
t.indexes <- indexes @ t.indexes;
let docs = t.documents in
t.documents <- UidMap.empty;
UidMap.iter
(fun _ (KV (k, v)) -> add_document t k (Option.get k.uid) v)
docs
type univ = Pack : 'a Witness.witness * 'a -> univ
let int : int Witness.t = Witness.make ()
let pack (type u) (module Witness : Witness.Tid with type t = u) x =
Pack (Witness.Tid, x)
let idf t token docs =
match TokenMap.find_opt token t.cache with
| Some i -> i
| None ->
let value =
match TokenMap.find_opt token t.token_map with
| Some stats -> float_of_int stats.document_occurences
| None -> 0.
in
let i = 1. +. log (float_of_int (List.length docs) /. (1. +. value)) in
t.cache <- TokenMap.add token i t.cache;
i
let tfidf t tokens docs doc =
let f score token =
let inverse = idf t token docs in
let freq =
match TokenMap.find_opt token t.token_map with
| None -> 0.
| Some stats -> (
doc |> fun (KV (uid, _doc)) ->
match UidMap.find_opt (Uid.V uid) stats.uid_map with
| None -> 0.
| Some (_, v) -> float_of_int v)
in
score +. (freq *. inverse)
in
List.fold_left f 0. tokens
let score_cmp v = int_of_float v
let search t token =
let tokens = t.santiser token |> t.tokeniser in
let documents = UidMap.empty in
let i = ref (-1) in
let document acc token =
incr i;
match TokenMap.find_opt token t.token_map with
| None -> acc
| Some stats ->
let docs = if !i = 0 then stats.uid_map else documents in
let map =
UidMap.fold
(fun uid (doc, _) docs ->
if !i = 0 then UidMap.add uid doc docs
else if not (UidMap.mem uid stats.uid_map) then
UidMap.remove uid docs
else docs)
docs documents
in
UidMap.union (fun _ v _ -> Some v) map acc
in
let uid_docs = List.fold_left document UidMap.empty tokens in
let documents = UidMap.bindings uid_docs |> List.map snd in
let tf = tfidf t tokens (UidMap.bindings t.documents) in
List.sort (fun d1 d2 -> score_cmp @@ (tf d2 -. tf d1)) documents
end