1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
let error_msgf fmt = Fmt.kstr (fun msg -> Error (`Msg msg)) fmt
open Cmdliner
let language_from_string str =
let algs = Snowball.languages in
let str = String.lowercase_ascii str in
let fn (alg : Snowball.Language.t) = String.equal str (alg :> string) in
match List.find_opt fn algs with
| Some alg -> Ok alg
| None -> error_msgf "Language %S not found" str
let tokenizer_from_string str =
match String.lowercase_ascii str with
| "whitespace" -> Ok Tokenizer.Whitespace
| "dash" -> Ok Dash
| "bert" -> Ok Bert
| _ -> error_msgf "Invalid tokenizer: %S" str
let behavior_from_string str =
match String.lowercase_ascii str with
| "remove" -> Ok Tokenizer.Remove
| "isolate" -> Ok Isolate
| "merge_with_previous" | "merge-with-previous" -> Ok Merge_with_previous
| "merge_with_next" | "merge-with-next" -> Ok Merge_with_next
| _ -> error_msgf "Invalid behavior: %S" str
let re_from_string str =
try Ok (Re.Pcre.regexp str)
with _ -> error_msgf "Invalid regular expression: %S" str
let pp_tokenizer ppf = function
| Tokenizer.Whitespace -> Fmt.string ppf "whitespace"
| Dash -> Fmt.string ppf "dash"
| Bert -> Fmt.string ppf "bert"
| Regex re -> Fmt.pf ppf "re:%a" Re.pp_re re
let pp_behavior ppf = function
| Tokenizer.Remove -> Fmt.string ppf "remove"
| Isolate -> Fmt.string ppf "isolate"
| Merge_with_previous -> Fmt.string ppf "merge-with-previous"
| Merge_with_next -> Fmt.string ppf "merge-with-next"
let language =
let algs = Snowball.languages in
let pp ppf (alg : Snowball.Language.t) = Fmt.string ppf (alg :> string) in
let language = Arg.conv (language_from_string, pp) in
let doc =
let algs =
List.map (fun (alg : Snowball.Language.t) -> (alg :> string)) algs in
let hd, tl = (List.hd algs, List.tl algs) in
let tl = List.rev tl in
Fmt.str
"The language to process. $(tname) is able to handle these languages: %s \
and %s."
(String.concat ", " tl) hd in
let open Arg in
value
& opt language Snowball.porter
& info [ "l"; "language" ] ~doc ~docv:"LANGUAGE"
let encoding =
let open Arg in
let docs = "ENCODINGS" in
let encodings =
[
(Snowball.UTF_8, info [ "utf-8" ] ~doc:"UTF-8 encoding" ~docs);
( Snowball.ISO_8859_1,
info [ "iso-8859-1"; "latin1" ] ~doc:"Latin1 encoding" ~docs );
( Snowball.ISO_8859_2,
info [ "iso-8859-2"; "latin2" ] ~doc:"Latin2 encoding" ~docs );
(Snowball.KOI8_R, info [ "koi8-r" ] ~doc:"KOI8-R encoding" ~docs);
] in
value & vflag Snowball.UTF_8 encodings
let action =
let ( let* ) = Result.bind in
let parser str =
match String.split_on_char ':' str with
| [ tokenizer ] ->
let* tokenizer = tokenizer_from_string tokenizer in
Ok (tokenizer, Tokenizer.Remove)
| ("re" | "RE" | "rE" | "Re") :: behavior :: re ->
let* behavior = behavior_from_string behavior in
let* re = re_from_string (String.concat ":" re) in
Ok (Tokenizer.Regex re, behavior)
| tokenizer :: behavior ->
let behavior = String.concat ":" behavior in
let* tokenizer = tokenizer_from_string tokenizer in
let* behavior = behavior_from_string behavior in
Ok (tokenizer, behavior)
| [] -> assert false in
let pp ppf (tokenizer, action) =
match tokenizer with
| Tokenizer.Regex re -> Fmt.pf ppf "re:%a:%a" pp_behavior action Re.pp_re re
| tokenizer -> Fmt.pf ppf "%a:%a" pp_tokenizer tokenizer pp_behavior action
in
Arg.conv (parser, pp)
let actions =
let doc = "An action to $(i,tokenize) the given document and split words." in
let open Arg in
value
& opt_all action Tokenizer.[ (Whitespace, Remove); (Bert, Remove) ]
& info [ "a"; "action" ] ~doc ~docv:"ACTION"
let () = Logs_threaded.enable ()
let output_options = "OUTPUT OPTIONS"
let verbosity =
let env = Cmd.Env.info "BLAZE_LOGS" in
Logs_cli.level ~docs:output_options ~env ()
let renderer =
let env = Cmd.Env.info "BLAZE_FMT" in
Fmt_cli.style_renderer ~docs:output_options ~env ()
let utf_8 =
let doc = "Allow binaries to emit UTF-8 characters." in
let env = Cmd.Env.info "BLAZE_UTF_8" in
Arg.(value & opt bool true & info [ "with-utf-8" ] ~doc ~env)
let reporter ppf =
let report src level ~over k msgf =
let k _ =
over () ;
k () in
let with_metadata _tags k ppf fmt =
Fmt.kpf k ppf
("[%a]%a[%a]: " ^^ fmt ^^ "\n%!")
Fmt.(styled `Cyan int)
(Stdlib.Domain.self () :> int)
Logs_fmt.pp_header (level, header)
Fmt.(styled `Magenta string)
(Logs.Src.name src) in
msgf @@ fun ? ?tags fmt -> with_metadata header tags k ppf fmt in
{ Logs.report }
let setup_logs utf_8 style_renderer level =
Fmt_tty.setup_std_outputs ~utf_8 ?style_renderer () ;
Logs.set_level level ;
Logs.set_reporter (reporter Fmt.stderr) ;
Option.is_none level
let setup_logs = Term.(const setup_logs $ utf_8 $ renderer $ verbosity)