1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
let is_ascii_punctuation = function
| '!' | '\"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ','
| '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' | ']'
| '^' | '_' | '`' | '{' | '|' | '}' | '~' ->
true
| _ -> false
let is_punctuation uchr =
if Uchar.is_char uchr
then is_ascii_punctuation (Uchar.to_char uchr)
else
match Uucp.Gc.general_category uchr with
| `Pc | `Pd | `Ps | `Pe | `Pi | `Pf | `Po -> true
| _ -> false
let find_matches ?(encoding = Snowball.UTF_8) str =
let seq = Seq.return str in
let is = is_punctuation in
match encoding with
| Snowball.UTF_8 -> On_utf8.find_matches ~encoding:`UTF_8 ~is seq
| Snowball.ISO_8859_1 -> On_utf8.find_matches ~encoding:`ISO_8859_1 ~is seq
| encoding ->
Fmt.invalid_arg "Unimplemented encoding: %a" Snowball.pp_encoding encoding
let find_matches_on_bstr ?(encoding = Snowball.UTF_8) bstr =
let is = is_punctuation in
match encoding with
| Snowball.UTF_8 -> On_utf8.find_matches_on_bstr ~encoding:`UTF_8 ~is bstr
| Snowball.ISO_8859_1 ->
On_utf8.find_matches_on_bstr ~encoding:`ISO_8859_1 ~is bstr
| encoding ->
Fmt.invalid_arg "Unimplemented encoding: %a" Snowball.pp_encoding encoding