1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
type normalization = NFC | NFD | NFKC | NFKD
let normalization_to_int = function
| NFC -> 0
| NFD -> 1
| NFKC -> 2
| NFKD -> 3
let lre_cc_res_len_max = 3
let utf8_to_codepoints s =
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in
let cps = ref [] in
let rec loop () =
match Uutf.decode decoder with
| `Uchar u ->
cps := Uchar.to_int u :: !cps;
loop ()
| `End -> Array.of_list (List.rev !cps)
| `Malformed _ ->
cps := 0xFFFD :: !cps;
loop ()
| `Await -> assert false
in
loop ()
let codepoints_to_utf8 cps =
let buf = Buffer.create (Array.length cps * 4) in
let encoder = Uutf.encoder `UTF_8 (`Buffer buf) in
Array.iter
(fun cp ->
let u =
if cp >= 0 && cp <= 0x10FFFF then Uchar.of_int cp else Uchar.rep
in
ignore (Uutf.encode encoder (`Uchar u)))
cps;
ignore (Uutf.encode encoder `End);
Buffer.contents buf
let is_cased c =
let cp = Unsigned.UInt32.of_int (Uchar.to_int c) in
Libunicode.is_cased cp <> 0
let is_case_ignorable c =
let cp = Unsigned.UInt32.of_int (Uchar.to_int c) in
Libunicode.is_case_ignorable cp <> 0
let is_id_start c =
let cp = Unsigned.UInt32.of_int (Uchar.to_int c) in
Libunicode.is_id_start cp <> 0
let is_id_continue c =
let cp = Unsigned.UInt32.of_int (Uchar.to_int c) in
Libunicode.is_id_continue cp <> 0
let is_whitespace c =
let code = Uchar.to_int c in
if code < 256 then
code = 0x20 || code = 0x09 || code = 0x0A || code = 0x0B || code = 0x0C
|| code = 0x0D
else
let cp = Unsigned.UInt32.of_int code in
Libunicode.is_space_non_ascii cp <> 0
let case_conv_char conv_type c =
let cp = Unsigned.UInt32.of_int (Uchar.to_int c) in
let res = Ctypes.CArray.make Ctypes.uint32_t lre_cc_res_len_max in
let res_ptr = Ctypes.CArray.start res in
let count = Libunicode.case_conv res_ptr cp conv_type in
let result = ref [] in
for i = count - 1 downto 0 do
let code = Unsigned.UInt32.to_int (Ctypes.CArray.get res i) in
if code >= 0 && code <= 0x10FFFF then result := Uchar.of_int code :: !result
done;
!result
let uppercase_char c = case_conv_char 0 c
let lowercase_char c = case_conv_char 1 c
let case_conv_string conv_type s =
let cps = utf8_to_codepoints s in
let res = Ctypes.CArray.make Ctypes.uint32_t lre_cc_res_len_max in
let res_ptr = Ctypes.CArray.start res in
let result = Buffer.create (Stdlib.String.length s * 2) in
let encoder = Uutf.encoder `UTF_8 (`Buffer result) in
Array.iter
(fun cp_int ->
let cp = Unsigned.UInt32.of_int cp_int in
let count = Libunicode.case_conv res_ptr cp conv_type in
for i = 0 to count - 1 do
let code = Unsigned.UInt32.to_int (Ctypes.CArray.get res i) in
let u =
if code >= 0 && code <= 0x10FFFF then Uchar.of_int code else Uchar.rep
in
ignore (Uutf.encode encoder (`Uchar u))
done)
cps;
ignore (Uutf.encode encoder `End);
Buffer.contents result
let uppercase s = case_conv_string 0 s
let lowercase s = case_conv_string 1 s
let canonicalize ?(unicode = true) c =
let cp = Unsigned.UInt32.of_int (Uchar.to_int c) in
let is_unicode = if unicode then 1 else 0 in
let result = Libunicode.canonicalize cp is_unicode in
if result >= 0 && result <= 0x10FFFF then Uchar.of_int result else c
let normalize form s =
let cps = utf8_to_codepoints s in
let len = Array.length cps in
if len = 0 then Some ""
else
let src =
Ctypes.CArray.of_list Ctypes.uint32_t
(Array.to_list (Array.map Unsigned.UInt32.of_int cps))
in
let src_ptr = Ctypes.CArray.start src in
let dst_ptr =
Ctypes.allocate
(Ctypes.ptr Ctypes.uint32_t)
(Ctypes.from_voidp Ctypes.uint32_t Ctypes.null)
in
let n_type = normalization_to_int form in
let result_len = Libunicode.normalize src_ptr len n_type dst_ptr in
if result_len < 0 then None
else
let dst = Ctypes.(!@dst_ptr) in
let result_cps = Array.make result_len 0 in
for i = 0 to result_len - 1 do
result_cps.(i) <- Unsigned.UInt32.to_int Ctypes.(!@(dst +@ i))
done;
Libunicode.normalize_free dst;
Some (codepoints_to_utf8 result_cps)