1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
let uchar_to_string uchr =
let buf = Buffer.create 0x10 in
Uutf.Buffer.add_utf_8 buf uchr ;
Buffer.contents buf
let find_matches ?(encoding = `UTF_8) ~is seq =
let decoder = Uutf.decoder ~encoding `Manual in
let buf = Buffer.create 0x100 in
let rec go seq () =
match Uutf.decode decoder with
| `Await -> begin
match Seq.uncons seq with
| Some (str, seq) ->
let buf = Bytes.unsafe_of_string str in
let len = Bytes.length buf in
Uutf.Manual.src decoder buf 0 len ;
go seq ()
| None ->
Uutf.Manual.src decoder Bytes.empty 0 0 ;
go Seq.empty ()
end
| `Malformed str ->
let token = { S.str; is_match = false } in
Seq.Cons (token, go seq)
| `Uchar uchr when is uchr && Buffer.length buf > 0 ->
let tok0 = { S.str = uchar_to_string uchr; is_match = true } in
let tok1 = { S.str = Buffer.contents buf; is_match = false } in
Buffer.clear buf ;
Seq.Cons (tok1, fun () -> Seq.Cons (tok0, go seq))
| `Uchar uchr when is uchr ->
let tok0 = { S.str = uchar_to_string uchr; is_match = true } in
Seq.Cons (tok0, go seq)
| `Uchar uchr ->
Uutf.Buffer.add_utf_8 buf uchr ;
go seq ()
| `End when Buffer.length buf > 0 ->
let tok0 = { S.str = Buffer.contents buf; is_match = false } in
Seq.Cons (tok0, fun () -> Seq.Nil)
| `End -> Seq.Nil in
go seq
let find_matches_on_bstr ?(encoding = `UTF_8) ~is bstr =
let decoder = Uutf.decoder ~encoding `Manual in
let buf = Buffer.create 0x100 in
let tmp = Bytes.create 0x7ff in
let rec go src_off () =
match Uutf.decode decoder with
| `Await ->
if src_off = Bstr.length bstr
then begin
Uutf.Manual.src decoder Bytes.empty 0 0 ;
go src_off ()
end
else begin
let len = Int.min (Bstr.length bstr - src_off) (Bytes.length tmp) in
Bstr.blit_to_bytes bstr ~src_off tmp ~dst_off:0 ~len ;
Uutf.Manual.src decoder tmp 0 len ;
go (src_off + len) ()
end
| `Malformed str ->
let token = { S.str; is_match = false } in
Seq.Cons (token, go src_off)
| `Uchar uchr when is uchr && Buffer.length buf > 0 ->
let tok0 = { S.str = uchar_to_string uchr; is_match = true } in
let tok1 = { S.str = Buffer.contents buf; is_match = false } in
Buffer.clear buf ;
Seq.Cons (tok1, fun () -> Seq.Cons (tok0, go src_off))
| `Uchar uchr when is uchr ->
let tok0 = { S.str = uchar_to_string uchr; is_match = true } in
Seq.Cons (tok0, go src_off)
| `Uchar uchr ->
Uutf.Buffer.add_utf_8 buf uchr ;
go src_off ()
| `End when Buffer.length buf > 0 ->
let tok0 = { S.str = Buffer.contents buf; is_match = false } in
Seq.Cons (tok0, fun () -> Seq.Nil)
| `End -> Seq.Nil in
go 0