1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
open! Core
open Lwt.Syntax
open Lwt.Infix
type location = {
sheet_number: int;
row_number: int;
col_index: int;
}
[@@deriving sexp_of]
type 'a cell_parser = {
string: location -> string -> 'a;
formula: location -> formula:string -> string -> 'a;
error: location -> string -> 'a;
boolean: location -> string -> 'a;
number: location -> string -> 'a;
date: location -> string -> 'a;
null: 'a;
}
type delayed_string = {
location: location;
sst_index: string;
}
[@@deriving sexp_of]
type 'a status =
| Available of 'a
| Delayed of delayed_string
[@@deriving sexp_of]
type 'a row = {
sheet_number: int;
row_number: int;
data: 'a array;
}
[@@deriving sexp_of]
let origin = Date.add_days (Date.create_exn ~y:1900 ~m:(Month.of_int_exn 1) ~d:1) (-2)
let parse_date f = Float.to_int f |> Date.add_days origin
let parse_datetime ~zone f =
let parts = Float.modf f in
let date = Float.Parts.integral parts |> Float.to_int |> Date.add_days origin in
let frac = Float.(Parts.fractional parts * 86400000. |> round) |> Time.Span.of_ms in
let ofday = Time.Ofday.of_span_since_start_of_day_exn frac in
Time.of_date_ofday ~zone date ofday
let xml_parser =
Xml.make_parser Xml.{ accept_html_boolean_attributes = false; accept_unquoted_attributes = false }
let fold_angstrom ~filter_path ~on_match =
let open Angstrom.Buffered in
let rec loop state acc =
match state, acc with
| Done ({ buf; off = pos; len }, node), acc -> (
let acc = Xml.SAX.Stream.folder ~filter_path ~on_match acc node in
match parse xml_parser with
| Partial feed -> (loop [@tailcall]) (feed (`Bigstring (Bigstring.sub_shared buf ~pos ~len))) acc
| state -> (loop [@tailcall]) state acc)
| state -> state
in
let f _entry bs ~len = function
| (_, Error _) as x -> x
| (Fail (_, [], err) as parse), _ -> parse, Error err
| (Fail (_, marks, err) as parse), _ ->
parse, Error (sprintf "%s: %s" (String.concat ~sep:" > " marks) err)
| (Done _ as parse), Ok _ -> parse, Error "Impossible case Done. Please report this bug."
| Partial feed, (Ok _ as acc) -> loop (feed (`Bigstring (Bigstring.sub_shared bs ~pos:0 ~len))) acc
in
Zip.Action.Fold_bigstring { init = parse xml_parser, Ok Xml.SAX.Stream.init; f }
let parse_sheet ~sheet_number push =
let num = ref 0 in
let on_match (el : Xml.DOM.element) =
(match Xml.get_attr el.attrs "r" with
| None -> incr num
| Some s -> (
try
let i = Int.of_string s in
for row_number = !num to i - 2 do
push { sheet_number; row_number; data = [||] }
done;
num := i
with
| _ -> incr num));
push { sheet_number; row_number = !num; data = el.children }
in
fold_angstrom ~filter_path:[ "worksheet"; "sheetData"; "row" ] ~on_match
let parse_string_cell el =
let open Xml.DOM in
match el |> dot "t" with
| Some { text; _ } -> text
| None -> filter_map "r" el ~f:(dot_text "t") |> String.concat_array
let flush_zip_stream zip_stream =
Lwt_stream.filter_map
(function
| Zip.{ filename; _ }, Zip.Data.Fold_bigstring (_, Error msg) ->
Some (sprintf "XLSX Parsing error in %s: %s" filename msg)
| _ -> None)
zip_stream
|> Lwt_stream.to_list
>>= function
| [] -> Lwt.return_unit
| errors -> failwith (String.concat ~sep:". " errors)
module SST = struct
type t = string Lazy.t array
let filter_path = [ "sst"; "si" ]
let zip_entry_filename = "xl/sharedStrings.xml"
let from_zip ~feed =
let q = Queue.create () in
let zip_stream, zip_success =
Zip.stream_files ~feed (function
| { filename = "xl/sharedStrings.xml"; _ } ->
let on_match el = Queue.enqueue q (lazy (parse_string_cell el)) in
fold_angstrom ~filter_path ~on_match
| _ -> Zip.Action.Skip)
in
let sst_p =
let+ () = flush_zip_stream zip_stream in
Queue.to_array q
in
let* () = zip_success in
sst_p
end
let process_file ?only_sheet ~skip_sst ~feed push finalize =
let q = Queue.create () in
let zip_stream, zip_success =
Zip.stream_files ~feed (function
| { filename = "xl/workbook.xml"; _ } -> Skip
| { filename = "xl/sharedStrings.xml"; _ } when skip_sst -> Skip
| { filename = "xl/sharedStrings.xml"; _ } ->
let on_match el = Queue.enqueue q (lazy (parse_string_cell el)) in
fold_angstrom ~filter_path:SST.filter_path ~on_match
| { filename; _ } ->
Option.try_with (fun () -> Scanf.sscanf filename "xl/worksheets/%[sS]heet%d.xml" (fun _ d -> d))
|> Option.bind ~f:(fun i ->
if Option.value_map only_sheet ~default:true ~f:(( = ) i)
then Some (parse_sheet ~sheet_number:i push)
else None)
|> Option.value ~default:Zip.Action.Skip)
in
let result_p =
Lwt.finalize
(fun () -> flush_zip_stream zip_stream)
(fun () ->
finalize ();
Lwt.return_unit)
in
let success =
let* () = zip_success in
result_p
in
let sst_p =
let+ () = result_p in
Queue.to_array q
in
sst_p, success
let resolve_sst_index (sst : SST.t) ~sst_index =
let index = Int.of_string sst_index in
match sst with
| sst when index < Array.length sst && index >= 0 -> Some (force sst.(index))
| _ -> None
let unwrap_status cell_parser (sst : SST.t) (row : 'a status row) =
let data =
Array.map row.data ~f:(function
| Available x -> x
| Delayed { location; sst_index } -> (
match resolve_sst_index sst ~sst_index with
| Some index -> cell_parser.string location index
| None -> cell_parser.null))
in
{ row with data }
let , =
let open Xml.DOM in
let ~null location : element option -> 'a = function
| None -> null
| Some { text; _ } -> extractor location text
in
let { string; formula; error; boolean; number; date; null } location el ty =
match ty with
| None
|Some "n" ->
el |> dot "v" |> extract ~null location number
| Some "d" -> el |> dot "v" |> extract ~null location date
| Some "str" -> (
match el |> dot_text "v" with
| None -> null
| Some s -> formula location s ~formula:(el |> dot_text "f" |> Option.value ~default:""))
| Some "inlineStr" -> (
match dot "is" el with
| None -> null
| Some el -> string location (parse_string_cell el))
| Some "e" -> el |> dot "v" |> extract ~null location error
| Some "b" -> el |> dot "v" |> extract ~null location boolean
| Some t -> failwithf "Unknown data type: %s ::: %s" t (sexp_of_element el |> Sexp.to_string) ()
in
let sst cell_parser location el =
match Xml.get_attr el.attrs "t" with
| Some "s" -> (
match el |> dot "v" with
| None -> cell_parser.null
| Some { text = sst_index; _ } -> (
match resolve_sst_index sst ~sst_index with
| None -> cell_parser.null
| Some resolved -> cell_parser.string location resolved))
| ty -> extract_cell_base cell_parser location el ty
in
let cell_parser location el =
match Xml.get_attr el.attrs "t" with
| Some "s" -> (
match el |> dot "v" with
| None -> Available cell_parser.null
| Some { text = sst_index; _ } -> Delayed { location; sst_index })
| ty -> Available (extract_cell_base cell_parser location el ty)
in
extract_cell_sst, extract_cell_status
let col_cache = String.Table.create ()
let index_of_column s =
let key =
String.take_while s ~f:(function
| 'A' .. 'Z' -> true
| _ -> false)
in
Hashtbl.find_or_add col_cache key ~default:(fun () ->
String.fold key ~init:0 ~f:(fun acc c -> (acc * 26) + Char.to_int c - 64) - 1)
let row_width num_cells data =
let open Xml.DOM in
Xml.get_attr (Array.last data).attrs "r" |> function
| None -> num_cells
| Some r -> max num_cells (index_of_column r + 1)
let parse_row_with_sst sst cell_parser ({ data; sheet_number; row_number } as row) =
let open Xml.DOM in
match Array.length data with
| 0 -> { row with data = [||] }
| num_cells ->
let num_cols = row_width num_cells data in
let new_data = Array.create ~len:num_cols cell_parser.null in
Array.iteri data ~f:(fun i el ->
let col_index = Xml.get_attr el.attrs "r" |> Option.value_map ~default:i ~f:index_of_column in
let v = extract_cell_sst sst cell_parser { col_index; sheet_number; row_number } el in
new_data.(col_index) <- v);
{ row with data = new_data }
let parse_row_without_sst cell_parser ({ data; sheet_number; row_number } as row) =
let open Xml.DOM in
match Array.length data with
| 0 -> { row with data = [||] }
| num_cells ->
let num_cols = row_width num_cells data in
let new_data = Array.create ~len:num_cols (Available cell_parser.null) in
Array.iteri data ~f:(fun i el ->
let col_index = Xml.get_attr el.attrs "r" |> Option.value_map ~default:i ~f:index_of_column in
let v = extract_cell_status cell_parser { col_index; sheet_number; row_number } el in
new_data.(col_index) <- v);
{ row with data = new_data }
let stream_rows ?only_sheet ?(skip_sst = false) ~feed cell_parser =
let stream, push = Lwt_stream.create () in
let finalize () = push None in
let sst_p, processed_p =
process_file ?only_sheet ~skip_sst ~feed
(fun x -> push (Some (parse_row_without_sst cell_parser x)))
finalize
in
stream, sst_p, processed_p
let stream_rows_unparsed ?only_sheet ?(skip_sst = false) ~feed () =
let stream, push = Lwt_stream.create () in
let finalize () = push None in
let sst_p, processed_p = process_file ?only_sheet ~skip_sst ~feed (fun x -> push (Some x)) finalize in
stream, sst_p, processed_p
let stream_rows_buffer ?only_sheet ~feed cell_parser =
let stream, push = Lwt_stream.create () in
let finalize () = push None in
let sst_p, processed_p =
process_file ?only_sheet ~skip_sst:false ~feed (fun x -> push (Some x)) finalize
in
let parsed_stream =
Lwt_stream.map_s
(fun row -> sst_p |> Lwt.map (fun sst -> parse_row_with_sst sst cell_parser row))
stream
in
parsed_stream, processed_p
let yojson_cell_parser : [> `Bool of bool | `Float of float | `String of string | `Null ] cell_parser =
{
string = (fun _location s -> `String (Xml.unescape s));
formula = (fun _location ~formula:_ s -> `String (Xml.unescape s));
error = (fun _location s -> `String (sprintf "#ERROR# %s" s));
boolean = (fun _location s -> `Bool String.(s = "1"));
number = (fun _location s -> `Float (Float.of_string s));
date = (fun _location s -> `String s);
null = `Null;
}