1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
(** *)
let dbg_mode =
match try Sys.getenv "XMLDIFF_DEBUG" with _ -> "" with
"1" -> true
| _ -> false
let dbg = if dbg_mode then prerr_endline else fun _ -> ()
let on_dbg f x = if dbg_mode then f x else ()
module Smap = Map.Make(String)
module Intmap = Map.Make
(struct type t = int let compare (x:int) y = Stdlib.compare x y end)
module Intset = Set.Make
(struct type t = int let compare (x:int) y = Stdlib.compare x y end)
module Nmap =
Map.Make (
struct
type t = string * string
let compare (p1, s1) (p2, s2) =
match String.compare p1 p2 with
0 -> String.compare s1 s2
| n -> n
end)
type name = Xmlm.name
let string_of_name = function
"", s -> s
| s1, s2 -> s1 ^ ":" ^ s2
let atts_of_list l=
List.fold_left
(fun acc (name, v) -> Nmap.add name v acc)
Nmap.empty l
type 'a xmlt = [
`E of name * string Nmap.t * 'a list
| `D of string ]
type xmltree = xmltree xmlt
type xmlnode = int option * xmlnode xmlt
type label = Node of string | Text of string
let compare_label l1 l2 =
match l1, l2 with
Node s1, Node s2 -> String.compare s1 s2
| Text s1, Text s2 -> String.compare s1 s2
| Node _, Text _ -> -1
| Text _, Node _ -> 1
module Lmap = Map.Make(struct type t = label let compare = compare_label end)
type node = {
number : int ;
children : int array ;
mutable parent : int option ;
xml : xmltree ;
weight : float ;
hash : string ;
label : label ;
rank : int ;
is_cut : bool ;
mutable matched : int option ;
}
type doc = {
height: int ;
w0 : float ;
nodes : node array ;
}
let file_of_string ~file s =
let oc = open_out file in
output_string oc s;
close_out oc
let string_of_atts map =
let l =
Nmap.fold
(fun name s acc ->
(Printf.sprintf "%s=%S" (string_of_name name) s) :: acc)
map []
in
String.concat " " l
let label_of_xml = function
| `D s -> Text s
| `E (tag, _, _) -> Node (string_of_name tag)
let atts_of_map map =
List.rev
(Nmap.fold
(fun name s acc -> (name, s) :: acc)
map [])
let string_of_xml ?(cut=false) tree =
let tree =
if cut then
match tree with
`D _ -> tree
| `E (name,atts,_) -> `E (name,atts,[])
else
tree
in
let b = Buffer.create 256 in
let ns_prefix s = Some s in
let output = Xmlm.make_output ~ns_prefix ~decl: false (`Buffer b) in
let frag = function
| `E (tag, atts, childs) ->
let atts = atts_of_map atts in
`El ((tag, atts), childs)
| `D d -> `Data d
in
Xmlm.output_doc_tree frag output (None, tree);
Buffer.contents b
let hash xml =
let s =
match xml with
`D s -> "!" ^ s
| `E _ -> "<" ^ (Marshal.to_string xml [])
in
Digest.string s
let short_label = function
`E (("",s2), _, _) -> "<"^s2^">"
| `E ((s1,s2), _, _) -> "<"^s1^":"^s2^">"
| `D s ->
let len = String.length s in
let s = Printf.sprintf "%S" (String.sub s 0 (min 10 len)) in
let len = String.length s in
String.sub s 1 (len - 2)
let xmlnode_of_t t =
let rec unfold_cut = function
(`D s) as xml -> (None, xml)
| `E (tag, atts, subs) ->
(None, `E (tag, atts, List.map unfold_cut subs))
in
let len = Array.length t in
let rec build n =
let xml = t.(n).xml in
match xml with
`D s -> (Some n, `D s)
| `E (tag,atts,children) ->
let children =
if t.(n).is_cut then
List.map unfold_cut children
else
List.map build (Array.to_list t.(n).children)
in
(Some n, `E (tag, atts, children))
in
build (len-1)
let weight xml children =
match xml with
`D s -> 1. +. log (float (1 + String.length s))
| `E _ -> List.fold_left (fun acc c -> c.weight +. acc) 1. children
let t_of_xml =
let rec iter ?cut (n0, acc, acc_children, h) xml =
let (label, subs, is_cut) =
match xml with
| `D _ -> (label_of_xml xml, [], false)
| `E (tag, atts, l) ->
match cut with
| Some f when f tag atts l -> (Node (string_of_xml xml), [], true)
| _ -> (label_of_xml xml, l, false)
in
let (n, acc, children, h_children) = List.fold_left (iter ?cut) (n0, acc, [], 0) subs in
let children = List.rev children in
List.iter (fun node -> node.parent <- Some n) children ;
let hash = hash xml in
let weight = weight xml children in
let node =
{ number = n ;
children = Array.of_list (List.map (fun node -> node.number) children) ;
parent = None ;
xml ; label ; hash ; weight ;
rank = List.length acc_children ;
is_cut ;
matched = None ;
}
in
(n+1, node :: acc, node :: acc_children, max h (h_children + 1))
in
fun ?cut xml ->
let (_, l, _, h) = iter ?cut (0, [], [], 0) xml in
let t = Array.of_list l in
Array.sort (fun n1 n2 -> n1.number - n2.number) t;
Array.iteri (fun i node ->
assert (i = node.number)
) t;
{ height = h;
nodes = t;
w0 = t.(Array.length t - 1).weight ;
}
type operation =
| Replace of node * int
| Move of int * int * int * int
| MoveRank of int * int * int
| Insert of node * int * int
| Delete of node
| Edit of node * node
type actions = operation list
type patch_path =
Path_cdata of int
| Path_node of Xmlm.name * int * patch_path option
type position = [ `FirstChild | `After]
type patch_operation =
| PInsert of xmltree * position
| PDelete
| PUpdateCData of string
| PUpdateNode of Xmlm.name * string Nmap.t
| PReplace of xmltree
| PMove of patch_path * position
type patch = (patch_path * patch_operation) list
let rec xml_of_source s_source source =
try
let ns s = Some s in
let input = Xmlm.make_input ~ns ~enc: (Some `UTF_8) source in
let el (tag, atts) childs =
let atts = List.fold_left
(fun map (name, v) -> Nmap.add name v map) Nmap.empty atts
in
`E (tag, atts, childs)
in
let data d = `D d in
let (_, tree) = Xmlm.input_doc_tree ~el ~data input in
tree
with
Xmlm.Error ((line, col), error) ->
let msg = Printf.sprintf "%sLine %d, column %d: %s"
s_source line col (Xmlm.error_message error)
in
failwith msg
| Invalid_argument e ->
let msg = Printf.sprintf "%sInvalid_argumen(%s)" s_source e in
failwith msg
and xml_of_string s =
xml_of_source s (`String (0, s))
let xml_of_file file =
let ic = open_in file in
try
let xml = xml_of_source
(Printf.sprintf "File %S, " file) (`Channel ic)
in
close_in ic;
xml
with
e ->
close_in ic;
raise e
let dot_of_t t =
let b = Buffer.create 256 in
let p b = Printf.bprintf b in
p b "digraph g {\nrankdir=TB;\nordering=out;\n";
Array.iter
(fun node ->
p b "\"N%d\" [ label=\"%d: %s[%d]\", fontcolor=black ];\n"
node.number node.number (short_label node.xml) node.rank;
Array.iter (fun i -> p b "\"N%d\" -> \"N%d\";\n" node.number i) node.children ;
)
t.nodes;
p b "}\n";
Buffer.contents b
let dot_of_xmlnode t =
let b = Buffer.create 256 in
let p b = Printf.bprintf b in
p b "digraph g {\nrankdir=TB;\nordering=out;\n";
let string_of_id = function None -> "_" | Some n -> string_of_int n in
let cpt = ref 0 in
let rec iter parent rank (id, xml) =
incr cpt;
let n = !cpt in
p b "\"N%d\" [ label=\"%s: %s[%d]\", fontcolor=black ];\n"
n (string_of_id id) (short_label xml) rank;
(
match xml with
`D _ -> ()
| `E (_,_,subs) ->
List.iteri (fun i child ->
iter (Some n) i child ;
)
subs
);
match parent with
None -> ()
| Some parent -> p b "\"N%d\" -> \"N%d\";\n" parent n
in
iter None 0 t ;
p b "}\n";
Buffer.contents b
let dot_of_matches t1 t2 =
let b = Buffer.create 256 in
let p b = Printf.bprintf b in
p b "digraph g {\nrankdir=TB;\nordering=out;\n";
p b "subgraph cluster_2 {\n";
Array.iter
(fun node ->
p b "\"T%d\" [ label=\"%d: %s[%d]\", fontcolor=black ];\n"
node.number node.number (short_label node.xml) node.rank;
Array.iter (fun i -> p b "\"T%d\" -> \"T%d\";\n" node.number i) node.children ;
)
t2.nodes;
p b "}\n";
p b "subgraph cluster_1 {\n";
Array.iter
(fun node ->
p b "\"S%d\" [ label=\"%d: %s[%d]\", fontcolor=black ];\n"
node.number node.number (short_label node.xml) node.rank;
Array.iter (fun i -> p b "\"S%d\" -> \"S%d\";\n" node.number i) node.children ;
match node.matched with
None -> ()
| Some j -> p b "S%d -> T%d [style=\"dashed\"];\n" node.number j
)
t1.nodes;
p b "}\n";
p b "}\n";
Buffer.contents b
let string_of_action = function
| Replace (n2, i) -> Printf.sprintf "Replace (%d, %d): %s" n2.number i (string_of_xml ~cut:true n2.xml)
| Move (i, parent, new_parent, rank) -> Printf.sprintf "Move(%d,%d,%d,%d)" i parent new_parent rank
| MoveRank (i, new_parent, rank) -> Printf.sprintf "MoveRank(%d,%d,%d)" i new_parent rank
| Insert (n2, i, rank) -> Printf.sprintf "Insert (%d, %d, %d): %s" n2.number i rank (string_of_xml ~cut:true n2.xml)
| Delete n1 -> Printf.sprintf "Delete(%d): %s" n1.number (string_of_xml ~cut: true n1.xml)
| Edit (n1, n2) -> Printf.sprintf "Edit(%d,%d): %s -> %s" n1.number n2.number
(string_of_xml ~cut: true n1.xml) (string_of_xml ~cut: true n2.xml)
let have_matching_parents nodes1 n1 n2 =
match n1.parent, n2.parent with
| None, None -> true
| None, _
| _, None -> false
| Some p1, Some p2 -> nodes1.(p1).matched = Some p2
let matching_parent nodes n =
match n.parent with
None -> None
| Some p -> nodes.(p).matched
let add_edit_action acc n1 n2 =
match n1.xml, n2.xml with
`E _, `D _
| `D _, `E _ -> assert false
| `D s1, `D s2 -> Edit (n1, n2) :: acc
| `E (tag1, atts1, _), `E (tag2, atts2, _) ->
match n1.is_cut, n2.is_cut with
true, _
| _, true -> Replace (n2, n1.number) :: acc
| false, false ->
begin
if tag1 = tag2 && Nmap.equal (=) atts1 atts2 then
acc
else
Edit (n1, n2) :: acc
end
let make_actions t1 t2 =
let nodes1 = t1.nodes in
let nodes2 = t2.nodes in
let rec f (acc, rank) i =
let n1 = nodes1.(i) in
match n1.matched with
None ->
let (acc, _) = Array.fold_left f (acc, 0) n1.children in
((Delete n1) :: acc, rank + 1)
| Some j ->
let n2 = nodes2.(j) in
let matching_parents = have_matching_parents nodes1 n1 n2 in
let (deleted, acc) =
if matching_parents then
if n1.rank = n2.rank then
(false, acc)
else
(
let new_parent =
match n1.parent with None -> assert false | Some i -> i
in
(false, MoveRank(n1.number, new_parent, n2.rank) :: acc)
)
else
(
match matching_parent nodes2 n2 with
None ->
dbg (Printf.sprintf
"make_actions: missing matching parent, j=%d, parent=%d"
j
(match nodes2.(j).parent with None -> -1 | Some n -> n)
);
(true, (Delete n1 :: acc))
| Some new_parent ->
let parent =
match n2.parent with None -> assert false | Some i -> i
in
(false, (Move(n1.number, parent, new_parent, n2.rank)) :: acc)
)
in
let acc =
if deleted then
acc
else
if n1.hash = n2.hash then
acc
else
(
let acc = add_edit_action acc n1 n2 in
let (acc, _) = Array.fold_left f (acc, 0) n1.children in
acc
)
in
(acc, rank + 1)
in
let (actions, _) = f ([], 0) (Array.length nodes1 - 1) in
let rec g acc j =
let n2 = nodes2.(j) in
match n2.matched with
None ->
dbg (Printf.sprintf "Insert %d which is not matched "n2.number);
let new_parent =
match matching_parent nodes2 n2 with
None ->
dbg (Printf.sprintf "no matching parent for t2.(%d)" n2.number);
assert false
| Some i -> i
in
(Insert (n2, new_parent, n2.rank)) :: acc
| Some _ ->
Array.fold_left g acc n2.children
in
let actions = g actions (Array.length nodes2 - 1) in
actions
let sort_actions =
let pred a1 a2 =
match a1, a2 with
| Delete i1, Delete i2 -> i2.number - i1.number
| Delete _, _ -> 1
| _, Delete _ -> -1
| Edit _, Edit _ -> 0
| Edit _, _ -> -1
| _, Edit _ -> 1
| Replace _, Replace _ -> 0
| Replace _, _ -> -1
| _, Replace _ -> 1
| MoveRank(_,_,rank1), Move(_,_,_,rank2)
| Move(_,_,_,rank1), MoveRank(_,_,rank2)
| Move (_,_,_,rank1), Move (_,_,_,rank2)
| MoveRank(_,_,rank1), MoveRank(_,_,rank2)
| MoveRank(_,_,rank1), Insert(_,_,rank2)
| Insert(_,_,rank1), MoveRank(_,_,rank2)
| Move (_,_,_,rank1), Insert (_,_,rank2)
| Insert (_,_,rank1), Move (_,_,_,rank2)
| Insert (_,_,rank1), Insert (_,_,rank2) ->
rank1 - rank2
in
List.sort pred
let build_hash_map =
let add node map =
let l =
try Smap.find node.hash map
with Not_found -> []
in
Smap.add node.hash (node.number :: l) map
in
fun t -> Array.fold_right add t.nodes Smap.empty
let rec get_nth_parent t i level =
match t.nodes.(i).parent with
None -> None
| Some p ->
if level <= 1 then
Some p
else
get_nth_parent t p (level-1)
let d_of_node t i =
1. +. (float t.height) *. t.nodes.(i).weight /. t.w0
let rec match_nodes ?(fail=true) ?(with_subs=false) t1 t2 i j =
dbg (Printf.sprintf "matching %d -> %d [with_subs=%B]" i j with_subs);
let node1 = t1.nodes.(i) in
match node1.matched with
| Some j2 when j <> j2 ->
dbg (Printf.sprintf "t1.(%d) already matched to t2.(%d)" i j2);
if fail then assert false
| _ ->
let node2 = t2.nodes.(j) in
match node2.matched with
Some i2 when i <> i2 ->
dbg (Printf.sprintf "t2.(%d) already matched to t1.(%d)" j i2);
if fail then assert false
| _ ->
node1.matched <- Some j;
node2.matched <- Some i;
if with_subs then
begin
let ch_i = node1.children in
let ch_j = node2.children in
for x = 0 to Array.length ch_i - 1 do
match_nodes ~with_subs: true t1 t2 ch_i.(x) ch_j.(x)
done
end
let match_ancestors t1 t2 i j =
let max_level = int_of_float (d_of_node t2 j) in
dbg (Printf.sprintf "match_ancestors: i=%d, j=%d, max_level=%d" i j max_level);
let rec iter i j level =
if level > max_level then
()
else
match t1.nodes.(i).parent, t2.nodes.(j).parent with
Some p1, Some p2
when t1.nodes.(p1).label = t2.nodes.(p2).label ->
match_nodes ~fail: false t1 t2 p1 p2;
iter p1 p2 (level + 1)
| _ -> ()
in
iter i j 1
let min_list p v l =
let rec iter acc = function
[] -> acc
| h :: q -> if p h < acc then iter h q else iter acc q
in
iter v l
let best_candidate t1 t2 j cands =
dbg ("best_candidates "^(String.concat ", " (List.map string_of_int cands)));
let d = d_of_node t2 j in
let map_parent acc = function
(i, None) -> acc
| (i, Some p) -> (i, t1.nodes.(p).parent) :: acc
in
let rec find level parent_j acc = function
[] -> iter (level + 1) acc
| (i, None) :: q -> find level parent_j acc q
| (i, Some p) :: q ->
match t1.nodes.(p).matched with
Some j when j = parent_j ->
Some i
| _ ->
find level parent_j ((i, Some p)::acc) q
and iter level cands =
if float level < d then
begin
let cands = List.rev (List.fold_left map_parent [] cands) in
match get_nth_parent t2 j level with
None -> None
| Some parent_j ->
find level parent_j [] cands
end
else
None
in
iter 1 (List.map (fun i -> (i, Some i)) cands)
let candidates hash_t1 t2 j =
try Smap.find t2.nodes.(j).hash hash_t1 with Not_found -> []
let match_candidate hash_t1 t1 t2 j =
let candidates = candidates hash_t1 t2 j in
let pred i = t1.nodes.(i).matched = None in
match List.filter pred candidates with
[] -> None
| [i] -> Some i
| l -> best_candidate t1 t2 j l
let (+=) map (k, v) =
let x = try Intmap.find k map with Not_found -> 0. in
Intmap.add k (v +. x) map
let match_uniquely_labeled =
let map_of_t nodes t f map =
Array.fold_left
(fun map i ->
let node = nodes.(i) in
match node.matched with
Some _ -> map
| None ->
let label = node.label in
let x = try Lmap.find label map with Not_found -> ([], []) in
let x = f i x in
Lmap.add label x map
)
map t
in
fun t1 t2 li lj ->
let map = map_of_t t1.nodes li (fun i (l1,l2) -> (i :: l1, l2)) Lmap.empty in
let map = map_of_t t2.nodes lj (fun j (l1,l2) -> (l1, j :: l2)) map in
Lmap.iter
(fun _ -> function
| [ i ], [ j ] -> match_nodes t1 t2 i j
| _ -> ())
map
let match_uniquely_labeled_children =
let do_match t1 t2 j =
let nj = t2.nodes.(j) in
match nj.matched with
None -> ()
| Some i ->
let children_i = t1.nodes.(i).children in
let children_j = nj.children in
match_uniquely_labeled t1 t2 children_i children_j
in
fun t1 t2 ->
for j = Array.length t2.nodes -1 downto 0 do
do_match t1 t2 j
done
let run_phase4 t1 t2 =
let f j node =
match node.matched with
Some _ -> ()
| None ->
let parents = Array.fold_left
(fun acc jc ->
match t2.nodes.(jc).matched with
None -> acc
| Some i ->
dbg (Printf.sprintf "%d has a child %d matched to %d" j jc i);
match t1.nodes.(i).parent with
| Some p when t1.nodes.(p).matched = None ->
dbg (Printf.sprintf "%d has a non-matched parent %d" i p);
acc += (p, t1.nodes.(i).weight)
| Some p ->
dbg (Printf.sprintf "[j=%d] i=%d has a parent %d already matched" j i p);
acc
| None -> acc
)
Intmap.empty node.children
in
let (parent, _) = Intmap.fold
(fun p w ((acc_parent, acc_w) as acc) ->
if w > acc_w then (p, w) else acc)
parents (-1, -1.0)
in
if parent >= 0 then match_nodes t1 t2 parent j
in
Array.iteri f t2.nodes;
match_uniquely_labeled_children t1 t2
let order_by_weight n1 n2 =
match Stdlib.compare n2.weight n1.weight with
0 -> Stdlib.compare n1.rank n2.rank
| n -> n
let compute t1 t2 =
let weight_queue = Queue.create () in
let root2 = Array.length t2.nodes - 1 in
let hash_t1 = build_hash_map t1 in
match_nodes t1 t2 (Array.length t1.nodes - 1) root2 ;
let queue_nodes children =
let t = Array.map (Array.get t2.nodes) children in
Array.sort order_by_weight t;
Array.iteri (fun _ n ->
Queue.add n.number weight_queue)
t
in
queue_nodes t2.nodes.(root2).children ;
while not (Queue.is_empty weight_queue) do
let j = Queue.pop weight_queue in
match t2.nodes.(j).matched with
Some _ -> ()
| None ->
match match_candidate hash_t1 t1 t2 j with
Some i ->
match_nodes ~with_subs: true t1 t2 i j;
match_ancestors t1 t2 i j
| None ->
queue_nodes t2.nodes.(j).children
done;
run_phase4 t1 t2 ;
on_dbg (fun () -> file_of_string ~file:"/tmp/matches.dot" (dot_of_matches t1 t2)) ();
sort_actions (make_actions t1 t2)
type cur_path = N of Xmlm.name | CData
module Cur_path = Map.Make (struct type t = cur_path let compare = Stdlib.compare end)
let cur_path_get cp map =
try Cur_path.find cp map
with Not_found -> 0
let cur_path_inc cp map =
let n = cur_path_get cp map in
Cur_path.add cp (n+1) map
let patch_path_of_cur_path_list =
let iter (cp, n) acc =
match acc, cp with
(None, CData) -> Some (Path_cdata n)
| (Some _, CData) -> assert false
| (_, N name) -> Some (Path_node (name, n, acc))
in
fun l ->
match List.fold_right iter l None with
None -> assert false
| Some p -> p
let rec string_of_path = function
Path_cdata n -> "CData("^(string_of_int n)^")"
| Path_node (name, n, next) ->
let s = (string_of_name name)^"("^(string_of_int n)^")" in
match next with
None -> s
| Some p -> s^"/"^(string_of_path p)
let path_of_id =
let cp_of_xml = function
`D s -> CData
| `E (name,_,_) -> N name
in
let rec forward to_move xmlnode path cur_path ~skip n = function
| (Some i, _) :: q when i = skip ->
forward to_move xmlnode path cur_path ~skip n q
| [] ->
let b = Buffer.create 256 in
Buffer.add_string b (string_of_path (patch_path_of_cur_path_list (List.rev path)));
Buffer.add_char b '{';
Cur_path.iter (fun k n -> Printf.bprintf b "%s->%d,"
(match k with N name -> string_of_name name | CData -> "cdata") n) cur_path ;
Buffer.add_char b '}';
on_dbg (fun () -> file_of_string ~file: "/tmp/xmldiff_state.dot" (dot_of_xmlnode xmlnode)) ();
failwith (Printf.sprintf "Invalid rank: %d element missing\npath: %s" (n+1) (Buffer.contents b))
| (Some id, xml) :: q when Intset.mem id to_move ->
let cur_path =
let cp = cp_of_xml xml in
cur_path_inc cp cur_path
in
forward to_move xmlnode path cur_path ~skip n q
| (_, xml) :: _ when n = 0 ->
let cp = cp_of_xml xml in
(cp, cur_path_get cp cur_path) :: path
| (id, xml) :: q ->
let cur_path =
let cp = cp_of_xml xml in
cur_path_inc cp cur_path
in
forward to_move xmlnode path cur_path ~skip (n-1) q
in
let rec iter to_move xmlnode ~rank ~skip i path cur_path = function
| (Some j, xml) when i = j ->
begin
let cp = cp_of_xml xml in
let path = (cp, cur_path_get cp cur_path) :: path in
let (path, pos) =
match rank with
None -> (path, `After)
| Some 0 -> (path, `FirstChild)
| Some n ->
match xml with
`D _ -> assert false
| `E (_,_,subs) -> (forward to_move xmlnode path Cur_path.empty ~skip (n-1) subs, `After)
in
(patch_path_of_cur_path_list (List.rev path), pos)
end
| (_, `D _) -> raise Not_found
| (_, `E (name, atts, subs)) ->
let cpt = cur_path_get (N name) cur_path in
let path = (N name, cpt) :: path in
iter_list to_move xmlnode ~rank ~skip i path Cur_path.empty subs
and iter_list to_move xmlnode ~rank ~skip i path cur_path = function
[] -> raise Not_found
| h :: q ->
try iter to_move xmlnode ~rank ~skip i path cur_path h
with Not_found ->
let cur_path =
let cp = cp_of_xml (snd h) in
cur_path_inc cp cur_path
in
iter_list to_move xmlnode ~rank ~skip i path cur_path q
in
fun to_move xmlnode ?rank ?(skip=(-1)) i ->
try iter to_move xmlnode ~rank ~skip i [] Cur_path.empty xmlnode
with Not_found ->
let msg = "Id "^(string_of_int i)^" not found" in
failwith msg
let rec xmlnode_of_xmltree = function
`D s -> (None, `D s)
| `E (name,atts,subs) ->
(None, `E (name,atts, List.map xmlnode_of_xmltree subs))
let string_of_position = function
`FirstChild -> "FirstChild"
| `After -> "After"
let string_of_patch_operation (path, op) =
match op with
| PReplace xmltree ->
"REPLACE("^(string_of_path path)^", "^(string_of_xml ~cut:true xmltree)^")"
| PInsert (xmltree, pos) ->
"INSERT("^(string_of_path path)^", "^(string_of_xml ~cut: true xmltree)^", "^(string_of_position pos)^")"
| PDelete ->
"DELETE("^(string_of_path path)^")"
| PUpdateCData s ->
Printf.sprintf "UPDATE_CDATA(%s, %S)" (string_of_path path) s
| PUpdateNode (name, atts) ->
Printf.sprintf "UPDATE_NODE(%s, %S, _)" (string_of_path path) (string_of_name name)
| PMove (newpath, pos) ->
Printf.sprintf "MOVE(%s, %s, %s)" (string_of_path path) (string_of_path newpath) (string_of_position pos)
let string_of_patch l =
String.concat "\n" (List.map string_of_patch_operation l)
let remove_xmlnode t path =
let rec iter xmls path =
match xmls, path with
((x, `D _) as xml):: q, Path_cdata 0 -> (xml, q)
| (x, `D s) :: q, Path_cdata n ->
let (removed, xmls) = iter q (Path_cdata (n-1)) in
(removed, (x, `D s) :: xmls)
| ((x, `E (name,atts,subs) as xml) :: q, Path_node (name2, n, next)) when name = name2 ->
if n = 0 then
(match next with
None -> (xml, q)
| Some p ->
let (removed, xmls) = iter subs p in
(removed, [x, `E (name, atts, xmls)] @ q)
)
else
(
let (removed, xmls) = iter q (Path_node (name2, n-1, next)) in
(removed, xml :: xmls)
)
| xml :: q, p ->
let (removed, xmls) = iter q p in
(removed, xml :: xmls)
| [], _ -> assert false
in
match iter [t] path with
removed, [t] -> (removed, t)
| _ -> assert false
let insert_xmlnode t node path pos =
let rec iter xmls path =
match xmls, path with
((x, `D _) as xml):: q, Path_cdata 0 ->
begin
match pos with
`FirstChild -> assert false
| `After -> xml :: node :: q
end
| (x, `D s) :: q, Path_cdata n ->
(x, `D s) :: iter q (Path_cdata (n-1))
| ((x, `E (name,atts,subs) as xml) :: q, Path_node (name2, n, next)) when name = name2 ->
if n = 0 then
(match next with
None ->
begin
match pos with
`FirstChild -> (x, `E(name,atts,node::subs)) :: q
| `After -> xml :: node :: q
end
| Some p -> [x, `E (name, atts, iter subs p)] @ q
)
else
xml :: iter q (Path_node (name2, n-1, next))
| xml :: q, p ->
xml :: iter q p
| [], _ -> assert false
in
match iter [t] path with
[t] -> t
| _ -> assert false
let patch_xmlnode t path op =
try
match op with
PMove (newpath, pos) ->
dbg (string_of_patch_operation (path, op));
let removed, t = remove_xmlnode t path in
dbg ("node removed");
insert_xmlnode t removed newpath pos
| _ ->
let apply xml op =
match xml, op with
| _, PReplace tree -> [xmlnode_of_xmltree tree]
| _, PInsert (tree, `After) -> [ xml ; xmlnode_of_xmltree tree ]
| (_, `D _), PInsert (_, `FirstChild) -> assert false
| (x, `E (tag,atts,subs)), PInsert (tree, `FirstChild) ->
[ (x, `E (tag, atts, xmlnode_of_xmltree tree :: subs)) ]
| _, PDelete -> []
| (x, _), PUpdateCData s -> [(x, `D s)]
| (x, `D _), PUpdateNode (name, atts) -> [x, `E (name,atts,[])]
| (x, `E (_,_,subs)), PUpdateNode (name, atts) -> [x, `E (name,atts,subs)]
| (_,_), PMove (_, _) -> assert false
in
let rec iter xmls path =
match xmls, path with
((x, `D _) as xml):: q, Path_cdata 0 -> (apply xml op) @ q
| (x, `D s) :: q, Path_cdata n ->
(x, `D s) :: iter q (Path_cdata (n-1))
| ((x, `E (name,atts,subs) as xml) :: q, Path_node (name2, n, next)) when name = name2 ->
if n = 0 then
(match next with
None -> (apply xml op) @ q
| Some p -> [x, `E (name, atts, iter subs p)] @ q
)
else
xml :: iter q (Path_node (name2, n-1, next))
| xml :: q, p ->
xml :: iter q p
| [], _ -> assert false
in
match iter [t] path with
[t] -> t
| _ -> assert false
with
Failure msg ->
failwith ("Error: "^msg^" when applying path:\n"^
(string_of_patch_operation (path, op)))
let patch_of_action (t1, to_move, patch) action =
dbg ("patch_of_action: "^string_of_action action);
dbg ("to_move = "^(String.concat ", " (List.map string_of_int (Intset.elements to_move))));
match action with
| Replace (n2, i) ->
let xmltree2 = n2.xml in
let (path, _) = path_of_id to_move t1 i in
let op = PReplace xmltree2 in
let t1 = patch_xmlnode t1 path op in
(t1, to_move, (path, op) :: patch)
| Move (i, parent, new_parent, rank) ->
let (path, _) = path_of_id to_move t1 i in
let (new_path, pos) = path_of_id to_move t1 ~rank ~skip: i new_parent in
let op = PMove (new_path, pos) in
let t1 = patch_xmlnode t1 path op in
let to_move = Intset.remove i to_move in
(t1, to_move, (path, op) :: patch)
| MoveRank (i, parent, rank) ->
let (path, _) = path_of_id to_move t1 i in
let (new_path, pos) = path_of_id to_move t1 ~rank ~skip: i parent in
let op = PMove (new_path, pos) in
let t1_patched = patch_xmlnode t1 path op in
let to_move = Intset.remove i to_move in
if t1_patched = t1 then
(t1, to_move, patch)
else
(t1_patched, to_move, (path, op) :: patch)
| Insert (n2, i, rank) ->
let xmltree2 = n2.xml in
let (path, pos) = path_of_id to_move t1 ~rank i in
let op = PInsert (xmltree2, pos) in
let t1 = patch_xmlnode t1 path op in
(t1, to_move, (path, op) :: patch)
| Delete i ->
begin
match path_of_id to_move t1 i.number with
| exception _ ->
let to_move = Intset.remove i.number to_move in
(t1, to_move, patch)
| (path,_) ->
let op = PDelete in
let t1 = patch_xmlnode t1 path op in
let to_move = Intset.remove i.number to_move in
(t1, to_move, (path, op) :: patch)
end
| Edit (n1, n2) ->
let (path,_) = path_of_id to_move t1 n1.number in
let op =
match n1.xml, n2.xml with
_ , `D s2 -> PUpdateCData s2
| `E (_,_,_), `E (name,atts,_) -> PUpdateNode (name, atts)
| `D _, `E (name,atts,subs) -> PUpdateNode (name, atts)
in
let t1 = patch_xmlnode t1 path op in
(t1, to_move, (path, op) :: patch)
let rec xmltree_of_xmlnode = function
(_, `D s) -> `D s
| (_, `E (tag,atts,subs)) -> `E (tag, atts, List.map xmltree_of_xmlnode subs)
let nodes_to_move =
let add acc = function
Move (i,_,_,_)
| MoveRank (i,_,_) -> Intset.add i acc
| Delete i -> Intset.add i.number acc
| _ -> acc
in
List.fold_left add Intset.empty
let patch_of_actions t1 t2 actions =
let to_move = nodes_to_move actions in
let nodes1 = xmlnode_of_t t1.nodes in
on_dbg (fun () -> file_of_string ~file: "/tmp/before_patch.dot" (dot_of_xmlnode nodes1)) ();
let (nodes1, to_move, l) = List.fold_left patch_of_action (nodes1, to_move, []) actions in
assert (Intset.is_empty to_move);
on_dbg (fun () -> file_of_string ~file: "/tmp/patch_result.dot" (dot_of_xmlnode nodes1)) ();
let t1 = xmltree_of_xmlnode nodes1 in
on_dbg (fun () ->
let t2 = xmltree_of_xmlnode (xmlnode_of_t t2.nodes) in
file_of_string ~file: "/tmp/xml1.xml" (string_of_xml t1) ;
file_of_string ~file: "/tmp/xml2.xml" (string_of_xml t2) ;
) ();
(List.rev l, t1)
let diff_with_final_tree ?cut xml1 xml2 =
let t1 = t_of_xml ?cut xml1 in
let t2 = t_of_xml ?cut xml2 in
on_dbg (fun () ->
file_of_string ~file: "/tmp/t1.dot" (dot_of_t t1);
file_of_string ~file: "/tmp/t2.dot" (dot_of_t t2);
) ();
let actions = compute t1 t2 in
dbg ("actions=\n "^(String.concat "\n " (List.map string_of_action actions)));
patch_of_actions t1 t2 actions
let diff ?cut xml1 xml2 = fst (diff_with_final_tree ?cut xml1 xml2)