Source file encoded_word.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
open Core
open Angstrom

module Let_syntax = struct
  let bind t ~f = t >>= f
  let map t ~f = t >>| f
  let both a b = lift2 Tuple2.create a b
end

let ws = take_while1 Char.is_whitespace

let charset =
  choice
    (* The following might not be an exhaustive list. We can add to this as we encounter
       more cases. *)
    [ string_ci "US-ASCII" >>| const `Ascii
    ; string_ci "UTF-8" >>| const `Utf8
    ; string_ci "ISO-8859-1" >>| const `Latin1
    ; string_ci "ISO-8859-2" >>| const `Latin2
    ]
;;

let encoding : [ `Base64 | `Quoted_printable ] Angstrom.t =
  choice [ string_ci "B" >>| const `Base64; string_ci "Q" >>| const `Quoted_printable ]
;;

let parser_ : string Angstrom.t =
  let%bind () = string "=?" >>| ignore
  and charset = charset
  and () = string "?" >>| ignore
  and encoding = encoding
  and () = string "?" >>| ignore
  and data =
    take_while (function
      | '?' -> false
      | c -> (not (Char.is_whitespace c)) && Char.is_print c)
  and () = string "?=" >>| ignore in
  let%bind data =
    match encoding with
    | `Quoted_printable ->
      (* RFC2047 deviates slightly from common quoted printable.
         In particular
         4.2(2) - Underscore may be used to encode space, and
         4.2(3)- underscore must be encoded.
         This substituion handles that decoding step. *)
      let data = String.substr_replace_all data ~pattern:"_" ~with_:" " in
      let data_bstr, _ =
        Quoted_printable_lexer.decode_quoted_printable
          (String.length data)
          (Lexing.from_string data)
      in
      return (Bigbuffer.contents data_bstr)
    | `Base64 ->
      (match Base64.decode data with
       | Ok data -> return data
       | Error (`Msg msg) -> fail msg)
  in
  match charset with
  | `Ascii
  | `Utf8
  | `Latin1
  | `Latin2
    -> return data
;;

let parser_many : string Angstrom.t =
  many
    (choice
       [ (let%map hd = parser_
          and tl =
            (* RFC2047 6.2 When displaying a particular header field that contains
               multiple 'encoded-word's, any 'linear-white-space' that separates a
               pair of adjacent 'encoded-word's is ignored. *)
            many
              (let%bind (_ : string) = option "" ws in
               parser_)
          in
          hd :: tl)
       ; (let%map c =
            choice
              [ take_while1 (function
                  | '=' -> false
                  | c -> not (Char.is_whitespace c))
              ; string "="
              (* Collapse Line breaks as per
                 RFC822 - 3.1.1 Unfolding is accomplished by regarding CRLF immediately
                 followed by an LWSP-char as equivalent to the LWSP-char.
                 RFC822 - 3.1.3 Rules of (un)folding apply to these (unstructured) fields *)
              ; (let%bind (_ : string) = choice [ string "\r\n"; string "\n" ] in
                 ws)
              (* The RFC is ambiguous on what should happen if there is a lone CRLF, so we
                 ignore those, and treat these as regular white space. The RFC is also
                 ambiguous on how to treat multiple consecutive whitespaces, so we do the
                 conservative thing and leave them exactly as is. *)
              ; ws
              ]
          in
          [ c ])
       ])
  >>| List.concat
  >>| String.concat ~sep:""
;;

let decode str =
  Angstrom.parse_string parser_many str |> Result.map_error ~f:Error.of_string
;;

let%expect_test _ =
  let test str = printf "%S\n" (decode str |> Or_error.ok_exn) in
  test "hello = there?!?";
  [%expect {|
    "hello = there?!?"
    |}];
  test "hello\n there\n\tagain\n   my\nfriend";
  [%expect {|
    "hello there\tagain   my\nfriend"
    |}];
  (* Some test vectors from: https://tools.ietf.org/html/rfc2047 *)
  test "=?ISO-8859-1?Q?a?=";
  [%expect {|
    "a"
    |}];
  test "=?ISO-8859-1?Q?a?= b";
  [%expect {|
    "a b"
    |}];
  test "=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=";
  [%expect {|
    "ab"
    |}];
  test "=?ISO-8859-1?Q?a?=  =?ISO-8859-1?Q?b?=";
  [%expect {|
    "ab"
    |}];
  test "=?ISO-8859-1?Q?a?=\n       =?ISO-8859-1?Q?b?=";
  [%expect {|
    "ab"
    |}];
  test "=?ISO-8859-1?Q?a_b?=";
  [%expect {|
    "a b"
    |}];
  test "=?ISO-8859-1?Q?a?=  =?ISO-8859-1?Q?_b?=";
  [%expect {|
    "a b"
    |}];
  test " =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>";
  [%expect {| " Keith Moore <moore@cs.utk.edu>" |}];
  test " =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>";
  [%expect {| " Keld J\248rn Simonsen <keld@dkuug.dk>" |}];
  test " =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be>";
  [%expect {| " Andr\233 Pirard <PIRARD@vm1.ulg.ac.be>" |}];
  test
    "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\n\
     =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=";
  [%expect {| "If you can read this you understand the example." |}];
  (* a UTF8 Example *)
  test "=?UTF-8?B?SGkgVGhlcmVcIQo=?=";
  [%expect {| "Hi There\\!\n" |}]
;;