(* Library for escaping and decoding.
 * Copyright (C) 2003-2004 Merjis Ltd. (http://www.merjis.com/)
 * Copyright (C) 1997 Daniel de Rauglaudre, INRIA
 * Copyright (C) 1998 Jean-Christophe FILLIATRE
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: cgi_escape.ml,v 1.10 2005/01/07 17:36:24 ChriS Exp $
 *)

(* Encoding and decoding according to RFC 2396.
 ***********************************************************************)

exception Hex_of_char

let hex_of_char =
  let code_a = Char.code 'a' - 10
  and code_A = Char.code 'A' - 10 in
  function
  | '0' .. '9' as c -> Char.code c - Char.code '0'
  | 'a' .. 'f' as c -> Char.code c - code_a
  | 'A' .. 'F' as c -> Char.code c - code_A
  | _ -> raise Hex_of_char


let rec decode_range_loop i0 i up s =
  if i0 >= up then i else begin
    match String.unsafe_get s i0 with
    | '+' ->
	String.unsafe_set s i ' ';
	decode_range_loop (succ i0) (succ i) up s
    | '%' when i0 + 2 < up ->
        let i1 = succ i0 in
        let i2 = succ i1 in
        let i0_next =
          try
            let v = hex_of_char(String.unsafe_get s i1) lsl 4
                    + hex_of_char(String.unsafe_get s i2) in
	    String.unsafe_set s i (Char.chr v);
	    succ i2
          with Hex_of_char ->
	    String.unsafe_set s i '%';
	    i1 in
	decode_range_loop i0_next (succ i) up s
    | c ->
	String.unsafe_set s i c;
	decode_range_loop (succ i0) (succ i) up s
  end

let is_space c =  c = ' ' || c = '\t' || c = '\r' || c = '\n'

(* [rm_htspace s] returns the substring [s.[low .. up - 1]] stripped
   of heading and trailing spaces. *)
let rm_htspace =
  let rec trailing_spaces j s = (* assume there is i s.t. s.[i] <> ' ' *)
    if is_space(String.unsafe_get s j) then trailing_spaces (pred j) s
    else j in
  let rec rm_spaces i up s =
    if i >= up then "" else begin
      if is_space(String.unsafe_get s i) then rm_spaces (succ i) up s
      else
        (* s.[i] <> space so trailing_spaces will stop and return j >= i. *)
        String.sub s i (trailing_spaces(pred up) s + 1 - i)
    end in
  fun low up s -> rm_spaces low up s


let decode_range s low up =
  if low >= up then "" else
    let up = decode_range_loop low low up s in
    rm_htspace low up s

(* It uses more memory than the previous version (CVS v1.5) -- the
   string is copied twice -- but is more efficient (and is only
   occasionally needed by the user).  This is not used in this
   library, only [decode_range] is. *)
let decode s =
  decode_range (String.copy s) 0 (String.length s)


(* Use a table lookup for speed. *)
let char_of_hex =
  let hex = [| '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9';
	       'A'; 'B'; 'C'; 'D'; 'E'; 'F' |] in
  fun i -> Array.unsafe_get hex i


(* Unreserved characters consist of all alphanumeric chars and the
   following limited set of punctuation marks and symbols: '-' | '_' |
   '.' | '!' | '~' | '*' | '\'' | '(' | ')'.  According to RFC 2396,
   they should not be escaped unless the context requires it. *)
let special_rfc2396 = function
  | ';' | '/' | '?' | ':' | '@' | '&' | '=' | '+' | '$' | ',' (* Reserved *)
  | '\000' .. '\031' | '\127' .. '\255' (* Control chars and non-ASCII *)
  | '<' | '>' | '#' | '%' | '"' (* delimiters *)
  | '{' | '}' | '|' | '\\' | '^' | '[' | ']' | '`' (* unwise *)
      -> true
  | _ -> false
(* ' ' must also be encoded but its encoding '+' takes a single char. *)

let encode_wrt is_special s0 =
  let len = String.length s0 in
  let encoded_length = ref len in
  for i = 0 to len - 1 do
    if is_special(String.unsafe_get s0 i) then
      encoded_length := !encoded_length + 2
  done;
  let s = String.create !encoded_length in
  let rec do_enc i0 i = (* copy the encoded string in s *)
    if i0 < len then begin
      let s0i0 = String.unsafe_get s0 i0 in
      if is_special s0i0 then begin
        let c = Char.code s0i0 in
        let i1 = succ i in
        let i2 = succ i1 in
        String.unsafe_set s i '%';
        String.unsafe_set s i1 (char_of_hex (c lsr 4));
        String.unsafe_set s i2 (char_of_hex (c land 0xF));
        do_enc (succ i0) (succ i2)
      end
      else if s0i0 = ' ' then begin
	String.unsafe_set s i '+';
        do_enc (succ i0) (succ i)
      end
      else begin
        String.unsafe_set s i s0i0;
        do_enc (succ i0) (succ i)
      end
    end in
  do_enc 0 0;
  s


let encode = encode_wrt special_rfc2396



(* Types of escaping performed by the escape function. *)
type escape_t = EscapeNone | EscapeUrl | EscapeHtml | EscapeHtmlTag
              | EscapeHtmlTextarea

let amp_re = Pcre.regexp "&"
let lt_re = Pcre.regexp "<"
let gt_re = Pcre.regexp ">"
let quot_re = Pcre.regexp "\""
let eol_re = Pcre.regexp "\r?\n"

let escape_url = encode

let escape_html_textarea str =
  let str = Pcre.replace ~rex:amp_re ~templ:"&amp;" str in
  let str = Pcre.replace ~rex:lt_re  ~templ:"&lt;" str in
  let str = Pcre.replace ~rex:gt_re  ~templ:"&gt;" str in
  let str = Pcre.replace ~rex:quot_re ~templ:"&quot;" str in
  str

let escape_html str =
  let str = escape_html_textarea str in
  let str = Pcre.replace ~rex:eol_re ~templ:"<br/>" str in
  str

let escape_html_tag str =
  (* XXX Can't deal with complete binary data, and doesn't handle line
     endings in binary data very well either ...  *)
  let str = escape_html_textarea str in
  let str = Pcre.replace ~rex:eol_re ~templ:"&#13;&#10;" str in
  str

let escape = function
  | EscapeNone -> (fun str -> str)
  | EscapeUrl -> encode
  | EscapeHtml -> escape_html
  | EscapeHtmlTag -> escape_html_tag
  | EscapeHtmlTextarea -> escape_html_textarea
