/* Copyright (C) 1999, 2000, 2001 Simon Patarin, INRIA

This file is part of Pandora, the Flexible Monitoring Platform.

Pandora is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

Pandora is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Pandora; see the file COPYING.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.  */

#include <libpandora/global.h>

extern "C" {
#include <stdlib.h>
#include <stdio.h>
#include <libpandora/conf/string.h>
#include <ctype.h>
	   }


#include <libpandora/urlutil.h>
#include <libpandora/error.h>

#ifndef TOASCII
#define TOASCII(c) (c)
#define FROMASCII(c) (c)
#endif
#define TOLOWER(c) tolower((int) (c)) 

#define MASK	0x8

#define ACCEPTABLE(a)	( a>=32 && a<128 && ((isAcceptable[a-32]) & MASK))

static char isAcceptable[96] =
{/* 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xA 0xB 0xC 0xD 0xE 0xF */
    0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xF,0xE,0x0,0xF,0xF,0xC, 
/* 2x  !"#$%&'()*+,-./   */
    0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0x8,0x0,0x0,0x0,0x0,0x0, 
/* 3x 0123456789:;<=>?   */
    0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,
/* 4x @ABCDEFGHIJKLMNO   */
    0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0x0,0x0,0x0,0x0,0xF,
/* 5X PQRSTUVWXYZ[\]^_   */
    0x0,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,
/* 6x `abcdefghijklmno   */
    0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0xF,0x0,0x0,0x0,0x0,0x0 
/* 7x pqrstuvwxyz{\}~DEL */
};
static char *hexa = "0123456789ABCDEF";

#define HEX_ESCAPE '%'

/* ------------------------------------------------------------------------- */

/*		Escape undesirable characters using %		HTEscape()
**		-------------------------------------
**
**	This function takes a pointer to a string in which
**	some characters may be unacceptable unescaped.
**	It returns a string which has these characters
**	represented by a '%' character followed by two hex digits.
**
**	In the tradition of being conservative in what you do and liberal
**	in what you accept, we encode some characters which in fact are
**	allowed in URLs unencoded -- so DON'T use the table below for
**	parsing! 
**
**	Unlike HTUnEscape(), this routine returns a HT_MALLOCed string.
**
*/
static char *HTEscape(const char *str)
{
  const char * p;
  char * q;
  char * result;
  int unacceptable = 0;
  if (!str) return NULL;
  for(p=str; *p; p++)
    if (!ACCEPTABLE((unsigned char)TOASCII(*p)))
      unacceptable++;
  if ((result = (char  *) xmalloc(p-str + unacceptable+ unacceptable + 1)) 
      == NULL)
    pandora_perror("HTEscape");
  for(q=result, p=str; *p; p++) {
    unsigned char a = TOASCII(*p);
    if (!ACCEPTABLE(a)) {
      *q++ = HEX_ESCAPE;	/* Means hex commming */
      *q++ = hexa[a >> 4];
      *q++ = hexa[a & 15];
    }
    else *q++ = *p;
  }
  *q++ = 0;			/* Terminate */
  return result;
}

static char HTAsciiHexToChar (char c)
{
    return  c >= '0' && c <= '9' ?  c - '0' 
    	    : c >= 'A' && c <= 'F'? c - 'A' + 10
    	    : c - 'a' + 10;	/* accept small letters just in case */
}

/*		Decode %xx escaped characters			HTUnEscape()
**		-----------------------------
**
**	This function takes a pointer to a string in which some
**	characters may have been encoded in %xy form, where xy is
**	the acsii hex code for character 16x+y.
**	The string is converted in place, as it will never grow.
*/
static char * HTUnEscape (char * str)
{
    char * p = str;
    char * q = str;

    if (!str) {					      /* Just for safety ;-) */
	return NULL;
    }
    while(*p) {
        if (*p == HEX_ESCAPE) {
	    p++;
	    if (*p) *q = HTAsciiHexToChar(*p++) * 16;
#if 1
	    /* Suggestion from Markku Savela */
	    if (*p) *q = FROMASCII(*q + HTAsciiHexToChar(*p)), ++p;
	    q++;
#else 
	    if (*p) *q = FROMASCII(*q + HTAsciiHexToChar(*p));
	    p++, q++;
#endif
	} else {
	    *q++ = *p++; 
	}
    }
    
    *q++ = 0;
    return str;
    
}

/*
**	Canonicalizes the URL in the following manner starting from the host
**	pointer:
**
**	1) The host name is converted to lowercase
**	2) Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp)
**
**	Return: OK	The position of the current path part of the URL
**			which might be the old one or a new one.
*/
static char * HTCanon (char ** filename, char * host)
{
    char *newname = NULL;
    char *port;
    char *strptr;
    char *path;
    char *access = host-3;

    while (access>*filename && *(access-1)!='/')       /* Find access method */
	access--;
    if ((path = strchr(host, '/')) == NULL)			/* Find path */
	path = host + strlen(host);
    if ((strptr = strchr(host, '@')) != NULL && strptr<path)	   /* UserId */
	host = strptr;
    if ((port = strchr(host, ':')) != NULL && port>path)      /* Port number */
	port = NULL;

    strptr = host;				    /* Convert to lower-case */
    while (strptr<path) {
	*strptr = TOLOWER(*strptr);
	strptr++;
    }
    
    /* Does the URL contain a full domain name? This also works for a
       numerical host name. The domain name is already made lower-case
       and without a trailing dot. */
    {
	char *dot = port ? port : path;
	if (dot > *filename && *--dot=='.') {
	    char *orig=dot, *dest=dot+1;
	    while((*orig++ = *dest++));
	    if (port) port--;
	    path--;
	}
    }
    /* Chop off port if `:', `:80' (http), `:70' (gopher), or `:21' (ftp) */
    if (port) {
	if (!*(port+1) || *(port+1)=='/') {
	    if (!newname) {
		char *orig=port, *dest=port+1;
		while((*orig++ = *dest++));
	    }
	} else if ((!strncmp(access, "http", 4) &&
	     (*(port+1)=='8'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
	    (!strncmp(access, "gopher", 6) &&
	     (*(port+1)=='7'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
	    (!strncmp(access, "ftp", 3) &&
	     (*(port+1)=='2'&&*(port+2)=='1'&&(*(port+3)=='/'||!*(port+3))))) {
	    if (!newname) {
		char *orig=port, *dest=port+3;
		while((*orig++ = *dest++));
		path -= 3;   	       /* Update path position, Henry Minsky */
	    }
	} else if (newname)
	    strncat(newname, port, (int) (path-port));
    }

    if (newname) {
	char *newpath = newname+strlen(newname);
	strcat(newname, path);
	path = newpath;
	xfree(*filename);				    /* Free old copy */
	*filename = newname;
    }
    return path;
}

/*
**  Search the URL and determine whether it is a relative or absolute URL.
**  We check to see if there is a ":" before any "/", "?", and "#". If this
**  is the case then we say it is absolute. Otherwise it is relative.
*/
int HTURL_isAbsolute (const char * url)
{    
  if (url) {	
    const char * ptr = url;
    while (*ptr) {
      if (*ptr == ':') return 1;
      if (*ptr == '/' || *ptr == '?' || *ptr == '#') break;
      ptr ++;
    }
  }	
  return 0;
}

/*	        Simplify a URI
//		--------------
// A URI is allowed to contain the seqeunce xxx/../ which may be
// replaced by "" , and the seqeunce "/./" which may be replaced by "/".
// Simplification helps us recognize duplicate URIs. 
//
//	Thus, 	/etc/junk/../fred 	becomes	/etc/fred
//		/etc/junk/./fred	becomes	/etc/junk/fred
//
//      but we should NOT change
//		http://fred.xxx.edu/../..
//
//	or	../../albert.html
//
// In order to avoid empty URLs the following URLs become:
//
//		/fred/..		becomes /fred/..
//		/fred/././..		becomes /fred/..
//		/fred/.././junk/.././	becomes /fred/..
//
// If more than one set of `://' is found (several proxies in cascade) then
// only the part after the last `://' is simplified.
//
// Returns: A string which might be the old one or a new one.
*/
static char *HTSimplify (char ** url)
{
    char *path;
    char *p = NULL;
    if (!url || !*url) {
	return *url;
    }

    /* Find any scheme name */
    if ((path = strstr(*url, "://")) != NULL) {		   /* Find host name */
	char *newptr;
	char *access = *url;
	while (access<path && (*access=TOLOWER(*access))) access++;
	path += 3;
	while ((newptr = strstr(path, "://")) != NULL)        /* For proxies */
	    path = newptr+3;
	path = HTCanon(url, path);       	      /* We have a host name */
    } else if ((path = strstr(*url, ":/")) != NULL) {
	path += 2;
    } else
	path = *url;
    if (*path == '/' && *(path+1)=='/') {	  /* Some URLs start //<foo> */
	path += 1;
    } else if (!strncmp(path, "news:", 5)) {
	char *ptr = strchr(path+5, '@');
	if (!ptr) ptr = path+5;
	while (*ptr) {			    /* Make group or host lower case */
	    *ptr = TOLOWER(*ptr);
	    ptr++;
	}
	return *url;		      /* Doesn't need to do any more */
    }
    if ((p = path)) {
	char *end;
	if (!((end = strchr(path, ';')) || (end = strchr(path, '?')) ||
	      (end = strchr(path, '#'))))
	    end = path+strlen(path);

	/* Parse string second time to simplify */
	p = path;
	while(p<end) {
	    if (*p=='/') {
		if (p>*url && *(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) {
		    char *orig = p+1;
		    char *dest = (*(p+2)!='/') ? p+2 : p+3;
		    while ((*orig++ = *dest++)); /* Remove a slash and a dot */
		    end = orig-1;
		} else if (*(p+1)=='.' && *(p+2)=='.' && (*(p+3)=='/' || !*(p+3))) {
		    char *q = p;
		    while (q>path && *--q!='/');	       /* prev slash */
		    if (strncmp(q, "/../", 4)) {
			char *orig = q+1;
			char *dest = (*(p+3)!='/') ? p+3 : p+4;
			while ((*orig++ = *dest++));	   /* Remove /xxx/.. */
			end = orig-1;
			p = q;		      /* Start again with prev slash */
		    } else
			p++;
		} else if (*(p+1)=='/') {
		    while (*(p+1)=='/') {
			char *orig=p, *dest=p+1;
			while ((*orig++ = *dest++));  /* Remove multiple /'s */
			end = orig-1;
		    }
		} else
		    p++;
	    } else
		p++;
	}
    }

    /*
    **  Check for host/../.. kind of things
    */
    while (*path=='/' && *(path+1)=='.' && *(path+2)=='.' &&
	   (!*(path+3) || *(path+3)=='/')) {
	char * orig = path;
	char * dest = path+3;
	while ((*orig++ = *dest++));
    }
    return *url;
}

void url_canonicalize(char **url)
{
  HTSimplify(url);
  HTUnEscape(*url);
}
