# HTML/Sloppy.pm
# Copyright (C) 2002-2003 colin z robertson
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package HTML::Sloppy;

$HTML::Sloppy::VERSION = '0.01'; 

=head1 NAME

HTML::Sloppy - Converts poor-quality HTML to valid XHTML

=head1 SYNOPSIS

  use HTML::Sloppy;
  my $converter = new HTML::Sloppy;
  my $bad_html  = 'Some <b>bad HTML.';
  my $good_html = $converter->as_strict($bad_html);

=head1 DESCRIPTION

HTML::Sloppy converts a fragment of poorly-written HTML (like you might expect
any fallible human to write) into a valid XHTML fragment. The tags that
HTML::Sloppy recognises and the way it deals with them can be adjusted by
specifying a ruleset.

=head1 METHODS

=head2 new()

Creates a new HTML::Sloppy converter object:

  my $converter = new HTML::Sloppy;
  my $converter = new HTML::Sloppy($ruleset);
  my $converter = new HTML::Sloppy($ruleset,$entities);

Called with no arguments, the default ruleset and entities will be used.
Alternatively, a ruleset and an entity list can be supplied as references to a
hash and a list respectively.

=head2 as_strict()

  my $good_html = $converter->as_strict($bad_html);

Given a fragment of bad html, returns a fragment of valid XHTML.

=cut

use strict;
use Carp;
use vars qw($AUTOLOAD);

my %fields = (
	ruleset    => undef,
	entities => undef,
);

sub new {
    my $proto = shift;
    my $class = ref($proto) || $proto;
    my $self  = {
		_permitted => \%fields,
		%fields,
	};
    bless ($self, $class);
	
	my ($ruleset,$entities) = @_;
	
	if (!$ruleset) {
		$ruleset = {
			'' => {
				level            => 'block',
				assume_child     => 'p',
				children         => ['p', 'ul', 'ol', 'blockquote'],
				attributes       => [],
				singleton        => 0,
			},
			p => {
				level            => 'block',
				assume_child     => '',
				children         => ['a', 'b', 'i', 'em', 'strong', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			blockquote => {
				level            => 'block',
				assume_child     => 'p',
				children         => ['p', 'ul', 'ol', 'blockquote', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			li => {
				level            => 'block',
				assume_child     => '',
				children         => ['p', 'ul', 'ol', 'blockquote', 'a', 'b', 'i', 'em', 'strong', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			ul => {
				level            => 'block',
				assume_child     => 'li',
				children         => ['li'],
				attributes       => [],
				singleton        => 0,
			},
			ol => {
				level            => 'block',
				assume_child     => 'li',
				children         => ['li'],
				attributes       => [],
				singleton        => 0,
			},
			em => {
				level            => 'inline',
				assume_child     => '',
				children         => ['a', 'b', 'i', 'em', 'strong', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			strong => {
				level            => 'inline',
				assume_child     => '',
				children         => ['a', 'b', 'i', 'em', 'strong', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			b => {
				level            => 'inline',
				assume_child     => '',
				children         => ['a', 'b', 'i', 'em', 'strong', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			i => {
				level            => 'inline',
				assume_child     => '',
				children         => ['a', 'b', 'i', 'em', 'strong', 'br'],
				attributes       => [],
				singleton        => 0,
			},
			a => {
				level            => 'inline',
				assume_child     => '',
				children         => ['b', 'i', 'em', 'strong', 'br'],
				attributes       => ['href'],
				singleton        => 0,
			},
			br => {
				level            => 'inline',
				assume_child     => '',
				children         => [],
				attributes       => [],
				singleton        => 1,
			},
		}
	}
	$self->ruleset($ruleset);
	
	if (!$entities) {
		$entities = [
			'nbsp',
			'iexcl',
			'cent',
			'pound',
			'curren',
			'yen',
			'brvbar',
			'sect',
			'uml',
			'copy',
			'ordf',
			'laquo',
			'not',
			'shy',
			'reg',
			'macr',
			'deg',
			'plusmn',
			'sup2',
			'sup3',
			'acute',
			'micro',
			'para',
			'middot',
			'cedil',
			'sup1',
			'ordm',
			'raquo',
			'frac14',
			'frac12',
			'frac34',
			'iquest',
			'Agrave',
			'Aacute',
			'Acirc',
			'Atilde',
			'Auml',
			'Aring',
			'AElig',
			'Ccedil',
			'Egrave',
			'Eacute',
			'Ecirc',
			'Euml',
			'Igrave',
			'Iacute',
			'Icirc',
			'Iuml',
			'ETH',
			'Ntilde',
			'Ograve',
			'Oacute',
			'Ocirc',
			'Otilde',
			'Ouml',
			'times',
			'Oslash',
			'Ugrave',
			'Uacute',
			'Ucirc',
			'Uuml',
			'Yacute',
			'THORN',
			'szlig',
			'agrave',
			'aacute',
			'acirc',
			'atilde',
			'auml',
			'aring',
			'aelig',
			'ccedil',
			'egrave',
			'eacute',
			'ecirc',
			'euml',
			'igrave',
			'iacute',
			'icirc',
			'iuml',
			'eth',
			'ntilde',
			'ograve',
			'oacute',
			'ocirc',
			'otilde',
			'ouml',
			'divide',
			'oslash',
			'ugrave',
			'uacute',
			'ucirc',
			'uuml',
			'yacute',
			'thorn',
			'yuml',
			'quot',
			'amp',
			'lt',
			'gt',
			'OElig',
			'oelig',
			'Scaron',
			'scaron',
			'Yuml',
			'circ',
			'tilde',
			'ensp',
			'emsp',
			'thinsp',
			'zwnj',
			'zwj',
			'lrm',
			'rlm',
			'ndash',
			'mdash',
			'lsquo',
			'rsquo',
			'sbquo',
			'ldquo',
			'rdquo',
			'bdquo',
			'dagger',
			'Dagger',
			'permil',
			'lsaquo',
			'rsaquo',
			'euro',
			'fnof',
			'Alpha',
			'Beta',
			'Gamma',
			'Delta',
			'Epsilon',
			'Zeta',
			'Eta',
			'Theta',
			'Iota',
			'Kappa',
			'Lambda',
			'Mu',
			'Nu',
			'Xi',
			'Omicron',
			'Pi',
			'Rho',
			'Sigma',
			'Tau',
			'Upsilon',
			'Phi',
			'Chi',
			'Psi',
			'Omega',
			'alpha',
			'beta',
			'gamma',
			'delta',
			'epsilon',
			'zeta',
			'eta',
			'theta',
			'iota',
			'kappa',
			'lambda',
			'mu',
			'nu',
			'xi',
			'omicron',
			'pi',
			'rho',
			'sigmaf',
			'sigma',
			'tau',
			'upsilon',
			'phi',
			'chi',
			'psi',
			'omega',
			'thetasym',
			'upsih',
			'piv',
			'bull',
			'hellip',
			'prime',
			'Prime',
			'oline',
			'frasl',
			'weierp',
			'image',
			'real',
			'trade',
			'alefsym',
			'larr',
			'uarr',
			'rarr',
			'darr',
			'harr',
			'crarr',
			'lArr',
			'uArr',
			'rArr',
			'dArr',
			'hArr',
			'forall',
			'part',
			'exist',
			'empty',
			'nabla',
			'isin',
			'notin',
			'ni',
			'prod',
			'sum',
			'minus',
			'lowast',
			'radic',
			'prop',
			'infin',
			'ang',
			'and',
			'or',
			'cap',
			'cup',
			'int',
			'there4',
			'sim',
			'cong',
			'asymp',
			'ne',
			'equiv',
			'le',
			'ge',
			'sub',
			'sup',
			'nsub',
			'sube',
			'supe',
			'oplus',
			'otimes',
			'perp',
			'sdot',
			'lceil',
			'rceil',
			'lfloor',
			'rfloor',
			'lang',
			'rang',
			'loz',
			'spades',
			'clubs',
			'hearts',
			'diams',
		]
	}
	$self->entities($entities);
	
	return $self;
}

sub as_strict {
	my $self = shift;
	my ($input) = @_;
	my @tokens = tokenise($input);
	my @stack = ('');
	my $output;
	foreach my $token(@tokens) {
		#warn "processing: $token\n";
		if (type($token) eq 'start') {
			if ($self->ruleset->{tag_name($token)}->{level} eq 'inline') {
				if (($token =~ /\S/) && $self->ruleset->{$stack[-1]}->{assume_child}) {
					my $child = $self->ruleset->{$stack[-1]}->{assume_child};
					push @stack, $child;
					$output .= "<" . $child . ">";
				}
			}
			if ($self->allowed_in_stack(tag_name($token),\@stack)) {
				$output .= $self->close_to_allowed_point(tag_name($token),\@stack);
				if ($self->ruleset->{tag_name($token)}->{singleton}) {
					$output .= $self->format_tag($token);
				} else {
					push @stack, tag_name($token);
					$output .= $self->format_tag($token);
				}
			}
		} elsif (type($token) eq 'end') {
			if (member(tag_name($token),@stack)) {
				$output .= close_stack(tag_name($token),\@stack);
			} else {
				#warn tag_name($token) . " not in stack\n"
			}
		} elsif (type($token) eq 'text') {
			if (($token =~ /\S/) && $self->ruleset->{$stack[-1]}->{assume_child}) {
				my $child = $self->ruleset->{$stack[-1]}->{assume_child};
				push @stack, $child;
				$output .= "<" . $child . ">";
			}
			$output .= $self->escape_special_chars($token);
		}
	}
	$output .= $self->close_remaining(\@stack);
	return $output;
}

sub close_stack {
	my ($target,$stack_ref) = @_;
	my $output;
	while (1) {
		my $tag_name = pop @{$stack_ref};
		$output .= "</" . $tag_name . ">";
		return $output if ($tag_name eq $target);
	}
	#warn "close_stack: $target not found in stack\n";
}

sub close_remaining {
	my ($target,$stack_ref) = @_;
	my $output = "";
	while (1) {
		my $tag_name = pop @{$stack_ref};
		return $output if $tag_name eq '';
		$output .= "</" . $tag_name . ">";
	}
}

sub close_to_allowed_point {
	my $self = shift;
	my ($element,$stack_ref) = @_;
	my $output = '';
	#return $output unless $self->allowed_in_stack($element,$stack_ref);
	while (1) {
		my $stack_item = @{$stack_ref}[-1];
		my @allowed = @{$self->ruleset->{$stack_item}->{children}};
		return $output if member($element,@allowed);
		$output .= "</" . $stack_item . ">";
		pop @{$stack_ref};
	}
}

sub parent_block {
	my $self = shift;
	my @stack = @_;
	foreach my $item(reverse(@stack)) {
		if ($self->ruleset->{$item}->{level} eq 'block') {
			return $item;
		}
	}
	#warn "parent_block called with stack (" . join(", ",@stack) . ") is returning null string\n";
	return '';
}

sub allowed_in_parent_block {
	my $self = shift;
	my ($element,@stack) = @_;
	my $parent_block = $self->parent_block(@stack);
	my @allowed = @{$self->ruleset->{$parent_block}->{children}};
	return member($element,@allowed);
}

sub allowed_in_stack {
	my $self = shift;
	my ($element,$stack_ref) = @_;
	my @stack = @{$stack_ref};
	#warn "allowed_in_stack called with element: $element\nstack: " . join(", ",@stack) . "\n";
	foreach my $item(reverse(@stack)) {
		my @allowed = @{$self->ruleset->{$item}->{children}};
		if (member($element,@allowed)) {
			#warn "true\n";
			return 1;
		}
	}
	#warn "false\n";
	return 0;
}

sub member {
	my ($value, @list) = @_;
	foreach my $item(@list) {
		if ($value eq $item) {
			return 1;
		}
	}
	return 0;
}

sub tokenise {
	my ($html) = @_;
	my @tokens;
	while ($html ne "") {
		# match at the beginning of the string
		# a tag or some text
		# and replace with nothing
		if ($html =~ s/ ^( <([^>"]*("[^"]*")*)*> | [^<]+ ) //x) {
			push(@tokens,$1);
        } else {
        	# the string contains something that won't match, so put it on the
            # end and give up.
        	push(@tokens,$html);
            $html = "";
        }
	}
	return @tokens;
}

sub type {
	my ($tag) = @_;
	if ($tag =~ /^<.*>$/) {
		if ($tag =~ m:^</:) {
			return "end";
		} else {
			return "start";
		}
	} else {
		return "text";
	}
}

sub tag_name {
	my ($tag) = @_;
	if ($tag =~ /^<\/?(\w*).*>$/) {
		return lc($1);
	}
}

sub format_tag {
	my $self = shift;
	my ($tag) = @_;
	my $att_string = "";
	my %atts = attributes($tag);
	foreach my $name (keys(%atts)) {
		if (member($name,@{$self->ruleset->{tag_name($tag)}->{attributes}})) {
			$att_string .= " $name=\"" . $self->escape_special_chars($atts{$name}) . "\"";
		}
	}
	my $closing_slash = "";
	if ($self->ruleset->{tag_name($tag)}->{singleton}) {
		$closing_slash = " /";
	}
	return "<" . tag_name($tag) . $att_string . $closing_slash . ">";
}

sub escape_special_chars {
	my $self = shift;
	my ($string) = @_;
	# Have to be a bit careful about ampersands. If they're not part of an
	# entity construction that we know about then escape them, otherwise leave
	# them.
	# There's more to be done with numeric references. For a start we should
	# really be checking the range. Also, in XML, the uppercase X to indicate
	# hexadecimal is illegal. It must be lower case.
	my $numeric = '|#([xX][0-9a-fA-F]{1,8}|[0-9]{1,10})';
	my $entities_re = join('|', @{$self->entities}) . $numeric;
	$string =~ s/&(?!($entities_re);)/&amp;/ig;
	# The rest are simple.
	$string =~ s/</&lt;/g;
	$string =~ s/>/&gt;/g;
	$string =~ s/"/&quot;/g;
	return $string;
}

sub attributes {
	my ($tag) = @_;
	my (%attributes);
	# extract just attributes section
	$tag =~ s/^<\S*(.*)>$/$1/;
	while ($tag ne "") {
		# match at the beginning of the string
		# an attribute (surrounded by optional white space)
		# and replace with nothing
		$tag =~ s/ ^\s*( ([^\s=]*)(\s*=\s*("[^"]*"|\S*))? )\s* //x;
		my $attribute = $1;
		my $name = lc($2);
		my $value;
		if ($attribute =~ /^[^\s=]*\s*=\s*"?(.*?)"?$/) {
			$value = $1;
		} else {
			$value = "yes";
		}
		$attributes{$name} = $value;
	}
	return %attributes;
}

sub AUTOLOAD {
	my $self = shift;
	my $type = ref($self) or croak "$self is not an object";

	my $name = $AUTOLOAD;
	$name =~ s/.*://;   # strip fully-qualified portion
	if ($name eq 'DESTROY') { return }

	unless (exists $self->{_permitted}->{$name} ) {
		croak "Can't access `$name' field in class $type";
	}

	if (@_) {
		return $self->{$name} = shift;
	} else {
		return $self->{$name};
	}
}

1;
