package ASD::Indexer;

#     This file is part of asd.
    
#     asd is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.

#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.

#     You should have received a copy of the GNU General Public License
#     along with this program; if not, write to the Free Software
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

#     asd 0.2 Copyright 2004 Antonini Daniele <arpeda@gmail.com>

use strict;
use warnings;

use ASD::Indexer::Parsing ':all';
use ASD::Indexer::Stemmer 'porter';

require 5.005;
require Exporter;

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# This allows declaration	use ASD::Function ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 'all' => [ qw( 
	indexing_of_man_page_with_occurrence indexing_of_man_page_without_occurrence stampa_hash
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw();

our $VERSION = '0.01';
# Preloaded methods go here.


my %stop_list = (
 "a" => 1, "ii" => 1, "about" => 1, "above" => 1, "according" => 1, "across" => 1, "39" => 1, "actually" => 1, "ad" => 1, 
"adj" => 1, "ae" => 1, "af" => 1, "after" => 1, "afterwards" => 1, "ag" => 1, "again" => 1, "against" => 1, "ai" => 1, "al" => 1, "all" => 1,
 "almost" => 1, "alone" => 1, "along" => 1, "already" => 1, "also" => 1, "although" => 1, "always" => 1, "am" => 1, "among" => 1, "amongst" => 1,
 "an" => 1, "and" => 1, "another" => 1, "any" => 1, "anyhow" => 1, "anyone" => 1, "anything" => 1, "anywhere" => 1, "ao" => 1, "aq" => 1, "ar" => 1,
 "are" => 1, "aren" => 1, "aren't" => 1, "around" => 1, "arpa" => 1, "as" => 1, "at" => 1, "au" => 1, "aw" => 1, "az" => 1, "b" => 1, "ba" => 1,
"bb" => 1, "bd" => 1, "be" => 1, "became" => 1, "because" => 1, "become" => 1, "becomes" => 1, "becoming" => 1, "been" => 1, "before" => 1, 
"beforehand" => 1, "begin" => 1, "beginning" => 1, "behind" => 1, "being" => 1, "below" => 1, "beside" => 1, "besides" => 1, "between" => 1, 
"beyond" => 1, "bf" => 1, "bg" => 1, "bh" => 1, "bi" => 1, "billion" => 1, "bj" => 1, "bm" => 1, "bn" => 1, "bo" => 1, "both" => 1, "br" => 1,
"bs" => 1, "bt" => 1, "but" => 1, "buy" => 1, "bv" => 1, "bw" => 1, "by" => 1, "bz" => 1, "c" => 1, "ca" => 1, "can" => 1, "can't" => 1, "cannot" => 1,
"caption" => 1, "cc" => 1, "cd" => 1, "cf" => 1, "cg" => 1, "ch" => 1, "ci" => 1, "ck" => 1, "cl" => 1, "click" => 1, "cm" => 1, "cn" => 1, "co" => 1,
"co." => 1, "com" => 1, "copy" => 1, "could" => 1, "couldn" => 1, "couldn't" => 1, "cr" => 1, "cs" => 1, "cu" => 1, "cv" => 1, "cx" => 1, "cy" => 1,
"cz" => 1, "d" => 1, "de" => 1, "did" => 1, "didn" => 1, "didn't" => 1, "dj" => 1, "dk" => 1, "dm" => 1, "do" => 1, "does" => 1, "doesn" => 1,
"doesn't" => 1, "don" => 1, "don't" => 1, "down" => 1, "during" => 1, "dz" => 1, "e" => 1, "each" => 1, "ec" => 1, "edu" => 1, "ee" => 1, "eg" => 1, 
"eh" => 1, "eight" => 1, "eighty" => 1, "either" => 1, "else" => 1, "elsewhere" => 1, "end" => 1, "ending" => 1, "enough" => 1, "er" => 1, "es" => 1, 
"et" => 1, "etc" => 1, "even" => 1, "ever" => 1, "every" => 1, "everyone" => 1, "everything" => 1, "everywhere" => 1, "except" => 1, "f" => 1,
"few" => 1, "fi" => 1, "fifty" => 1, "find" => 1, "first" => 1, "five" => 1, "fj" => 1, "fk" => 1, "fm" => 1, "fo" => 1, "for" => 1, "former" => 1,
"formerly" => 1, "forty" => 1, "found" => 1, "four" => 1, "fr" => 1, "free" => 1, "from" => 1, "further" => 1, "fx" => 1, "g" => 1, "ga" => 1, "gb" => 1,
"gd" => 1, "ge" => 1, "get" => 1, "gf" => 1, "gg" => 1, "gh" => 1, "gi" => 1, "gl" => 1, "gm" => 1, "gmt" => 1, "gn" => 1, "go" => 1, "gov" => 1, 
"gp" => 1, "gq" => 1, "gr" => 1, "gs" => 1, "gt" => 1, "gu" => 1, "gw" => 1, "gy" => 1, "h" => 1, "had" => 1, "has" => 1, "hasn" => 1, "hasn't" => 1,
"have" => 1, "haven" => 1, "haven't" => 1, "he" => 1, "he'd" => 1, "he'll" => 1, "he's" => 1, "help" => 1, "hence" => 1, "her" => 1, "here" => 1, 
"here's" => 1, "hereafter" => 1, "hereby" => 1, "herein" => 1, "hereupon" => 1, "hers" => 1, "herself" => 1, "him" => 1, "himself" => 1, "his" => 1, 
"hk" => 1, "hm" => 1, "hn" => 1, "home" => 1, "homepage" => 1, "how" => 1, "however" => 1, "hr" => 1, "ht" => 1, "htm" => 1, "html" => 1, "http" => 1,
"hu" => 1, "hundred" => 1, "i" => 1, "i'd" => 1, "i'll" => 1, "i'm" => 1, "i've" => 1, "i.e." => 1, "id" => 1, "ie" => 1, "if" => 1, "il" => 1, "im" => 1, 
"in" => 1, "inc" => 1, "inc." => 1, "indeed" => 1, "information" => 1, "instead" => 1, "int" => 1, "into" => 1, "io" => 1, "iq" => 1, "ir" => 1, "is" => 1,
"isn" => 1, "isn't" => 1, "it" => 1, "it's" => 1, "its" => 1, "itself" => 1, "j" => 1, "je" => 1, "jm" => 1, "jo" => 1, "join" => 1, "jp" => 1, "k" => 1,
"ke" => 1, "kg" => 1, "kh" => 1, "ki" => 1, "km" => 1, "kn" => 1, "kp" => 1, "kr" => 1, "kw" => 1, "ky" => 1, "kz" => 1, "l" => 1, "la" => 1, "last" => 1,
"later" => 1, "latter" => 1, "lb" => 1, "lc" => 1, "least" => 1, "less" => 1, "let" => 1, "let's" => 1, "li" => 1, "like" => 1, "likely" => 1, "lk" => 1,
"ll" => 1, "lr" => 1, "ls" => 1, "lt" => 1, "ltd" => 1, "lu" => 1, "lv" => 1, "ly" => 1, "m" => 1, "ma" => 1, "made" => 1, "make" => 1, "makes" => 1,
"many" => 1, "maybe" => 1, "mc" => 1, "md" => 1, "me" => 1, "meantime" => 1, "meanwhile" => 1, "mg" => 1, "mh" => 1, "microsoft" => 1, "might" => 1,
"mil" => 1, "million" => 1, "miss" => 1, "mk" => 1, "ml" => 1, "mm" => 1, "mn" => 1, "mo" => 1, "more" => 1, "moreover" => 1, "most" => 1, "mostly" => 1,
"mp" => 1, "mq" => 1, "mr" => 1, "mrs" => 1, "ms" => 1, "msie" => 1, "mt" => 1, "mu" => 1, "much" => 1, "must" => 1, "mv" => 1, "mw" => 1, "mx" => 1, 
"my" => 1, "myself" => 1, "mz" => 1, "n" => 1, "na" => 1, "namely" => 1, "nc" => 1, "ne" => 1, "neither" => 1, "net" => 1, "netscape" => 1, 
"never" => 1, "nevertheless" => 1, "new" => 1, "next" => 1, "nf" => 1, "ng" => 1, "ni" => 1, "nine" => 1, "ninety" => 1, "nl" => 1, "no" => 1, 
"nobody" => 1, "none" => 1, "nonetheless" => 1, "noone" => 1, "nor" => 1, "not" => 1, "nothing" => 1, "now" => 1, "nowhere" => 1, "np" => 1, "nr" => 1, 
"nu" => 1, "nz" => 1, "o" => 1, "of" => 1, "off" => 1, "often" => 1, "om" => 1, "on" => 1, "once" => 1, "one" => 1, "one's" => 1, "only" => 1, "onto" => 1, 
"or" => 1, "org" => 1, "other" => 1, "others" => 1, "otherwise" => 1, "our" => 1, "ours" => 1, "ourselves" => 1, "out" => 1, "over" => 1, "overall" => 1, 
"own" => 1, "p" => 1, "pa" => 1, "page" => 1, "pe" => 1, "per" => 1, "perhaps" => 1, "pf" => 1, "pg" => 1, "ph" => 1, "pk" => 1, "pl" => 1, "pm" => 1, 
"pn" => 1, "pr" => 1, "pt" => 1, "pw" => 1, "py" => 1, "q" => 1, "qa" => 1, "r" => 1, "rather" => 1, "re" => 1, "recent" => 1, "recently" => 1, 
"reserved" => 1, "ring" => 1, "ro" => 1, "ru" => 1, "rw" => 1, "s" => 1, "sa" => 1, "same" => 1, "sb" => 1, "sc" => 1, "sd" => 1, "se" => 1, "seem" => 1, 
"seemed" => 1, "seeming" => 1, "seems" => 1, "seven" => 1, "seventy" => 1, "several" => 1, "sg" => 1, "sh" => 1, "she" => 1, "she'd" => 1, "she'll" => 1, 
"she's" => 1, "should" => 1, "shouldn" => 1, "shouldn't" => 1, "si" => 1, "since" => 1, "site" => 1, "six" => 1, "sixty" => 1, "sj" => 1, "sk" => 1, 
"sl" => 1, "sm" => 1, "sn" => 1, "so" => 1, "some" => 1, "somehow" => 1, "someone" => 1, "something" => 1, "sometime" => 1, "sometimes" => 1, 
"somewhere" => 1, "sr" => 1, "st" => 1, "still" => 1, "stop" => 1, "su" => 1, "such" => 1, "sv" => 1, "sy" => 1, "sz" => 1, "t" => 1, "taking" => 1, 
"tc" => 1, "td" => 1, "ten" => 1, "text" => 1, "tf" => 1, "tg" => 1, "test" => 1, "th" => 1, "than" => 1, "that" => 1, "that'll" => 1, "that's" => 1, 
"the" => 1, "their" => 1, "them" => 1, "themselves" => 1, "then" => 1, "thence" => 1, "there" => 1, "there'll" => 1, "there's" => 1, "thereafter" => 1, 
"thereby" => 1, "therefore" => 1, "therein" => 1, "thereupon" => 1, "these" => 1, "they" => 1, "they'd" => 1, "they'll" => 1, "they're" => 1, 
"they've" => 1, "thirty" => 1, "this" => 1, "those" => 1, "though" => 1, "thousand" => 1, "three" => 1, "through" => 1, "throughout" => 1, "thru" => 1,
"thus" => 1, "tj" => 1, "tk" => 1, "tm" => 1, "tn" => 1, "to" => 1, "together" => 1, "too" => 1, "toward" => 1, "towards" => 1, "tp" => 1, "tr" => 1, 
"trillion" => 1, "tt" => 1, "tv" => 1, "tw" => 1, "twenty" => 1, "two" => 1, "tz" => 1, "u" => 1, "ua" => 1, "ug" => 1, "uk" => 1, "um" => 1, 
"under" => 1, "unless" => 1, "unlike" => 1, "unlikely" => 1, "until" => 1, "up" => 1, "upon" => 1, "us" => 1, "use" => 1, "used" => 1, "using" => 1, 
"uy" => 1, "uz" => 1, "v" => 1, "va" => 1, "vc" => 1, "ve" => 1, "very" => 1, "vg" => 1, "vi" => 1, "via" => 1, "vn" => 1, "vu" => 1, "w" => 1, 
"was" => 1, "wasn" => 1, "wasn't" => 1, "we" => 1, "we'd" => 1, "we'll" => 1, "we're" => 1, "we've" => 1, "web" => 1, "webpage" => 1, "website" => 1, 
"welcome" => 1, "well" => 1, "were" => 1, "weren" => 1, "weren't" => 1, "wf" => 1, "what" => 1, "what'll" => 1, "what's" => 1, "whatever" => 1, 
"when" => 1, "whence" => 1, "whenever" => 1, "where" => 1, "whereafter" => 1, "whereas" => 1, "whereby" => 1, "wherein" => 1, "whereupon" => 1, 
"wherever" => 1, "whether" => 1, "which" => 1, "while" => 1, "whither" => 1, "who" => 1, "who'd" => 1, "who'll" => 1, "who's" => 1, "whoever" => 1, 
"NULL" => 1, "whole" => 1, "whom" => 1, "whomever" => 1, "whose" => 1, "why" => 1, "will" => 1, "with" => 1, "within" => 1, "without" => 1, "won" => 1, 
"won't" => 1, "would" => 1, "wouldn" => 1, "wouldn't" => 1, "ws" => 1, "www" => 1, "x" => 1, "y" => 1, "ye" => 1, "yes" => 1, "yet" => 1, "you" => 1, 
"you'd" => 1, "you'll" => 1, "you're" => 1, "you've" => 1, "your" => 1, "yours" => 1, "yourself" => 1, "yourselves" => 1, "yt" => 1, "yu" => 1, "z" => 1, 
"za" => 1, "zm" => 1, "zr" => 1, "10" => 1, "z" => 1 );

sub stampa_array {

    my $array = shift;

    for my $i (0..$#$array) {
	print "_{$array->[$i]}_\n";
    }
}

sub stampa_hash {

    my $hash = shift;

    foreach my $i ( sort keys %$hash ) {

	print "_{key: $i, ".scalar @{$hash->{$i}}." occurrenze: ";
  	for my $k (0..scalar @{$hash->{$i}}-1 ) {

  	    print "$hash->{$i}[$k] ";
  	}
 	print "}_\n";
    }
}

## The two interface of Indexer module
##
sub indexing_of_man_page_with_occurrence {
    my $ref_man_page = shift;
    my $man_page_title = shift;
    
    my %man_page_text = ();
    my %man_page_text_stemmed = ();
    my $wordID = 1;

    #remove tag
    &clean_man_page( $ref_man_page );

    # split text into single word
    %man_page_text = &_split_man_page_into_words_with_list_occurence( $ref_man_page );

    # filter word with stoplist
    &_filter_with_stop_list( \%man_page_text, \%stop_list );

    # elaborate title man page and add it to %man_page_text
    &_add_man_page_title_to_dictionary_with_occurrence(  \%man_page_text, $man_page_title );

    # word stemming
    %man_page_text_stemmed = &_word_stemming_with_occurrence( \%man_page_text );
    %man_page_text = ();

    # ordina la lista delle occorrenze
    &_sort_occurrence( \%man_page_text_stemmed );

    return \%man_page_text_stemmed;
}

sub indexing_of_man_page_without_occurrence {
    my $ref_man_page = shift;
    my $man_page_title = shift;
    
    my $ref_man_page_text = ();
    my %man_page_text_stemmed = ();
    my $wordID = 1;

    #remove tag
    &clean_man_page( $ref_man_page );

    # split text into single word
    $ref_man_page_text = &_split_man_page_into_words_without_list_occurence( $ref_man_page );

    # filter word with stoplist
    &_filter_with_stop_list( $ref_man_page_text, \%stop_list );

    # elaborate title man page and add it to %man_page_text
    &_add_man_page_title_to_dictionary_without_occurrence(  $ref_man_page_text, $man_page_title );

    # word stemming
    %man_page_text_stemmed = &_word_stemming_without_occurrence( $ref_man_page_text );
    %{$ref_man_page_text} = ();

    return \%man_page_text_stemmed;
}


sub _sort_occurrence {
    my $ref_man_page = shift;

    foreach my $word ( keys %$ref_man_page ) {
	@{$ref_man_page->{$word}} = sort {$a <=> $b} @{$ref_man_page->{$word}}; #ascendent
    }
}


##     Stemming
##
sub _word_stemming_with_occurrence {
    my $ref_man_page = shift;

    my %word_stemmed = ();

    while( my ($word, $ref_array_of_occurrence) = each (%$ref_man_page) ) {

	#$word_stemmed{ porter($key) } += $value;
	push @{$word_stemmed{ porter($word) }}, @$ref_array_of_occurrence if ( porter($word) );

	if ( $word =~ /:|-|_/ ) {
	    my @tmp2;

	    $word =~ s/:|-|_/ /g;
	    $word =~ s/ {2,}/ /g;
	    $word =~ s/^ | $//;

	    if ( $word ) {
		@tmp2 = split( / /, $word );

		foreach my $subword( @tmp2 ) {
		    
		    push @{$word_stemmed{ porter($subword) }}, @$ref_array_of_occurrence if ( porter($subword) );
		}
	    }
	}

    }

    return %word_stemmed;
}

sub _word_stemming_without_occurrence {
    my $ref_man_page = shift;

    my %word_stemmed = ();

    while( my ($word, $value) = each (%$ref_man_page) ) {

	$word_stemmed{ porter($word) } += $value if ( porter($word) );
	#push @{$word_stemmed{ porter($word) }}, @$ref_array_of_occurrence if ( porter($word) );

	if ( $word =~ /:|-|_/ ) {

	    my @tmp2;

	    $word =~ s/:|-|_/ /g;
	    $word =~ s/ {2,}/ /g;
	    $word =~ s/^ | $//;

	    if ( $word ) {
		@tmp2 = split( / /, $word );

		foreach my $subword( @tmp2 ) {

		    $word_stemmed{ porter($subword) } += $value if ( porter($subword) );
		}
	    }
	}

    }

    return %word_stemmed;
}


##    Elaborate Title
##
sub _add_man_page_title_to_dictionary_without_occurrence {
    my $ref_dict = shift;
    my $string = shift;

    $string =~ s/\((\d)*(\w)*\)//g;
    $string =~ s/[^\w|:|-]/ /g;
    $string =~ s/ - / /g;
    $string =~ s/ {2,}/ /g;
    $string =~ tr/[A-Z]/[a-z]/;
    $string =~ s/^ | $//g;

    my @tmp = split( / /, $string );

    foreach my $word ( @tmp ) {
	$ref_dict->{ porter($word) } ++;

	if ( $word =~ /:|-|_/ ) {
	    my @tmp2;

	    $word =~ s/:|-|_/ /g;
	    $word =~ s/ {2,}/ /g;
	    $word =~ s/^ | $//;

	    if ( $word ) {
		@tmp2 = split( / /, $word );

		foreach my $subword( @tmp2 ) {
		    
		    $ref_dict->{ porter($subword) } ++ if ( porter($subword) );
		}
	    }
	}

    }

    @tmp = ();

}

sub _add_man_page_title_to_dictionary_with_occurrence {
    my $ref_dict = shift;
    my $string = shift;

    $string =~ s/\((\d)*(\w)*\)//g;
    $string =~ s/[^\w|:|-]/ /g;
    $string =~ s/ - / /g;
    $string =~ s/ {2,}/ /g;
    $string =~ tr/[A-Z]/[a-z]/;
    $string =~ s/^ | $//g;

    my @tmp = split( / /, $string );

    foreach my $word ( @tmp ) {
	push @{$ref_dict->{ porter($word) }}, "0";
	
	if ( $word =~ /:|-|_/ ) {
	    my @tmp2;

	    $word =~ s/:|-|_/ /g;
	    $word =~ s/ {2,}/ /g;
	    $word =~ s/^ | $//;

	    if ( $word ) {
		@tmp2 = split( / /, $word );

		foreach my $subword( @tmp2 ) {

		    push @{ $ref_dict->{porter($subword)} }, "0" if ( porter($subword) );
		}
	    }
	}
    }

    @tmp = ();

}


##    Split text into word
##
sub _split_man_page_into_words_with_list_occurence {

    my $ref_man_page = shift;
    
    my @word_tmp_man_page = undef;
    my %word_man_page = ();
    my $position = 0;

    for my $i( 0..$#$ref_man_page ) {
	
	@word_tmp_man_page = split( / /,$ref_man_page->[$i] );
	
	foreach my $word( @word_tmp_man_page ) {

	    #$word_man_page{$word} ++;
	    $position++;
	    push @{$word_man_page{$word}}, $position;
	}

	#@word_tmp_man_page = ();
    }

    return %word_man_page;
}

sub _split_man_page_into_words_without_list_occurence {

    my $ref_man_page = shift;
    
    my @word_tmp_man_page = undef;
    my %word_man_page = ();

    for my $i( 0..$#$ref_man_page ) {

#	print "\n_{$ref_man_page->[$i]}_\n" if ();

	@word_tmp_man_page = split( / /,$ref_man_page->[$i] );
	
	foreach my $word( @word_tmp_man_page ) {

	    $word_man_page{$word} ++;
	}
    }

    return \%word_man_page;
}


##    Filter words with stop-list
##
sub _filter_with_stop_list {
    my $ref_text_man_page = shift;
    my $ref_stop_list = shift;

    my $key;
    my $value;
    #my %stop_list = %$ref_stop_list;

    while ( ($key, $value) = each (%$ref_text_man_page) ) {
	#elimino l'elemento corrente se compare nella stop list o se inzia per - o se ha lunghezza < 2
	delete $ref_text_man_page->{$key} if ( exists $ref_stop_list->{$key} || $key =~ /^\-/ || length($key)<=2 || $key =~ /@/ ||
					      $key =~ /a{3,}|b{3,}|c{3,}|d{4xxxxxxxx,}|e{3,}|f{3,}|g{3,}|h{3,}|i{3,}|l{3,}|m{3,}|n{3,}/ ||
					      $key =~ /o{3,}|p{3,}|q{3,}|r{3,}|s{3,}|t{3,}|v{3,}|z{3,}|x{3,}|y{3,}|w{4,}|j{3,}/ ||
					      $key =~ /k{3,}|^_{1,}|^((\d)+(\w)+)+|^((\d)+\.(\w)+)+|\.{2,}|^\+|^%|^\*|^\?|^\^|\.\W|(\w)+\.(\w)+/ );

    }
}

1;
__END__

=head1 NAME

ASD::Indexer - Perl extension for ASD. It is a collection of function

=head1 SYNOPSIS

  use ASD::Function ':all';

=head1 DESCRIPTION

Simple collection if function for ASD.

=head2 EXPORT

None by default.

=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

A. U. Thor antonini.daniele( at )gmail( dot )com

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005 by A. U. Thor

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.


=cut
