#!/usr/bin/perl -w

#    This file is part of asd.
#    
#    asd is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#    asd 0.1 Copyright 2004 Antonini Daniele <arpeda@gmail.com>

use strict;
use Fcntl ':mode';
use File::stat;

sub remove_dup
{
    my $ref_array = shift;
    my %tmp_hash = ();

    foreach my $dir ( @$ref_array )
    {
	$tmp_hash{$dir} = 0;
    }

    return sort keys %tmp_hash;

}

=head1
    L'array contente le dir da analizzare
=cut
    my $tmp_dir = `echo \$MANPATH`;
$tmp_dir =~ s/:{2,}/:/g;
$tmp_dir = substr( $tmp_dir, 0, -1 );

my @dir = split( /:/, $tmp_dir );
@dir = &remove_dup( \@dir );

=head1
    le sezioni delle man page da analizzare
=cut
    my @man_page_section = qw( 0p 1 1p 2 3 4 5 6 7 8 9 n );

=head1
    i campi delle man page che posso tranquillamente eliminare senza perdita di informazione
=cut
my @man_page_field_to_delete = ('SYNOPSIS', 'COPYRIGHT', 'SEE ALSO', 'EXAMPLE', 'AUTHOR', 'REPORTING BUG', 'KEYWORDS' );

=head1
    Funzione per lo stemming delle parole

    see also http://www.tartarus.org/~martin/PorterStemmer
    Revision 1
=cut
my %step2list;
my %step3list;
my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);

sub initialise {

   %step2list =
   ( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
     'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
     'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
     'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');

   %step3list =
   ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');


   $c =    "[^aeiou]";          # consonant
   $v =    "[aeiouy]";          # vowel
   $C =    "${c}[^aeiouy]*";    # consonant sequence
   $V =    "${v}[aeiou]*";      # vowel sequence

   $mgr0 = "^(${C})?${V}${C}";               # [C]VC... is m>0
   $meq1 = "^(${C})?${V}${C}(${V})?" . '$';  # [C]VC[V] is m=1
   $mgr1 = "^(${C})?${V}${C}${V}${C}";       # [C]VCVC... is m>1
   $_v   = "^(${C})?${v}";                   # vowel in stem

}

sub stem
{  my ($stem, $suffix, $firstch);
   my $w = shift;
   if (length($w) < 3) { return $w; } # length at least 3
   # now map initial y to Y so that the patterns never treat it as vowel:
   $w =~ /^./; $firstch = $&;
   if ($firstch =~ /^y/) { $w = ucfirst $w; }

   # Step 1a
   if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
   elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
   # Step 1b
   if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } }
   elsif ($w =~ /(ed|ing)$/)
   {  $stem = $`;
      if ($stem =~ /$_v/o)
      {  $w = $stem;
         if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
         elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
         elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
      }
   }
   # Step 1c
   if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } }

   # Step 2
   if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
   { $stem = $`; $suffix = $1;
     if ($stem =~ /$mgr0/o) { $w = $stem . $step2list{$suffix}; }
   }

   # Step 3

   if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
   { $stem = $`; $suffix = $1;
     if ($stem =~ /$mgr0/o) { $w = $stem . $step3list{$suffix}; }
   }

   # Step 4

   if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
   { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } }
   elsif ($w =~ /(s|t)(ion)$/)
   { $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } }


   #  Step 5

   if ($w =~ /e$/)
   { $stem = $`;
     if ($stem =~ /$mgr1/o or
         ($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o))
        { $w = $stem; }
   }
   if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); }

   # and turn initial Y back to y
   if ($firstch =~ /^y/) { $w = lcfirst $w; }
   return $w;
}

=head1
    Questa funzione si occupa di eliminare dall'analisi alcune sezioni delle man page che possono essere
    trascurate.
=cut
sub remove_man_page_section
{
    my $ref_man_page = shift;
    my $field_to_delete = 0;
    my $tmp_start = 0;
    my $tmp_end = 0;
    my $deleted = 0;
    my $i = 0;
    while( $i <= $#$ref_man_page )
    {
	if( $$ref_man_page[$i] =~ /^(\.SH)/ ) #sto analizzando il titolo di una sezione della man page
	{
	    if( $field_to_delete )
	    {
		splice( @$ref_man_page, $tmp_start, ($i-$tmp_start) ); #elimino la sezione
		$field_to_delete = 0;
		$i = $tmp_start;
		$deleted = $deleted + $i - $tmp_start;
	    } 

	    my $y = undef;

	    for $y(0..$#man_page_field_to_delete)
	    {
		if(  $$ref_man_page[$i] =~ /$man_page_field_to_delete[$y]/ )
		{
		    $deleted = $deleted + $i - $tmp_start;
		    $field_to_delete = 1;
		    $tmp_start = $i;
		    last;
		}

	    }
	}
	$i ++;
    }

    splice( @$ref_man_page, $tmp_start ) if ( $field_to_delete );
}

=head1
    questa funzione si occupa di:
    - rimuovere tutti i tag della man page
    - rimuovere tutte le sequenze di numeri
    - rimuovere tutta la punteggiatura
    - spazi doppi
    - formattare tutte le parole con i soli caratteri minuscoli

    e restituisce un hash di <parola, occorrenze>
=cut
sub convert_man_page
{
    my ($ref_man_page) = shift;
    my $string_man_page = undef;
    my @word_tmp_man_page = undef;
    my %word_man_page = ();

    #rimuovo tutti i tag
    for my $i( 0..$#$ref_man_page )
    {
	if( $$ref_man_page[$i] =~ /^(\.|\\)/ ) #elimino tutte le righe che iniziano con un . o \
	{
	    splice( @$ref_man_page, $i, 0 );
	    next;
	}
	$string_man_page = $string_man_page." $$ref_man_page[$i]";
    }

    if( $string_man_page )
    {
	#ulteriori elaborazioni, elimino ...
	$string_man_page =~ s/\\..|\n//g; #.. tutti i tag delle man, gli \n ..

	$string_man_page =~ s/[0-9]+//g; #.. tutte le sequenze di numeri ..
	#$string_man_page =~ s/\'s//g; # .. tutte le 's
	
	#$string_man_page =~ s/<\w*>//g; # .. tutte le sequenze <..>
	
	#$string_man_page =~ s/[\(\)\[\]\{\}<>\.:;=,%\#\`\?\+!\"\$\&~_\-\|\\\/\'\*\s\^]/\ /g; #.. tutte le () [] <> ed i segni di punteggiatura ..
	$string_man_page =~ s/\W/\ /g;

	$string_man_page =~ s/\ {2,}/\ /g; #... tutti gli spazi doppi ..
	
	#tutti i caratteri minuscoli
	$string_man_page =~ tr/[A-Z]/[a-z]/;

	@word_tmp_man_page = split( /\ /,$string_man_page);

	foreach my $word( @word_tmp_man_page )
	{
	    $word_man_page{$word} += 1; 
	}
    }
    return %word_man_page;

}

=head1
    filtro le parole della man pages con la stop-list
    elimino le parole che iniziano per -
    elimino le parole che hanno una funghezza <= 2 
    elimino tutte le parole che contengono la @
    elimino parole che contengono almeno 3 caratteri uguali consecutivi 
=cut
sub filter_with_stop_list
{
    my $ref_text_man_page = shift;
    my $ref_stop_list = shift;

    my $key;
    my $value;
    my %stop_list = %$ref_stop_list;

    while ( ($key, $value) = each %$ref_text_man_page )
    {
	#elimino l'elemento corrente se compare nella stop list o se inzia per - o se ha lunghezza < 2
	delete $$ref_text_man_page{$key} if ( exists $stop_list{$key} || $key =~ /^\-/ || length($key)<=2 || $key =~ /@/ ||
        $key =~ /a{3,}|b{3,}|c{3,}|d{3,}|e{3,}|f{3,}|g{3,}|h{3,}|i{3,}|l{3,}|m{3,}|n{3,}|
					      o{3,}|p{3,}|q{3,}|r{3,}|s{3,}|t{3,}|v{3,}|z{3,}|x{3,}|y{3,}|w{3,}|j{3,}|k{3,}|^_{1,}/ );
    }
}

=head1
    Ogni parola viene "stemmata"( viene portata alla radice morfologica) ed inserita in un nuovo hash <parola, occorrenze>, in cui
    occorrenze  la somma delle occorrenze delle parole che hanno la stessa radice
=cut
sub word_stemming
{
    my $ref_man_page = shift;
    my $key;
    my $value;

    my %word_stemmed;

    while( ($key, $value) = each %$ref_man_page )
    {
	$word_stemmed{ stem($key) } += $value;
    }

    return %word_stemmed;
}

=head1
    carico la stop list da filename
=cut
sub load_stop_list
{
    my $filename = shift;
    my @tmp_stop_list = undef;
    my %stop_list;

    @tmp_stop_list = `cat $filename`;
    foreach my $word( @tmp_stop_list )
    {
	$stop_list{ substr($word, 0, -1) } = 1;
    }

    return %stop_list;
}

sub get_name
{
    my $ref_man_page = shift;
    my $ref_hash_man_page = shift;
    my $ref_index = shift;
    my $section = shift;

    my $value = undef;
    my $string = "";
    my $start_name = -1;
    my $end_name = -1;
    my $trovato = 0;
    my @tmp = undef;
    my @tmp2 = undef;
    my $dot = 0; #se si deve accorciare il nome della man page

    for my $i ( 0..scalar( @$ref_man_page )-1 )
    {
	#memorizzo gli indici di inizio e fine della sezione NAME
	if ( $$ref_man_page[$i] =~ /^\.[S,s][H,h,S,s]|^\.PP|^\.iX/ )
	{
	    if( $$ref_man_page[$i] =~ /NAME/ )
	    {
		$start_name = $i+1;
		$trovato = 1;
		next;
	    }
	    
	    if ( $trovato )
	    {
		$end_name = $i-1;
		last;
	    }
	}

    }

    #prendo la sezione name e la elaboro
    if ( $trovato )
    {
	for my $i( $start_name .. $end_name )
	{
	    $string = $string." ".$$ref_man_page[$i];
	}
	
	if( $string )
	{
	    $string =~ s/^\...|\.Vb 1|\.Ve|\.N[m,d]|\.IR|\.IP|\.br|\\f[I,P]|\.LP|\s|\\fB|\\fR|\.B/\ /g ;
	    $string =~ s/\\-/-/g;
	    $string =~ s/--/-/;
	    $string =~ s/\\\(em/-/g;
	    $string =~ s/\ {2,}/\ /g;
	    $string =~ s/^\ //;

	    #corretta gestione man di php
	    if ($string =~ /^\.TP\ 15/)
	    {
		my $name = undef;

		$string =~ s/\.TP\ 15\ //;

		#print "\n_{$string}_\n";

		@tmp = split( /\ /, $string );
		$name = shift @tmp;
		#print "_{$name}_.\n\n";
		$string = $name." -";
		
		foreach my $i ( @tmp )
		{
		    $string .= " ".$i;
		}
		#print "\n_{$string}_\n";
	    }

	    @tmp = split( /\ -\ /, $string, 2);
	    @tmp2 = split( /,/, $tmp[0]);

#	    $dot = 1 if ( $#tmp2 );

	    if ( scalar(@tmp2) > 2 )
	    {
		$string = $tmp[1];
		$string = (shift @tmp2).", ..., ".(pop @tmp2)." - ".pop @tmp;#$string;# if ( $dot );	
	    }

	    $string =~ s/\\&//;
	    $string =~ s/^\ {1,}//;
	    $string =~ s/\ {2,}/\ /g;
	    $string =~ s/\"//g;

	    #aggiungo la sezione all'output
	    @tmp = split( /\ -\ /, $string );

	    if ($#tmp > 0)
	    {
		$string = $tmp[0]." ($section) - ".$tmp[1];
	    } else {
		$string = $tmp[0]." ($section)";
	    }

	    #inserisco la stringa nel dizionario
	    if( $string )
	    {
		if ( exists $$ref_hash_man_page{$string} )
		{
		    #la man page  gi stata analizzata
		    return "";
		} else {
		    $value = ++ $$ref_index ;
		    $$ref_hash_man_page{$string} = $value;
		}

		#elimina il campo NAME in modo da non doverlo processare 2 volte
		splice( @$ref_man_page, $start_name-1, ($end_name-$start_name+2) );
		
	    } else {
		$string = "";
	    }

	    #return push( @$ref_array_man_page, $string )-1 if( $string );
	    return $string;
	}
    }
    return "";
}

sub get_id
{
    my $ref_hash = shift;
    my $string = shift;

    #if ( $string ne "" )
    #{
	if ( exists $$ref_hash{$string} )
	{
	    return $$ref_hash{$string};
	} else {
	    return -1;
	}
    #}

    return -1;
}

=head1
    pulisce la man page rimuovendo commenti e righe vuote
=cut
sub remove_comment
{
    my $ref_man_page = shift;
    my $i = 0;

    while ( $i <= $#$ref_man_page )
    {
	#tolgo i commenti e righe vuote
	if ( $$ref_man_page[$i] =~ /^\.\\\"|^\.\n|^\n/ )
	{
	    splice( @$ref_man_page, $i, 1 );
	} else {
	    $i++;
	}

    }
}


sub elaborate_man_page_name
{
    my $string = shift;
    my $ref_dict = shift;
    my $man_page_id = shift;
    my @tmp;

#    print "\n";
#    print "_{$string}_\n";
    $string =~ s/-/\ /g;
    $string =~ s/\W/\ /g;
    $string =~ s/[0-9]+|\n//g;
    $string =~ s/\ {2,}/\ /g;
    $string =~ tr/[A-Z]/[a-z]/;
    $string =~ s/^\ //g;
#    print "_{$string}_\n";

    @tmp = split( /\ /, $string );

    foreach my $word ( @tmp )
    {
#	print "_{$word}_\n";
	$$ref_dict{$word} += 1;
    }

}

sub add_section_to_dictionary
{
    my $section = shift;
    my $ref_dictionary = shift;
    my $man_page_id = shift;

    push( @{$$ref_dictionary{$section}}, $man_page_id );

}

sub create_dict
{
    my $ref_hash_dict = shift;
    my $ref_hash_man_page = shift;
    my $man_page_id = shift;
    my $word = undef;

    foreach $word( keys %$ref_hash_man_page )
    {
	#$$ref_hash_dict{ $word } = 
	push( @{$$ref_hash_dict{$word}}, "$man_page_id" );
    }
}

=head1
    read man_page.asd file and initilize man page id
=cut
sub read_man_page_file
{
    my $file_man_page = shift;
    my $ref_indice = shift;
    my %hash_man_page = ();

    my @tmp = `cat $file_man_page 2>/dev/null`;

    if ( $#tmp >= 0 )
    {
	foreach my $row ( @tmp )
	{
	    chop( $row );
	    my @splitted_row = split( /\ /, $row, 2 ); #splitto in 2 la stringa corrente
	    $hash_man_page{$splitted_row[1]} = int($splitted_row[0]);

	    $$ref_indice = int($splitted_row[0]) if ( int($splitted_row[0]) > $$ref_indice );
	}
    }
    return %hash_man_page;
}

=head1
    read dictionary.asd file
=cut
sub read_dictionary
{
    my $file_dictionary = shift;
    my @list;
    my %dictionary = ();

    my @tmp = `cat $file_dictionary 2>/dev/null`;

    if ( $#tmp > 0 )
    {
	foreach my $row ( @tmp )
	{
	    chop( $row );
	    $row =~ s/\ $//;
	    my @splitted_row = split( /\ /, $row, 2 ); #spitto la stringa in 2

	    @list = split( /\ /, $splitted_row[1] );
	    
	    foreach my $elem ( @list )
	    {
		push( @{$dictionary{$splitted_row[0]}}, "$elem" );
		#$dictionary{ $splitted_row[0] } = @list;
	    }
	}

    }
    
    return %dictionary;

}


#main program
=head1
    inverted_file  un hash contenente come chiave le parole trovate nelle man page
    e come valore un array delle man pages che contengono quella parola
=cut
    my %inverted_file;

=head1
    %stop_list  un hash (solo per ottimizzare l'esecuzione) di termini che devono essere
    eliminati perch troppo comuni e quindi di poca informazione
=cut
    my %stop_list = &load_stop_list( "stop-list.txt" );

=head1
    %dictionary  un hash che contiene come chiave una parola e come valore un array delle man page che la contengono
=cut
    my %dictionary = &read_dictionary( "dictionary.asd" );

#foreach my $i ( keys %dictionary )
#{
#    print "$i @{$dictionary{$i}}\n";
#}

#print "follow _${$dictionary{'follow'}}[4]_";
#die "test"; 
=head1
    %man_page_hash  un hash che contiene il titolo della man page e un id della man
=cut
    my $indice = 0;

my %man_page_hash = &read_man_page_file( "man_page.asd", \$indice );

my $man_page_id = undef;
my $file_elaborated = 0;
my $man_page_analized = 0;

#my $indice = 0;

initialise();

foreach my $man_page_dir( @dir )
{

    #le sezioni delle man page da analizzare
    my @man_page_section = `ls $man_page_dir | grep man`;

    foreach my $section( @man_page_section )
    {
	my $current_dir = $man_page_dir."/".$section;
	$current_dir = substr( $current_dir, 0, -1 );

	my @man_page_to_examine = `ls -1 $current_dir`;
	
	print "Elaboro $current_dir ";
	$file_elaborated = 0;

	foreach my $current_man_page( @man_page_to_examine )
	{

#	    $current_man_page = "msgconv.1.gz";
#	    $current_dir = "/usr/share/man/man1/";

            #inizio ad elaborare il singolo file
	    my @basename = split( /\./, $current_man_page );
	    
	    my @man_page = undef;
	    my %text_of_man_page = ();
	    my %man_page_stemmed = ();
	    my $man_page_name = undef;

	    #se si tratta di un link non lo analizzo
	    my $path = $current_dir."/".$current_man_page;

	    $path = substr( $path, 0, -1 );
	    next if ( -l $path); #non lo processo se si tratta di un link
	    
	    if( $basename[$#basename] =~ /gz/ ) 
	    {
		@man_page = `zcat $current_dir/$current_man_page`;
	    } elsif( $basename[$#basename] =~ /bz2/ ) {
		@man_page = `bzcat $current_dir/$current_man_page`;
	    }elsif( $basename[$#basename] =~ /\d/ ) {
		@man_page = `cat $current_dir/$current_man_page`;
	    } else {
		print "non posso processare $current_dir/$current_man_page\n";
		next;
	    }
	    
	    #pulizia generale della man page, rimuovo commenti e righe vuote
	    &remove_comment( \@man_page );

	    #aggiungo la section al dizionario
	    #la section la leggo dal nome del file
	    my $section_man_page = $basename[1];
	    
	    $section_man_page =~ s/\n//;
	    
	    #estraggo il campo name dalla man page ed inserisce il testo nell'hash delle man_page
	    $man_page_name = &get_name( \@man_page, \%man_page_hash, \$indice, $section_man_page );

	    #non processo la man_page se  gi presente nel db
	    #se la devo analizzare ne prendo l'id
	    $man_page_id = &get_id( \%man_page_hash, $man_page_name );

	    if( $man_page_id > 0 )
	    {
		$file_elaborated ++;
		
		#rimuovo dalle man pages le sezioni che sicuramete non mi servono
		&remove_man_page_section( \@man_page );

		#converto la man page in testo e ne restituisco un hash di parole da analizzare
		%text_of_man_page = &convert_man_page( \@man_page );

		#tolgo le parole comuni e le parole di lunghezza < 2 o che iniziano per - o quelle che sicuramente non sono parole
		&filter_with_stop_list( \%text_of_man_page, \%stop_list );

		#elaboro il nome della man page
		&elaborate_man_page_name( $man_page_name, \%text_of_man_page, $man_page_id );
		
		#in %text_of_man_page ho tutte le parole "importanti" della man page ora passo ogni parola al porter stemming 
		%man_page_stemmed = &word_stemming( \%text_of_man_page );

		#creo il dizionario dei termini
		&create_dict( \%dictionary, \%man_page_stemmed, $man_page_id );

		&add_section_to_dictionary( $section_man_page, \%dictionary, $man_page_id );
	    }

	    #last;
	}

	$man_page_analized += $file_elaborated;
	print " .... processate $file_elaborated man_page\n";
	#last;
    }
    #last;
}

print "Elaborate $man_page_analized man page\n";

#stampa su file
my $posizione;

open FILE_MAN_PAGE, ">", "man_page.asd";
foreach my $word( sort keys %man_page_hash )
{
    print FILE_MAN_PAGE $man_page_hash{$word}." ".$word."\n";
} 
close FILE_MAN_PAGE;

=head1
open FILE_DICTIONARY, ">", "dictionary.asd";
open INVERTED_FILE, ">", "inverted-file.asd";
foreach my $word( sort keys %dictionary )
{
    $posizione = tell INVERTED_FILE;
    print FILE_DICTIONARY "$word ".$posizione."\n";
    for my $id( 0 .. $#{$dictionary{$word}} )
    {
	print INVERTED_FILE " $dictionary{$word}[$id]";
    }
    print INVERTED_FILE "\n";
}
close INVERTED_FILE;
close FILE_DICTIONARY;

=cut

open FILE_DICTIONARY, ">", "dictionary.asd";
foreach my $word( sort keys %dictionary )
{
    print FILE_DICTIONARY "$word";
    for my $id( 0 .. $#{$dictionary{$word}} )
    {
	print FILE_DICTIONARY " $dictionary{$word}[$id]";
    }
    print FILE_DICTIONARY "\n";
}
close FILE_DICTIONARY;
