#!/usr/bin/ruby

require 'token'
require 'hashdb'
require 'gmp'

class Stats

  ## Constructeur ##
  def initialize(file,basename)
    @content_file = file
    listToken = Token.new(@content_file)
    @hash = listToken.get_all
    @value = Hash.new(0)
    @proba = Hash.new(0)
    @fw    = Hash.new(0)
    @ch    = Hash.new(0)
    @db = Hashdb.new(basename, 'r+')
    @tabstat = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    @nbtokens = 0;

    @BAD, @GOOD  = @db.getScore("~~nombre-pages~~")

    @x = 0.5
    @s = 1
    @dev = 0.4

    @min = 30
    @info = Hash.new(0)
  end  

  ## Retourne la probabilité qu'une page soit mauvaise (bad) ##
  def getProbabilityFile(tokens)
    tabRes = 0
    tabRes = []

    if @hash.length < @min
      return "fichier non pris en compte car trop petit"
    end

    @hash.keys.each {|t|
      bad,good = @db.getScore(t)
      v = @hash[t]

      if tokens == "true"
        print("#{t} : #{@hash[t]}\n")
      end

      if bad+good != 0
        #proba = getSimpleProbability(bad, good)
        proba = getGrahamProbability(bad, good)
        #proba = getComplexProbability(bad, good)
        fw = (@s*@x + (bad+good)*proba)/(@s+bad+good)
      else
        proba = 0
        fw = @x
      end
      @tabstat[(fw*10).floor] = @tabstat[(fw*10).floor] +v;
      @nbtokens = @nbtokens + v;
      for i in 1..v        
        tabRes  << fw
      end
      if(fw>@dev and fw<(1-@dev))
        ch = "-"
      else
        ch=  "+"
      end
      
      @value[t] = "#{v}"
      @proba[t] = "#{proba}" 
      @fw[t]    = fw  
      @ch[t]    = "#{ch}"
      
    }

    #rejette probas aux deviations trop faibles
    tabRes.reject!{|p| p>@dev and p<(1-@dev)}

    produitProba = GMP::F.new(1)    
    for p in tabRes
      produitProba *= p
    end

    h = chi2P(-2 * produitProba.log, 2*tabRes.length)
    produitProba = GMP::F.new(1)
    for p in tabRes
      produitProba *= (1-p)
    end
    s = chi2P(-2 * produitProba.log, 2*tabRes.length)

    return (1+h-s)/2
  end

  ## Retourne la probabilité simple d'un token ##
  def getSimpleProbability(bad, good)
    return 1.0*bad/(bad+good)
  end

  ## Retourne la probabilité de Graham d'un token ##
  #équivaut a la technique de sclaleFactor de Cedric
  def getGrahamProbability(bad, good)
    return (1.0*bad/@BAD)/((1.0*bad/@BAD)+(1.0*good/@GOOD))
  end

  ## Retourne la probabilité la plus complète d'un token ##
  # prend en compte le nombre de pages en learning
  def getComplexProbability(bad, good)
    return 1.0*(1.0*@BAD/(@BAD+@GOOD))*(1.0*bad/(@BAD))/(((1.0*@BAD/(@BAD+@GOOD))*(1.0*bad/(@BAD)))+(1.0*@GOOD/(@BAD+@GOOD))*(1.0*good/(@GOOD)))
  end

  ## calcul de l'inverse du Chi² ##
  def chi2P(chi,df)
    m = chi / 2.0
    sum = term = (-m).exp
    for i in 1..df/2
      term *= m / i
      sum += term
    end
    if sum<1.0
      return sum
    end
    return 1.0
  end
  
  def getInfo()
    info = []
    tri  =  @fw.sort{|a,b| a[1]<=>b[1]}
    tri.each{|t,v|
      info << "#{t.ljust(35)} #{@value[t].ljust(5)} #{@proba[t].ljust(20)} #{@fw[t].to_s.ljust(20)} #{@ch[t]}"
    }
    return info
  end

  def getHisto()
    info = []
     for i in 0..9        
       info[i] = "#{(i*10).to_s.ljust(3)} - #{((i+1)*10).to_s.rjust(3)} #{@tabstat[i].to_s.rjust(4)} : "
       for j in 0..(((1.0*@tabstat[i]/@nbtokens)*100).floor)
         info[i] = info[i] + "#"
       end
     end
    return info
  end
    

end
