/*
    humanzip - compresses text files in a human readable way
    Copyright (C) 2007 Matthew Strait

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
*/
using namespace std;

#include <fstream>
#include <iostream>
#include <string>
#include <getopt.h>
#include <vector>
#include <iomanip>
#include <sys/stat.h>
#include "humanzip.h"

#define TABLESIZE 55
const string utftable[TABLESIZE] = { 

"ă", "ḇ", "č", "đ", "ę", "ḟ", "ġ", "ħ", "ĩ", "ĵ", "ķ", "ŀ", "ṁ", 
"ń", "ō", "ṕ", "ƍ", "ŕ", "ş", "ŧ", "ů", "↺", "ŵ", "ж", "ŷ", "ž",

"æ", "ḃ", "ç", "ð", "ɘ", "ϕ", "ǥ", "ȟ", "ḭ", "ʝ", "ǩ", "ł", "ḿ", 
"ṋ", "ø", "Ƿ", "ƣ", "ṟ", "ƨ", "θ", "ű", "ṽ", "ẉ", "ϫ", "ȳ", "ƶ",

"œ", "ƌ", "å"
};

const string UTFtable[TABLESIZE] = { 
"Ă", "Ḇ", "Č", "Đ", "Ę", "Ḟ", "Ġ", "Ħ", "Ĩ", "Ĵ", "Ķ", "Ŀ", "Ṁ", 
"Ń", "Ō", "Ṕ", "ℚ", "Ŕ", "Ş", "Ŧ", "Ů", "℣", "Ŵ", "Ж", "Ŷ", "Ž",

"Æ", "Ḃ", "Ç", "Ð", "Ǝ", "Φ", "Ǥ", "Ȟ", "Ḭ", "ℷ", "Ǩ", "Ł", "Ḿ", 
"Ṋ", "Ø", "₱", "Ƣ", "Ṟ", "Ƨ", "Θ", "Ű", "Ṽ", "Ẉ", "Ϫ", "Ȳ", "Ƶ",

"Œ", "Ƌ", "Å"
};

#define BLACKSIZE 35
const string blacklist[BLACKSIZE] = {
"hers",		"they",		"them",		"mine",		"ours",
"whom",		"when",		"that",		"from",		"with",
"this",		"were",		"will",		"been",		"have",
"what",		"your",		"into",
						"which",	"where",
"would",	"could",	"these",	"there",	"about",
"might",	"their",
				"theirs",	"should",	"didn't"
"of the",	"in the",	"for the",	"to the",	"with the"
};

struct candidate{
  string s;
  int words;
  int goodness;
};

static int useblacklist = 1;
static unsigned int nabbrs = 10;
static int maxwords = 1;
static int verbose = 0; // output detailed progress information?
static int quiet = 0; // output only errors?
static int keeporig = 0; // keep original files?
static int tostdout = 0; // write to stdout?
static int overwrite = 0; // overwrite existing files?

static void handle_cmdline(vector<string> & filenames, int argc, char ** argv)
{
  int done = 0;
  const char * opts = "fcqvkl:n:ah?";
  while(!done){
    char c;
    switch(c = getopt(argc, argv, opts))
    {
      case -1:
        done = 1;
        break;
      case 'f':
        overwrite = 1;
        break;
      case 'c':
        keeporig = 1;
        tostdout = 1;
        break;
      case 'k':
        keeporig = 1;
        break;
      case 'v':
        verbose = 1;
        break;
      case 'q':
        quiet = 1;
        break;
      case 'a':
        useblacklist = 0;
        break;
      case 'l':
        maxwords = atoi(optarg);
        if(maxwords < 1){
          cerr << "Max words can't be less than one.\n";
          exit(1);
        }
        if(maxwords > 256) cerr << "Max words is very large! Are you sure?\n";
        break;
      case 'n':
        nabbrs = atoi(optarg);
        if(nabbrs > TABLESIZE){
          cerr << "Can only handle " << TABLESIZE << " abbreviations\n";
          nabbrs = TABLESIZE;
        }
        if(nabbrs < 1){
          cerr << "Invalid number of abbreviations\n";
          exit(1);
        }
        break;
      case 'h':
      case '?':
      default:
        cerr << "humanzip v" << HUMANVERSION << " © 2007 Matthew Strait\n"
        << "humanzip comes with ABSOLUTELY NO WARRANTY. This is free software\n"
        << "and you may redistribute it under the terms of the GPLv2.\n"
        << "\n"
        << "Basic syntax: humanzip file1 [file2]\n"
        << "\n"
        << "If no file is specified, humanzip reads from stdin.\n"
        << "The compressed version is written to, e.g., file1.hz\n"
        << "or stdout if the input was stdin. Options:\n"
        << "\n"
        << "-a\tDo not use the blacklist of common English words and phrases.\n"
        << "-n #\tUse at most # abbreviations, max "<< TABLESIZE
        << ", default " << nabbrs << "." << endl
        << "-l #\tLook for abbreviations of at most # words, default " 
        << maxwords << "." << endl	
        << "-k\tKeep the uncompressed files instead of deleting them.\n"
        << "-c\tSend output to stdout and keep uncompressed files.\n"
        << "-f\tForce overwriting of existing output files.\n"
        << "-v\tBe verbose.\n"
        << "-q\tBe quiet.  Output only errors.\n"
        << "-h\tPrint this help and exit.\n"
        << "\n";
        exit(1);
        break;
    }
  }

  for(int i = optind; i < argc; i++)
    filenames.push_back(argv[i]);

  if(filenames.size() == 0)
    filenames.push_back("/dev/stdin");
}

// Allows for a sort by alphabet (doesn't matter which way)
// and then by position in the file (does matter than it's accending)
static int pf(phrase a, phrase b)
{
  int foo = a.s.compare(b.s);

  if(foo < 0) return 0;
  else if(foo > 0) return 1;
  else return(a.pos < b.pos);
}

static int checkascii(const string & s)
{
  for(unsigned int i = 0; i < s.size(); i++)
    if(s[i] < 0)
      return 0;
  return 1;
}

// We want to catch words that appear in the document in both their signular
// and plural forms and recognize them as the same word.
static void handleplural(string & word, string & delims,
  const vector<string> & dictionary)
{
  // Word of 1 or 2 letters can't be plurals ending in 's' or "es".
  if(word.size() < 3) return;

  // If it doesn't end in 's', it's too hard to handle.
  if(word[word.size()-1] != 's') return;

  // if the word isn't in the dictionary, forget about it.
  if(!binary_search(dictionary.begin(), dictionary.end(), word)) return;

  // if taking the 's' off the end yields another valid word, assume that
  // it's a plural
  string wordwithouts = word.substr(0, word.size()-1);
    if(binary_search(dictionary.begin(), dictionary.end(), wordwithouts)){
    word.erase(word.size()-1);
    delims = 's' + delims;
    return;
  }

  // if that didn't work and it doesn't end in "es", it's too hard to handle.
  if(word[word.size()-2] != 'e') return;

  // if taking the "es" off the end yields another word, assume plural
  string wordwithoutes = word.substr(0, word.size()-2);
  if(binary_search(dictionary.begin(), dictionary.end(), wordwithoutes)){
    word.erase(word.size()-1);
    word.erase(word.size()-1);
    delims = "es" + delims;
    return;
  }
}

/* reads file into two vectors, one of words and one of delimeters.
For instance a file with the content "hello, this is a test." would produce
words={"hello",	"this",	"is",	"a",	"test"}
delims={"", 	", ",	" ",	" ",	" ",	"."}
delims *always* has one more element than words.
Returns number of characters read. */
int readfile(vector<string> & words, vector<string> & delims,
 const string & filename)
{
  int charsin = 0;

  ifstream infile(filename.c_str());
  if(!infile.is_open()){
    cerr << "Couldn't open " << filename << " skipping.\n";
    return -1;
  }

  vector<string> dictionary;
  ifstream dict("/usr/share/dict/words");
  if(!dict.is_open()){
    cerr<<"Couldn't open /usr/share/dict/words. Won't try to handle plurals.\n";
  }
  else{
    string temp;
    while(dict >> temp)
      dictionary.push_back(temp);
    dict.close();
  }

  string trailingdelims;
  while(1){
    string word, leadingspace;
    char c;

    // First get whitespace and punctuation, if any.
    while(isspace(infile.peek()) || ispunct(infile.peek())){
      infile.get(c);
      leadingspace += c;
    }

    // store this along with any puctuation from the end of the last word
    delims.push_back(trailingdelims + leadingspace);
    charsin += (trailingdelims + leadingspace).size();

    // Then get non-delims with possible trailing punctuation.
    // If there aren't any, we're done.
    if(!(infile >> word)) break;

    if(!checkascii(word)){
      cerr << "In " << filename << ", got non-ascii string " << word 
           << ". Cannot operate on non-ascii, skipping file.\n";
      return -1;
    }

    //now strip off and store the punctuation at the end of the word
    trailingdelims = "";
    while(ispunct(word[word.size()-1])){
      trailingdelims = word[word.size()-1] + trailingdelims;
      word.erase(word.size()-1);
    }

    //now figure out if words that end in "s" or "es" are plural and, if
    //so, move the "s"/"es" to the delims.
    handleplural(word, trailingdelims, dictionary);

    words.push_back(word);
    charsin += word.size();
  }

  infile.close();
  return charsin;
}

// strings are sorted by position when they get here. Returns number of times
// that the phrase in question appears not overlapping.  e.g. if the phrase
// is "happy happy" and the file contains "happy happy happy happy happy", this 
// returns 2.
static int nnonoverlapping(const vector<phrase> & phrases, int first, int last)
{
  int length = phrases[first].words;
  int count = 1; // first one is obviously ok
  unsigned int firstvalidposition = phrases[first].pos + length;
  for(int i = first+1; i <= last; i++){
    if(phrases[i].pos >= firstvalidposition){
      count++;
      firstvalidposition = phrases[i].pos + length;
    }
  }
  return count;
}

// Take a vector of strings and make it into a vector of ordered pairs
// of unique strings and number of times that string occured in the input.
static void sortuniqc(vector<candidate> & cands, vector<phrase> & p)
{
  sort(p.begin(), p.end(), pf); // this is the slow part

  for(unsigned int i = 0; i < p.size();){
    candidate sct;
    sct.s = p[i].s;
    sct.words = p[i].words;

    int first = i;
    do{
      i++;
    }while(i < p.size() && sct.s == p[i].s);

    sct.goodness = nnonoverlapping(p, first, i-1); // goodness now = # reps
    if(sct.goodness > 1)
      cands.push_back(sct); // make very loose cut
  }
}

// Sanitizes strings by converting line breaks into paragraph symbols.
static string convertlinebreaks(const string & s)
{
  string answer;
  for(unsigned int i = 0; i < s.size(); i++){
    if(s[i] == '\n')      answer += "¶";
    else if(s[i] == '\r') answer += "⁋";
    else                  answer += s[i];
  }
  return answer;
}

// Assign values to how good a candidate replacement is
static void weight(candidate & cand)
{
  // how many characters will be saved?
  cand.goodness *= (cand.s.size()-1);

  // subtract from this the space used by the key
  // this approximately finds space usage by assuming 80 character columns
  cand.goodness -= ((6 + cand.s.size())/80 + 1)*80;

  // If the string to be replaced is less than 4 characters long
  // do not use it.
  if(cand.s.size() < 4){
    cand.goodness = 0;
    return;
  }

  // If the string is on the black list, do not use it.
  if(useblacklist){
    for(int j = 0; j < BLACKSIZE; j++){
      if(cand.s == blacklist[j]){
        cand.goodness = 0;
        break;
      }
    }
  }
  //cerr<<"Tested: "<<cand.goodness<< "\t\""<<convertlinebreaks(cand.s) << "\"\n";
}

// The idea is to try longer strings first.  Otherwise if one replacement
// is a subset of another, you end up not getting to use the longer one.
static int rtf(searchreplace a, searchreplace b)
{
  return(a.words > b.words);
}

// Given the text of the file, a starting position and a number of words,
// returns a string consisting of the words starting at that position, 
// including the delimiters between them
string yoink(const vector<string> & words, const vector<string> & delims,
  unsigned int pos, unsigned int len)
{
  string answer;

  if(pos + len - 1 >= words.size()) { cerr << "Bad yoink!"; return ""; }

  for(unsigned int i = 0; i < len; i++){
    answer += words[pos+i];
    if(i != len-1) answer += delims[pos+i+1];
  }

  //cerr << "Yoinked: " << answer << endl;

  return answer;
}

static int doreplacement(ofstream & outfile, vector<searchreplace> & rtab, 
  const vector<string> & words, const vector<string> & delims)
{
  string s;
  int charsout = 0;

  sort(rtab.begin(), rtab.end(), rtf);

  if(verbose){
    for(unsigned int i = 0; i < rtab.size(); i++)
      cerr << "\nSearching for: " << rtab[i].search << " " << rtab[i].timesused;
    cerr << endl;
  }

  outfile << delims[0]; // always exists, never part of a replacement
  charsout += delims[0].size();

  for(unsigned int w = 0; w < words.size();){
    int gotit = 0;
    for(unsigned int r = 0; r < rtab.size(); r++){
      // can't match because the search string is longer than the remaining text
      if(rtab[r].words > words.size() - w) continue;
      
      string text = yoink(words, delims, w, rtab[r].words);
      
      if(text == rtab[r].search){
        outfile << convertlinebreaks(rtab[r].replacelower);
        gotit = 1;
        w += rtab[r].words;
        rtab[r].timesused++;
        break;
      }
      if(lowerfirstletter(text) == rtab[r].search){
        outfile << convertlinebreaks(rtab[r].replaceupper);
        gotit = 1;
        w += rtab[r].words;
        rtab[r].timesused++;
        break;
      }
    }
    if(!gotit){ 
      outfile << words[w];
      charsout += words[w].size();
      w++;
    }
    else
      charsout++;

    outfile << delims[w];
    charsout += delims[w].size();
  }

  return charsout;
}

static int finalcheckandprintkey(ofstream & outfile, vector<searchreplace> & rt)
{
  int charsout = 13; // ĦŨӍĄŅŽȈƤǷΈÐ\n\n
  outfile << "ĦŨӍĄŅŽȈƤǷΈÐ\n"; //used as a magic number for trivially zipped files
  for(vector<searchreplace>::iterator i = rt.begin(); i != rt.end(); i++){
    if((*i).timesused > 1){
      // The key looks nicer with an upper case first letter
      outfile << (*i).replacelower << "/" << (*i).replaceupper << " - " 
              << raisefirstletter(convertlinebreaks((*i).search)) << "\n";
      charsout += (*i).search.size() + 7;
    }
    else{
      if(verbose) cerr << "Eliminated: " << (*i).search << endl;
      rt.erase(i);
      i--;
    }
  }
  outfile << endl; // end marker.  Also looks nice.
  return charsout;
}

// cands is sorted to put the best replacements at the front
static void makereplacementtable(vector<searchreplace> & replacementtable,
  const vector<candidate> & cands)
{
  searchreplace temp;
  temp.timesused = 0; // set when the replacement is used

  vector<bool> utftableusage(TABLESIZE, 0);

  for(unsigned int i=0; replacementtable.size()<nabbrs && i<cands.size(); i++){
    if(cands[i].goodness < 1) break;

    temp.search = cands[i].s;
    temp.words  = cands[i].words;

    // Choose the abbreviation such that it is mnumonic, if possible.
    // First try to make the abbreviation match the first letter, then
    // the second, then punt.  There are two tables for each letter.
    int gotit = 0;
    int choice[4] = { temp.search[0] - 'a', temp.search[0] - 'a' + 26,
                      temp.search[1] - 'a', temp.search[1] - 'a' + 26 };

    for(int c = 0; c < 4; c++){
      if(choice[c] < 0 || choice[c] >= TABLESIZE)
        continue; // typically happens if temp.search[0|1] isn't a letter
      if(!utftableusage[choice[c]]){
        temp.replacelower = utftable[choice[c]];
        temp.replaceupper = UTFtable[choice[c]];
        utftableusage[choice[c]] = 1;
        gotit = 1;
        break;
      }
    }

    if(!gotit){
      int lastavailable = TABLESIZE-1;
      while(utftableusage[lastavailable]){
        lastavailable--;
        if(lastavailable < 0){ cerr << "NOT REACHED: table exhausted\n";}
      }
      temp.replacelower = utftable[lastavailable];
      temp.replaceupper = UTFtable[lastavailable];
      utftableusage[lastavailable] = 1;
    }

    replacementtable.push_back(temp);
  }
}

static void lowerfirstletter(vector<phrase> & phrases)
{
  for(unsigned int i = 0; i < phrases.size(); i++)
    if(phrases[i].s.size() > 0)
      phrases[i].s[0] = tolower(phrases[i].s[0]);
}

// rip off of bzip2 output
static void printstats(float charsin, float charsout, const string & filename)
{
  if(quiet) return;

  cerr << "  " << filename << ":\t" << setprecision(3)
       << charsin/charsout << ":1,  " << (1-charsout/charsin)*100
       << "% saved, " << int(charsin) << " in, " << int(charsout) << " out.\n";
}

vector<phrase> makephrases(const vector<string> & words, 
  const vector<string> & delims, int len)
{
  vector<phrase> phrases;
  int nwords = words.size();

  for(int iword = 0; iword < nwords-len+1; iword++){
    phrase ph;
    ph.words = len;
    ph.pos = iword;
    for(int word = 0; word < len; word++){
      ph.s += words[iword+word];
      if(word != len-1) ph.s += delims[iword+word+1];
    }
    phrases.push_back(ph);
  }
  return phrases;
}

static int wcf(candidate a, candidate b)
{
  return(a.goodness > b.goodness);
}


static int writecompressed(vector<searchreplace> & replacementtable,
  const vector<string> & words, const vector<string> & delims, 
  const string & filename)
{
  if(verbose) cerr << "\nOutputting compressed text...";

  string outfilename;
  if(filename == "/dev/stdin" || tostdout) outfilename = "/dev/stdout";
  else outfilename = filename + ".hz";

  ofstream outfile(outfilename.c_str());
  if(!outfile.is_open()){
    cerr << "Couldn't open " << outfilename << " for writing!\n";
    return -1;
  }

  ofstream devnull("/dev/null");
  if(!devnull.is_open()){
    cerr << "Couldn't open /dev/null for writing!\n";
    return -1;
  }

  // dry run to eliminate unused abbreviations
  if(verbose) cerr << "dry run:\n";
  doreplacement(devnull, replacementtable, words, delims);

  int charsout = finalcheckandprintkey(outfile, replacementtable);
  if(verbose) cerr << "Ok, for real this time:\n";
  charsout += doreplacement(outfile, replacementtable, words, delims);
  outfile.close();

  if(!keeporig){
    if(verbose) cerr << "Deleting " << filename << endl;
    unlink(filename.c_str());
  }
  if(verbose) cerr << "\n";
  return charsout;
}

// Checks if we're ok with writing out the compressed file: returns 1 on 
// sucess, 0 on failure. As a side effect, this creates an empty output file.
static int checkoutfile(const string & filename)
{
  if(tostdout || filename == "/dev/stdin")
    return 1; // stdout had better always be ok

  string outfilename = filename + ".hz";

  // if the file already exists and we're not willing to ovewrite it, fail
  struct stat buffer;
  if(!overwrite && (-1 != stat(outfilename.c_str(), &buffer))){
    cerr << "Output file " << outfilename << " already exists, skipping.\n";
    return 0;
  }

  // if the file does not exist, or it does and we're willing to overwrite
  // it, check if we can actually write to it
  ofstream outfile(outfilename.c_str());
  if(!outfile.is_open()){
    cerr << "Couldn't open " << outfilename << " for writing!\n";
    return 0;
  }
  else{
    outfile.close();
    return 1;
  }
}

// Returns 1 on failure, 0 on sucess
static int doit(const string & filename)
{
  vector<string> words, delims;
  vector<searchreplace> replacementtable;
  int charsin = 0, charsout = 0;

  if(verbose) cerr << "Reading file...";
  if(-1 == (charsin = readfile(words, delims, filename))) return 1;

  if(!checkoutfile(filename)) return 1;

  if(verbose) cerr << "\nMaking and judging abbreviations";
  vector<candidate> cands; // master candidates
  for(int len = 1; len <= maxwords; len++){
    vector<phrase> phrases = makephrases(words, delims, len);
    vector<candidate> candcands; // temporary candidates
    lowerfirstletter(phrases);
    sortuniqc(candcands, phrases);
    for(unsigned int i = 0; i < candcands.size(); i++){
      weight(candcands[i]); // not too bad
      if(candcands[i].goodness > 0) // keep only vaugly reasonable ones
        cands.push_back(candcands[i]);
    }
    cerr << ".";
  }

  // sort master array of candidates
  if(verbose) cerr << "\nSorting candidates...";
  sort(cands.begin(), cands.end(), wcf); // slow, but not as bad as the above

  if(verbose) cerr << "\nMaking replacement table...";
  makereplacementtable(replacementtable, cands); // fast

  if(-1 == (charsout = writecompressed(replacementtable,words,delims,filename)))
    return 1;
  printstats(charsin, charsout, filename);
  return 0;
}

int main(int argc, char ** argv)
{
  vector<string> filenames;
  handle_cmdline(filenames, argc, argv);

  int nfailed = 0;

  for(unsigned int i = 0; i < filenames.size(); i++)
    nfailed += doit(filenames[i]);

  return nfailed;
}
