#! /usr/bin/ruby -wEUTF-8:UTF-8

# Copyright (C) 2012 Charles Atkinson
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

# Purpose: copies files of configured types from the configured source 
#   directory tree(s) to the configured collation directory, preserving 
#   relative path names, hardlinking any duplicates, setting the mtime to
#   the earliest of the duplicates.

# Object call tree
#    +
#    |
#    +-- Initialise
#    |   |
#    |   +-- InitialiseParameters
#    |   |
#    |   +-- ParseConfigFile (library method)
#    |   |
#    |   +-- ParseCommandLine
#    |   |   |
#    |   |   +-- Usage
#    |   |
#    |   +-- NormaliseParameters
#    |   |
#    |   +-- LogParameters
#    |   |
#    |   +-- CheckParameters
#    |   |
#    |   +-- ConnectToDB
#    |   |
#    |   +-- CreateTables
#    |
#    +-- CollateFiles
#    |   |
#    |   +-- ProcessFile
#    |
#    +-- Finalise

require 'English'
require 'getoptlong'
require 'open3'
require 'pg'

require './CollatedFile'
require './Log'
require './SourceFile'
require './ruby_db_lib'
require './ruby_lib'


# Method definitions
# ~~~~~~~~~~~~~~~~~~
# (in alphabetical order)

def CollateFiles( )

  # Build the OS find command
  find_command = "find "
  $parameters[ "SourceRootDirs" ].each \
  do | dir |
    find_command << "#{ ShellEscape( dir  ) } "
  end
  if $parameters[ "SourcesToExclude" ].size > 0
    find_command << "\\( -type d \\( "
    $parameters[ "SourcesToExclude" ].each \
    do | dir |
      find_command << "-iwholename #{ ShellEscape( dir  ) } -o "
    end
    find_command[-3,3] = ""
    find_command << "\\) -prune \\) -o "
  end
  find_command << "-type f \\( "
  $parameters[ "ExtensionsToInclude" ].each \
  do |ext |
    find_command << "-iname '*#{ ext }' -o "
  end
  find_command[-3,3] = ""
  find_command << " \\) -print0"

  $log.write( Log::DEBUG, "find_command: #{find_command }" )

  # Run the OS find command
  stdin, stdout, stderr = Open3.popen3( find_command )
  while ! ( stdout.eof? && stderr.eof? ) \
  do
    # Read any stderr from find
    char=""
    err=""
    while char != "\n"
      begin
        char = stderr.read_nonblock( 1 )
        err << char
      rescue Errno::EAGAIN
        break
      rescue EOFError
        break
      end
    end
    if err != "" then $log.write( Log::ERROR, err ) end
    # Read any stdout from find
    char=""
    path=""
    while true
      begin
        char = stdout.read_nonblock( 1 )
        if char == "\0" then break end
        path << char
      rescue Errno::EAGAIN
        sleep 1
        next
      rescue EOFError
        break
      end
    end
    # Experimentation showed that Ruby sets the encoding of strings read
    # from Open3.popen3 as ASCII-8BIT.  
    #
    # From http://www.humbug.in/docs/read-ruby/enc.ascii-8bit.html 
    # Ruby defines an encoding named ASCII-8BIT, with an alias of BINARY, which
    # does not correspond to any known encoding. It is intended to be associated
    # with binary data, such as the bytes that make up a PNG image, so has no
    # restrictions on content. One byte always corresponds with one character.
    # This allows a String, for instance, to be treated as bag of bytes rather
    # than a sequence of characters. ASCII-8BIT, then, effectively corresponds
    # to the absence of an encoding, so methods that expect an encoding name
    # recognise nil as a synonym.
    #
    # Set the encoding as UTF-8 (which Ruby's Find.find does) so the path
    # can be used!
    if path != "" then ProcessFile( path.force_encoding("UTF-8") ) end
  end

end


def Finalise( exitcode, *msg )
  # TODO: might be nice to have a quiet option, eg for use with command line errors

  # Log any optional message
  if msg.length > 0; $log.write( Log::INFO, msg[ 0 ] ) end

  # Close the database connection
  if $databaseConnected
    $log.write( Log::INFO, "Closing database connection" )
    $conn.close
  end

  # Final logging
  $log.write( Log::INFO, "Database record changes:\n" + \
    "  Collated files created: #{ $num_collated_files_created }\n" + \
    "  Collated file mtime adjustments: #{ $num_mtime_adjustments }\n" + \
    "  Collated paths created: #{ $num_collated_paths_created }\n" + \
    "  Source files created: #{ $num_source_files_created }" \
  )
  $log.write( Log::INFO, "Collation directory tree changes:\n" + \
    "  Paths created by copying: #{ $num_files_copied }\n" + \
    "  Paths created by linking: #{ $num_collated_paths_linked }" \
  )
  if $log.n_warnings > 0
    if $log.n_warnings == 1
      $log.write( Log::WARN, "There was one warning" )
    else
      $log.write( Log::WARN, "There were #{ $log.n_warnings } warnings" )
    end
    if exitcode == 0; exitcode = 1 end
  end
  if $log.n_errors > 0
    if $log.n_errors == 1
      $log.write( Log::ERROR, "There was one error" )
    else
      $log.write( Log::ERROR, "There were #{ $log.n_errors } errors" )
    end
    if exitcode == 0; exitcode = 1 end
  end
  $log.write( Log::INFO, "#{ File.basename( $0 ) }: exiting with exitcode #{ exitcode }" )
  $log.close

  # Bye!
  exit exitcode
end


def Initialise
  # Disable common traps until logging and initialisation required for Finalise( )
  # is completed
  trap( "INT" ) { }
  trap( "HUP" ) { }
  trap( "QUIT" ) { }
  trap( "TERM" ) { }

  # Set default parameters
  InitialiseParameters( )

  # Parse any config file
  # Must do now so config file settings can be overriden by command line
  x = ARGV.index( "--config" ) 
  if x != nil && ARGV[ x + 1 ] != nil
    config_file_error_msg = ParseConfigFile( ARGV[ x + 1 ], $parameters.keys )
  else
    config_file_error_msg = ''
  end

  # Parse command line
  # Must do now in case "--help" is given or there are any logging options
  # Save the options and arguments because GetoptsLong in ParseCommandLine will empty ARGV :-(
  opts_and_args = ARGV.join( ' ' )
  cmd_line_error_msg = ParseCommandLine( )

  # Set up logging
  now = "#{ Time.now.strftime( '%y-%m-%d@%H:%M:%S' ) }"
  if $parameters[ "LogToFile" ]
    # TODO: pass path to $Log.new when it can accept
    # TODO: error trap the File.open (which would be better in the Log class anyway)
    log_fd = File.open( $parameters[ "LogPath" ], 'w' )
    timestamps = true
  else
    log_fd = $stdout
    timestamps = false
  end
  $log = Log.new( log_fd, $parameters[ "LogLevel" ], timestamps )

  # Initialisation required before Finalise can be called
  # (logging FATAL messages calls Finalise)
  $databaseConnected = false
  $num_collated_files_created = 0
  $num_collated_paths_created = 0
  $num_collated_paths_deleted = 0
  $num_collated_paths_linked = 0
  $num_files_copied = 0
  $num_mtime_adjustments = 0
  $num_source_files_created = 0

  # Report any command line or config errors
  if cmd_line_error_msg != ''
    $log.write( Log::ERROR, cmd_line_error_msg )
    Usage( "not verbose" )
    Finalise( 1 )
  end
  if config_file_error_msg != ''
    $log.write( Log::FATAL, config_file_error_msg )
  end

  # Set common traps
  trap( "HUP" ) { Finalise( 129, "Received signal HUP" ) }
  trap( "INT" ) { Finalise( 130, "Received signal INT" ) }
  trap( "QUIT" ) { Finalise( 131, "Received signal QUIT" ) }
  trap( "TERM" ) { Finalise( 143, "Received signal TERM" ) }

  # Log startup message
  $log.write( Log::INFO, \
    "#{ File.basename( $0 ) } started at #{ now } by " + \
    "#{ $0 } #{ opts_and_args }" \
  )

  # Normalise, log and check $parameters
  NormaliseParameters( )
  db_pwd = $parameters[ "Database" ][ "password" ]
  $parameters[ "Database" ][ "password" ] = "<not logged>"
  LogParameters( )
  $parameters[ "Database" ][ "password" ] = db_pwd
  error_msg = CheckParameters( )
  if error_msg != '' 
    $log.write( Log::FATAL, "Parameter error(s):" + error_msg )
  end

  # Initialisation for database actions
  # TODO: use at_exit to ensure connection dropped?
  ConnectToDB( )
  $databaseConnected = true
  CreateTables( )
end


def ParseCommandLine( )
  # Options that require an argument are marked OPTIONAL_ARGUMENT so this
  # script can handle missing arguments itself
  opts = GetoptLong.new(
    [ '--config', GetoptLong::OPTIONAL_ARGUMENT ],
    [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
    [ '--log', GetoptLong::OPTIONAL_ARGUMENT ],
    [ '--loglevel', GetoptLong::OPTIONAL_ARGUMENT ]
  )
  # Option arguments that may later be changed are duplicated
  # (strings in ARGV[ ] are frozen)
  error_msg = ''
  opts.each \
  do |opt, arg|
    case opt
      when "--config"
        if arg != ''
          $parameters[ "ConfigFile" ] = arg.dup
        else
          error_msg += "\n  '--config' argument missing"
        end
      when "--help"
        Usage( "verbose" )
        exit( 0 )
      when "--log"
        $parameters[ "LogToFile" ] = true
        if arg != ''
          $parameters[ "LogPath" ] = arg.dup
        end
      when "--loglevel"
        case arg
          when 'D' 
            $parameters[ "LogLevel" ] = "DEBUG"
          when 'I' 
            $parameters[ "LogLevel" ] = "INFO"
          when 'W' 
            $parameters[ "LogLevel" ] = "WARN"
          when 'E' 
            $parameters[ "LogLevel" ] = "ERROR"
          when 'F' 
            $parameters[ "LogLevel" ] = "FATAL"
          when ''
            error_msg += "\n  '--loglevel' argument missing"
          else
            error_msg += "\n  Invalid '--loglevel' argument: '#{ arg }'"
        end
      else
        error_msg += "\n  Invalid option, '#{ opt }'"
    end
  end
  if ARGV.length != 0
    error_msg += "\n  Invalid argument(s) after options and their arguments: '#{ ARGV.join( ' ' ) }'"
  end
  if error_msg != ''
    error_msg = "Command line error(s):" + error_msg
  end
  return error_msg
end
  

def ProcessFile( source_path )

  $log.write( Log::INFO, "Source path: " + source_path )

  # Ensure file exists
  if ! File.exists?( source_path )
    $log.write( Log::WARN, "File does not exist" )
    return    
  end

  # Ignore some files
  if File.size( source_path ) < $parameters[ "MinimumFileSize" ]
    $log.write( Log::INFO, "Ignored: smaller than MinimumFileSize" )
    return
  end 
  if ! source_path.valid_encoding?
    $log.write( Log::ERROR, "Ignored: invalid encoding in path name #{ source_path }. Manual renaming may fix this problem" )
    return    
  end 

  # Instantiate a containing SourceFile and ensure in DB
  begin
    source_file = SourceFile.new( source_path )
  rescue
    $log.write( Log::ERROR, "SourceFile.new failed" )
    return
  end

  # Strip leading directories
  # (this is intended to be used to strip uninformative leading directories)
  stripped_path = "#{ source_path }"
  $parameters[ "LeadingDirsToStrip" ].each \
  do |regexp|
    stripped_path.sub!( regexp, '' ) 
  end
  stripped_path.sub!( %r|^/|, '' )    # Ensure no leading /

  # Generate equivalent path
  equivalent_path = $parameters[ "CollationRootDir" ] + stripped_path
  $log.write( Log::DEBUG, "Collated path: #{ equivalent_path }" )

  # Instantiate a containing CollatedFile
  begin
    collated_file = CollatedFile.new( source_file.md5, source_file.sha1, \
      source_file.path \
    )
  rescue
    $log.write( Log::ERROR, "CollatedFile.new failed" )
    return
  end

  # Fix file name extension if it does not match the MIME type
  if collated_file.mime_type =~ /^text\/rtf/ && ! ( equivalent_path =~ /\.rtf$/ )
    $log.write( Log::INFO, "Changing extension to match MIME type" )
    equivalent_path = equivalent_path[/^.*\./] + "rtf"
    collated_file.path = equivalent_path
  end

  # Ensure equivalent path exists and note required DB action(s)
  copy_source_file = false
  link_collated_file = false
  if File.exists?( equivalent_path )
    $log.write( Log::INFO, \
      "Equivalent path already exists: #{ equivalent_path }" \
    )
  else
    if collated_file.in_db
      # LookupCollatedPathByInode deletes from DB any paths that do not exist
      path_to_link = LookupCollatedPathByInode( collated_file.inode )
      if path_to_link != nil
        link_collated_file = true
      else
        copy_source_file = true
      end
    else
      copy_source_file = true
    end
  end

  insert_collated_file = false
  insert_collated_path = false
  if copy_source_file
    $log.write( Log::INFO, "Collated file does not exist; copying source file to collation as:" + \
      "\n  #{ equivalent_path }"
    )
    exitstatus, message = CopyFile( source_file.path, equivalent_path )
    if exitstatus == 0
      $num_files_copied += 1
      insert_collated_file = true
      insert_collated_path = true
    else
      $log.write( Log::FATAL, "Failed to copy file: #{ message }" )
    end
  elsif link_collated_file
    $log.write( Log::INFO, "A collated path already exists for this file; hard linking to it:" + \
      "\n  Existing path: #{ path_to_link }" + \
      "\n  New path: #{ equivalent_path }"
    )
    exitstatus, message = CreateHardLink( path_to_link, equivalent_path )
    if exitstatus != 0
      $log.write( Log::FATAL, "Failed to create hard link: #{ message }" )
    end
    $num_collated_paths_linked += 1
    inode, unused = GetInodeAndMtime( equivalent_path )
    if inode == nil; return end
    insert_collated_path = true
  end

  # DB actions as required
  if insert_collated_file
    inode, mtime = GetInodeAndMtime( equivalent_path )
    if inode == nil; return end
    $log.write( Log::INFO, \
      "Creating DB record for collated file" + \
      " (inode #{ collated_file.inode })" \
    )
    collated_file.inode = inode
    collated_file.mtime = mtime
    collated_file.insert_into_db
    $num_collated_files_created += 1
  end
  if insert_collated_path
    $log.write( Log::INFO, "Creating DB record for collated path #{ equivalent_path }" )
    InsertCollatedPathIntoDB( inode, equivalent_path )
    $num_collated_paths_created += 1
  end

  # Ensure collated file has earliest known mtime
  if ( source_file.mtime < collated_file.mtime )
    $log.write( Log::INFO, "Source file has earlier mtime than collated file; changing mtime on file and in DB:" + \
      "\n  Path: #{ equivalent_path }"
    )
    File.utime( source_file.mtime, source_file.mtime, equivalent_path )
    collated_file.mtime = source_file.mtime
    collated_file.update_db
    $num_mtime_adjustments += 1
  end
end


def Usage( verbosity )
  # If logging not set up, set up default logging
  # This is required when "--help" is given on the command line
  if $log == nil
    log_fd = $stdout
    timestamps = false
    $log = Log.new( log_fd, $parameters[ "LogLevel" ], timestamps )
  end

  # Display usage
  $log.write( Log::INFO, "Usage: #{File.basename( $0 )} " + \
    "[--config config_file] [--help] [--log [log_file]] [--loglevel level]" \
  )
  if verbosity == "verbose"
    $log.write( Log::INFO, \
      "  --config: names the configuration file" + \
      "\n  --help: print this help message and exit" + \
      "\n  --log" + \
      "\n    log_file given: specify the log_file" + \
      "\n    log_file not given: log to the default log file" + \
      "\n  --loglevel: set lowest log level messages to log.  In order:" + \
      "\n    D for debug" + \
      "\n    I for informtion" + \
      "\n    W for warning" + \
      "\n    E for error" + \
      "\n    F for fatal" \
  )
  end
end


# Execute
# ~~~~~~~
Initialise( )
CollateFiles( )
Finalise( 0 )
