#!/usr/bin/perl -w
# Author:   $Author: merkosh $
# Revision: $Rev: 70 $
############################################################################
#    Copyright (C) 2005 by Uwe Mayer                                       #
#    merkosh@hadiko.de                                                     #
#                                                                          #
#    This program is free software; you can redistribute it and/or modify  #
#    it under the terms of the GNU General Public License as published by  #
#    the Free Software Foundation; either version 2 of the License, or     #
#    (at your option) any later version.                                   #
#                                                                          #
#    This program is distributed in the hope that it will be useful,       #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of        #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
#    GNU General Public License for more details.                          #
#                                                                          #
#    You should have received a copy of the GNU General Public License     #
#    along with this program; if not, write to the                         #
#    Free Software Foundation, Inc.,                                       #
#    59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             #
############################################################################

#---------------------------------------------------------#
# scan IMDB for a number of fields on a movie title       #
#---------------------------------------------------------#

#-- imports --------------------------------------------------------------------
use URI::Escape;
use LWP::UserAgent;
use HTTP::Request;
use HTML::Entities;
use File::Basename;

use LMCTools;

#-- display help screen --------------------------------------------------------
if ((grep /--help|-h/,@ARGV) || (scalar @ARGV == 0)) {
  print STDERR <<HELP;
IMDB-en.pl  \$Rev\$  (c)  2005-01-04  by Uwe Mayer

Search us.imdb.com for descriptive information on a movie title.

Synopsis: IMDB-en.pl [-h|--help] <title>|<URL>

     -h     --help     this screen
     <title>           search for <title> on imdb and return either the
                       the information or a list of matches
     <URL>             get information from this URL

The URL is distinguished from the title by the prefix \'http://\'. If
your title happens to have this prefix you\'re busted. ;)

If your internet connection needs a proxy server set the environment
variable "http_proxy" to the appropriate url 
(i.e. http_proxy=http://proxy.somehost.com:8080)

HELP
  exit();

}


#-- helper functions -----------------------------------------------------------

#--
# given a comment line it returns a list of propperly formated "aka <name>" strings
sub unfiddleAka($){
  my $text = $_[0];
  my $pos = 0;
  my @result = ();
  while ($pos != -1){
    $pos = index($text, "aka <em>");
    last if ($pos == -1);
    $text = substr($text, $pos+1);
    $text =~ /\"(.+?)\"\</;
    push @result, "aka $1";
  }
  return \@result;
}



#-- parse arguments ------------------------------------------------------------
my $title = "";
my $URL = "";

if (substr($ARGV[0], 0,7) eq 'http://') {
  $URL = $ARGV[0];
} else {
  $title = $ARGV[0];
}


#-- scan imdb ------------------------------------------------------------------
$baseURL = 'http://us.imdb.com';
$searchURL = 'http://us.imdb.com/Tsearch?title=';


#-- title ----------------------------------------------------------------------
if ($title) {
  #-- get list of titles
  $page = getPage($searchURL.uri_escape($title))->content();

  @sections = ("Popular Titles", "Titles (Exact Matches)",
	       "Titles (Partial Matches)");
  %match = ();
  $totalResults = 0;
  $pos = 0;

  #-- parse search (possible) section
  foreach $sec (@sections){
    $match{$sec} = [];
    $text = $page;
    $pos = index($text, $sec, $pos);
    if ($pos != -1) {
      $text = substr($text, $pos);
      $text =~ /Displaying (\d+) Result/;
      $count = $1;
      for ($i=0; $i < $count; $i++){
	$pos = index($text, "<li>");
	$text = substr($text, $pos+1);
	$text =~ /href\=\"(\/title\/.+?\/)\?.+?\"\>(.+?)\<\/a\> +(\(.+?\))(.*?)\<\/li\>/;
	push @{$match{$sec}}, {URL => $1, Text => decode_entities("$2 $3"), 
			       Comment => &unfiddleAka(decode_entities($4))};
	$totalResults++;
      }
    }
  }


  #-- output Results
  if ($totalResults == 0){
    # try to interpret page as page with details
    $URL = $searchURL.uri_escape($title);
  } else {
    print "status: list\n";

    foreach $sec (@sections){
      print "section: $sec\n";

      foreach $rec (@{$match{$sec}}){
	print "title: $rec->{Text}\n";
	print "url: $baseURL$rec->{URL}\n";
	foreach $desc (@{$rec->{Comment}}){
	  print "description: $desc\n";
	}
	print "\n";
      }
    }
    exit();
  }
}


#-- URL ------------------------------------------------------------------------
if ($URL){
  #-- get main page
  $response = getPage($URL);
  $page = $response->content();
  # when querying for a list and recieving details the url
  # of the page has changed: update this
  $URL = $response->base();

  # fields we are looking out for:
  %data = ();
  # - originalTitle
  # - year
  # - picture
  # - director
  # - category
  # - actors
  # - rating
  # - url
  # - length
  # - country
  # - language
  # - description
  # - comments

  # the following are not available
  # - translated title
  # - producer

  # original title
  $tmp = &find($page, '<strong class="title">');
  $data{'originalTitle'} = $1 if ($tmp =~ /\>([^\<]+) *\</);

  # year
  $tmp = &find($tmp, '<small>');
  $data{year} = $1 if ($tmp =~ /href\=.+?\>(\d+)\</);

  # picture
  $tmp = &find($page, ' alt="cover" ');
  # if image found
  if ($tmp){
    $tmp =~ /src=\"(.+?)\"/;
    $imageURL = $1;
    $imageURL =~ /(\..{0,3})$/;
    $suffix = $1;
    $count = 0;
    do {
      $tempfile = sprintf "/tmp/temp_%05d%s", $count++, $suffix;
    } while (-e $tempfile);
    open FILE, ">$tempfile";
    print FILE getPage($imageURL)->content();
    close FILE;
    $data{picture} = $tempfile;
  }

  # director
  $tmp = &find($page, 'Directed by</b>');
  $data{director} = $1 if ($tmp =~ /href\=.+?\>([^\<]+?)\</);

  # category
  $tmp = &find($page, '<b class="ch">Genre:</b>');
  @category = ();
  while ($tmp){
    $tmp = &find($tmp, '<a href="/Sections/Genres/', 1);
    last unless ($tmp);
    $tmp =~ /\>(.+?)\</;
    push @category, $1;
  }
  $data{category} = join(", ", @category) if (scalar @category);

  # actors
  $tmp = &find($page, '<b class="blackcatheader">Cast overview');
  @actors = ();
  while ($tmp){
    $tmp = &find($tmp, '<a href="/name/', 1);
    last unless ($tmp);
    $tmp =~ /\>(.+?)\<.+?\.\.\.\. .+?\"\>(.+?)\<\/td\>/;
    push @actors, "$1 - $2";
  }
  $data{actors} = \@actors if (scalar @actors);

  # rating
  $tmp = &find($page, '<b class="ch">User Rating:</b>');
  # if no rating available (first 100 char contain magic text)
  if (substr($tmp, 0, 100) !~ /awaiting \d votes/) {
    $tmp = &find($tmp, '<b>');
    $data{rating} = $1 if ($tmp =~ /([\d\.]+)/);
  }

  # url
  $URL =~ /^(.+?\/title\/.+?\/)/;
  $URL = $1 if ($1);
  $data{url} = $URL;

  # length
  $tmp = &find($page, '<b class="ch">Runtime:</b>');
  $data{length} = $1 if ($tmp =~ /(\d+) min/);

  # country
  $tmp = &find($page, '<b class="ch">Country:</b>');
  $data{country} = $1 if ($tmp =~ /href\=\".+?\"\>(.+?)\</);

  # language
  $tmp = &find($page, '<b class="ch">Language:</b>');
  $data{languages} = $1 if ($tmp =~ /href\=\".+?\"\>(.+?)\</);

  # description
  $tmp = &getPage($URL.'plotsummary')->content();
  @summary = ();
  while ($tmp){
    $tmp = &find($tmp, '<p class="plotpar">', 1);
    last unless ($tmp);
    $tmp =~ /\>(.+?)\<\/p\>/s;
    push @summary, &strip(decode_entities($1));
    $summary[-1] =~ s/\n/<br>/g;
  }
  $data{description} = \@summary if (scalar @summary);

  # comments
  $tmp = &find($page, '<a name="comment">User Comments:</a>');
  $tmp = &find($tmp, '<small>Author:</small>');
  $tmp = &find($tmp, '<p>');
  $tmp =~ /\>(.+?)\<\/p\>/s;
  if ($1) {
      $data{comments} = &strip(decode_entities($1));
      $data{comments} =~ s/\n/<br>/g;
  }


  #-- output
  # nothing found
  exit if ((scalar keys(%data) == 1) && (exists $data{url}));

  # return normal status
  print "status: details\n";

  foreach $k (keys(%data)){
    if ($k =~ /description|actors/){
      foreach (@{$data{$k}}){
	print "$k: $_\n";
      }
    }
    else {
      print "$k: $data{$k}\n";
    }
    print "\n";
  }


}
