#!/usr/local/bin/perl -w
# Time-stamp: <2005/03/16, 00:45:43 (EST), maverick, scrape.pl>
# Warning: Please use this script responsibly!

use strict;

# packages
use WWW::Mechanize;

# set this to 1, or 2 for more output
my $debug = 0;

# autoflush
$| = 1;

# get album id, username and password
my $albumurl = $ARGV[0];
my $username = $ARGV[1];
my $password = $ARGV[2];
$debug = $ARGV[3] || $debug;
die "Usage: scrape.pl albumurl username password [debuglevel]\n"
    unless $password;
my ($albumid) = $albumurl =~ /id=(\d+)(\&?)/;
my $indexurl =
    'http://www.imagestation.com/album/pictures.html?id=' . $albumid;
my $stage = 0;

# banner
print "\n";
print "Imagestation Scraper by Maverick Woo\n";
print "== Version 2005-03-15 ==\n";
print "== Debug level $debug ==\n" if $debug;
print "\n";

# init a mech
my $mech = WWW::Mechanize->new();
$mech->agent_alias('Windows Mozilla');

# get to the album entry page
$stage++;
print "Connecting to album $albumid...\n";
$mech->get($indexurl);
print "@ Front page\n";
if ($debug) {
    open F, '>' . $stage . '-entry.html' or die;
    print F $mech->content();
    close F;
}
if ($debug >= 2) {
    print "Available links:\n";
    my @links = $mech->links();
    foreach (@links) {
        print "-> ", $_->text(), "\n";
    }
}

# fill in the login form
$stage++;
$mech->follow_link(text => "click here");
print "@ Log in\n";
if ($debug) {
    open F, '>' . $stage . '-login.html' or die;
    print F $mech->content();
    close F;
}
$mech->form_name('mainForm');
$mech->set_fields(
                  'username' => $username,
                  'password' => $password
                  );

# should now be in the ssl redirect page
$stage++;
$mech->submit();
print "@ Redirect\n";
if ($debug) {
    open F, '>' . $stage . '-redirect.html' or die;
    print F $mech->content();
    close F;
}
if ($debug >= 2) {
    print "Available links:\n";
    my @links = $mech->links();
    foreach (@links) {
        print "-> ", $_->text(), "\n";
    }
}

# should now be in the first index page
$stage++;
$mech->follow_link(text => "here");
my ($title) = $mech->title() =~ /^Album: (.+)/;
print "@ Album [$title]\n";
if ($debug) {
    open F, '>' . $stage . '-coverpage.html' or die;
    print F $mech->content();
    close F;
}

# massage title in case it contains special characters for path
$title =~ s/\?//g;              # no ? on windows
$title =~ s/\//-/g;             # no / (usually from dates, so use -)
$title =~ s/\\/_/g;             # no \

# now go to the full index
$stage++;
$mech->follow_link(text => "Index");
print "@ Index";
if ($debug) {
    open F, '>' . $stage . '-index.html' or die;
    print F $mech->content();
    close F;
}

# extract urls from the javascript
my @thumburls = grep(/sraid/, split("\n", $mech->content()));
my $numpics = scalar(@thumburls);
print ", with $numpics pic(s)\n";

# set the path name and mkdir
my $pathname = './' . $title . ' ' . $albumid;
mkdir $pathname;

# now split at double quotes to scan for http urls
my $counter = 0;
my $success = 0;
my $failure = 0;
my $skipped = 0;
print "@ Downloading\n";
foreach (@thumburls) {

    # get all thumb urls
    my ($prefix) = $_ =~ /http(.+)\.thumb\.jpg/;
    next unless $prefix;

    # compute filename
    $counter++;
    my $filename = $pathname . sprintf('/%08d.jpg', $counter);

    # either skip or download
    print "  => $filename";
    if (-e $filename) {
        $skipped++;
        print " [skipping]";
    } else {
        sleep(1);               # sleeping seems important
        my $origurl = 'http' . $prefix . '.orig.jpg';
        $mech->get($origurl);
        if ($mech->success()) {
            $success++;
            open F, ">$filename" or die "Cannot open file [$filename].\n";
            binmode F;
            print F $mech->content();
            close F;
            print " OK";
        } else {
            $failure++;
            print " Failed";
        }
    }
    print "\n";
}

# wrap up
print "Finished downloading $success file(s)";
print ", skipped $skipped file(s)" if ($skipped);
print ".\n";
print "\n$failure download(s) failed. Rerun.\n" if ($failure);

# end