#!/usr/bin/perl -w

use strict;

my(
    $base_url,
    $base_path,
    $program_line,      
    $proxy,
    $verbose,
    $program,		# the code to eval
    $tmpin, 		# the files a2p reads
    $tmpout, 		# the files a2p writes
    @nargs,		# new args for a2p
    @vargs		# temporary holder of variables
);

END {
    unlink $tmpin, $tmpout; # XXX: don't check failure
    close STDOUT 	    || die "$0: can't close stdout: $!\n";
    $? = 1 if $? == 255;    # from die
} 

sub usage {
    warn "$0: @_\n" if @_;
    die "usage: $0 [ -F fs ] [ [-v] var=value ] [ 'prog' | -f progfile ] [ file ...  ]\n";
} 

usage() unless @ARGV;

# the following dance is needed to avoid explicit 
# pipes or even redirections for process control 
# on broken soi-disant operating systems.

open(SAVE_OUT, ">&STDOUT") || die "can't save stdout: $!";
die unless defined fileno SAVE_OUT;

open(TMPIN, ">  " . ($tmpin = "a2pin.$$"))	     ||
open(TMPIN, ">  " . ($tmpin = "/tmp/a2pin.$$"))      ||
die "can't find a tmp input file";

open(TMPOUT,"+> " . ($tmpout = "a2pout.$$"))	     ||
open(TMPOUT,"+> " . ($tmpout = "/tmp/a2pout.$$"))    ||
die "can't find a tmp output file";

# This needs to be improved: assumes first two options are start url and
# local path to save mirrored pages; next two are optionally 
# --proxy and --verbose

$base_url = shift @ARGV;
$base_path = shift @ARGV;
$_ = shift @ARGV;
$proxy = "";
$verbose = 0;
if ($_ eq "--proxy") {
    $proxy = $_;
    $_ = shift @ARGV;
}
if ($_ eq "--verbose") {
    $verbose = 1;
    $_ = shift @ARGV;
}
unshift @ARGV, $_;
$base_path =~ s/^\~/$ENV{HOME}/;

# Tom's awk-option handler; webawk-specific options have been removed above
while (@ARGV) {
    $_ = $ARGV[0];
    if (s/^-//) {
	if (s/^F//) {
	    unless (length) { shift; $_ = $ARGV[0]; } 
	    push(@nargs, "-F$_");
	    shift;
	    next;
	} 
	elsif (s/^v// || /^\w+=/) {
	    unless (length) { shift; $_ = $ARGV[0]; } 
	    push(@vargs, $_);
	    shift;
	    next;
	}
	elsif (s/^f//) {
	    unless (length) { shift; $_ = shift; } 
	    push(@nargs, $_);
	    last;
	} 
	elsif (s/^-//) {
	    if (length) { usage("Long options not supported") }
	    shift;
	    next;
	}
	else { 
	    usage("unknown flag: -$_");
	}
    } else { 
	# XXX: is it a program or an expression?  
        if (/^\w+=/) {
	    push(@vargs, $_);
	    shift;
	    next;
	}  
	else { 
	    print TMPIN "$_\n";
	    shift;
	    push @nargs, $tmpin; 
	    last;
	}
    }
} 

unshift @ARGV, @vargs;  # put back var=val statements

close(TMPIN)		    || die "can't close $tmpin: $!";

open(STDOUT, ">&TMPOUT")    || die "can't dup to $tmpout: $!";
$| = 1;

system 'a2p', @nargs;

if ($?) {
    die "Couldn't run a2p (wait status == $?)";
} 

die "empty program" unless -s TMPOUT;
die "empty program" unless -s $tmpout;

seek(TMPOUT, 0, 0)	   || die "can't rewind $tmpout: $!";

# modify the resulting Perl code
# first, add webawk module requirements and subroutines

while ($program_line = <TMPOUT>) {
    if ($program_line eq "#!/usr/bin/perl\n") {
	$program = $program_line . add_headers();
    }
    # replace the main line-based loop with a link-based one
    elsif ($program_line eq "while (<>) {\n") {
	$program .= add_initialization();
	$program .= add_subroutines();
	$program .= add_loop();
    }
    else { $program .= $program_line }
}
close(TMPOUT)	   || die "can't close $tmpout: $!";
open(STDOUT, ">&SAVE_OUT") || die "can't restore stdout: $!";

# eval qq{
#     no strict; 
#     local \$^W = 0;
#     $program;
# }; 
print $program;

if ($@) {
    die "Couldn't compile and execute awk-to-perl program: $@\n";
} 

exit 0;

sub add_headers()
{
    my $lines;
    $lines =
q(
use LWP::UserAgent;
require HTML::Parse;
require HTML::FormatText;
use HTML::Entities;
use Getopt::Long;
use File::Path;
use HTTP::Date qw(time2str str2time);
);
    return $lines;
}

sub add_initialization()
{
    my $lines;
    $lines =
qq(
\$base_url = "$base_url";
\$base_path = "$base_path";
\$verbose = $verbose;
);
    $lines .=
q(
push @ancestors, "command line";
push @links, $base_url;
);
# If given URL doesn't end with a slash (i.e. is file instead of dir)
# then remove non-slash characters
    $lines .=
q(
$base_url =~ s,[^/]+$,, if ($base_url =~ m,[^/]$,);
);
    $lines .=
q(
$ua = new LWP::UserAgent;
$ua->agent("WebAWK/0.1 " . $ua->agent);
);
     if ($proxy) {
	 $lines .=
q(
$ua->env_proxy; 
)
     }
    return $lines;
}

sub add_loop()
{
    my $lines;
    $lines =
# quote with bang since mismatched brackets in text we're adding (no
# close brace to match while; that'll be added later from a2p output)
q!
while (($url, $linked_from) = get_next_url()) {
    $save_as = get_filename($url, $base_url);
    ($content, $need_save, $mimetype, $res_base) = 
      get_next_page($url, $base_url, $save_as);
    if ($content eq "") {
      next;
    }
    $_ = $url;
!;
    return $lines;
}

sub add_subroutines()
{
    my $lines;
# uses global hash "history"
# also uses global lists since I can't figure out how to pass the
# lists as local vars :-/
    $lines =
q(
sub get_next_url()
{
    my ($url, $linked_from);
    while ($url = shift @links) {
	$linked_from = shift @ancestors;

	# skip this link if already visited
	next if (exists $history{$url});
	$history{$url} = 1;
	return ($url, $linked_from);
    }
    return ();
}	

sub get_filename($$)
# ??? must fix for "external" urls
{
    my ($url, $base_url) = @_;
    my $rel_url = $url;
    $rel_url =~ s,^$base_url,,; # whack off the base
    # if null or ending in a slash, concatenate "index.html"
    $rel_url .= "/index.html" if ($rel_url=~m,/$, or $rel_url eq "");
    print "  relative URL is $rel_url\n" if ($verbose);
    my $filename = "$base_path/$rel_url";
    # replace doubled slashes with singles
    $filename =~ s,//,/,g;
    return $filename;
}

sub get_next_page($$$)
{
    my ($url, $base_url, $save_as) = @_;
    my $req = new HTTP::Request 'GET' => $url;
    my $need_save = 1;
    my $content = "", $mimetype = "unknown";

    # Modify request to only check for changes if file exists
    if (($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	 $atime,$mtime,$ctime,$blksize,$blocks) = stat($save_as)) {
	$req->header('If-Modified-Since', time2str($mtime));
	print "  If-Modified-Since: ",time2str($mtime),"\n" if ($verbose);
    }
    
    # Send the request; simple error checking
    my $res = $ua->request($req);
    if (not $res->is_success) {
	print "Request unsuccessful; request was for $url\n";
	return "";
    }
    if ($res->content =~ m/\<(moved|please update)\>/) {
	print STDERR "Getting $url\n" if (not $verbose);
	print STDERR "  Error: page moved!\n";
	print STDERR "  Linked from: $linked_from\n";
	return "";
    }
    if ($res->code == 304) { # File unchanged code
	$need_save = 0;
	# If html file, need to read it in so we can check the links later
	# Shouldn't REALLY rely on extension, but this is "good enough" for now
	if ($save_as =~ m/\.html?$/) {
	    open FILE, $save_as || die;
	    read FILE, $content, 10 * 1024 * 1024; # 10 MB should be enough
	    close FILE;
	    $mimetype = "text/html";
	}
    } else {
	$content = $res->content;
	$mimetype = $res->header('Content-type');
    }
    return ($content, $need_save, $mimetype, $res->base);
}
 
# subroutines below use global variables.  my excuse for this
# laziness is so the "awk" programs can call them w/o arguments
sub save()
{
    # if file unchanged since last visit, don't bother
    if (not $need_save) {
        return();
    }
    # create directory, if needed
    my $path = $save_as;
    $path =~ s,/[^/]+$,,;
    mkpath($path);
    my @tags = split(/</,$content);
    # massage internal URLs to reflect the shift in path
    # (e.g. from http://www.cs.byu.edu/~ellisj/foo.html to
    # temp/foo.html)
    foreach $i (@tags) {
	$i =~ s,$base_url/*,/,;
	$i =~ s,="",="index.html",;
	$i =~ s,=(\s|>),=index.html$1,;
    }
    $content = join('<',@tags);
    open FILE,">$save_as" || die;
    print FILE $content;
    close FILE;
}

sub add_links()
{
    my $html = HTML::Parse::parse_html($content);
    print STDERR "  Content-Type: $mimetype\n" if ($verbose);
    if ($mimetype =~ m,^text/html,) {
	for ( @{ $html->extract_links } ) {
	    my($link, $elem) = @$_;
	    my $tag = uc $elem->tag;
	    #print "  Link: $link -> \n";
	    $link = new URI::URL $link, $res_base;
	    #print $link->abs->as_string,"\n";
	    #print STDERR "  found ",$link->abs->as_string,"\n";
	    my $Link = $link->abs->as_string;
	    $Link =~ s/#.*$//;	
	    if ($Link =~ m/$base_url/ or $checklinks) {
		next if ($Link =~ m/^(mailto|news):/i);
		#  ??? to make depth-first, replace push with unshift
		#      hmm no have to think about this
		push @links, $Link;
		push @ancestors, $url;
	    }
	}
    }
}
);
    return $lines;
}