#!/usr/bin/perl -w use strict; my( $base_url, $base_path, $program_line, $proxy, $verbose, $program, # the code to eval $tmpin, # the files a2p reads $tmpout, # the files a2p writes @nargs, # new args for a2p @vargs # temporary holder of variables ); END { unlink $tmpin, $tmpout; # XXX: don't check failure close STDOUT || die "$0: can't close stdout: $!\n"; $? = 1 if $? == 255; # from die } sub usage { warn "$0: @_\n" if @_; die "usage: $0 [ -F fs ] [ [-v] var=value ] [ 'prog' | -f progfile ] [ file ... ]\n"; } usage() unless @ARGV; # the following dance is needed to avoid explicit # pipes or even redirections for process control # on broken soi-disant operating systems. open(SAVE_OUT, ">&STDOUT") || die "can't save stdout: $!"; die unless defined fileno SAVE_OUT; open(TMPIN, "> " . ($tmpin = "a2pin.$$")) || open(TMPIN, "> " . ($tmpin = "/tmp/a2pin.$$")) || die "can't find a tmp input file"; open(TMPOUT,"+> " . ($tmpout = "a2pout.$$")) || open(TMPOUT,"+> " . ($tmpout = "/tmp/a2pout.$$")) || die "can't find a tmp output file"; # This needs to be improved: assumes first two options are start url and # local path to save mirrored pages; next two are optionally # --proxy and --verbose $base_url = shift @ARGV; $base_path = shift @ARGV; $_ = shift @ARGV; $proxy = ""; $verbose = 0; if ($_ eq "--proxy") { $proxy = $_; $_ = shift @ARGV; } if ($_ eq "--verbose") { $verbose = 1; $_ = shift @ARGV; } unshift @ARGV, $_; $base_path =~ s/^\~/$ENV{HOME}/; # Tom's awk-option handler; webawk-specific options have been removed above while (@ARGV) { $_ = $ARGV[0]; if (s/^-//) { if (s/^F//) { unless (length) { shift; $_ = $ARGV[0]; } push(@nargs, "-F$_"); shift; next; } elsif (s/^v// || /^\w+=/) { unless (length) { shift; $_ = $ARGV[0]; } push(@vargs, $_); shift; next; } elsif (s/^f//) { unless (length) { shift; $_ = shift; } push(@nargs, $_); last; } elsif (s/^-//) { if (length) { usage("Long options not supported") } shift; next; } else { usage("unknown flag: -$_"); } } else { # XXX: is it a program or an expression? if (/^\w+=/) { push(@vargs, $_); shift; next; } else { print TMPIN "$_\n"; shift; push @nargs, $tmpin; last; } } } unshift @ARGV, @vargs; # put back var=val statements close(TMPIN) || die "can't close $tmpin: $!"; open(STDOUT, ">&TMPOUT") || die "can't dup to $tmpout: $!"; $| = 1; system 'a2p', @nargs; if ($?) { die "Couldn't run a2p (wait status == $?)"; } die "empty program" unless -s TMPOUT; die "empty program" unless -s $tmpout; seek(TMPOUT, 0, 0) || die "can't rewind $tmpout: $!"; # modify the resulting Perl code # first, add webawk module requirements and subroutines while ($program_line = ) { if ($program_line eq "#!/usr/bin/perl\n") { $program = $program_line . add_headers(); } # replace the main line-based loop with a link-based one elsif ($program_line eq "while (<>) {\n") { $program .= add_initialization(); $program .= add_subroutines(); $program .= add_loop(); } else { $program .= $program_line } } close(TMPOUT) || die "can't close $tmpout: $!"; open(STDOUT, ">&SAVE_OUT") || die "can't restore stdout: $!"; # eval qq{ # no strict; # local \$^W = 0; # $program; # }; print $program; if ($@) { die "Couldn't compile and execute awk-to-perl program: $@\n"; } exit 0; sub add_headers() { my $lines; $lines = q( use LWP::UserAgent; require HTML::Parse; require HTML::FormatText; use HTML::Entities; use Getopt::Long; use File::Path; use HTTP::Date qw(time2str str2time); ); return $lines; } sub add_initialization() { my $lines; $lines = qq( \$base_url = "$base_url"; \$base_path = "$base_path"; \$verbose = $verbose; ); $lines .= q( push @ancestors, "command line"; push @links, $base_url; ); # If given URL doesn't end with a slash (i.e. is file instead of dir) # then remove non-slash characters $lines .= q( $base_url =~ s,[^/]+$,, if ($base_url =~ m,[^/]$,); ); $lines .= q( $ua = new LWP::UserAgent; $ua->agent("WebAWK/0.1 " . $ua->agent); ); if ($proxy) { $lines .= q( $ua->env_proxy; ) } return $lines; } sub add_loop() { my $lines; $lines = # quote with bang since mismatched brackets in text we're adding (no # close brace to match while; that'll be added later from a2p output) q! while (($url, $linked_from) = get_next_url()) { $save_as = get_filename($url, $base_url); ($content, $need_save, $mimetype, $res_base) = get_next_page($url, $base_url, $save_as); if ($content eq "") { next; } $_ = $url; !; return $lines; } sub add_subroutines() { my $lines; # uses global hash "history" # also uses global lists since I can't figure out how to pass the # lists as local vars :-/ $lines = q( sub get_next_url() { my ($url, $linked_from); while ($url = shift @links) { $linked_from = shift @ancestors; # skip this link if already visited next if (exists $history{$url}); $history{$url} = 1; return ($url, $linked_from); } return (); } sub get_filename($$) # ??? must fix for "external" urls { my ($url, $base_url) = @_; my $rel_url = $url; $rel_url =~ s,^$base_url,,; # whack off the base # if null or ending in a slash, concatenate "index.html" $rel_url .= "/index.html" if ($rel_url=~m,/$, or $rel_url eq ""); print " relative URL is $rel_url\n" if ($verbose); my $filename = "$base_path/$rel_url"; # replace doubled slashes with singles $filename =~ s,//,/,g; return $filename; } sub get_next_page($$$) { my ($url, $base_url, $save_as) = @_; my $req = new HTTP::Request 'GET' => $url; my $need_save = 1; my $content = "", $mimetype = "unknown"; # Modify request to only check for changes if file exists if (($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($save_as)) { $req->header('If-Modified-Since', time2str($mtime)); print " If-Modified-Since: ",time2str($mtime),"\n" if ($verbose); } # Send the request; simple error checking my $res = $ua->request($req); if (not $res->is_success) { print "Request unsuccessful; request was for $url\n"; return ""; } if ($res->content =~ m/\<(moved|please update)\>/) { print STDERR "Getting $url\n" if (not $verbose); print STDERR " Error: page moved!\n"; print STDERR " Linked from: $linked_from\n"; return ""; } if ($res->code == 304) { # File unchanged code $need_save = 0; # If html file, need to read it in so we can check the links later # Shouldn't REALLY rely on extension, but this is "good enough" for now if ($save_as =~ m/\.html?$/) { open FILE, $save_as || die; read FILE, $content, 10 * 1024 * 1024; # 10 MB should be enough close FILE; $mimetype = "text/html"; } } else { $content = $res->content; $mimetype = $res->header('Content-type'); } return ($content, $need_save, $mimetype, $res->base); } # subroutines below use global variables. my excuse for this # laziness is so the "awk" programs can call them w/o arguments sub save() { # if file unchanged since last visit, don't bother if (not $need_save) { return(); } # create directory, if needed my $path = $save_as; $path =~ s,/[^/]+$,,; mkpath($path); my @tags = split(/),=index.html$1,; } $content = join('<',@tags); open FILE,">$save_as" || die; print FILE $content; close FILE; } sub add_links() { my $html = HTML::Parse::parse_html($content); print STDERR " Content-Type: $mimetype\n" if ($verbose); if ($mimetype =~ m,^text/html,) { for ( @{ $html->extract_links } ) { my($link, $elem) = @$_; my $tag = uc $elem->tag; #print " Link: $link -> \n"; $link = new URI::URL $link, $res_base; #print $link->abs->as_string,"\n"; #print STDERR " found ",$link->abs->as_string,"\n"; my $Link = $link->abs->as_string; $Link =~ s/#.*$//; if ($Link =~ m/$base_url/ or $checklinks) { next if ($Link =~ m/^(mailto|news):/i); # ??? to make depth-first, replace push with unshift # hmm no have to think about this push @links, $Link; push @ancestors, $url; } } } } ); return $lines; }