#!/usr/bin/perl

use strict;
use Getopt::Long;

my %opt=();
GetOptions(\%opt,
		   'r=n',
		   'l|list',
		   'v|verbose',
		   'g|go=s',
		   's|stop=s');

my ($url) = @ARGV;
my %urls=();
if (!$url){
	print "Usage: web2txt [-l] [-r n] [-v] [-g go -s stop] url\n";
	exit 0;
}
if (!$opt{'r'}) {
	$opt{'r'} = 0;
}
my @text = gettext($url,$opt{'r'},$opt{'g'},$opt{'s'});
print @text if (!$opt{'l'});

sub dprint($){
	my ($it)=@_;
	print $it if ($opt{'v'});
}

sub gettext($;$){
	my ($url,$recurse,$go,$stop) = @_;
	my $printing;
	my $newrecurse = $recurse?$recurse-1:0;

	dprint "Getting '$url': Recurse='$recurse',go='$go', stop='$stop'\n";
	print "$url\n" if $opt{'l'};
	# Don't print stuff twice!
	if($urls{$url}){
		return;
	}
	$urls{$url}=1;
	if (! $go){
		$printing=1;
	}else{
		$printing=0;
	}
	my @output = `lynx -dump "$url"`;
	my @ret=("\n\n");
	my $inrefs=0;
	foreach(@output){
		if ($go && !$printing && /$go/){
			$printing=1;
		}
		if ($stop && $printing && /$stop/){
			$printing=0;
		}
		if (/^References$/){
			$inrefs=1;
			if (!$recurse){
				last;
			}
			next;
		}
		if ($inrefs==0){
			push @ret,$_ if $printing;
		}
		else{
			if (my ($newurl)=(/\s*\d+\.\s*(.*)$/)){
				next if ( $newurl =~ /mailto/);
				next if ( $newurl =~ /javascript/);
				my @out2 = gettext($newurl,$newrecurse,$go,$stop);
				push @ret,@out2;
			}
		}
	}
	return @ret;
}
