#!/bin/perl
#
# Bradley Huffaker		(07/31/98)
# To create a data file based on the connections between html documents
# 

use Socket;

$start_file = "http://caida.org/Tools/Otter/index.html";
$depth = 2;
&ProcessURL($start_file,$depth);
&PrintFVL();

sub ProcessURL
{
	my ($url,$depth) = @_;
	$url.= "index.html" if ($url=~/\/$/);
	return if ($reached{$url} || 0 > $depth);
	$htmlReached{$url} = 1;
	($type,$server,$port,$path) = 
		($url=~/(^[^\:&^\/]+)\:\/\/([^\:|\/]+):*(\d*)(.+)/);
	
	$port = 80 if (!$port);

	$type =~ y/A-Z/a-z/;

	if ($type ne "http")
	{
		$url2format{$url} = $format;
	}
#print STDOUT "type:$type server:$server port:$port path:$path\n";

	#Opens a socket to the server and sets SOCK to the handle
	if (!open_socket($port,$server))
	{
		$broken{$url} = 1;
		return;
	}

	print SOCK "GET $path HTTP/1.0\n";
	print SOCK "User-Agent: Mozilla/3.01Gold (X11; U; SunOS 5.5.1 sun4u)\n";
	print SOCK "\n\n";

	my $finished;
	while (!$finished && ($line = <SOCK>))
	{
		#$line=~y/A-Z/a-z/;
		if ($line=~/HTTP.+ ([\d]+)/)
		{
			$httpRequest = $1;
		}
		elsif ($line=~/Server\: ([^\s]+)/)
		{
			$url2server{$url} = $1;
		}
		elsif ($line=~/Content-Type\: ([^\s]+)/)
		{
			$url2format{$url} = $1;
			$finished = 1;
#print STDOUT "`$1' $line";
		}
	}
#print STDERR "$httpRequest\n";

	# Captures the redirects
	if ($httpRequest == 301)
	{
	   my $mapping;
	   while (<SOCK>)
	   {
		if (/(\<[a|A]\s+[h|H][r|R][e|E][f|F])=\"*([^\"&^\>]+)\"*.*\>/)
		{ 
			$mapping = $2;
		}
	    }
	    if ($mapping)
	    {
		$mapping .= "index.html" if ($mapping =~/\/$/);
		$mapping{$url} = $mapping;
	        &ProcessURL($mapping,$depth);
	    }
	    $return;
 	}

#print STDERR "$depth:$url\n";
	if (!$urls{$url})
	{
		push @urls,$url;
		$urls{$url} = 1;
	}
		
	my @next_hop;
	return
		if ($depth == 0);
	while (<SOCK>)
	{
		undef $link;
		if (/(\<[a|A]\s+[h|H][r|R][e|E][f|F])=\"*([^\"&^\>]+)\"*.*\>/)
		{
			$link2type{$1}  = $2;
			$link = $2;
		}
		elsif
		 (/(\<[i|I][m|M][g|G]\s+[s|S][r|R][c|C])=\"*([^\"&^\>]+)\"*.*\>/)
		{
			$link2type{$1}  = $2;
			$link = $2;
		}
		if ($link)
		{
			if ($link eq "..")
			{
				$path=~s/[^\/]+\/$//g; 
				$link = "http://".$server.$path;
			}
			elsif ($link eq ".")
			{
				$link = $url;
			}
			elsif (!($link=~/^http\:/))
			{
				$path=~s/[^\/]*\.html*$//g;
				$link = $path.$link
					if (!($link=~/^\//));
				$link = "http://".$server.$link;
			}
			$link .= "index.html" if ($link=~/\/$/);
			$links{"$url\0$link"}++;
			if (!$urls{$link} && !$reached{$url})
			{
				push @urls,$link;
				push @next_hop,$link;
				$urls{$link} = 1;
			}
		}
	}

	close SOCK;
	foreach $next (@next_hop)
	{
		&ProcessURL($next,$depth-1);
	}
}

sub PrintFVL()
{
	#@urls = sort keys %urls;
	@links = keys %links;
	print STDOUT "t ",$#urls+1,"\n";
	print STDOUT "T ",$#links+1,"\n";
	$node_index = 0;
	foreach $node (@urls)
	{
		if (!$mapping{$node})
		{
			print STDOUT "? $node_index $node";
			print STDOUT " r" if ($start_file eq $node);
			print STDOUT "\n";
			$node2index{$node} = $node_index++;
		}
	}

	$link_index = 0;
	foreach $link (@links)
	{
		($from,$to) = split /\0/,$link;
		$from = $mapping{$from}
			if ($mapping{$from});
		$to = $mapping{$to}
			if ($mapping{$to});
		$from = $node2index{$from};
		$to = $node2index{$to};
		print STDOUT "l $link_index $from $to\n";
		$link_index++;
	}
}	

sub open_socket($port,$hostname)
{
        my ($port, $hostname ) = @_;
        my ($iaddr, $paddr, $proto, $ip);
 
        if ($port =~ /\D/) {$port = getservbyname($port, 'tcp'); }
	if (!$port)
	{
		return 0;
	}
        &die("Bad port") unless $port;
        #$ip = getprotobyname($hostname);
        $iaddr = inet_aton($hostname);
        ($fqdn, $aliases, $type, $len, $iaddr) = gethostbyname($hostname);
        $paddr = sockaddr_in($port, $iaddr);
        $proto = getprotobyname('tcp');
#print STDOUT "(port: $port) (hostname: $hostname) (ip:$ip)",
        "(iaddr: $iaddr) (paddr: $paddr)\n";
 
        socket( SOCK, PF_INET, SOCK_STREAM, $proto)|| return 0;
        select(SOCK) || &diecgi("select:$!");
        connect(SOCK, $paddr)                      || return 0;
        $| = 1;
	return 1;
}

