//------------ // Introduction to Programming Using Java: An Object-Oriented Approach //Arnow/Weiss //------------ import java.io.*; import java.util.*; import java.net.*; public class WebSurveyor { // Initialize a WebSurveyor object capable of giving a set of WebPages reachable // from startingUrl all on the site defined by siteString public WebSurveyor(String startingUrl, String siteString) throws IOException { wpList = new Set(); urlList = new Set(); if (startingUrl.indexOf(siteString) == -1) return; WebPage wp = new WebPage(startingUrl); wpList.addElement(wp); urlList.addElement(startingUrl); survey(wp,siteString); } // Examine the web pages of the site defined by siteString and reachable from the // web page wp private void survey(WebPage wp, String siteString) throws IOException { Set links = wp.getLinks(); Enumeration e = links.elements(); while (e.hasMoreElements()) { String url = (String) e.nextElement(); if (url.indexOf(siteString)!=-1 && !urlList.contains(url)) { WebPage wp2 = new WebPage(url); urlList.addElement(url); if (!wp2.isBad()) { wpList.addElement(wp2); survey(wp2,siteString); } } } } Set getPages() { return wpList; } private Set wpList;// set of existing web pages whose URLs contain // the siteString and that are reachable from // the initial URL given to WebSurveyor private Set urlList;// set of URLs good or bad that have already // been surveyed private static void showBad(WebPage wp, Set badset) { System.out.println("Bad pages of "+wp.getURL()+":"); Enumeration ebad = badset.elements(); while (ebad.hasMoreElements()) System.out.println(ebad.nextElement()); } public static void main(String[] args) throws IOException { String startingUrl = args[0]; String siteString = args[1]; WebSurveyor ws = new WebSurveyor(startingUrl,siteString); Set pset = ws.getPages(); Enumeration e = pset.elements(); while (e.hasMoreElements()) { WebPage wp = (WebPage) e.nextElement(); Set badset = wp.getBadLinks(); if (!badset.isEmpty()) showBad(wp,badset); } } } class WebPage { public WebPage(String url) { this.url = url; } public boolean isBad() throws IOException { return isBad(this.url); } // return a set of all the HTTP HREFs in this web page public Set getLinks() throws IOException { Set links = new Set(); HttpReader hr = new HttpReader(url); String link = new String(); link = hr.readLine(); while (link!=null) { if (link.indexOf("=") != -1) link = link.replace('=','\0'); links.addElement(link); link = hr.readLine(); } return links; } // return a set of all the HTTP HREFs in this web page that are bad links public Set getBadLinks() throws IOException { Set links = new Set(); HttpReader hr = new HttpReader(url); String link = hr.readLine(); while (link!=null) { if (isBad(link)) links.addElement(link); link = hr.readLine(); } return links; } public String getURL() { return url; } private boolean isBad(String url) throws IOException { WebReader wr = new WebReader(url); String s = wr.readLine().toUpperCase(); return s.indexOf("404")!=-1 || s.indexOf("403")!=-1 || s.indexOf("NOT FOUND")!=-1 || s.indexOf("FORBIDDEN")!=-1; } private String url; public static void main(String[] a) throws Exception { WebPage wp = new WebPage(a[0]); Set links = wp.getLinks(); Enumeration e = links.elements(); while (e.hasMoreElements()) System.out.println(wp.getURL()+"has this link:"+e.nextElement()); } } class WebReader { public WebReader(String url) throws IOException { URL u = new URL(url); URLConnection uC = u.openConnection(); BufferedInputStream ins = (BufferedInputStream)uC.getContent(); InputStreamReader isr = new InputStreamReader(ins); br = new BufferedReader(isr); } public String readLine() throws IOException { return br.readLine(); } private BufferedReader br; } class HttpReader { public HttpReader(String urlString) throws IOException { lr = new LinkReader(urlString); } // return the next HTTP HREF in complete URL form public String readLine() throws IOException { String line = lr.readLine(); while (line!=null && line.toUpperCase().indexOf("HTTP:")==-1) line = lr.readLine(); return line; } private LinkReader lr; } class LinkReader { public LinkReader(String urlString) throws IOException { hr = new HrefReader(urlString); url = new URL(urlString); host = url.getHost(); resource = url.getFile(); directory = null; parentDirectory = null; int k = resource.lastIndexOf("/"); if (k!=-1) { directory = resource.substring(0,k); k = directory.lastIndexOf("/"); if (k!=-1) { parentDirectory = resource.substring(0,k); parentDirectory = parentDirectory.concat("/"); } directory = directory.concat("/"); } port = url.getPort(); protocol = url.getProtocol(); } public String readLine() throws IOException { String link = new String(); link = ""; link = hr.readLine(); if (link==null) return null; int k = link.indexOf("://"); if (k!=-1) { if (link.substring(k+3).indexOf("/")==-1) return link+"/"; else return link; } k = link.indexOf(":"); { if (k!=-1 && link.substring(0,k).indexOf(".")==-1) return link; } if (link.length()>=2 && link.substring(0,2).equals("..")) return protocol + "://" + host + parentDirectory + link.substring(3); if (link.length()>=1 && link.substring(0,1).equals("/")) return protocol + "://" + host + link; return protocol + "://" + host + directory + link; } private HrefReader hr;// The HrefReader to read HREFs from private URL url;// The URL of this page private String host,// The host portion of this URL resource,// The resource portion of this URL protocol,// The protocol portion of this URL directory,// The resources directory parentDirectory;// The directory of the // resources directory private int port;// The port number of this URL } class HrefReader { public HrefReader(String url) throws IOException { tr = new TagReader(url); } // return index of HREF href HrEf and so on in s private int hrefIndex(String s) { return s.toUpperCase().indexOf("HREF"); } // return the largest prefix of s that does not contain x with spaces trimmed private String trimFrom(String s, String x) { int k = s.indexOf(x); if (k!=-1) return s.substring(0,k).trim(); else return s.trim(); } // return the largest suffix of s that does not contain x with spaces trimmed private String trimUpThrough(String s, String x) { int k = s.indexOf(x); if (k!=-1) return s.substring(k,k+x.length()).trim(); else return s.trim(); } public String readLine() throws IOException { String tag; tag = tr.readLine(); while (tag!=null && hrefIndex(tag)==-1) tag = tr.readLine(); if (tag==null) return null; int k = hrefIndex(tag); tag = tag.substring(k+1).trim(); tag = trimUpThrough(tag,"="); tag = trimFrom(tag," "); if (tag.indexOf("\"")!=-1) { tag = trimUpThrough(tag,"\""); tag = trimFrom(tag,"\""); } if (tag.indexOf("'")!=-1) { tag = trimUpThrough(tag,"'"); tag = trimFrom(tag,"'"); } tag = trimFrom(tag,"#"); return trimFrom(tag,"?"); } private TagReader tr; // The TagReader to read tags from } class TagReader { public TagReader(String url) throws IOException { wr = new WebReader(url); line = null; } public String readLine() throws IOException { if (line==null) line = wr.readLine(); while (line!=null && line.indexOf("<")==-1) line = wr.readLine(); if (line==null) return null; int k = line.indexOf(">"); if (k!=-1 && k")==-1) { line = line.concat(nextLine); nextLine = wr.readLine(); } if (nextLine!=null) line = line.concat(nextLine); if (line.indexOf(">")==-1) return null; int tagStart = line.indexOf("<"); int tagEnd = line.indexOf(">"); if (tagStart<0 || tagEnd<0 || tagStart>tagEnd) System.err.println("Bad angle brackets: "+line); String tag = line.substring(tagStart+1,tagEnd); line = line.substring(tagEnd+1); return tag; } private String line;// The unprocessed part of the most recent line // read from wr private WebReader wr;// The WebReader to read lines from } class Set { public Set() { vector = new Vector(); } public Set(int n) { vector = new Vector(n); } public boolean isEmpty() { return vector.isEmpty(); } public int size() { return vector.size(); } public boolean contains(Object o) { Enumeration enum = vector.elements(); while (enum.hasMoreElements()) { Object elem = enum.nextElement(); if (elem.equals(o)) return true; } return false; } public void addElement(Object o) { if (!contains(o)) vector.addElement(o); } public Object clone() { Set destSet = new Set(); Enumeration enum = vector.elements(); while (enum.hasMoreElements()) destSet.addElement(enum.nextElement()); return destSet; } public Set union(Set s) { Set unionSet = (Set)s.clone(); Enumeration enum = vector.elements(); while (enum.hasMoreElements()) unionSet.addElement(enum.nextElement()); return unionSet; } public Set intersection(Set s) { Set interSet = new Set(); Enumeration enum = this.vector.elements(); while (enum.hasMoreElements()) { Object elem = enum.nextElement(); if (s.contains(elem)) interSet.addElement(elem); } return interSet; } public Enumeration elements() { return vector.elements(); } public void print(PrintStream ps) { Enumeration enum = vector.elements(); while (enum.hasMoreElements()) { ps.print(enum.nextElement().toString()); ps.print(" "); } } private Vector vector; }