/** * This is a simple class to recognize the robots.txt protocols. * Various methods are developed to fulfil this task.

* * Classroom demonstration.

* * Fall 2006.

* * Xiannong Meng

*/ import java.io.*; import java.net.*; import java.util.*; public class Robots { // Data members private String robotContent; // content of the robots.txt private URL thisUrl; // the site url private String[] lookUpTable; // an array of 'disallow's private int sizeOfTable; // size of the array /** * Constructor: initialize the URL object with the incoming * URL.

* */ public Robots(String inUrl) { try { thisUrl = new URL(inUrl); } catch (MalformedURLException e) { System.err.println("Malformed URL"); System.exit(1); } sizeOfTable = 0; } /** * Connect to the web and read its robots.txt, if any.

*/ public void connectAndRead() { HttpURLConnection uC = null; int responseCode = -1; InputStream input; BufferedReader remote; try { // connet to the site uC = (HttpURLConnection)thisUrl.openConnection(); System.out.println("port " + thisUrl.getPort()); responseCode = uC.getResponseCode(); } catch (IOException e) { System.err.println("IO excption caught when openning connection!"); System.exit(1); } if (responseCode != HttpURLConnection.HTTP_OK) { System.err.println("Server didn't response positively!"); System.exit(1); } try { input = uC.getInputStream(); remote = new BufferedReader(new InputStreamReader(input)); String line = new String(); line = remote.readLine(); while (line != null) { // System.out.println("in reading:" + line + ":"); sizeOfTable ++; robotContent += (line + "\n"); line = remote.readLine(); } } catch (IOException ex) { System.err.println("IO excption caught when read!"); System.exit(1); } } /** * build the look-up table.

* * @param inContent the robots.txt in a string format. */ public void build(String inContent) { robotContent = inContent; StringTokenizer st = new StringTokenizer(robotContent); lookUpTable = new String[sizeOfTable]; int tokenCount = 0; int cmdCount = 0; while (st.hasMoreTokens() == true) { String op = st.nextToken(); // System.out.println(tokenCount + ":" + op); int separatorLoc = op.indexOf(":"); if (separatorLoc > 0) { // this is a 'disallow' or 'allow' String value = st.nextToken(); op = op.substring(0, separatorLoc); // System.out.println("Op|" + op + "| value|" + value + "|"); /* * Only the 'disallow' entries are recorded. */ if (op.compareToIgnoreCase("disallow") == 0) { /* System.out.println("adding " + op + " and " + value + " to the table " + "index " + cmdCount + " size " + sizeOfTable); */ lookUpTable[cmdCount] = new String(value.trim()); cmdCount ++; } } tokenCount ++; } // while sizeOfTable = cmdCount; sortTable(); // sort the look-up table } /** * Sort the table in order, using quicksort that's in Sort.java.

*/ private void sortTable() { /** try { Sort.QuickSort(lookUpTable, 0, sizeOfTable - 1); } catch (Exception e) { System.err.println("sorting problem, quit!"); System.exit(1); } **/ } /** * Check to see if the any of the paths inthe list is * a part of the incoming 'path'.

* * @param list the table to check against * @param path the path to check * * @return true if the 'path' is a part of the 'list', false otherwise.

*/ private boolean isPartOf(String[] list, String path) { for (int i = 0; i < sizeOfTable; i ++) if (path.startsWith(list[i])) return true; return false; } /** * Check to see if a given path is allowed to visit.

* * @return true if this is allowed to visit, false otherwise. */ public boolean isAllowedToVisit(String path) { path = path.trim(); if (isPartOf(lookUpTable, path)) return false; // not allowed to visit else return true; // allowed to visit } /** * Wrapper to 'build'.

*/ public void buildProtocol() { build(this.robotContent); } /** * A simple test program.

*/ public static void main(String[] args) { if (args.length != 2) { System.err.println("usage: java Robots url test-path"); System.exit(1); } Robots rb = new Robots(args[0].trim()); rb.connectAndRead(); rb.buildProtocol(); // a few simple tests if (rb.isAllowedToVisit(args[1].trim())) System.out.println(args[1] + " is allowed to visit"); else System.out.println(args[1] + " is not allowed to visit"); } }