/* * HTMLLinkExtractor.java

*/ //package seme.net; import java.util.Vector; import java.util.StringTokenizer; import java.util.ArrayList; import java.net.URL; import java.net.MalformedURLException; import java.io.InputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; /** Written by Tim Macinta 1997
* Distributed under the GNU Public License * (a copy of which is enclosed with the source).
*
* This LinkExtractor can extract URLs from HTML files.
* * modified by Xiannong Meng to fix the finite state machine * to recognize urls containing white spaces.
* April 2005

* * modified by Xiannong Meng to put it into package seme.net; * May 2006

*/ //public class HTMLLinkExtractor implements LinkExtractor { public class HTMLLinkExtractor { Vector urls = new Vector(6, 9); // list of URLs int next_url = 0; // next URL to return int url_count = 0; // number of URLs URL base = null; // base URL /** Creates a new HTMLLinkExtractor that will enumerate all the * URLs in the give "cache_file". */ public HTMLLinkExtractor(File cache_file, URL base_url) throws IOException { this.base = base_url; // System.err.println("in HTMLLinkExtractor: base " + base_url.getFile()); InputStream in = new FileInputStream(cache_file); int state = 0; StringBuffer sb = new StringBuffer(); int i = in.read(); // System.err.println("before extractor"); while (i >= 0) { // System.err.print((char)i); switch (state) { case 0: if (i == '<') state = '<'; break; case '<': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor <" + sb.toString()); sb.setLength(0); } else if (i == 'a' || i == 'A') { state = 'a'; sb.append((char) i); } break; case 'a': if (Character.isWhitespace((char)i)) { state = '+'; sb.append((char)i); } break; case '+': if (!Character.isWhitespace((char)i)) { state = '-'; sb.append((char)i); } break; case '-': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor -" + sb.toString()); sb.setLength(0); } else if (!Character.isWhitespace((char)i)) sb.append((char)i); break; } // System.err.println("state: " + (char)state); i = in.read(); } // System.err.println("after extractor" + sb.toString()); if (sb.length() > 0) analyze(sb.toString()); in.close(); } /** Creates a new HTMLLinkExtractor that will enumerate all the * URLs in the given string */ public HTMLLinkExtractor(String thisPage, URL base_url) throws IOException { this.base = base_url; // System.err.println("in HTMLLinkExtractor: base " + base_url.getFile()); int state = 0; int pLength = thisPage.length(); if (pLength == 0) return; char[] in = new char[pLength]; in = thisPage.toCharArray(); int c = 0; StringBuffer sb = new StringBuffer(); int i = in[c++]; // System.err.println("before extractor"); while (c < pLength) { // System.err.print((char)i); switch (state) { case 0: if (i == '<') state = '<'; break; case '<': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor <" + sb.toString()); sb.setLength(0); } else if (i == 'a' || i == 'A') { state = 'a'; sb.append((char) i); } break; case 'a': if (Character.isWhitespace((char)i)) { state = '+'; sb.append((char)i); } break; case '+': if (!Character.isWhitespace((char)i)) { state = '-'; sb.append((char)i); } break; case '-': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor -" + sb.toString()); sb.setLength(0); } else if (!Character.isWhitespace((char)i)) sb.append((char)i); break; } // System.err.println("state: " + (char)state); i = in[c++]; } // System.err.println("after extractor" + sb.toString()); if (sb.length() > 0) analyze(sb.toString()); } /** Analyzes "param", which should be the contents between a '<' and a '>', * and adds any URLs that are found to the list of URLs. */ public void analyze(String param) { StringTokenizer st = new StringTokenizer(param); // System.err.println("in analyze " + param); if (st.countTokens() < 2) return; String first_word = st.nextToken().toLowerCase(); // System.err.println("in analyze(first_word) " + first_word); if (first_word.equals("a")) { analyzeAnchor(st.nextToken("")); } else if (first_word.equals("frame")) { analyzeFrame(st.nextToken("")); } else if (first_word.equals("base")) { extractBase(st.nextToken("")); } } /** Analyzes the tag. */ void analyzeAnchor(String anchor) { String href = extract(anchor, "href"); // if (href == null) System.err.println("href null anchor" + anchor); if (href == null) return; try { // href = fixUrl(href); // System.err.println("in analyzeAnchore: adding " + base + "|" + href); addURL(new URL(base, href)); } catch (MalformedURLException e) { anchor = anchor.toLowerCase(); // java doesn't understand mailto and will throw an exception if (!href.startsWith("mailto:")) { e.printStackTrace(); } } } /** Analyzes the tag. */ void analyzeFrame(String frame) { String src = extract(frame, "src"); // if (src == null) System.err.println("src null"); if (src == null) return; try { // System.err.println("in analyzeFrame: adding " + base + "|" + // src); addURL(new URL(base, src)); } catch (MalformedURLException e) { e.printStackTrace(); } } /** Extracts the base URL from the tag. */ void extractBase(String b) { String b2 = extract(b, "href"); if (b2 != null) { try { base = new URL(base, b2); } catch (MalformedURLException e) { e.printStackTrace(); } } } /** Adds "url" to the list of URLs. */ public void addURL(URL url) { urls.addElement(url); url_count++; } public boolean hasMoreElements() { return url_count != next_url; } public Object nextElement() { Object ob = urls.elementAt(next_url); next_url++; return ob; } /** Resets this enumeration. */ public void reset() { next_url = 0; } /** Returns the value in "line" associated with "key", or null if "key" * is not found. For instance, if line were "a href="blah blah blah" * and "key" were "href" this method would return "blah blah blah". *

* Keys are case insensitive. */ String extract(String line, String key) { try { key = key.toLowerCase(); String lower_case = line.toLowerCase(); int i = lower_case.indexOf(key); if (i < 0) return null; i += key.length(); if (line.charAt(i) != '=') return null; i++; int i2; if (line.charAt(i) == '"') { i++; i2 = line.indexOf('"', i); if (i2 < 0) { return line.substring(i); } else { return line.substring(i, i2); } } else { int targ = line.length(); for (i2 = i; i < targ; i++) { if (Character.isWhitespace(line.charAt(i))) break; } return line.substring(i, i2); } } catch (StringIndexOutOfBoundsException e) {} return null; } // this is an add-hoc fix, only works for "href" static public String fixUrl(String inUrl) { int docLoc = inUrl.lastIndexOf('.'); int protLoc = inUrl.indexOf("://"); int slashLoc = -1; if (protLoc > 0) slashLoc = inUrl.indexOf('/', protLoc+3); else slashLoc = inUrl.indexOf('/'); if (slashLoc < 0 || (slashLoc > 0 && docLoc < slashLoc)) // none inUrl = inUrl + '/'; inUrl = removeExtraSlash(inUrl); return inUrl; } /** * Removes extra slashes "/" in the giving url.

* @param inStr is a in-coming url.

* @return the fixed url without any duplicated slashes.

* * Precondition: inStr is a valid url.

* Postcondition: Extra slahes are removed from the url.

*/ static public String removeExtraSlash(String inStr) { int slashLoc = inStr.indexOf("://"); int l = inStr.length(); int hold = slashLoc + 3; slashLoc = inStr.indexOf("//", hold); while (slashLoc > 0 && slashLoc < l) { hold = slashLoc; while (hold < l && inStr.charAt(hold) == '/') hold ++; if (hold >= l) { inStr = inStr.substring(0, slashLoc+1); break; } else inStr = inStr.substring(0, slashLoc+1) + inStr.substring(hold); slashLoc = inStr.indexOf("//", hold); l = inStr.length(); } return inStr; } /** * Print a given url in its completion, protocol://domain/file * * @param theUrl is a valid URL object.

* * Precondition: a valid URL object is given.

* Postcondition: The given url is printed.

*/ static public void printAUrl(URL theUrl) { if (theUrl.getPort() > 0) System.out.println(theUrl.getProtocol() + "://" + theUrl.getHost() + ":" + theUrl.getPort() + theUrl.getFile()); else if (theUrl.getProtocol().compareToIgnoreCase("mailto") == 0) System.out.println(theUrl.getProtocol() + ":" + theUrl.getFile()); else System.out.println(theUrl.getProtocol() + "://" + theUrl.getHost() + theUrl.getFile()); } /** * Print all urls held currently in the link extractor.

* * Precondition: the link extractor holds a set of valid urls.

* Postcondition: these urls are printed.

*/ public void printAllUrls() { int count = urls.size(); for (int i = 0; i < count; i ++) { URL aUrl = (URL)urls.elementAt(i); printAUrl(aUrl); } } /** * Return the set of urls held by this link extractor.

*/ public Vector getAllUrls() { return urls; } /* * A simple test prorgram */ static public void main(String[] argv) throws Exception { // argv[0] should be in the form of proto://host.domain/dir/file URL aUrl = new URL(argv[0]); File aFile = new File(argv[1]); InputStream in = new FileInputStream(aFile); String page = ""; int c = in.read(); while (c >= 0) { page += (char)c; c = in.read(); } HTMLLinkExtractor htmlExtractor = new HTMLLinkExtractor(page, aUrl); System.out.println("count of urls : " + htmlExtractor.urls.size()); htmlExtractor.printAllUrls(); } }