import java.util.Vector; import java.util.StringTokenizer; import java.util.ArrayList; import java.net.URL; import java.net.MalformedURLException; import java.io.InputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; /** Written by Tim Macinta 1997
* Distributed under the GNU Public License * (a copy of which is enclosed with the source).
*
* This LinkExtractor can extract URLs from HTML files.
* *

* Revised or added comments to some sections of code.
* * Modified by Xiannong Meng to fix the finite state machine * to recognize urls containing white spaces.
* April 2005

* * Revised 2013-04-27
* Change the use of Vector to ArrayList

*/ //public class HTMLLinkExtractor implements LinkExtractor { public class HTMLLinkExtractor { ArrayList urls = new ArrayList(); // list of URLs int next_url = 0; // next URL to return int url_count = 0; // number of URLs URL base = null; // base URL /** Creates a new HTMLLinkExtractor that will enumerate all the * URLs in the give "cache_file". */ public HTMLLinkExtractor(File cache_file, URL base_url) throws IOException { this.base = base_url; // System.err.println("in HTMLLinkExtractor: base " + base_url.getFile()); InputStream in = new FileInputStream(cache_file); int state = 0; StringBuffer sb = new StringBuffer(); int i = in.read(); // System.err.println("before extractor"); while (i >= 0) { // System.err.print((char)i); switch (state) { case 0: if (i == '<') state = '<'; break; case '<': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor <" + sb.toString()); sb.setLength(0); } else if (i == 'a' || i == 'A') { state = 'a'; sb.append((char) i); } break; case 'a': if (Character.isWhitespace((char)i)) { state = '+'; sb.append((char)i); } break; case '+': if (!Character.isWhitespace((char)i)) { state = '-'; sb.append((char)i); } break; case '-': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor -" + sb.toString()); sb.setLength(0); } else if (!Character.isWhitespace((char)i)) sb.append((char)i); break; } // System.err.println("state: " + (char)state); i = in.read(); } // System.err.println("after extractor" + sb.toString()); if (sb.length() > 0) analyze(sb.toString()); in.close(); } /** * Creates a new HTMLLinkExtractor that will enumerate all the * URLs in the given string.

* * @param thisPage The web page being processed * @param base_url The url base for this web page (i.e., relative base) */ public HTMLLinkExtractor(String thisPage, URL base_url) throws IOException { this.base = base_url; // System.err.println("in HTMLLinkExtractor: base " + base_url.getFile()); int state = 0; int pLength = thisPage.length(); if (pLength == 0) return; char[] in = new char[pLength]; in = thisPage.toCharArray(); int c = 0; StringBuffer sb = new StringBuffer(); int i = in[c++]; // System.err.println("before extractor"); while (c < pLength) { // System.err.print((char)i); switch (state) { case 0: if (i == '<') state = '<'; break; case '<': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor <" + sb.toString()); sb.setLength(0); } else if (i == 'a' || i == 'A') { state = 'a'; sb.append((char) i); } break; case 'a': if (Character.isWhitespace((char)i)) { state = '+'; sb.append((char)i); } break; case '+': if (!Character.isWhitespace((char)i)) { state = '-'; sb.append((char)i); } break; case '-': if (i == '>') { state = 0; analyze(sb.toString()); // System.err.println("in extractor -" + sb.toString()); sb.setLength(0); } else if (!Character.isWhitespace((char)i)) sb.append((char)i); break; } // System.err.println("state: " + (char)state); i = in[c++]; } // System.err.println("after extractor" + sb.toString()); if (sb.length() > 0) analyze(sb.toString()); } /** * Analyzes "param", which should be the contents between a '<' and a '>', * and adds any URLs that are found to the list of URLs.

* * @param param The parameter section */ public void analyze(String param) { StringTokenizer st = new StringTokenizer(param); // System.err.println("in analyze " + param); if (st.countTokens() < 2) return; String first_word = st.nextToken().toLowerCase(); // System.err.println("in analyze(first_word) " + first_word); if (first_word.equals("a")) { analyzeAnchor(st.nextToken("")); } else if (first_word.equals("frame")) { analyzeFrame(st.nextToken("")); } else if (first_word.equals("base")) { extractBase(st.nextToken("")); } } /** * Analyzes the tag.

* * @param anchor The anchor text to be analyzed. */ void analyzeAnchor(String anchor) { String href = extract(anchor, "href"); // if (href == null) System.err.println("href null anchor" + anchor); if (href == null) return; try { // href = fixUrl(href); // System.err.println("in analyzeAnchore: adding " + base + "|" + href); addURL(new URL(base, href)); } catch (MalformedURLException e) { anchor = anchor.toLowerCase(); // java doesn't understand mailto and will throw an exception // if (!href.startsWith("mailto:")) { // e.printStackTrace(); // } } } /** * Analyzes the tag.

* * @param frame The 'frame' tag being analyzed */ void analyzeFrame(String frame) { String src = extract(frame, "src"); // if (src == null) System.err.println("src null"); if (src == null) return; try { // System.err.println("in analyzeFrame: adding " + base + "|" + // src); addURL(new URL(base, src)); } catch (MalformedURLException e) { // e.printStackTrace(); } } /** * Extracts the base URL from the tag.

* * @param b Base tag, currently only works for 'href' */ void extractBase(String b) { String b2 = extract(b, "href"); if (b2 != null) { try { base = new URL(base, b2); } catch (MalformedURLException e) { // e.printStackTrace(); } } } /** * Adds "url" to the list of URLs.

* * @param url The url to be added */ public void addURL(URL url) { urls.add(url); url_count++; } /** * Check to see if there is more elements.

* * @return true if the list has more elements, false otherwise */ public boolean hasMoreElements() { return url_count != next_url; } /** * Return the next element in the list.

* * @return The next element in the list */ public Object nextElement() { Object ob = urls.get(next_url); next_url++; return ob; } /** * Resets this enumeration. */ public void reset() { next_url = 0; } /** Returns the value in "line" associated with "key", or null if "key" * is not found. For instance, if line were "a href="blah blah blah" * and "key" were "href" this method would return "blah blah blah". *

* Keys are case insensitive.

* * @param line The source data to work with * @param key The pattern to look for */ String extract(String line, String key) { // System.err.println("line in extract |" + line + "|"); line = line.replace('\'', '\"'); // some sites use ' instead of " // System.err.println("line in extract after replace |" + line + "|"); try { key = key.toLowerCase(); String lower_case = line.toLowerCase(); int i = lower_case.indexOf(key); if (i < 0) return null; i += key.length(); if (line.charAt(i) != '=') return null; i++; int i2; if (line.charAt(i) == '"') { i++; i2 = line.indexOf('"', i); if (i2 < 0) { return line.substring(i); } else { return line.substring(i, i2); } } else { int targ = line.length(); for (i2 = i; i < targ; i++) { if (Character.isWhitespace(line.charAt(i))) break; } return line.substring(i, i2); } } catch (StringIndexOutOfBoundsException e) {} return null; } /** * Add '/' to a url whose ending is not a file. * Then remove the extra '/'s. * This is an add-hoc fix, only works for "href".

* * @param inUrl The url to examine * @return The fixed url */ static public String fixUrl(String inUrl) { int docLoc = inUrl.lastIndexOf('.'); int protLoc = inUrl.indexOf("://"); int slashLoc = -1; if (protLoc > 0) slashLoc = inUrl.indexOf('/', protLoc+3); else slashLoc = inUrl.indexOf('/'); if (slashLoc < 0 || (slashLoc > 0 && docLoc < slashLoc)) // none inUrl = inUrl + '/'; inUrl = removeExtraSlash(inUrl); return inUrl; } /** * Remove extra '/'s in a url.

* * @param inUrl The url to examine * @return The fixed url */ static public String removeExtraSlash(String inStr) { int slashLoc = inStr.indexOf("://"); int l = inStr.length(); int hold = slashLoc + 3; slashLoc = inStr.indexOf("//", hold); while (slashLoc > 0 && slashLoc < l) { hold = slashLoc; while (hold < l && inStr.charAt(hold) == '/') hold ++; if (hold >= l) { inStr = inStr.substring(0, slashLoc+1); break; } else inStr = inStr.substring(0, slashLoc+1) + inStr.substring(hold); slashLoc = inStr.indexOf("//", hold); l = inStr.length(); } return inStr; } /** * Print the string form of a url for a given url object.

* * @param theUrl The url to be printed */ static public void printAUrl(URL theUrl) { if (theUrl.getPort() > 0) System.out.println(theUrl.getProtocol() + "://" + theUrl.getHost() + ":" + theUrl.getPort() + theUrl.getFile()); else if (theUrl.getProtocol().compareToIgnoreCase("mailto") == 0) System.out.println(theUrl.getProtocol() + ":" + theUrl.getFile()); else System.out.println(theUrl.getProtocol() + "://" + theUrl.getHost() + theUrl.getFile()); } /** * Print all urls in the list.

*/ public void printAllUrls() { int count = urls.size(); for (int i = 0; i < count; i ++) { URL aUrl = (URL)urls.get(i); printAUrl(aUrl); } } /** * Return all urls from the list. * * @return url list */ public ArrayList getAllUrls() { return urls; } /* * Simple test drive. */ static public void main(String[] argv) throws Exception { if (argv.length != 2) { System.err.println("usage: java HTMLLinkExtractor [url] [file-to-process]"); System.exit(1); } // argv[0] should be in the form of proto://host.domain/dir/file URL aUrl = new URL(argv[0]); File aFile = new File(argv[1]); InputStream in = new FileInputStream(aFile); String page = ""; int c = in.read(); while (c >= 0) { page += (char)c; c = in.read(); } HTMLLinkExtractor htmlExtractor = new HTMLLinkExtractor(page, aUrl); System.out.println("count of urls : " + htmlExtractor.urls.size()); htmlExtractor.printAllUrls(); } }