import java.util.Vector;
import java.util.StringTokenizer;
import java.util.ArrayList;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.InputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
/** Written by Tim Macinta 1997
* Distributed under the GNU Public License
* (a copy of which is enclosed with the source).
*
* This LinkExtractor can extract URLs from HTML files.
*
*
* Revised or added comments to some sections of code.
*
* Modified by Xiannong Meng to fix the finite state machine
* to recognize urls containing white spaces.
* April 2005
*
* Revised 2013-04-27
* Change the use of Vector to ArrayList
*/
//public class HTMLLinkExtractor implements LinkExtractor {
public class HTMLLinkExtractor {
ArrayList
*
* @param thisPage The web page being processed
* @param base_url The url base for this web page (i.e., relative base)
*/
public HTMLLinkExtractor(String thisPage, URL base_url) throws IOException {
this.base = base_url;
// System.err.println("in HTMLLinkExtractor: base " + base_url.getFile());
int state = 0;
int pLength = thisPage.length();
if (pLength == 0)
return;
char[] in = new char[pLength];
in = thisPage.toCharArray();
int c = 0;
StringBuffer sb = new StringBuffer();
int i = in[c++];
// System.err.println("before extractor");
while (c < pLength) {
// System.err.print((char)i);
switch (state) {
case 0:
if (i == '<') state = '<';
break;
case '<':
if (i == '>') {
state = 0;
analyze(sb.toString());
// System.err.println("in extractor <" + sb.toString());
sb.setLength(0);
} else if (i == 'a' || i == 'A') {
state = 'a';
sb.append((char) i);
}
break;
case 'a':
if (Character.isWhitespace((char)i))
{
state = '+';
sb.append((char)i);
}
break;
case '+':
if (!Character.isWhitespace((char)i))
{
state = '-';
sb.append((char)i);
}
break;
case '-':
if (i == '>')
{
state = 0;
analyze(sb.toString());
// System.err.println("in extractor -" + sb.toString());
sb.setLength(0);
}
else if (!Character.isWhitespace((char)i))
sb.append((char)i);
break;
}
// System.err.println("state: " + (char)state);
i = in[c++];
}
// System.err.println("after extractor" + sb.toString());
if (sb.length() > 0) analyze(sb.toString());
}
/**
* Analyzes "param", which should be the contents between a '<' and a '>',
* and adds any URLs that are found to the list of URLs.