/* * HTMLLinkExtractor.java
*/
//package seme.net;
import java.util.Vector;
import java.util.StringTokenizer;
import java.util.ArrayList;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.InputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
/** Written by Tim Macinta 1997
* Distributed under the GNU Public License
* (a copy of which is enclosed with the source).
*
* This LinkExtractor can extract URLs from HTML files.
*
* modified by Xiannong Meng to fix the finite state machine
* to recognize urls containing white spaces.
* April 2005
* * modified by Xiannong Meng to put it into package seme.net; * May 2006
*/
//public class HTMLLinkExtractor implements LinkExtractor {
public class HTMLLinkExtractor {
Vector
* Keys are case insensitive.
*/
String extract(String line, String key) {
try {
key = key.toLowerCase();
String lower_case = line.toLowerCase();
int i = lower_case.indexOf(key);
if (i < 0) return null;
i += key.length();
if (line.charAt(i) != '=') return null;
i++;
int i2;
if (line.charAt(i) == '"') {
i++;
i2 = line.indexOf('"', i);
if (i2 < 0) {
return line.substring(i);
} else {
return line.substring(i, i2);
}
} else {
int targ = line.length();
for (i2 = i; i < targ; i++) {
if (Character.isWhitespace(line.charAt(i))) break;
}
return line.substring(i, i2);
}
} catch (StringIndexOutOfBoundsException e) {}
return null;
}
// this is an add-hoc fix, only works for "href"
static public String fixUrl(String inUrl)
{
int docLoc = inUrl.lastIndexOf('.');
int protLoc = inUrl.indexOf("://");
int slashLoc = -1;
if (protLoc > 0)
slashLoc = inUrl.indexOf('/', protLoc+3);
else
slashLoc = inUrl.indexOf('/');
if (slashLoc < 0 ||
(slashLoc > 0 && docLoc < slashLoc)) // none
inUrl = inUrl + '/';
inUrl = removeExtraSlash(inUrl);
return inUrl;
}
/**
* Removes extra slashes "/" in the giving url.
* @param inStr is a in-coming url.
* @return the fixed url without any duplicated slashes.
*
* Precondition: inStr is a valid url.
* Postcondition: Extra slahes are removed from the url.
*/
static public String removeExtraSlash(String inStr)
{
int slashLoc = inStr.indexOf("://");
int l = inStr.length();
int hold = slashLoc + 3;
slashLoc = inStr.indexOf("//", hold);
while (slashLoc > 0 && slashLoc < l)
{
hold = slashLoc;
while (hold < l && inStr.charAt(hold) == '/')
hold ++;
if (hold >= l)
{
inStr = inStr.substring(0, slashLoc+1);
break;
}
else
inStr = inStr.substring(0, slashLoc+1)
+ inStr.substring(hold);
slashLoc = inStr.indexOf("//", hold);
l = inStr.length();
}
return inStr;
}
/**
* Print a given url in its completion, protocol://domain/file
*
* @param theUrl is a valid URL object.
*
* Precondition: a valid URL object is given.
* Postcondition: The given url is printed.
*/
static public void printAUrl(URL theUrl)
{
if (theUrl.getPort() > 0)
System.out.println(theUrl.getProtocol() + "://" +
theUrl.getHost() + ":" +
theUrl.getPort() +
theUrl.getFile());
else if (theUrl.getProtocol().compareToIgnoreCase("mailto") == 0)
System.out.println(theUrl.getProtocol() + ":" +
theUrl.getFile());
else
System.out.println(theUrl.getProtocol() + "://" +
theUrl.getHost() +
theUrl.getFile());
}
/**
* Print all urls held currently in the link extractor.
*
* Precondition: the link extractor holds a set of valid urls.
* Postcondition: these urls are printed.
*/
public void printAllUrls()
{
int count = urls.size();
for (int i = 0; i < count; i ++)
{
URL aUrl = (URL)urls.elementAt(i);
printAUrl(aUrl);
}
}
/**
* Return the set of urls held by this link extractor.
*/
public Vector