/** * Some utility methods to help work with URLs.

* * @author Xiannong Meng * @date April 2005, revised October 2006 for CSCI 335. */ public class UrlUtil { // Data members. private String thisHost; private int thisPort; private String thisDir; private String thisProt; private String prefix; private String thisFile; /** * Default constructor.

*/ public UrlUtil() { thisHost = thisDir = null; thisPort = 80; thisProt = "http"; thisDir = "/"; thisFile = "index-test.html"; } /** * Alternate constructor.

*/ public UrlUtil(String host, int port, String currentDir, String prot, String fileName) { thisHost = host; thisPort = port; thisDir = currentDir; thisProt = prot; thisFile = fileName; } // access methods public String getProtocol() { return thisProt; } public String getHost() { return thisHost; } public String getDir() { return thisDir; } public int getPort() { return thisPort; } public String getFileName() { return thisFile; } // check to see if this is a valid url format // we don't care about the specific protocols, hostnames, port numbers and // directories, as long as the format is of // proto://host:port/dir/file or // mailto:someone@somewhere or // somedir/somefile (without protocol, port, host) static public boolean isValidUrl(String inUrl) { if (inUrl.length() < 10) return false; // first to see if it is a mailto if ("mailto".compareToIgnoreCase(inUrl.substring(0,6)) == 0 && inUrl.indexOf("@") > 6) return true; // then check the rest // if not relative, has to have protocol and host int colonLoc = inUrl.indexOf(":"); int slashLoc = inUrl.lastIndexOf("//"); if (colonLoc > 0 && colonLoc < slashLoc && slashLoc < inUrl.length() - 2) return true; if (colonLoc < 0 && slashLoc < 0) // relative url return true; return false; } /** * Some URLs come with extra slashes e.g.
* http://www.host.com//dir/file
* remove them.

*/ static public String removeExtraSlash(String inStr) { int slashLoc = inStr.indexOf("://"); int l = inStr.length(); int hold = slashLoc + 3; slashLoc = inStr.indexOf("//", hold); while (slashLoc > 0 && slashLoc < l) { hold = slashLoc; while (hold < l && inStr.charAt(hold) == '/') hold ++; if (hold >= l) { inStr = inStr.substring(0, slashLoc+1); break; } else inStr = inStr.substring(0, slashLoc+1) + inStr.substring(hold); slashLoc = inStr.indexOf("//", hold); l = inStr.length(); } return inStr; } // given an incomplete url (partial, relative ...), construct // a complete one from the given url information in urlInfo // urlInfor has: host, port, and dir static public String fixUrl(String inUrl, UrlUtil thisUrlInfor) { String path = ""; String thisProto = extractProtocol(inUrl); if (thisProto != null) { // this is not a relative url if (isValidProtocol(thisProto) == true) { // revise thisUrlInfor thisUrlInfor = UrlUtil.extractUrlInfor(inUrl); path = thisUrlInfor.getDir() + thisUrlInfor.getFileName(); } else return null; } else path = inUrl; String completedUrl = thisUrlInfor.getProtocol() + "://"; completedUrl += thisUrlInfor.getHost(); if (thisUrlInfor.getPort() != 80) completedUrl += ":" + thisUrlInfor.getPort(); String newDir = convertRelativePath(path, thisUrlInfor); completedUrl += newDir; // remove the section tag '#' int sectionLoc = completedUrl.lastIndexOf("#"); if (sectionLoc > 0) completedUrl = completedUrl.substring(0, sectionLoc); // attach 'index.html' if the filename ends with '/' if (completedUrl.endsWith("/")) completedUrl += "index-test.html"; // remove the annoying '//' in the middle of a url // I am sure there are better ways of doing it, but ... completedUrl = removeExtraSlash(completedUrl); return completedUrl; } // precondition: the rPath IS a relative path that doesn't have a protocol // and a host! static public String convertRelativePath(String rPath, UrlUtil thisUrlInfor) { if (rPath.startsWith("/") == false) { // a relative path String tempPath = thisUrlInfor.getDir() + rPath; rPath = tempPath; } String completePath = rPath; // keep track of whether we change anything boolean changed = false; // collapse all occurances of "/./" int i = rPath.indexOf("/./"); while (i >= 0) { changed = true; rPath = rPath.substring(0, i) + rPath.substring(i+2); i = rPath.indexOf("/./"); } // collapse all occurances of "/../" (by removing preceding directory) i = rPath.indexOf("/../"); while (i >= 0) { changed = true; int i2 = rPath.lastIndexOf('/', i-1); if (i2 < 0) i2 = i; rPath = rPath.substring(0, i2) + rPath.substring(i+3); i = rPath.indexOf("/../"); } if (changed == true) completePath = rPath; return completePath; } /** * Extract the protocol part of a URL.

*/ static public String extractProtocol(String inUrl) { String proto = null; int loc = inUrl.indexOf(":"); if (loc > 0) // the url includes a protocol proto = inUrl.substring(0, loc); return proto; } /** * Check to see if this is a valid protocol. This one only * responds to 'http'. It can easily add others.

*/ static public boolean isValidProtocol(String inProto) { if (inProto.compareToIgnoreCase("http") == 0) return true; else return false; } // the 'inUrl' is expected to have the form of // http://host.domain:port/dir. This method will extract // each of the components out of this string.

static public UrlUtil extractUrlInfor(String inUrl) { int protLoc = inUrl.indexOf("://"); int hostLoc = -1; int portLoc = -1; int dirLoc = -1; String localHost = ""; String localDir = "/"; int localPort = 80; String localProt = "http"; String localFile = "/"; if (protLoc >= 0) { hostLoc = protLoc + 3; localProt = inUrl.substring(0, protLoc); } else hostLoc = 0; portLoc = inUrl.indexOf(':', hostLoc); dirLoc = inUrl.indexOf('/', hostLoc); // System.err.println("portloc " + portLoc + " dirLoc " + dirLoc); if (portLoc >= 0 && dirLoc > portLoc && dirLoc <= portLoc + 6) { // System.err.println("branch A"); localHost = inUrl.substring(hostLoc, portLoc); localPort = Integer.parseInt ((inUrl.substring(portLoc+1, dirLoc)).trim()); } else { // System.err.println("branch B"); if (dirLoc >= 0) localHost = inUrl.substring(hostLoc, dirLoc); else localHost = inUrl.substring(hostLoc); } if (dirLoc >= 0) { // need to remove the trailing file name, /in.html if (inUrl.endsWith("/") == true) { localDir = inUrl.substring(dirLoc); localFile = "/"; } else { localDir = inUrl.substring(dirLoc, inUrl.lastIndexOf("/")+1); localFile = inUrl.substring(inUrl.lastIndexOf("/")+1); } } return new UrlUtil(localHost, localPort, localDir, localProt, localFile); } /** * Print the URL in a decent format.

*/ public void print() { System.out.println("prot: " + thisProt); System.out.println("host: " + thisHost); System.out.println("port: " + thisPort); System.out.println("dir : " + thisDir); } /** * Some simple sanity check.

*/ static public void main(String[] argvs) { String sampleUrl = "http://www.eg.bucknell.edu:1234/home/dir/index.html"; UrlUtil infor = UrlUtil.extractUrlInfor(sampleUrl); infor.print(); // examples of input urls // http://www.cnn.com/level1/level2/../index.html // http://www.cnn.com/level1/level2/../../index.html // http://www.cnn.com/../../index.html if (argvs.length != 1) { System.err.println("usage: java UrlUtil url"); System.exit(1); } // /** sampleUrl = "http://polaris.bucknell.edu/hello"; infor = UrlUtil.extractUrlInfor(sampleUrl); infor.print(); sampleUrl = "ftp://polaris.bucknell.edu"; infor = UrlUtil.extractUrlInfor(sampleUrl); infor.print(); sampleUrl = "http://www.eg.bucknell.edu:1234/home/dir/index.html"; infor = UrlUtil.extractUrlInfor(sampleUrl); infor.print(); String samplePath = "./level1/../level2/path3/../path4/./index.html"; System.out.println("new url:" + convertRelativePath(samplePath, infor)); samplePath = "level1/level2/../index.html"; System.out.println("new url:" + convertRelativePath(samplePath, infor)); samplePath = "/level1/../level2/index.html"; System.out.println("new url:" + convertRelativePath(samplePath, infor)); // **/ System.out.println("orig url: " + argvs[0]); System.out.println("fixed url: " + fixUrl(argvs[0], infor)); } }