//package ca.carleton.scs.text; /** * Porter's algorithm to canonicalize an English word. The * algorithm removes suffix morphemes for plurals, participles, etc. * * This implementation is adapted from the CNIDR freeWAIS and * Harvest implementations of Porter's algorithm. * * @author Darcy Quesnel * @version 1998 February */ public class Porter { /* Plurals. */ private static String step1aRules[][] = { {"sses", "ss"}, {"ies", "i"}, {"ss", "ss"}, {"s", ""} }; /* Participles 0. */ private static String step1b0Rules[][] = { {"eed", "ee"} }; /* Participles 1. */ private static String step1b1Rules[][] = { {"ed", ""}, {"ing", ""} }; /* Participles 2. */ private static String step1b2Rules[][] = { {"at", "ate"}, {"bl", "ble"}, {"is", "ise"}, {"iz", "ize"}, {"bb", "b"}, {"dd", "d"}, {"ff", "f"}, {"gg", "g"}, {"mm", "m"}, {"nn", "n"}, {"pp", "p"}, {"rr", "r"}, {"tt", "t"}, {"ww", "w"}, {"xx", "x"} }; /* Participles 3. */ private static String step1b3Rules[][] = { {"", "e"} }; /* Change some y's to i's. */ private static String step1cRules[][] = { {"y", "i"} }; /* Double and triple suffices. */ private static String step2Rules[][] = { {"ational", "ate"}, {"tional", "tion"}, {"enci", "ence"}, {"anci", "ance"}, {"iser", "ise"}, {"izer", "ize"}, {"abli", "able"}, {"alli", "al"}, {"entli", "ent"}, {"eli", "e"}, {"ousli", "ous"}, {"isation", "ise"}, {"ization", "ize"}, {"ation", "ate"}, {"ator", "ate"}, {"alism", "al"}, {"iveness", "ive"}, {"fulness", "ful"}, {"ousness", "ous"}, {"aliti", "al"}, {"iviti", "ive"}, {"biliti", "ble"} }; /* More double and triple suffices. */ private static String step3Rules[][] = { {"icate", "ic"}, {"ative", ""}, {"alise", "al"}, {"alize", "al"}, {"iciti", "ic"}, {"ful", ""}, {"ness", ""} }; /* Single suffices on polysyllables. */ private static String step4Rules[][] = { {"al", ""}, {"ance", ""}, {"ence", ""}, {"er", ""}, {"ic", ""}, {"able", ""}, {"ible", ""}, {"ant", ""}, {"ement", ""}, {"ment", ""}, {"ent", ""}, {"sion", "s"}, {"tion", "t"}, {"ou", ""}, {"ism", ""}, {"ate", ""}, {"iti", ""}, {"ous", ""}, {"ive", ""}, {"ise", ""}, {"ize", ""} }; /* Remove some final e's. */ private static String step5aRules[][] = { {"e", ""} }; /* Remove some final double l's. */ private static String step5bRules[][] = { {"ll", "l"} }; /* Class not to be instantiated. */ private Porter() { } /** * Counts the number of syllables in a word. Disregards an initial * consonant and a trailing vowel. * * @param word the String whose syllables are counted * @return the number of syllables in the given String */ public static int countSyllables(String word) { int syllables = 0; for (int i = 0; i < word.length() - 1; i++) { if ( word.charAt(i) == 'a' || word.charAt(i) == 'e' || word.charAt(i) == 'i' || word.charAt(i) == 'o' || word.charAt(i) == 'u' || word.charAt(i) == 'y' ) { if ( word.charAt(i+1) != 'a' && word.charAt(i+1) != 'e' && word.charAt(i+1) != 'i' && word.charAt(i+1) != 'o' && word.charAt(i+1) != 'u' && word.charAt(i+1) != 'y' ) { syllables += 1; } } } return syllables; } /** * Recognizes a vowel in a given word. * * @param word the String in which to search for a vowel * @return whether the given String contains a vowel */ public static boolean containsVowel(String word) { for (int i = 0; i < word.length(); i++) { if ( word.charAt(i) == 'a' || word.charAt(i) == 'e' || word.charAt(i) == 'i' || word.charAt(i) == 'o' || word.charAt(i) == 'u' ) { return true; } else if (i+1 < word.length()) { if (word.charAt(i+1) == 'y') { return true; } else if (word.charAt(i) == 'y') { if ( word.charAt(i+1) != 'a' && word.charAt(i+1) != 'e' && word.charAt(i+1) != 'i' && word.charAt(i+1) != 'o' && word.charAt(i+1) != 'u' && word.charAt(i+1) != 'y') { return true; } } } } return false; } /** * Recognizes a trailing consonant-vowel-consonant (CVD). The * last consonant doesn't include w, x, or * y. * * @param word the String in which to search for CVD * @return whether the given String contains CVD */ public static boolean containsCVD(String word) { if ( !word.endsWith("a") && !word.endsWith("e") && !word.endsWith("i") && !word.endsWith("o") && !word.endsWith("u") && !word.endsWith("w") && !word.endsWith("x") && !word.endsWith("y") ) { word = word.substring(0, word.length()-1); if ( word.endsWith("a") || word.endsWith("e") || word.endsWith("i") || word.endsWith("o") || word.endsWith("u") || word.endsWith("y") ) { while ( word.endsWith("a") || word.endsWith("e") || word.endsWith("i") || word.endsWith("o") || word.endsWith("u") || word.endsWith("y") ) { word = word.substring(0, word.length()-1); } if ( !word.endsWith("a") && !word.endsWith("e") && !word.endsWith("i") && !word.endsWith("o") && !word.endsWith("u") ) { return true; } } } return false; } /** * Applies the first appropriate rule to the given word. * * @param word the word to be transformed * @param rules the rules with which to transform the word * @return the transformed word */ protected static String replaceEnd(String word, String rules[][]) { return replaceEnd(word, rules, -1); } /** * Applies the first appropriate rule to the given word. Ensures * the stem has at least the given number of syllables before * applying. * * @param word the word to be transformed * @param rules the rules with which to transform the word * @param syllables the number of syllables the word must have * @return the transformed word */ protected static String replaceEnd(String word, String rules[][], int syllables) { String stem; for (int i = 0; i < rules.length; i++) { if (word.endsWith(rules[i][0])) { stem = word.substring( 0, word.length() - rules[i][0].length() ); if (syllables <= countSyllables(stem)) { return stem + rules[i][1]; } else { return word; } } } return word; } /** * Canonicalizes an English word. (This is the main exported method of * this class.) * * @param word the String to be canonicalized * @return the canonical version of the the given String */ public static String canonicalize(String word) { // The steps are numbered similar to previous implementations // and, apparently, Porter's original article. // Step 1a if (word.endsWith("s")) { word = replaceEnd(word, step1aRules); } // Step 1b if (word.endsWith("eed")) { word = replaceEnd(word, step1b0Rules, 1); } else if (word.endsWith("ed") || word.endsWith("ing")) { word = replaceEnd(word, step1b1Rules); word = replaceEnd(word, step1b2Rules); if (containsCVD(word)) { word = replaceEnd(word, step1b3Rules); } } // Step 1c if ( word.endsWith("y") && containsVowel(word.substring(0, word.length()-1)) ) { word = replaceEnd(word, step1cRules); } // Step 2 word = replaceEnd(word, step2Rules, 1); // Step 3 word = replaceEnd(word, step3Rules, 1); // Step 4 word = replaceEnd(word, step4Rules, 2); word = replaceEnd(word, step4Rules, 2); // Step 5a if (word.endsWith("e")) { word = replaceEnd(word, step5aRules, 2); if ( word.endsWith("e") && containsCVD(word.substring(0, word.length()-1)) ) { word = replaceEnd(word, step5aRules); } } // Step 5b if (word.endsWith("ll")) { word = replaceEnd(word, step5bRules, 1); } return word; } }