#!/usr/local/bin/perl -w

## (c) Steffi Bruninghaus, 1999
##     University of Pittsburgh
## 
## This is a straight-forward Perl implementation of Porter's stemming 
## algorithm. Given a word from the command line, the program prints the
## stem to standard out. The algorithm is not perfect, though - international
## becomes intern, for instance. 

## Thanks a lot to Armel Asseling for finding and correcting two errors!



$word = lc($ARGV[0]);
print "Word before stemming: $word \n";

&calculate_m;
&step_1;
&calculate_m;
&step_2;
&calculate_m; 
&step_3;
&calculate_m; 
&step_4;
&calculate_m; 
&step_5;
&calculate_m; 
&step_6;
&step_7; 

print "Word after stemming: $word \n";
       

sub calculate_m {

    if ($word =~ 
	/(([aeiou]+|y[aeiou]*)([^aeiou]+|y[^aeiouy]*)){4,}/) 
      {
	  $m = 4; 
      }
    elsif
      ($word =~ 
       /(([aeiou]+|y[aeiou]*)([^aeiou]+|y[^aeiouy]*)){3,}/)
        {
	    $m = 3; 
        }
    elsif 
      ($word =~	       
       /(([aeiou]+|y[aeiou]*)([^aeiou]+|y[^aeiouy]*)){2,}/)
	{
	    $m = 2; 
	}
    elsif  
      ($word =~	       
       /(([aeiou]+|y[aeiou]*)([^aeiou]+|y[^aeiouy]*)){1,}/)
        {				
	    $m = 1; 
	}		       
    else {
	$m = 0; 
    }
}


sub step_1 {
    if ($word =~ /sses$/) {
	$word =~ s/sses$/ss/;
    }			 
    elsif ($word =~ /ies$/) {
	$word =~ s/ies$/i/;  
    }		       
    elsif ($word =~ /ss$/) {
    }
    elsif ($word =~ /s$/) {
	$word =~ s/s$//; 
    }
    elsif (($word =~ /^(.*?)eed$/) && ($m > 2)) {
	$word =~ s/eed$/ee/;
    }
    elsif ((($word =~ /(.)(.)ed$/) && ($1 eq $2)) ||
	   ($word =~ /(.)+([aeiou]|y)(.)+ed$/)) {
	$word =~ s/ed$/D/; 
    }
    elsif  ((($word =~ /(.)(.)ing$/) && ($1 eq $2)) ||
	    ($word =~ /(.)+([aeiou]|y)(.)+ing$/)) {
	$word =~ s/ing$/D/;
    }
    elsif ($word =~ /(.)+([aeiou]|y)(.)+ing$/) {
	$word =~ s/ing$/S/; 
    }	
    elsif ($word =~ /pezrl(.)+([aeiou]|y)(.)+ed$/) {
	$word =~ s/ed$/S/; 
    }	
    elsif ($word =~ /(.)+([aeiou]|y)(.)+y$/) {
	$word =~ s/y/i/; 
    }
}    


sub step_2 {
    if ($word =~ /atS/) {
	$word =~ s/atS/ate/;
    }
    elsif ($word =~ /blS/) {
	$word =~ s/blS/ble/;
    }
    elsif ($word =~ /[^aeiouslz]yD/ ||
	   $word =~ /[^szl][aeiou]D/) {
	$word =~ s/.D//;
    }
    elsif ($word =~ /[^aeiou][aeiouy][^aeiouyxw]S/ 
	   && $m == 1) {
	$word =~ s/S/e/;       
    }
    elsif ($word =~ /S/) {
	$word =~ s/S//; 
    }
    elsif ($word =~ /D/) {
	$word =~ s/D//; 
    }
}
	    

sub step_3 {
	    
    if ($word =~ /ational$/ && $m >= 4) {
	$word =~ s/ational$/ate/; 
    }
    elsif ($word =~ /tional$/ && $m >= 3) {
	$word =~ s/tional$/tion/;
    }
    elsif ($word =~ /enci$/ && $m >= 2) {
	$word =~ s/enci/eence/; 
    }
    elsif ($word =~ /anci$/ && $m >= 2) {
	$word =~ s/anci$/ance/; 
    }
    elsif ($word =~ /izer$/ && $m >= 2) {
	$word =~ s/izer$/ize/; 
    }
    elsif ($word =~ /abli$/ && $m >= 2) {
	$word =~ s/abli/able/; 
    }
    elsif ($word =~ /alli$/ && $m >= 2) {
	$word =~ s/alli$/al/; 
    }			
    elsif ($word =~ /entli$/ && $m >= 2) {
	$word =~ s /entli$/ent/;
    }
    elsif ($word =~ /eli$/ && $m >= 2) {
	$word =~ s/eli$/e/; 
    }
    elsif ($word =~ /ousli$/ && $m >= 2) {
	$word =~ s/ousli$/ous/;
    }
    elsif ($word =~ /ization$/ && $m >= 4) {
	$word =~ s/ization$/ize/;
    }
    elsif ($word =~ /ation$/ && $m >= 3) {
	$word =~ s/ation$/ate/;
    }
    elsif ($word =~ /ator$/ && $m >= 3) {
	$word =~ s/ator$/ate/;
    }
    elsif ($word =~ /alism$/ && $m >= 3) {
	$word =~ s/alism$/al/;
    }
    elsif ($word =~ /iveness$/ && $m >= 4) {
	$word =~ s/iveness$/ive/;
    }
    elsif ($word =~ /fulness$/ && $m >= 3) {
	$word =~ s/fulness$/ful/;
    }
    elsif ($word =~ /ousness$/ && $m >= 3) {
	$word =~ s/ousness$/ous/;
    }
    elsif ($word =~ /aliti$/ && $m >= 3) {
	$word =~ s/aliti$/al/;
    }
    elsif ($word =~ /iviti$/ && $m >= 3) {
	$word =~ s/iviti$/ive/;
    }
    elsif ($word =~ /biliti$/ && $m >= 3) {
	$word =~ s/biliti$/ble/;
    }
}			


sub step_4 {		      

    if ($word =~ /icate$/ && $m >= 3) {
	$word =~ s/icate$/ic/;
    }
    elsif ($word =~ /ative$/ && $m >= 3) {
	$word =~ s/ative$//;
    }
    elsif ($word =~ /alize$/ && $m >= 3) {
	$word =~ s/alize$/al/;
    }
    elsif ($word =~ /iciti$/ && $m >= 3) {
	$word =~ s/iciti$/ic/;
    }
    elsif ($word =~ /ical$/ && $m >= 3) {
	$word =~ s/ical$/ic/;
    }
    elsif ($word =~ /ful$/ && $m >= 2) {
	$word =~ s/ful$//;
    }
    elsif ($word =~ /ness$/ && $m >= 2) {
	$word =~ s/ness$//;
    }
    
}		       


sub step_5 {
    if ($word =~ /al$/ && $m >= 3) {
	$word =~ s/al$//;
    }
    elsif ($word =~ /ance$/ && $m >= 3) {
	$word =~ s/ance$//;
    }
    elsif ($word =~ /ence$/ && $m >= 3) {
	$word =~ s/ence$//;
    }
    elsif ($word =~ /er$/ && $m >= 3) {
	$word =~ s/er$//;
    }
    elsif ($word =~ /ic$/ && $m >= 3) {
	$word =~ s/ic$//;
    }
    elsif ($word =~ /able$/ && $m >= 3) {
	$word =~ s/able$//;
    }
    elsif ($word =~ /ible$/ && $m >= 3) {
	$word =~ s/ible$//;
    }
    elsif ($word =~ /ant$/ && $m >= 3) {
	$word =~ s/ant$//;
    }
    elsif ($word =~ /ement$/ && $m >= 4) {
	$word =~ s/ement$//;
    }
    elsif ($word =~ /ment$/ && $m >= 3) {
	$word =~ s/ment$//;
    }
    elsif ($word =~ /[s|t]ion$/ && $m >= 3) {
	$word =~ s/(.)ion$//;
    }
    elsif ($word =~ /ou$/ && $m >= 2) {
	$word =~ s/ou$//;
    }
    elsif ($word =~ /ism$/ && $m >= 3) {
	$word =~ s/ism$//;
    }
    elsif ($word =~ /ate$/ && $m >= 3) {
	$word =~ s/ate$//;
    }
    elsif ($word =~ /iti$/ && $m >= 3) {
	$word =~ s/iti$//;
    }  
    elsif ($word =~ /ous$/ && $m >= 3) {
	$word =~ s/ous$//;
    }
    elsif ($word =~ /ive$/ && $m >= 3) {
	$word =~ s/ive$//;
    }
    elsif ($word =~ /ize$/ && $m >= 3) {
	$word =~ s/ize$//;
    }
}


sub step_6 {
	    
    if ($word =~ /e$/ && $m >= 2) {
	$word =~ s/e$//;
    }
    elsif ($word =~ /[^aeiou][aeiouy][^aeiouwyx]e$/ 
	   && $m == 1) {
	$word =~ s/e$//;
    }
    elsif ($word =~ /ll$/ && $m >= 2) {
	$word =~ s/l$//;
    }			
}


sub step_7 {
    if ($word eq "discussion") {
	$word = "discuss"; 
    }
    if ($word eq "protection") {
	$word = "protect"; 
    }
    if ($word eq "conclusion") {
	$word = "conclud"; 
    }
}

1;

