# wordget3 - applies a stoplist which eliminates high-frequency (useless for query)
# words from the string before extracting the words.

use MakeRegex;

$regex = stoplist2regex("stop2.txt");
print "regex: $regex\n";

@text = <DATA>;
$text = join(" ",@text); 
print "\n\ntext before: $text\n\n";

preprocess(\$text);

print "ALL WORDS:*******************************************\n";
($unique,$wdups,$count) = wordget(\$text);
print "unique: $unique, wdups: $wdups\n";

print "\nALL EXCEPT STOPLISTED WORDS**************************\n";
($unique,$wdups,$count) = wordget_stop(\$text,\$regex);

print "\n\nunique: $unique, wdups: $wdups\n";
# sort the hash by value, print out sorted hash
foreach $word (sort numerically keys %$count) {  
   print "$count->{$word} : $word\n";               
}
exit;

sub preprocess {
   my($text_ref) = @_;
   $$text_ref =~ tr/A-Z/a-z/;         # convert to lower case
   $$text_ref =~ tr/.,:;!?"(){}'/ /d; # strip out punctuation
   $$text_ref =~ s/--/ /g;
}

sub wordget_stop { 
   # extract all words except those on a stoplist from a string into a hash and count them
   my($text_ref,$regex_ref) = @_;
   my(%count,$word,$unique,$wdups,$regex);
   $regex = $$regex_ref;   

   # get rid of the stoplisted words using the regex
   $$text_ref =~ s/ /  /g;
   $$text_ref =~ s/\s$regex\s/  /gc;
   print "text after: $$text_ref\n";

   # split line into words 
   # count words, put count into hash (word is key, count is value)
   foreach $word (split(" ",$$text_ref))  {  
      $count{$word}++;     
   }
   $unique = scalar(keys(%count));
   $wdups = 0;
   foreach $word (keys %count) {  
      $wdups += $count{$word};               
   }
   return ($unique,$wdups,\%count);
}

sub wordget { 
   # extract all words into a hash and count them
   my($text_ref) = @_;
   my(%count,$word,$unique,$wdups);

   # split line into words 
   # count words, put count into hash (word is key, count is value)
   foreach $word (split(" ",$$text_ref))  {  
      $count{$word}++;     
   }
   $unique = scalar(keys(%count));
   $wdups = 0;
   foreach $word (keys %count) {  
      $wdups += $count{$word};               
   }
   return ($unique,$wdups,\%count);
}

sub numerically {        
   # compare two words numerically in decreasing order            
   $count->{$b} <=> $count->{$a};       
}

sub stoplist2regex {
   my($stoplist) = @_;
   my($regex,@list,$word);

   open(STOP,$stoplist) or die "Can't open stoplist: $stoplist\n";
   @list = <STOP>;
   foreach $word (@list) {
      #print "before:*$word*\n";
      trimword($word);
      #print "after:*$word*\n";
   }
   $regex = MakeRegex::make_regex(@list);
   return $regex;
}


sub trimword {
   #remove excess characters before or after a word 
   chomp($_[0]);
   $_[0] =~ s/^\W+//;
   $_[0] =~ s/\W+$//;
}

__DATA__
'Stop, stop!' said the old woman in a loud whisper.  'Will she be
buried to-morrow, or next day, or to-night?  I laid her out; and
I must walk, you know.  Send me a large cloak: a good warm one: 
for it is bitter cold.  We should have cake and wine, too, before
we go!  Never mind; send some bread--only a loaf of bread and a
cup of water.  Shall we have some bread, dear?' she said eagerly: