# wordget3 - applies a stoplist which eliminates high-frequency (useless for query) # words from the string before extracting the words. use MakeRegex; $regex = stoplist2regex("stop2.txt"); print "regex: $regex\n"; @text = ; $text = join(" ",@text); print "\n\ntext before: $text\n\n"; preprocess(\$text); print "ALL WORDS:*******************************************\n"; ($unique,$wdups,$count) = wordget(\$text); print "unique: $unique, wdups: $wdups\n"; print "\nALL EXCEPT STOPLISTED WORDS**************************\n"; ($unique,$wdups,$count) = wordget_stop(\$text,\$regex); print "\n\nunique: $unique, wdups: $wdups\n"; # sort the hash by value, print out sorted hash foreach $word (sort numerically keys %$count) { print "$count->{$word} : $word\n"; } exit; sub preprocess { my($text_ref) = @_; $$text_ref =~ tr/A-Z/a-z/; # convert to lower case $$text_ref =~ tr/.,:;!?"(){}'/ /d; # strip out punctuation $$text_ref =~ s/--/ /g; } sub wordget_stop { # extract all words except those on a stoplist from a string into a hash and count them my($text_ref,$regex_ref) = @_; my(%count,$word,$unique,$wdups,$regex); $regex = $$regex_ref; # get rid of the stoplisted words using the regex $$text_ref =~ s/ / /g; $$text_ref =~ s/\s$regex\s/ /gc; print "text after: $$text_ref\n"; # split line into words # count words, put count into hash (word is key, count is value) foreach $word (split(" ",$$text_ref)) { $count{$word}++; } $unique = scalar(keys(%count)); $wdups = 0; foreach $word (keys %count) { $wdups += $count{$word}; } return ($unique,$wdups,\%count); } sub wordget { # extract all words into a hash and count them my($text_ref) = @_; my(%count,$word,$unique,$wdups); # split line into words # count words, put count into hash (word is key, count is value) foreach $word (split(" ",$$text_ref)) { $count{$word}++; } $unique = scalar(keys(%count)); $wdups = 0; foreach $word (keys %count) { $wdups += $count{$word}; } return ($unique,$wdups,\%count); } sub numerically { # compare two words numerically in decreasing order $count->{$b} <=> $count->{$a}; } sub stoplist2regex { my($stoplist) = @_; my($regex,@list,$word); open(STOP,$stoplist) or die "Can't open stoplist: $stoplist\n"; @list = ; foreach $word (@list) { #print "before:*$word*\n"; trimword($word); #print "after:*$word*\n"; } $regex = MakeRegex::make_regex(@list); return $regex; } sub trimword { #remove excess characters before or after a word chomp($_[0]); $_[0] =~ s/^\W+//; $_[0] =~ s/\W+$//; } __DATA__ 'Stop, stop!' said the old woman in a loud whisper. 'Will she be buried to-morrow, or next day, or to-night? I laid her out; and I must walk, you know. Send me a large cloak: a good warm one: for it is bitter cold. We should have cake and wine, too, before we go! Never mind; send some bread--only a loaf of bread and a cup of water. Shall we have some bread, dear?' she said eagerly: