#!/usr/bin/perl

# A simple script to parse a simple html file
# to an ASCII file. (if japanese text is included,
# make sure that you use 'jperl', rather than 'perl')

# I'm reading the whole file! So, if it's a huge file
# you'll run out of memory --- but noone's going to
# write a .html file that bigger, right?
@whole = <>;
$whole_file = "@whole";

# Let's delete the tags
# What the below line means? It means:
#
#      to delete everything that start '<' and ends with '>'
#      so, <tag baby> gets deleted
#      or, <tag whatever-?llll> gets deleted!
$whole_file =~ s/\<[^\>]+\>//g;

# Before deleting the &blah....&blah...lines we do some
# reasonable things:

# <(less than), >(greter than) signs goes here
$whole_file =~ s/\&lt\;/\</g;
$whole_file =~ s/\&gt\;/\>/g;

# &(ampersand) goes here
$whole_file =~ s/\&amp\;/&/g;

# &nbsp;(no-break-space) goes here
$whole_file =~ s/\&nbsp\;/ /g;

# Let's delete those small space tricks &blah...blah...;
# What the below line means? Well it means: 
#
#      to delete that starts with '&' and ends with ';'
#      so, '&abc;' gets deleted!
#      or, '&b;'   gets deleted!
#
# Well, it deletes some reasonable things also! :( sorry
$whole_file =~ s/\&[^\;\&]+\;//g;

print "$whole_file\n";
