use strict;
# ---------------------- Read word file
open(IN,"rawkey.txt");
print "Opened file\n";
my $readdata='';
my %wordcount=();
my %wordindex=();
my %related=();
my $line;
while ($line=<IN>) {
$readdata.=$line;
}
close(IN);
undef $line;
# ---------------------- Get all words and index them
my $index=0;
map {
map {
my $word=$_;
if ($word) {
$word=~tr/A-Z/a-z/;
$wordcount{$word}++;
unless (defined $wordindex{$word}) {
$index++;
$wordindex{$word}=$index;
}
}
} split(/\W+/,$_);
} split(/(\n|\r)+/,$readdata);
print "Built word index\n";
# ---------------------- Find related words
map {
my $line=$_;
if ($line) {
my @words=split(/\W+/,$line);
map {
my $word=$_;
if ($word) {
$word=~tr/A-Z/a-z/;
my $twordidx=$wordindex{$word};
if ($twordidx) {
my %alreadyrelated=();
if (defined $related{$twordidx}) {
map {
my ($idx,$count)=split(/\:/,$_);
$alreadyrelated{$idx}=$count;
} split(/\,/,$related{$twordidx});
}
map {
my $rword=$_;
$rword=~tr/A-Z/a-z/;
if (($rword) && ($rword ne $word)) {
my $rwordidx=$wordindex{$rword};
if (defined $alreadyrelated{$rwordidx}) {
$alreadyrelated{$rwordidx}++;
} else {
$alreadyrelated{$rwordidx}=1;
}
}
} @words;
$related{$twordidx}='';
map {
$related{$twordidx}.=$_.':'.$alreadyrelated{$_}.',';
} keys %alreadyrelated;
chop $related{$twordidx};
} else {
print "Warning! Unknown word: ".$word;
}
}
} @words;
}
} split(/(\n|\r)+/,$readdata);
print "Built hash of related words\n";
# ---------------------- Output
open(OUT,">thesaurus.dat");
map {
my $wordidx=$wordindex{$_};
print OUT $_.'@'.$wordidx.'@'.$wordcount{$_}.'@'.$related{$wordidx}.
"\n";
} sort keys %wordindex;
close(OUT);
print "Wrote thesaurus file\n";
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>