#!/usr/bin/perl -w
#$Id: make_collection,v 1.10 2007/11/29 23:57:40 joshr Exp $
## Copyright 2004-2007 Josh Rabinowitz
# script to create random collections for swish-e from a file like /usr/dict/words (one word per line)
use strict;
use warnings;
use Getopt::Long;
use XML::Simple; # for $xmlsimple->escape_value()
use File::Basename;
use GetDictionaryWords;
use NotRand qw(not_rand);
my $prog = basename( $0 );
# Dict file with words. One word per line.
my $dict='data/C020-words-txt/words-linux-fc1.txt'; # 45,000 word dictionary (408K)
# note that F10 and OSX dictionaries are much larger
my $min_words_per_file=100;
my $max_words_per_file=100;
my $num_files=1000; # 0 means one file for each word in dictionary
my $num_words; # should be scalar(@words)
my $base_dir = ""; # empty base_dir means be an -S prog external program
my $randommode = 1;
# in randommode, words are randomly chosen, otherwise words are sequential from the dict
my $filetype = "xml"; # type of file to create. can also be 'html' or 'txt'
my $verbose = 0;
my $progress = 0;
my $misspell_fraction = 0;
my $progress_seconds = 60;
my $lastprogresstime = 0;
my $xmlsimple = new XML::Simple(); # used for $xmlsimple->escape_value()
sub Usage {
return "make_collection: [--dict=words.txt] [--base_dir=/your/location]\n" .
" [--min_words_per_file=$min_words_per_file] [--max_words_per_file=$max_words_per_file] [--num_files=$num_files]\n" .
" [--misspell-fraction=0.1]\n" .
" [--verbose] [--filetype=(txt|html|xml)] [--(no)randommode]:\n" .
" Makes a set of (possibly random) xml, html, or txt files based on a dict.\n" .
" If you dont set a --base_dir, then it outputs data like a swish-e prog.\n";
}
main();
sub main {
GetOptions(
"min_words_per_file=i" => \$min_words_per_file,
"max_words_per_file=i" => \$max_words_per_file,
"num_files=i" => \$num_files,
"base_dir=s" => \$base_dir,
"dict=s" => \$dict,
"randommode!" => \$randommode,
"filetype=s" => \$filetype,
"verbose!" => \$verbose,
"misspell-fraction=f" => \$misspell_fraction,
) || die Usage();
die "$prog: Error: Filetype '$filetype' not understood\n" . Usage() unless $filetype =~ /^(txt|xml|html?)$/i;
if ($verbose) {
warn "$prog: Warning: No --base_dir option, running as swish-e external program\n" unless $base_dir;
}
my $parser = choose_parser($filetype); # XML2, HTML2 or TXT
if ($max_words_per_file < $min_words_per_file) {
die "$prog: max_words_per_file must be larger than min_words_per_file";
}
# ref to wordlist, and ref to counthash
my ($words, $word_counts) = GetDictionaryWords::get_dictionary_words( $dict );
if ($num_files == 0) {
$num_files = scalar(@$words);
print STDERR "$prog: set num_files to $num_files\n" if $verbose;
}
print STDERR "$0: Outputting $num_files files...\n" if $ENV{TEST_VERBOSE};
my $wordcounter = 0;
print "$prog: Creating files...\n" if $verbose;
for(my $i = 0; $i < $num_files; $i++) {
if ($i && $progress && time() - $lastprogresstime >= $progress_seconds) {
my $percent = sprintf("%1.1f", $i / $num_files * 100);
print STDERR "$prog: $filetype: on file $i of $num_files ($percent%)\n";
$lastprogresstime = time();
}
#if (($i+1) % 1000 == 0) { print STDERR "** working on file $i"; }
my $this_file_words = # choose how many words will be in the file
int( not_rand( $max_words_per_file - $min_words_per_file + 1 ) ) + $min_words_per_file;
my $doc ="";
for(my $j = 0; $j < $this_file_words; $j++, $wordcounter++)
{
# choose the next word, either (not) randomly, or sequentially
my $toadd = $randommode ? $$words[ not_rand( scalar(@$words) ) ] : $$words[$wordcounter % scalar(@$words)];
if (!defined($toadd)) { next; }
if ($misspell_fraction && not_rand(10_000) / 10_000 < $misspell_fraction) {
$toadd = misspell( $toadd );
}
$doc .= $toadd;
$doc .= ($j+1) % 7 ? " " : "\n";
}
if ($parser =~ /^XML2?$/) {
$doc = simple_xmlify( $doc );
} elsif ($parser =~ /^HTML2?$/) {
$doc = simple_htmlify( extract_title($doc), $doc ); # title, content
} else {
$doc = simple_txtify( $doc );
}
if ($base_dir) {
my $path = "$base_dir/$i.$filetype";
open(my $outfile, ">", $path) || die "$prog: Couldn't open $path";
print $outfile $doc;
close($outfile) || die "$prog: Couldn't close $path";
print STDERR "$prog: created $path...\n" if ($verbose && $i % 1000 == 0);
} else {
# act like a swish-e external program. This prints directly to stdout.
#simple_swishe_progify($parser, "$i.$filetype", $doc, scalar(localtime(time())));
simple_swishe_progify($parser, "$i.$filetype", $doc, time() );
}
}
}
# one block of text in xml
sub simple_xmlify {
# we should test with other encodings. This tests with ISO-8859-1
# handle ampersands and other chars that need to be escaped
return qq{\n\n} .
$xmlsimple->escape_value( $_[0] ) .
"\n\n\n";
}
# one block of text in txt
sub simple_txtify {
return $_[0] . "\n";
}
# one block of text, with a title, in html
sub simple_htmlify {
my ($title, $content) = @_;
my $html = <
$title
$content