-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlexbuild
38 lines (35 loc) · 1.06 KB
/
lexbuild
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/perl
# Copyright (C) 2004 Kevin P. Scannell
# This is free software; see the file COPYING for copying conditions. There is
# NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Reads a plain text lexicon from standard input;
# words with frequency 0 and 1 have been taken out and dealt with already.
# Processes these words and writes 5 files of approximately the same size:
# lexicon0.txt through lexicon4.txt ranging from highest to lowest freq.
use strict;
my @everything;
binmode STDIN, ":bytes";
while (<STDIN>) {
push @everything, $_;
}
my $linenum = 0;
my $stop = $linenum + @everything/5;
my $filenum = 0;
my $prev;
open (OUTSTREAM, ">:bytes", "lexicon$filenum.txt") or die "Could not open output file: $!\n";
foreach (@everything) {
$linenum++;
if ($linenum > $stop) {
$prev =~ s/ .*/ /s;
unless (m/^$prev/) {
close OUTSTREAM;
$filenum++;
open (OUTSTREAM, ">:bytes", "lexicon$filenum.txt") or die "Could not open output file: $!\n";
$stop += @everything/5;
}
}
print OUTSTREAM $_;
$prev = $_;
}
exit 0;