-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path_enrich.pl
95 lines (71 loc) · 2.38 KB
/
_enrich.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
use Getopt::Std;
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
use Encode;
my $usage;
{
$usage = <<"_USAGE_";
_enrich.pl version 1.0.1
This script enriches lines based on the first tab-delimited column of that line with values from a lexicon file in a new column of the output file.
Optional arguments are currently only outputting the help message.
Usage: _enrich.pl [optional args] -l <LEXICON> <IN_FILE>
Options and arguments:
-h print this [h]elp message and quit
-l [l]exicon file (required)
<IN_FILE> A text file one category per line, only text up to the first tab is used for lexicon lookup
Copyright 2014, Amir Zeldes
This program is free software. You may copy or redistribute it under
the same terms as Perl itself.
_USAGE_
}
### OPTIONS BEGIN ###
%opts = ();
getopts('hl:',\%opts) or die $usage;
#help
if ($opts{h} || (@ARGV == 0)) {
print $usage;
exit;
}
if (!($lexicon = $opts{l}))
{$lexicon = "lexicon.txt";}
### OPTIONS END ###
open(FLH,"$lexicon");
@array = <FLH>;
close(FLH);
foreach $ar (@array)
{
$ar = decode_utf8( $ar );
if ($ar =~ /^(.+)\t(.+)\n/)
{
$entry = decode_utf8($1);
$trans = $2;
$lex{decode_utf8($entry)} .= $trans;
$trans =~ s/^[ \t]+//g;
$trans =~ s/[ \t]+$//g;
}
}
while($ar = <>)
{
$ar = decode_utf8( $ar );
$ar =~ s/\n//g;
if ($ar =~ /^([^\t]+)/)
{
if (exists $lex{$1}) {$mykey=$1;}
elsif (exists $lex{"*" . substr(decode_utf8($1), -4)}) {$mykey="*" . substr(decode_utf8($1), -4);}
elsif (exists $lex{"*" . substr(decode_utf8($1), -3)}) {$mykey="*" . substr(decode_utf8($1), -3);}
elsif (exists $lex{substr(decode_utf8($1), 0,5) . "*"}) {$mykey=substr(decode_utf8($1), 0,5) . "*";}
elsif (exists $lex{substr(decode_utf8($1), 0,4) . "*"}) {$mykey=substr(decode_utf8($1), 0,4) . "*";}
elsif (exists $lex{substr(decode_utf8($1), 0,3) . "*"}) {$mykey=substr(decode_utf8($1), 0,3) . "*";}
elsif (exists $lex{substr(decode_utf8($1), 0,2) . "*"}) {$mykey=substr(decode_utf8($1), 0,2) . "*";}
else {$mykey='';}
if ($mykey ne ''){
{print $ar . "\t" . $lex{$mykey} . "\n";}
}
else
{print $ar ."\n";}
}
else
{
print $ar."\n";
}
}