-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathenwn2gawn.pl
135 lines (126 loc) · 3.81 KB
/
enwn2gawn.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/perl
use strict;
use warnings;
my %wnga; # see makewn2ga.pl; hash of arrays, keys are sense_keys,
# values are array refs with ga words in the array
open(WNGA, "<:utf8", "wn2ga.txt") or die "Could not open wn2ga.txt: $!\n";
while (<WNGA>) {
chomp;
(my $sk, my $focail) = /^([^|]+)\|(.+)$/;
$wnga{$sk} = [ split /,/,$focail ];
}
close WNGA;
my %gafreq;
open(ROGET, "<:utf8", "roget.txt") or die "Could not open roget.txt: $!\n";
while (<ROGET>) {
chomp;
(my $cnt, my $word) = /^([0-9]+) (.+)$/;
$gafreq{$word} = $cnt;
}
close ROGET;
# just need index.sense for the adjective lookups; ASCII!
my %adjlookup;
open(SENSEINDEX, "<", "index.sense") or die "Could not open index.sense: $!\n";
while (<SENSEINDEX>) {
chomp;
(my $sense_key, my $offset, my $wnsensenumber, my $count) = /^([^ ]+) ([0-9]{8}) ([0-9]+) ([0-9]+)$/;
(my $lemma, my $ss_type, my $lex_filenum, my $lex_id) = $sense_key =~ /^([^%]+)%([1-5]):([0-9][0-9]):([0-9][0-9])/;
if ($ss_type == 5) {
$adjlookup{"$lemma|$offset"} = $sense_key;
}
}
close SENSEINDEX;
my %pos_codes = ('n' => '1', # used for generating sense key correctly
'v' => '2',
'a' => '3',
'r' => '4',
's' => '5',
);
my %irish_words;
sub my_sort {
if ($irish_words{$a} == $irish_words{$b}) {
return $a cmp $b;
}
else {
return $irish_words{$b} <=> $irish_words{$a};
}
}
sub process_data_file
{
(my $file) = @_;
# Princeton WordNet data.* files are ASCII only
open(DATAFILE, "<", $file) or die "Could not open $file: $!\n";
my $outputfile = $file;
$outputfile =~ s/plus//;
$outputfile =~ s/^/ga-/;
open(OUTPUTFILE, ">:utf8", $outputfile) or die "Could not open $outputfile: $!\n";
while (<DATAFILE>) {
chomp;
unless (/^ /) {
(my $synset_offset, my $lex_filenum, my $ss_type, my $w_cnt, my $rest) = /^([0-9]{8}) ([0-9][0-9]) ([nvasr]) ([0-9a-f][0-9a-f]) (.+)$/;
my $decimal_words = hex($w_cnt);
my $pos = $ss_type;
$pos = 'a' if ($pos eq 's');
$pos = 'adv' if ($pos eq 'r');
%irish_words = ();
for (my $i=0; $i < $decimal_words; $i++) {
$rest =~ s/^([^ ]+) ([0-9a-z]) //;
my $lemma=$1;
my $lex_id_hex=$2;
my $sense_key;
$lemma =~ s/\([a-z]+\)$//; # (s) or (a) only: "syntactic marker"
if ($ss_type eq 's') {
$sense_key = $adjlookup{"\L$lemma"."|$synset_offset"};
}
else {
# for non-adjs, rebuild the sense_key just from data in data.*
my $ss_num_type = $pos_codes{$ss_type};
my $lex_id=sprintf("%02d", hex($lex_id_hex));
$sense_key = "\L$lemma".'%'.$ss_num_type.':'.$lex_filenum.':'.$lex_id.'::';
# should be same as adjlookup as in 's' case
}
foreach my $ir (@{$wnga{$sense_key}}) {
$ir =~ s/\+/\/$pos+/ unless ($ir =~ / /); # to match gafreq
$irish_words{$ir}++;
}
}
my $icount = scalar keys %irish_words;
if ($icount > 0) {
foreach my $i (keys %irish_words) {
$i =~ m/^([^+]+)\+([0-9]+)\+/;
my $freqkey = $1;
my $tot = $2;
$irish_words{$i} *= 12; # tune up as gafreq corpus grows!
$irish_words{$i} /= $tot;
if (exists($gafreq{$freqkey})) {
$irish_words{$i} += log($gafreq{$freqkey}+1);
}
}
print OUTPUTFILE "$synset_offset $ss_type $icount ";
foreach my $i (sort my_sort keys %irish_words) {
$i =~ s/ /_/g;
$i =~ s/\/[^+]+\+/+/;
print OUTPUTFILE "$i ";
}
print OUTPUTFILE "$rest\n";
}
$rest =~ s/^([0-9]{3}) //;
my $p_cnt = $1;
for (my $i=0; $i < $p_cnt; $i++) {
$rest =~ s/^([^ ]+) ([0-9]{8}) ([nvasr]) ([0-9a-f]{4}) //;
my $pointer_symbol=$1;
my $offset=$2;
my $pos=$3;
my $sourcetarget=$4;
}
my $gloss = $rest;
$gloss =~ s/^[^|]+\| //; # kills frames for verbs too
}
}
close DATAFILE;
close OUTPUTFILE;
}
process_data_file('dataplus.adj');
process_data_file('dataplus.adv');
process_data_file('dataplus.noun');
process_data_file('dataplus.verb');