-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_10cv.py
96 lines (84 loc) · 2.72 KB
/
run_10cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
import getopt
import sys
import time
import os
import datetime
import random
import re
import glob
import subprocess
"""
1.00 2016-10-25
python3 split_10.py -f hdt_Books_forFrog.col -p10
95948 9594
[9594, 9594, 9594, 9594, 9594, 9594, 9594, 9594, 9594, 9602]
OUTPUT: hdt_Books_forFrog.col.cv00
OUTPUT: hdt_Books_forFrog.col.cv01
...
OUTPUT: hdt_Books_forFrog.col.cv08
OUTPUT: hdt_Books_forFrog.col.cv09
python3 run_10cv.py -f 'hdt_Books_forFrog.col.cv??'
['hdt_Books_forFrog.col.cv00', 'hdt_Books_forFrog.col.cv01', 'hdt_Books_forFrog.col.cv02', 'hdt_Books_forFrog.col.cv03', 'hdt_Books_forFrog.col.cv04', 'hdt_Books_forFrog.col.cv05', 'hdt_Books_forFrog.col.cv06', 'hdt_Books_forFrog.col.cv07', 'hdt_Books_forFrog.col.cv08', 'hdt_Books_forFrog.col.cv09']
...[output]...
hdt_Books_forFrog.col.cv09-stats.txt:
# Correct (lemmatised only, no unknowns) 98.26
98.26
# Correct (lcount) 88.8
88.8
Average correct_lcount 89.34
Average correct_lemmatised 98.65
"""
files = []
try:
opts, args = getopt.getopt(sys.argv[1:], "f:", ["files="])
except getopt.GetoptError as err:
print( str(err) )
sys.exit(1)
for o, a in opts:
if o in ("-f", "--files="):
files = sorted(glob.glob(a))
else:
assert False, "unhandled option"
print( files )
for n, f in enumerate(files):
cmd = [ "python3", "lemmatiser.py", "-f", f, "-o", f ]
print( "\n--------\n" )
print( " ".join(cmd) )
x = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True )
out = x.stdout
for l in str(out).split('\n'):
if "SKIP" not in l:
print( l )
print( "\n----------------\n" )
'''
The regexpen are hard coded for the grepped lemmatiser output:
hdt_Books_forFrog.col.cv02-stats.txt:
# Correct (lemmatised only, no unknowns) 98.61
# Correct (lcount) 89.37
'''
correct_lcount = 0;
pattern0 = re.compile(r"# Correct \(lcount\) ([\d.]+)", re.UNICODE)
correct_lemmatised = 0;
pattern1 = re.compile(r"# Correct \(lemmatised only, no unknowns\) ([\d.]+)", re.UNICODE)
num_res = 0
for n, f in enumerate(files):
cmd = [ "grep", "Correct", f + "-stats.txt" ]
print( f + "-stats.txt:" )
x = subprocess.run( cmd, stdout=subprocess.PIPE, universal_newlines=True )
out = x.stdout
num_res += 1
for l in str(out).split('\n'):
print( l )
m = re.search(pattern0, l)
if m:
#print( m.group(1) )
correct_lcount += float(m.group(1))
m = re.search(pattern1, l)
if m:
#print( m.group(1) )
correct_lemmatised += float(m.group(1))
print( "Average correct_lcount", round(correct_lcount / num_res, 2) )
print( "Average correct_lemmatised", round(correct_lemmatised / num_res, 2) )