-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathroot_and_suffix_separator.py
92 lines (71 loc) · 3.17 KB
/
root_and_suffix_separator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
import jpype as jp
import re
import string
import csv
## Zemberek: Ambiguity Resolution Example
# Documentation: https://github.com/ahmetaa/zemberek-nlp/tree/master/morphology#ambiguity-resolution
# Java Code Example: https://github.com/ahmetaa/zemberek-nlp/blob/master/examples/src/main/java/zemberek/examples/morphology/DisambiguateSentences.java
# Relative path to Zemberek .jar
ZEMBEREK_PATH = '/home/busra/System_Programming_HWS/src/0.17.1-20190726T121643Z-001/0.17.1/zemberek-full.jar'
# Start the JVM
jp.startJVM(jp.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))
# Import required Java classes
TurkishMorphology = jp.JClass('zemberek.morphology.TurkishMorphology')
Paths = jp.JClass('java.nio.file.Paths')
# Instantiating the morphology class with the default RootLexicon
morphology = TurkishMorphology.createWithDefaults()
lineList = [line.rstrip('\n') for line in open("tests2.txt")]
#REGEX = re.compile(r":\s*")
#REGEX2 = re.compile(r"\+\s*")
regex = re.compile('\+(.*?)\:')
# Dummy sentence to work on
for word in lineList:
# Analyzing the dummy sentence. The returning WordAnalysis
# object which can include zero or more SingleAnalysis objects
analysis = morphology.analyzeSentence(word)
# Resolving the ambiguity
sonuclar = morphology.disambiguate(word, analysis).bestAnalysis()
stemmed = []
temp = []
# Printing the results
if "Ques" in str(sonuclar):
stemmed.append("-")
else:
for i, sonuc in enumerate(sonuclar):
x = sonuc.formatLong()
stem = ' '.join(sonuc.getStems())
#print('Analysis %d: %s' % (i+1, sonuc.formatLong()))
#print('Stems %d: %s' % (i+1, ' '.join(sonuc.getStems())))
stemmed = regex.findall(x)
for a in stemmed:
if "A3sg+" in stemmed[0]:
stemmed[0] = stemmed[0].replace("A3sg+","")
elif "Aor+" in stemmed[0]:
stemmed[0] = stemmed[0].replace("Aor+", "")
elif len(stemmed) > 1 and "Aor+" in stemmed[1]:
stemmed[1] = stemmed[1].replace("Aor+", "")
elif len(stemmed) > 1 and "A3sg+" in stemmed[1]:
stemmed[1] = stemmed[1].replace("A3sg+","")
elif "A3sg|" in stemmed[0]:
stemmed[0] = stemmed[0].replace("A3sg|","")
elif len(stemmed) > 1 and "A3sg|" in stemmed[1]:
stemmed[1] = stemmed[1].replace("A3sg|", "")
#stemmed.insert(0, stem)
#print(stemmed)
with open('vocab_file_2.csv', mode='a') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
if len(stemmed) == 0:
stemmed.append("-")
employee_writer.writerow(stemmed)
else:
employee_writer.writerow(stemmed)
# Shutting down the JVM
jp.shutdownJVM()
# if "Ques" in str(sonuclar):
# x = sonuc.formatLong()
# stem = ' '.join(sonuc.getStems())
# print('Analysis %d: %s' % (i + 1, sonuc.formatLong()))
# #print(str(sonuclar))
# stemmed.insert(i,str(regex.findall(x)))
# else: