-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTest Word2Vec.py
75 lines (57 loc) · 2.2 KB
/
Test Word2Vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 20 11:18:03 2017
@author: Satyarth Vaidya
"""
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
dataframe = pd.read_fwf('/home/Downloads/linear_regression_demo-master/brain_body.txt')
x_values = dataframe[['Brain']]
y_values = dataframe[['Body']]
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values,y_values)
plt.scatter(x_values,y_values)
plt.plot(x_values,body_reg.predict(x_values))
plt.show()
import numpy as np
np.random.random_sample((3,2))-5
from gensim import models
model = word2vec(sentences, size=100, window=5, min_count=5, workers=4)
model = gensim.models.KeyedVectors.load_word2vec_format(args.model.strip(), binary=True)
src = '/home/firsttest.text'
model = gensim.models.KeyedVectors.load_word2vec_format('/home/Downloads/german (2).model', binary=True)
out_lis = []
def test_mostsimilar(model,src, label='most similar', topn=10):
num_lines = sum(1 for line in open(src))
num_questions = 0
num_right = 0
num_topn = 0
# get questions
with open(src) as f:
questions = f.readlines()
questions = [x.strip() for x in questions]
# test each question
for question in questions:
words = question.split()
# check if all words exist in vocabulary
if all(x in model.index2word for x in words):
num_questions += 1
#bestmatches = model.most_similar(positive=[words[1], words[2]], negative=[words[0]], topn=topn)
bestmatches = model.most_similar(positive= 'Hallo' , topn=topn)
print(bestmatches)
# best match
"""
if words[3] in bestmatches[0]:
num_right += 1
# topn match
for topmatches in bestmatches[:topn]:
if words[3] in topmatches:
num_topn += 1
break
# calculate result
correct_matches = round(num_right/float(num_questions)*100, 1) if num_questions>0 else 0.0
topn_matches = round(num_topn/float(num_questions)*100, 1) if num_questions>0 else 0.0
coverage = round(num_questions/float(num_lines)*100, 1) if num_lines>0 else 0.
"""