sbos · glicerico · Jun 27, 2017 · Jun 30, 2017 · Jun 30, 2017 · Jun 30, 2017
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# AdaGram
+  # AdaGram
 
 Adaptive Skip-gram (AdaGram) model is a nonparametric extension of famous Skip-gram model implemented in word2vec software which  is able to learn multiple representations per word capturing different word meanings. This projects implements AdaGram in Julia language.
 
@@ -138,6 +138,11 @@ julia> disambiguate(vm, dict, "apple", split("fresh tasty breakfast"))
 ```
 As one may see, model correctly estimated probabilities of each sense with quite large confidence. Vector corresponding to second prototype of word "apple" can be obtained from `vm.In[:, 2, dict.word2id["apple"]]` and then used as context-aware features of word "apple".
 
+A k-means clustering algorithm is provided to classify words in a given number of clusters (default 100) using their embeddings. The algorithm is taken from the one included in word2vec. Because a word can have different meanings, they can (and should in many cases) be assigned to different clusters. The algorithm writes word meanings above a given prior probability minimum (default 1e-3) and the cluster they belong to.
+```
+julia> clustering(vm, dict, "clustering_output_file", 10; min_prob=1e-3)
+```
+
 Plase refer to [API documentation](https://github.com/sbos/AdaGram.jl/wiki/API) for more detailed usage info.
 ## Future work
 * Full API documentation

diff --git a/classify.jl b/classify.jl
@@ -0,0 +1,38 @@
+push!(LOAD_PATH, "./src/")
+
+using ArgParse
+
+s = ArgParseSettings()
+
+@add_arg_table s begin
+  "input"
+    help = "file where the word embeddings are saved"
+    arg_type = AbstractString
+    required = true
+  "output"
+    help = "file to save the clustering (in text format)"
+    arg_type = AbstractString
+    required = true
+  "--epochs"
+    help = "number of epochs to train"
+    arg_type = Int64
+    default = 1
+  "--k"
+    help = "number of clusters to use in k-means"
+    arg_type = Int64
+    default = 100
+  "--min-prob"
+    help = "lower threshold to include a meaning"
+    arg_type = Float64
+    default = 1e-2
+end
+
+args = parse_args(ARGS, s)
+
+using AdaGram
+
+print("Starting clustering...")
+
+vm, dict = load_model(args["input"])
+clustering(vm, dict, args["output"], args["k"], args["min-prob"])
+println("Done!")
diff --git a/classify.sh b/classify.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+
+$DIR"/run.sh" $DIR"/classify.jl" "$@"
diff --git a/csrc/learn.c b/csrc/learn.c
@@ -1,8 +1,11 @@
+#include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <stdint.h>
 #include <float.h>
 
+typedef float real;			// Precision of float numbers
+
 float sigmoid(float x) {
 	return 1 / (1 + exp(-x));
 }
@@ -111,3 +114,61 @@ void update_z(float* In, float* Out,
 		}
 	}
 }
+
+// Runs K-means on the word vectors; taken from the word2vec clustering routine
+void kmeans(char** words, float* syn0, int classes, int vocab_size, 
+  int layer1_size, char* outputFile){
+
+  long a, b, c, d;
+  FILE *fo;
+  int clcn = classes, iter = 10, closeid;
+  int *centcn = (int *)malloc(classes * sizeof(int));
+  int *cl = (int *)calloc(vocab_size, sizeof(int));
+  real closev, x;
+  real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
+
+  fo = fopen(outputFile, "wb");
+
+  // initialize arrays
+  for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
+  for (a = 0; a < iter; a++) {
+    for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
+    for (b = 0; b < clcn; b++) centcn[b] = 1;
+    // calculate clusters' centers
+    for (c = 0; c < vocab_size; c++) {
+      for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
+      centcn[cl[c]]++;
+    }
+    for (b = 0; b < clcn; b++) {
+      closev = 0;
+      for (c = 0; c < layer1_size; c++) {
+        cent[layer1_size * b + c] /= centcn[b]; // averages
+        closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
+      }
+      closev = sqrt(closev);
+      for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; // normalizes
+    }
+	// classify words in clusters
+    for (c = 0; c < vocab_size; c++) {
+      closev = -10;
+      closeid = 0;
+      for (d = 0; d < clcn; d++) {
+        x = 0;
+        for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
+        if (x > closev) {
+          closev = x;
+          closeid = d;
+        }
+      }
+      cl[c] = closeid;
+    }
+  }
+
+  // Save the K-means classes
+  for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", words[a], cl[a]);
+
+  free(centcn);
+  free(cent);
+  free(cl);
+  fclose(fo);
+}
diff --git a/src/gradient.jl b/src/gradient.jl
@@ -161,4 +161,4 @@ function inplace_train_vectors!(vm::VectorModel, dict::Dictionary, path::Abstrac
 	println("Learning complete $(words_read[1]) / $train_words")
 
 	return words_read[1]
-end
+end
diff --git a/src/util.jl b/src/util.jl
@@ -258,7 +258,43 @@ function disambiguate{Ts <: AbstractString}(vm::VectorModel, dict::Dictionary, x
 	return disambiguate(vm, dict.word2id[x], Int32[dict.word2id[y] for y in context], use_prior, min_prob)
 end
 
+
+# Performs clustering using K-means algorithm adapted from word2vec
+# clustering routine, but handling the representation vector for each
+# different significant meaning of a word. A word can (and probably should)
+# end up in different clusters, according to its different meanings.
+function clustering(vm::VectorModel, dict::Dictionary, outputFile::AbstractString,
+        K::Integer=100; min_prob=1e-3)
+	wordVectors = Float32[]
+	words = AbstractString[]
+
+	# Builds arrays of words and their vectors
+	for w in 1:V(vm)
+		probVec = expected_pi(vm, w)
+		for iMeaning in 1:T(vm)
+			# ignores senses that do not reach min probability
+			if probVec[iMeaning] > min_prob
+				push!(words, dict.id2word[w])
+				currentVector = vm.In[:, iMeaning, w]
+				for currentValue in currentVector 
+					push!(wordVectors, currentValue)
+				end
+			end
+		end
+	end
+
+	# Calls the actual classifier, from a c-function
+	ccall((:kmeans, "superlib"), Void,
+	    (Ptr{Ptr{Cchar}}, Ptr{Float32},
+	    	Int, Int, Int, Ptr{Cchar}), 
+	    words, wordVectors, K, size(words, 1), M(vm), outputFile)
+
+	println("Finished clustering")
+end
+
 export nearest_neighbors
 export disambiguate
 export pi, write_extended
 export cos_dist, preprocess, read_word2vec, write_word2vec
+export load_model
+export clustering, clarkClustering
-Original file line number
+Diff line change
@@ Expand Up @@
     	println("Learning complete $(words_read[1]) / $train_words")
     	return words_read[1]
-    end
+    end