train-skl

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, os
import argparse, json, fileinput
import math, random
import numpy as np

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn import cluster
from sklearn import mixture
from sklearn import ensemble
from sklearn import svm
from sklearn import discriminant_analysis

import pickle

from scipy import sparse


class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# general help text
general_help = """
Training various supervised and unsupervised algorithms from scikit-learn

SVM : http://scikit-learn.org/stable/modules/svm.html
Ensemble : http://scikit-learn.org/stable/modules/ensemble.html
Clustering : http://scikit-learn.org/stable/modules/clustering.html
Mixture : http://scikit-learn.org/stable/modules/mixture.html
Discriminant Analysis: http://scikit-learn.org/stable/modules/lda_qda.html
"""

# estimator help text, estimator objects constructed from the keys
estimators_help = {
            'AffinityPropagation': 'Perform Affinity Propagation Clustering of data.',
            'AgglomerativeClustering': 'Agglomerative Clustering',
            'Birch': 'Implements the Birch clustering algorithm.',
            'DBSCAN': 'Perform DBSCAN clustering from vector array or distance matrix.',
            'FeatureAgglomeration': 'Agglomerate features.',
            'KMeans': 'K-Means clustering',
            'MiniBatchKMeans': 'Mini-Batch K-Means clustering',
            'MeanShift': 'Mean shift clustering using a flat kernel.',
            'SpectralClustering': 'Apply clustering to a projection to the normalized laplacian.',
            'GaussianMixture': 'Gaussian Mixture Model',
            'BayesianGaussianMixture': 'Variational Bayesian estimation of a Gaussian mixture.',
            'RandomForestClassifier': 'A random forest classifier.',
            'LinearSVC': 'Linear Support Vector Classification.',
            'NuSVC': 'Nu-Support Vector Classification.',
            'SVC': 'C-Support Vector Classification.',
            'LinearDiscriminantAnalysis': 'A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.',
            'QuadraticDiscriminantAnalysis': 'A classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.',
            }

# estimator groups
cluster_estimators = ['AffinityPropagation','AgglomerativeClustering','Birch','DBSCAN','FeatureAgglomeration','KMeans','MiniBatchKMeans','MeanShift','SpectralClustering']
mixture_estimators = ['GaussianMixture','BayesianGaussianMixture']
ensemble_estimators = ['RandomForestClassifier']
svm_estimators = ['LinearSVC','NuSVC','SVC']
xda_estimators = ['LinearDiscriminantAnalysis','QuadraticDiscriminantAnalysis']


# constructing an estimator list help text
estimator_help_list = "These are the available estimators. Try [train-skl ESTIMATOR -h] for optional arguments and a parameter listing per estimator.\n"

estimator_help_list += "\nCLUSTERING\n"
for e in cluster_estimators:
    estimator_help_list += '\t' + color.BOLD + e + color.END + ": " + estimators_help[e] + '\n'

estimator_help_list += "\nMIXTURE MODELS\n"
for e in mixture_estimators:
    estimator_help_list += '\t' + color.BOLD + e + color.END + ": " + estimators_help[e] + '\n'

estimator_help_list += "\nSUPPORT VECTOR MACHINES\n"
for e in svm_estimators:
    estimator_help_list += '\t' + color.BOLD + e + color.END + ": " + estimators_help[e] + '\n'

estimator_help_list += "\nENSEMBLE METHODS\n"
for e in ensemble_estimators:
    estimator_help_list += '\t' + color.BOLD + e + color.END + ": " + estimators_help[e] + '\n'

estimator_help_list += "\nDISCRIMINANT ANALYSIS\n"
for e in xda_estimators:
    estimator_help_list += '\t' + color.BOLD + e + color.END + ": " + estimators_help[e] + '\n'


estimators_params = dict() # estimator parameters
estimators = dict() # estimator classes

# fill estimator dicts
def get_estimators():
    for e in list(estimators_help.keys()):
        tmp = None
        module = None

        # get correct sklearn module for estimator type
        if e in cluster_estimators:
            module = cluster
        elif e in mixture_estimators:
            module = mixture
        elif e in ensemble_estimators:
            module = ensemble
        elif e in svm_estimators:
            module = svm
        elif e in xda_estimators:
            module = discriminant_analysis
        else:
            sys.stderr.write("Estimator " + e + " not implemented!\n")
            sys.exit(-1)

        try:
            tmp = getattr(module, e)
        except AttributeError:
            #raise NotImplementedError("Class `{}` does not implement `{}`".format(module.__name__, e))
            sys.stderr.write("train-skl: " + module.__name__ + " does not implement estimator " + e + '\n')
            sys.exit(-1)

        estimators[e] = tmp()
        estimators_params[e] = estimators[e].get_params()

    #print_dict('estimators_help', estimators_help, '')
    #print_dict('estimators_params', estimators_params, '')
    #print_dict('estimators', estimators, '')


# load stream
def load(stream, trainset):
    train_labels = list()
    train_features_l = list()
    test_labels = list()
    test_features_l = list()
    num_features = 0

    ratio, fraction, integral = 0, 0, 0

    # get trainset modes
    # TODO: support k greater 9, limited because fraction (select) cannot be greater than 9 -> round(0.9*10)
    try:
        ratio = float(trainset)
        fraction, integral = math.modf(ratio)
        integral = int(integral)
        fraction = round(fraction*10)
    except ValueError:
        stream = trainset

    # special case for a ratio of 100%
    if ratio == 1:
        ratio = 0

    # sanity checks on trainset
    if ratio > 0:
        if fraction < 0:
            sys.stderr.write("train-skl: no fold number given, specify with k.x or use a ratio (0,1] for random split\n")
            sys.exit(-1)

        if integral >= 1 and fraction > integral:
            sys.stderr.write("train-skl: fold number (" + str(fraction) + ") must be greater than 0 and less than or equal the number of folds (" + str(integral) + ")\n")
            sys.exit(-1)

        if ratio > 1 and fraction < 1:
            sys.stderr.write("train-skl: fold number (" + str(fraction) + ") must be greater than 0 and less than or equal the number of folds (" + str(integral) + ")\n")
            sys.exit(-1)

        if integral > 9:
            sys.stderr.write("train-skl: k > 9 not supported currently\n")
            sys.exit(-1)

    for line in fileinput.input(stream, bufsize=1000):
        line = line.strip()
        line = line.strip('\n')

        # skip empty and comment lines
        if line == "":
            continue
        if line[0] == '#':
            #print(line)
            continue

        fields = line.split()
        if len(fields) < 2:
            sys.stderr.write("train-skl: no features?\n")
            sys.exit(-1)

        # make sure the number of data fields is always the same
        if not num_features:
            num_features = len(fields[1:])
        elif len(fields[1:]) != num_features:
            sys.stderr.write("train-skl: incorrect number of features:" + str(len(fields[1:])) + "!=" + str(num_features) + '\n')
            sys.exit(-1)

        train_labels.append(fields[0])
        train_features_l.append([ float(x) for x in fields[1:] ])

    # do splitting
    if ratio <= 0:
        pass # no split
    elif ratio < 1:
        # random stratified split:
        # 1. for each class, create a vector containing their respective indices (strata)
        # 2. randomize each stratum and split them according to the provided ratio
        # 3. according to the created index strata, move the selected samples to the test vectors

        # init
        train_strata = list()
        test_strata = list()
        u_labels = list(set(train_labels))

        # create strata vector with sample indices
        for l in u_labels:
            train_strata.append([ x for x,y in enumerate(train_labels) if y==l ])

        # randomize index strata
        for s in train_strata:
            random.shuffle(s)

        # split index strata
        for s in train_strata:
            test_strata.append([ y for x,y in enumerate(s) if x < len(s) * (1-ratio) ])

        # combine test strata to one list and sort it
        test_set = list()
        for s in test_strata:
            test_set.extend(s)
        test_set.sort(reverse=True)

        # move test set samples from train list to test list
        for i in test_set:
            test_labels.insert(0, train_labels.pop(i))
            test_features_l.insert(0, train_features_l.pop(i))

    elif ratio > 1:
        # k-fold split:
        samples_per_fold = int(len(train_labels) / integral)

        # last fold might be larger since only integers
        if integral == fraction:
            for i in reversed(range((fraction-1) * samples_per_fold, len(train_labels))):
                test_labels.insert(0, train_labels.pop(i))
                test_features_l.insert(0, train_features_l.pop(i))
        else:
            for i in reversed(range(0, samples_per_fold)):
                test_labels.insert(0, train_labels.pop((fraction - 1) * samples_per_fold + i))
                test_features_l.insert(0, train_features_l.pop((fraction - 1) * samples_per_fold + i))

    return train_labels, np.array(train_features_l), test_labels, np.array(test_features_l)


# output graphs showing the results of the last fitting of the given estimator
# TODO: experimental at best, needs to be extended
def graph_result(estimator, labels, features):
    ulabels, ilabels = list(), list()
    for i in labels:
        if ulabels.count(i) == 0:
            ulabels.append(i)
    for i in labels:
        ilabels.append(ulabels.index(i))

    print("samples:", len(labels))
    print("uniques:", len(ulabels))
    print("features:", len(features[0]))
    print("estimator:", estimator)

    try:
        centers = estimator.cluster_centers_
    except AttributeError:
        sys.stderr.write("train-skl: sorry, graphing for the selected estimator is not supported yet\n")
        sys.exit(-1)

    fig = plt.figure(1, figsize=(20,12))
    fig.clf()
    fig.canvas.set_window_title('train-skl: ' + str(estimator)[:str(estimator).find("(")])

    ax = fig.add_subplot(2,3,1)
    ax.scatter(features[:,0], features[:,1], c=ilabels, cmap=plt.get_cmap('gist_rainbow'))
    ax.scatter(centers[:,0], centers[:,1], marker='x', color='r', s=150, linewidths=2)
    ax.set_title('X-Y')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')

    ax = fig.add_subplot(2,3,2)
    ax.scatter(features[:,0], features[:,2], c=ilabels, cmap=plt.get_cmap('gist_rainbow'))
    ax.scatter(centers[:,0], centers[:,2], marker='x', color='r', s=150, linewidths=2)
    ax.set_title('X-Z')
    ax.set_xlabel('X')
    ax.set_ylabel('Z')

    ax = fig.add_subplot(2,3,3)
    ax.scatter(features[:,1], features[:,2], c=ilabels, cmap=plt.get_cmap('gist_rainbow'))
    ax.scatter(centers[:,1], centers[:,2], marker='x', color='r', s=150, linewidths=2)
    ax.set_title('Y-Z')
    ax.set_xlabel('Y')
    ax.set_ylabel('Z')

    ax = fig.add_subplot(2,3,5, projection='3d')
    ax.scatter(features[:,0], features[:,1], features[:,2], c=ilabels, cmap=plt.get_cmap('gist_rainbow'))
    ax.scatter(centers[:,0], centers[:,1], centers[:,2], marker='x', color='r', s=150, linewidths=2)
    ax.set_title('X-Y-Z')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')

    plt.show()


# helper to print dictionaries somewhat readable
def print_dict(header, d, *args):
    pad_text = len(max(list(d.keys()), key=len)) # longest string
    print(header)
    for i in sorted(d.keys()):
        print("  ", '{:<{}}'.format(i, pad_text), "   ", d[i], sep='')
    for i in args:
        print(i)

# helper to make a custom args compatibility string into a dictionary-able string
# compatibility string: {key1:value1:key2...}
def custom_args_compat(s):
    # check if even number of entries
    if len(s.split(':')) % 2 != 0:
        sys.stderr.write("Number of dictionary entries must be even!\n")
        sys.exit(-1)

    # first and last double quote
    s = s[0] + '"' + s[1:-1] + '"' + s[-1]

    # insert double quote before and after each colon
    i = 0
    while True:
        if i >= len(s): break
        if s[i] == ':':
            s = s[:i] + '":"' + s[i+1:]
            i += 2
        else: i += 1

    # replace every second colon with comma
    flag = False
    for i in range(len(s)):
        if s[i] == ':' and flag:
            s = s[:i] + ',' + s[i+1:]
            flag = False
        elif s[i] == ':' and not flag: flag = True
    return s


def general_args(parser):
    parser.add_argument('samples', metavar='SAMPLES', type=str, nargs='?', default='-', help="sample stream, format: [label] [[features]]\n")
    parser.add_argument('-g', '--graph', help='graph results of estimator fitting\n', action='store_true')
    parser.add_argument('-p', '--prediction', help='output the prediction for the test set, or if no test set specified, the train set\n', action='store_true')
    parser.add_argument('-f', '--file', type=argparse.FileType('wb'), help='save the estimator model to the file\n')
    parser.add_argument('-n', '--trainset', type=str, default='0', help='training/test set customization:\n- <= 0: no split\n- < 1: random stratified split, rest is test set\n- > 1: k-fold split, selected fold is test set (k.x, [folds].[select])\n- file: use this file as sample source, no split\n')
    parser.add_argument('-b', '--benchmark', type=str, help='benchmark the fit/predict process, and output result to specified file\n')
    parser.add_argument('--custom-args', metavar='dict', type=str, help='custom trainer arguments\n')
    parser.add_argument('--custom-args-compat', help='compatibility mode for --custom-args, e.g. when used with GNU parallel\nkeys and values are strings only! format: {key1:value1:key2...}\n', action='store_true')

if __name__=="__main__":
    get_estimators()

    class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter): pass
    cmdline = argparse.ArgumentParser(description=general_help, epilog="Default output: estimator model dump", formatter_class=Formatter)

    # create subparsers for each estimator
    est_cmdline = cmdline.add_subparsers(title='estimator', dest='estimator', description=estimator_help_list, metavar='')
    est_cmdline_subs = dict()
    for e in sorted(estimators_help.keys()):
        est_cmdline_subs[e] = est_cmdline.add_parser(e, description=estimators_help[e], formatter_class=Formatter)
        general_args(est_cmdline_subs[e])
        # fill arguments with available parameters
        group = est_cmdline_subs[e].add_argument_group('estimator parameters')
        for a in sorted(estimators_params[e].keys()):
            group.add_argument('--'+a, type=type(estimators_params[e][a]), default=estimators_params[e][a], metavar=type(estimators_params[e][a]).__name__, help=' ')

    args = cmdline.parse_args()

    if args.estimator == None:
        print('no estimator given')
        cmdline.print_help()
        quit()

    # load sample stream and fill label and feature containers
    train_labels, train_features, test_labels, test_features = load(args.samples, args.trainset)

    # get the estimator class
    estimator = estimators[args.estimator]
    supervised = args.estimator in ensemble_estimators + svm_estimators + xda_estimators

    # set the estimators parameters from cli
    params = dict(args.__dict__)
    # filter other cli arguments
    del params['estimator'], params['samples'], \
        params['graph'], params['prediction'], \
        params['file'], params['trainset'], \
        params['benchmark'], \
        params['custom_args'], params['custom_args_compat']

    # set custom estimator args if any
    if args.custom_args:
        # compatibility mode
        if args.custom_args_compat: args.custom_args = custom_args_compat(args.custom_args)
        # use json to transform dict-like string to actual dict
        custom_args = json.loads(args.custom_args)
        # entry: "connectivity":"temp_seq"
        if 'connectivity' in custom_args and custom_args['connectivity'] == 'temp_seq':
            # temporal sequence: each sample connected with its predecessor and successor
            temp_seq = sparse.diags([1,1], [-1,1], shape=(len(train_features), len(train_features)))
            params['connectivity'] = temp_seq

    # set the estimator parameters
    estimator.set_params(**params)

    #print(estimator.get_params())

    if args.benchmark: bench_file = open(args.benchmark, "w")

    # fit model
    try:
        if args.benchmark: start = datetime.now()
        if supervised: estimator.fit(train_features, train_labels)
        else: estimator.fit(train_features)
        if args.benchmark: print("fit::{}".format(datetime.now() - start), file=bench_file)
    except ValueError as ex:
        sys.stderr.write("train-skl: ValueError when fitting estimator! (NAN in the data?)\n")
        sys.stderr.write(str(ex) + '\n')
        sys.exit(-1)

    # graph
    if args.graph:
        graph_result(estimator, train_labels, train_features)
        quit()

    # immediately print predictions, don't dump model
    if args.prediction:
        if args.benchmark: start = datetime.now()
        # if no test samples, predict on all train samples
        if len(test_labels) == 0:
            # there are two predict functions in skl :/
            try:
                pred = estimator.fit_predict(train_features)
                if args.benchmark: print("fit_predict::{}".format(datetime.now() - start), file=bench_file)
            except AttributeError:
                pred = estimator.predict(train_features)
                if args.benchmark: print("predict::{}".format(datetime.now() - start), file=bench_file)
            for i in range(0, len(train_labels)): print(train_labels[i], pred[i], sep='\t')
        # if test set, predict on test samples only
        else:
            pred = estimator.predict(test_features)
            if args.benchmark: print("predict::{}".format(datetime.now() - start), file=bench_file)
            for i in range(0, len(test_labels)): print(test_labels[i], pred[i], sep='\t')
        if args.benchmark: bench_file.close()
        quit()

    if args.benchmark: bench_file.close()

    if args.file:
        # dump into file
        pickle.dump(estimator, args.file)
    else:
        # dump to stdout
        print(pickle.dumps(estimator).__sizeof__()) # size of the object, to be read by predict TODO: 33 Bytes inaccurate (why?)
        print(pickle.dumps(estimator).decode('latin-1'), flush=True)

    # if there is a test set, print it after the estimator dump
    if len(test_labels) > 0:
        print()
        print()
        for i in range(0, len(test_labels)):
            print(test_labels[i], end='\t')
            for feature in test_features[i]:
                print(feature, end='\t')
            print()