Source code for syconn.processing.learning_rfc

# -*- coding: utf-8 -*-
# SyConn - Synaptic connectivity inference toolkit
#
# Copyright (c) 2016 - now
# Max-Planck-Institute for Medical Research, Heidelberg, Germany
# Authors: Sven Dorkenwald, Philipp Schubert, Jörgen Kornfeld

import numpy as np
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
import pylab as pl
import pandas as pd
import zipfile
import tempfile
from collections import Counter
import shutil
import os
import matplotlib.patches as patches
from sklearn.decomposition import PCA

rf_params = {'n_estimators': 2000, 'oob_score': True, 'n_jobs': -1,
             'class_weight': 'balanced', 'max_features': 'auto'}

svm_params = {'kernel': 'rbf', 'class_weight': 'balanced', 'probability': True,
              'C': 2.0, 'gamma': 0.3, 'cache_size': 2000}

ada_params = {'n_estimators': 100, 'learning_rate': 1.}

lr_params = {'class_weight': 'balanced', 'solver': 'lbfgs', 'penalty': 'l2',
             'multi_class': 'multinomial'}


[docs]def init_clf(clf_used, params=None):
    if params is not None:
        params_used = params
    elif clf_used == 'svm':
        params_used = svm_params
    elif clf_used == 'ada_boost':
        params_used = rf_params
    elif clf_used == 'lr':
        params_used = lr_params
    else:
        params_used = rf_params
    if clf_used == 'svm':
        clf = SVC(**params_used)
    elif clf_used == 'ada_boost':
        rf = RandomForestClassifier(**rf_params)
        clf = AdaBoostClassifier(base_estimator=rf, **params_used)
    elif clf_used == 'lr':
        clf = LogisticRegressionCV(**params_used)
    else:
        clf = RandomForestClassifier(**params_used)
    return clf


[docs]def load_rfcs(rf_axoness_p, rf_spiness_p):
    """Loads pickeled Random Forest Classifier for axoness and spiness. If path is
    not valid returns None

    Parameters
    ----------
    rf_axoness_p : str
        Path to pickeled axonnes rf directory
    rf_spiness_p : str
        Path to pickeled spiness rf directory

    Returns
    -------
        RFC axoness, spiness or None
    """
    if os.path.isfile(rf_axoness_p):
        rfc_axoness = joblib.load(rf_axoness_p)
        # print "Found RFC for axoness. SkeletonNodes will contain axoness " \
        #       "probability."
    else:
        rfc_axoness = None
        print "WARNING: Could not predict axoness of SkeletonNodes. " \
              "Pretrained RFC file not found."
    if os.path.isfile(rf_spiness_p):
        rfc_spiness = joblib.load(rf_spiness_p)
        # print "Found RFC for spiness. SkeletonNodes will contain spiness " \
        #       "probability."
    else:
        rfc_spiness = None
        print "WARNING: Could not predict spiness of SkeletonNodes. " \
              "Pretrained RFC file not found."
    return rfc_axoness, rfc_spiness


[docs]def save_train_clf(X, y, clf_used, dir_path, use_pca=False, params=None):
    """Train classifier specified by clf_used to dir_path. Train with features X
    and labels y

    Parameters
    ----------
    X : np.array
        features
    y : np.array
        labels
    clf_used : str
        'rf' or 'svm' for RandomForest or SupportVectorMachine, respectively
    dir_path : str
        directory where to save pkl files of clf
    use_pca : bool
        flag if pca should be performed
    params: dict
        parameters for classifier

    """
    if use_pca:
        old_dim = X.shape[1]
        pca = PCA(n_components=0.99)
        X = pca.fit_transform(X)
    clf = init_clf(clf_used, params=params)
    try:
        clf.fit(X, y)
    except ValueError:
        X = np.nan_to_num(X)
        clf.fit(X, y)
    clf_dir = dir_path + '%s/' % clf_used
    if os.path.exists(clf_dir):
        shutil.rmtree(clf_dir)
    # if clf_used == 'rf':
        # print "Random Forest oob score:", clf.oob_score_
    os.makedirs(clf_dir)
    joblib.dump(clf, clf_dir + '/%s.pkl' % clf_used)
    # print "%s-Classifier written to %s" % (clf_used, clf_dir)


[docs]def novel_multiclass_prediction(f_scores, thresholds, probs):
    pred = -1 * np.ones((len(probs), ))
    tprobs = np.array(probs > thresholds, dtype=np.int)
    for i in range(len(probs)):
        if np.sum(tprobs[i]) != 0:
            pred[i] = np.argmax(f_scores * tprobs[i])
    return pred


[docs]def fscore(rec, prec, beta=1.):
    """Calculates f-score with beta value

    Parameters
    ----------
    rec : np.array
        recall
    prec : np.array
        precision
    beta : float
        weighting of precision

    Returns
    -------
    np.array
        f-score
    """
    prec = np.array(prec)
    rec = np.array(rec)
    f_score = (1. + beta**2) * (prec * rec) / (beta**2 * prec + rec)
    return np.nan_to_num(f_score)


[docs]def plot_corr(x, y, title='', xr=[-1, -1], yr=[-1, -1], save_path=None, nbins=5,
              xlabel='Size x', ylabel='Size y'):
    from matplotlib import pyplot as plt
    fig, ax = plt.subplots()
    fig.patch.set_facecolor('white')
    ax.tick_params(axis='x', which='major', labelsize=16, direction='out',
                    length=8, width=2,  right="off", top="off", pad=10)
    ax.tick_params(axis='y', which='major', labelsize=16, direction='out',
                    length=8, width=2,  right="off", top="off", pad=10)

    ax.tick_params(axis='x', which='minor', labelsize=12, direction='out',
                    length=8, width=2, right="off", top="off", pad=10)
    ax.tick_params(axis='y', which='minor', labelsize=12, direction='out',
                    length=8, width=2, right="off", top="off", pad=10)

    ax.spines['left'].set_linewidth(2)
    ax.spines['bottom'].set_linewidth(2)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    if not -1 in xr:
        plt.xlim(xr)
    if not -1 in yr:
        plt.ylim(yr)
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.gcf().subplots_adjust(left=0.15)
    plt.xlabel(xlabel, fontsize=18)
    plt.ylabel(ylabel, fontsize=18)
    ax.set_xscale("log")
    ax.set_yscale("log")

    plt.scatter(x, y, s=1, c="0.35")
    if save_path is None:
        plt.show(block=False)
    else:
        plt.savefig(save_path, dpi=600)


[docs]def plot_pr(precision, recall, title='', r=[0.67, 1.01], legend_labels=None,
            save_path=None, nbins=5, colorVals=None,
            xlabel='Recall', ylabel='Precision', l_pos="lower left",
            legend=True, r_x=[0.67, 1.01], ls=22):
    from matplotlib import pyplot as plt
    fig, ax = plt.subplots()
    fig.patch.set_facecolor('white')
    ax.tick_params(axis='x', which='major', labelsize=ls, direction='out',
                    length=4, width=3,  right="off", top="off", pad=10)
    ax.tick_params(axis='y', which='major', labelsize=ls, direction='out',
                    length=4, width=3,  right="off", top="off", pad=10)

    ax.tick_params(axis='x', which='minor', labelsize=ls, direction='out',
                    length=4, width=3, right="off", top="off", pad=10)
    ax.tick_params(axis='y', which='minor', labelsize=ls, direction='out',
                    length=4, width=3, right="off", top="off", pad=10)

    ax.spines['left'].set_linewidth(3)
    ax.spines['bottom'].set_linewidth(3)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    plt.locator_params(axis='x', nbins=nbins)
    plt.locator_params(axis='y', nbins=nbins)
    plt.title(title)
    if not -1 in r:
        plt.xlim(r_x)
        plt.ylim(r)

    plt.xlabel(xlabel, fontsize=ls)
    plt.ylabel(ylabel, fontsize=ls)

    plt.tight_layout()
    if isinstance(recall, list):
        if colorVals is None:
            colorVals = [[0.171, 0.485, 0.731, 1.],
                         [0.175, 0.585, 0.301, 1.],
                         [0.841, 0.138, 0.133, 1.]]
        if len(colorVals) < len(recall):
            colorVals += ["0.35"] * (len(recall) - len(colorVals))
        if len(colorVals) > len(recall):
            colorVals = ["0.35", "0.7"]
        if legend_labels is None:
            legend_labels = ["Mitochondria", "Vesicle Clouds", "Synaptic Junctions"]
        handles = []
        for ii in range(len(recall)):
            handles.append(patches.Patch(color=colorVals[ii], label=legend_labels[ii]))
            plt.plot(recall[ii], precision[ii], lw=4, c=colorVals[ii])
        if legend:
            plt.legend(handles=handles, loc=l_pos, frameon=False, prop={'size': ls})
    else:
        plt.plot(recall, precision, lw=4, c="0.35")
    if save_path is None:
        plt.show(block=False)
    else:
        plt.savefig(save_path, dpi=300)


[docs]def feature_importance(rf, save_path=None):
    """Plots feature importance of sklearn RandomForest

    Parameters
    ----------
    rf : RandomForestClassifier
    save_path : str
    """
    importances = rf.feature_importances_
    nb = len(importances)
    tree_imp = [tree.feature_importances_ for tree in rf.estimators_]
    # print "Print feature importance of rf with %d trees." % len(tree_imp)
    std = np.std(tree_imp, axis=0) / np.sqrt(len(tree_imp))
    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    # print("Feature ranking:")
    # for f in range(nb):
    #     print("%d. feature %d (%f)" %
    #           (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    pl.figure()
    pl.title("Feature importances")
    pl.bar(range(nb), importances[indices],
           color="r", yerr=std[indices], align="center")
    pl.xticks(range(nb), indices)
    pl.xlim([-1, nb])
    if save_path is not None:
        pl.savefig(save_path)
    pl.close()


[docs]def write_feat2csv(fpath, feat_arr, feat_names=None):
    """Writes array with column names to csv file

    Parameters
    ----------
    fpath: str
        Path to file
    feat_arr : np.array
        feature array
    feat_names : list of str
        feature names
    """
    if feat_names is None or (len(feat_names) != feat_arr.shape[1]):
        df = pd.DataFrame(feat_arr)
        # print feat_arr.shape, len(feat_names)
    else:
        df = pd.DataFrame(feat_arr, columns=feat_names)
    df.to_csv(fpath)
    return


[docs]def load_csv2feat(fpath, prop='axoness'):
    """Load csvfile from kzip and return numpy array and list of header names.
    List line is supposed to be the probability prediction.

    Parameters
    ----------
    fpath : str
        Source file path
    prop : str
        property which should be loaded
    Returns
    -------
    np.array, list of str
        features, feature names
    """
    zf = zipfile.ZipFile(fpath, 'r')
    df = pd.DataFrame()
    for filename in ['%s_feat.csv' % prop]:
        temp = tempfile.TemporaryFile()
        temp.write(zf.read(filename))
        temp.seek(0)
        df = pd.DataFrame.from_csv(temp)
    return df.as_matrix(), df.columns.values.tolist()


[docs]def loo_proba(x, y, clf_used='rf', use_pca=False, params=None):
    """Perform leave-one-out

    Parameters
    ----------
    x : np.array
        features
    y : np.array
        labels
    clf_used : str
        classifier
    use_pca : bool
        perform principal component analysis on features x in advance
    params : dict
        parameter for classifier

    Returns
    -------
    np.array, np.array
        class probability, hard classification
    """
    # print "Performing LOO with %s and %d features. Using PCA: %s" % \
    #       (clf_used, x.shape[1], str(use_pca))
    if use_pca:
        old_dim = x.shape[1]
        pca = PCA(n_components=0.999)
        x = pca.fit_transform(x)
        # print pca.explained_variance_ratio_
        # print "Reduced feature space dimension %d, instead of %d" % (x.shape[1],
        #                                                              old_dim)
    nans_in_X = np.sum(np.isnan(x))
    if nans_in_X > 0:
        # print np.where(np.isnan(x))
        # print "Found %d nans in features, converting to number." % nans_in_X
        x = np.nan_to_num(x)
    loo = cross_validation.LeaveOneOut(len(x))
    shape = (len(x), len(list(set(y))))
    prob = np.zeros(shape, dtype=np.float)
    pred = np.zeros((len(x), 1), dtype=np.int)
    cnt = 0
    # print "rf params:", rf_params
    for train_ixs, test_ixs in loo:
        x_train = x[train_ixs]
        x_test = x[test_ixs]
        y_train = y[train_ixs]
        clf = init_clf(clf_used, params)
        clf.fit(x_train, y_train)
        prob[cnt] = clf.predict_proba(x_test)
        pred[cnt] = clf.predict(x_test)
        np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
        # if pred[cnt] == y[test_ixs]:
        #     print test_ixs, "\t", prob[cnt], pred[cnt], y[test_ixs]
        # else:
        #     print test_ixs, "\t", prob[cnt], pred[cnt], y[test_ixs], "\t WRONG"
        cnt += 1
    return prob, pred


[docs]def cell_classification(node_pred):
    """Perform majority vote

    Parameters
    ----------
    node_pred : np.array of int
        arbitrary array of integer

    Returns
    -------
    int
        maximum occurring integer in array
    """
    if len(node_pred) == 0:
        return np.zeros(0)
    counter = Counter(node_pred)
    return counter.most_common()[0][0]