Source code for syconn.processing.learning_rfc

# -*- coding: utf-8 -*-
# SyConn - Synaptic connectivity inference toolkit
#
# Copyright (c) 2016 - now
# Max-Planck-Institute for Medical Research, Heidelberg, Germany
# Authors: Sven Dorkenwald, Philipp Schubert, Jörgen Kornfeld

import numpy as np
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
import pylab as pl
import pandas as pd
import zipfile
import tempfile
from collections import Counter
import shutil
import os
import matplotlib.patches as patches
from sklearn.decomposition import PCA

rf_params = {'n_estimators': 2000, 'oob_score': True, 'n_jobs': -1,
             'class_weight': 'balanced', 'max_features': 'auto'}

svm_params = {'kernel': 'rbf', 'class_weight': 'balanced', 'probability': True,
              'C': 2.0, 'gamma': 0.3, 'cache_size': 2000}

ada_params = {'n_estimators': 100, 'learning_rate': 1.}

lr_params = {'class_weight': 'balanced', 'solver': 'lbfgs', 'penalty': 'l2',
             'multi_class': 'multinomial'}


[docs]def init_clf(clf_used, params=None): if params is not None: params_used = params elif clf_used == 'svm': params_used = svm_params elif clf_used == 'ada_boost': params_used = rf_params elif clf_used == 'lr': params_used = lr_params else: params_used = rf_params if clf_used == 'svm': clf = SVC(**params_used) elif clf_used == 'ada_boost': rf = RandomForestClassifier(**rf_params) clf = AdaBoostClassifier(base_estimator=rf, **params_used) elif clf_used == 'lr': clf = LogisticRegressionCV(**params_used) else: clf = RandomForestClassifier(**params_used) return clf
[docs]def load_rfcs(rf_axoness_p, rf_spiness_p): """Loads pickeled Random Forest Classifier for axoness and spiness. If path is not valid returns None Parameters ---------- rf_axoness_p : str Path to pickeled axonnes rf directory rf_spiness_p : str Path to pickeled spiness rf directory Returns ------- RFC axoness, spiness or None """ if os.path.isfile(rf_axoness_p): rfc_axoness = joblib.load(rf_axoness_p) # print "Found RFC for axoness. SkeletonNodes will contain axoness " \ # "probability." else: rfc_axoness = None print "WARNING: Could not predict axoness of SkeletonNodes. " \ "Pretrained RFC file not found." if os.path.isfile(rf_spiness_p): rfc_spiness = joblib.load(rf_spiness_p) # print "Found RFC for spiness. SkeletonNodes will contain spiness " \ # "probability." else: rfc_spiness = None print "WARNING: Could not predict spiness of SkeletonNodes. " \ "Pretrained RFC file not found." return rfc_axoness, rfc_spiness
[docs]def save_train_clf(X, y, clf_used, dir_path, use_pca=False, params=None): """Train classifier specified by clf_used to dir_path. Train with features X and labels y Parameters ---------- X : np.array features y : np.array labels clf_used : str 'rf' or 'svm' for RandomForest or SupportVectorMachine, respectively dir_path : str directory where to save pkl files of clf use_pca : bool flag if pca should be performed params: dict parameters for classifier """ if use_pca: old_dim = X.shape[1] pca = PCA(n_components=0.99) X = pca.fit_transform(X) clf = init_clf(clf_used, params=params) try: clf.fit(X, y) except ValueError: X = np.nan_to_num(X) clf.fit(X, y) clf_dir = dir_path + '%s/' % clf_used if os.path.exists(clf_dir): shutil.rmtree(clf_dir) # if clf_used == 'rf': # print "Random Forest oob score:", clf.oob_score_ os.makedirs(clf_dir) joblib.dump(clf, clf_dir + '/%s.pkl' % clf_used)
# print "%s-Classifier written to %s" % (clf_used, clf_dir)
[docs]def novel_multiclass_prediction(f_scores, thresholds, probs): pred = -1 * np.ones((len(probs), )) tprobs = np.array(probs > thresholds, dtype=np.int) for i in range(len(probs)): if np.sum(tprobs[i]) != 0: pred[i] = np.argmax(f_scores * tprobs[i]) return pred
[docs]def fscore(rec, prec, beta=1.): """Calculates f-score with beta value Parameters ---------- rec : np.array recall prec : np.array precision beta : float weighting of precision Returns ------- np.array f-score """ prec = np.array(prec) rec = np.array(rec) f_score = (1. + beta**2) * (prec * rec) / (beta**2 * prec + rec) return np.nan_to_num(f_score)
[docs]def plot_corr(x, y, title='', xr=[-1, -1], yr=[-1, -1], save_path=None, nbins=5, xlabel='Size x', ylabel='Size y'): from matplotlib import pyplot as plt fig, ax = plt.subplots() fig.patch.set_facecolor('white') ax.tick_params(axis='x', which='major', labelsize=16, direction='out', length=8, width=2, right="off", top="off", pad=10) ax.tick_params(axis='y', which='major', labelsize=16, direction='out', length=8, width=2, right="off", top="off", pad=10) ax.tick_params(axis='x', which='minor', labelsize=12, direction='out', length=8, width=2, right="off", top="off", pad=10) ax.tick_params(axis='y', which='minor', labelsize=12, direction='out', length=8, width=2, right="off", top="off", pad=10) ax.spines['left'].set_linewidth(2) ax.spines['bottom'].set_linewidth(2) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) if not -1 in xr: plt.xlim(xr) if not -1 in yr: plt.ylim(yr) plt.gcf().subplots_adjust(bottom=0.15) plt.gcf().subplots_adjust(left=0.15) plt.xlabel(xlabel, fontsize=18) plt.ylabel(ylabel, fontsize=18) ax.set_xscale("log") ax.set_yscale("log") plt.scatter(x, y, s=1, c="0.35") if save_path is None: plt.show(block=False) else: plt.savefig(save_path, dpi=600)
[docs]def plot_pr(precision, recall, title='', r=[0.67, 1.01], legend_labels=None, save_path=None, nbins=5, colorVals=None, xlabel='Recall', ylabel='Precision', l_pos="lower left", legend=True, r_x=[0.67, 1.01], ls=22): from matplotlib import pyplot as plt fig, ax = plt.subplots() fig.patch.set_facecolor('white') ax.tick_params(axis='x', which='major', labelsize=ls, direction='out', length=4, width=3, right="off", top="off", pad=10) ax.tick_params(axis='y', which='major', labelsize=ls, direction='out', length=4, width=3, right="off", top="off", pad=10) ax.tick_params(axis='x', which='minor', labelsize=ls, direction='out', length=4, width=3, right="off", top="off", pad=10) ax.tick_params(axis='y', which='minor', labelsize=ls, direction='out', length=4, width=3, right="off", top="off", pad=10) ax.spines['left'].set_linewidth(3) ax.spines['bottom'].set_linewidth(3) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) plt.locator_params(axis='x', nbins=nbins) plt.locator_params(axis='y', nbins=nbins) plt.title(title) if not -1 in r: plt.xlim(r_x) plt.ylim(r) plt.xlabel(xlabel, fontsize=ls) plt.ylabel(ylabel, fontsize=ls) plt.tight_layout() if isinstance(recall, list): if colorVals is None: colorVals = [[0.171, 0.485, 0.731, 1.], [0.175, 0.585, 0.301, 1.], [0.841, 0.138, 0.133, 1.]] if len(colorVals) < len(recall): colorVals += ["0.35"] * (len(recall) - len(colorVals)) if len(colorVals) > len(recall): colorVals = ["0.35", "0.7"] if legend_labels is None: legend_labels = ["Mitochondria", "Vesicle Clouds", "Synaptic Junctions"] handles = [] for ii in range(len(recall)): handles.append(patches.Patch(color=colorVals[ii], label=legend_labels[ii])) plt.plot(recall[ii], precision[ii], lw=4, c=colorVals[ii]) if legend: plt.legend(handles=handles, loc=l_pos, frameon=False, prop={'size': ls}) else: plt.plot(recall, precision, lw=4, c="0.35") if save_path is None: plt.show(block=False) else: plt.savefig(save_path, dpi=300)
[docs]def feature_importance(rf, save_path=None): """Plots feature importance of sklearn RandomForest Parameters ---------- rf : RandomForestClassifier save_path : str """ importances = rf.feature_importances_ nb = len(importances) tree_imp = [tree.feature_importances_ for tree in rf.estimators_] # print "Print feature importance of rf with %d trees." % len(tree_imp) std = np.std(tree_imp, axis=0) / np.sqrt(len(tree_imp)) indices = np.argsort(importances)[::-1] # Print the feature ranking # print("Feature ranking:") # for f in range(nb): # print("%d. feature %d (%f)" % # (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest pl.figure() pl.title("Feature importances") pl.bar(range(nb), importances[indices], color="r", yerr=std[indices], align="center") pl.xticks(range(nb), indices) pl.xlim([-1, nb]) if save_path is not None: pl.savefig(save_path) pl.close()
[docs]def write_feat2csv(fpath, feat_arr, feat_names=None): """Writes array with column names to csv file Parameters ---------- fpath: str Path to file feat_arr : np.array feature array feat_names : list of str feature names """ if feat_names is None or (len(feat_names) != feat_arr.shape[1]): df = pd.DataFrame(feat_arr) # print feat_arr.shape, len(feat_names) else: df = pd.DataFrame(feat_arr, columns=feat_names) df.to_csv(fpath) return
[docs]def load_csv2feat(fpath, prop='axoness'): """Load csvfile from kzip and return numpy array and list of header names. List line is supposed to be the probability prediction. Parameters ---------- fpath : str Source file path prop : str property which should be loaded Returns ------- np.array, list of str features, feature names """ zf = zipfile.ZipFile(fpath, 'r') df = pd.DataFrame() for filename in ['%s_feat.csv' % prop]: temp = tempfile.TemporaryFile() temp.write(zf.read(filename)) temp.seek(0) df = pd.DataFrame.from_csv(temp) return df.as_matrix(), df.columns.values.tolist()
[docs]def loo_proba(x, y, clf_used='rf', use_pca=False, params=None): """Perform leave-one-out Parameters ---------- x : np.array features y : np.array labels clf_used : str classifier use_pca : bool perform principal component analysis on features x in advance params : dict parameter for classifier Returns ------- np.array, np.array class probability, hard classification """ # print "Performing LOO with %s and %d features. Using PCA: %s" % \ # (clf_used, x.shape[1], str(use_pca)) if use_pca: old_dim = x.shape[1] pca = PCA(n_components=0.999) x = pca.fit_transform(x) # print pca.explained_variance_ratio_ # print "Reduced feature space dimension %d, instead of %d" % (x.shape[1], # old_dim) nans_in_X = np.sum(np.isnan(x)) if nans_in_X > 0: # print np.where(np.isnan(x)) # print "Found %d nans in features, converting to number." % nans_in_X x = np.nan_to_num(x) loo = cross_validation.LeaveOneOut(len(x)) shape = (len(x), len(list(set(y)))) prob = np.zeros(shape, dtype=np.float) pred = np.zeros((len(x), 1), dtype=np.int) cnt = 0 # print "rf params:", rf_params for train_ixs, test_ixs in loo: x_train = x[train_ixs] x_test = x[test_ixs] y_train = y[train_ixs] clf = init_clf(clf_used, params) clf.fit(x_train, y_train) prob[cnt] = clf.predict_proba(x_test) pred[cnt] = clf.predict(x_test) np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) # if pred[cnt] == y[test_ixs]: # print test_ixs, "\t", prob[cnt], pred[cnt], y[test_ixs] # else: # print test_ixs, "\t", prob[cnt], pred[cnt], y[test_ixs], "\t WRONG" cnt += 1 return prob, pred
[docs]def cell_classification(node_pred): """Perform majority vote Parameters ---------- node_pred : np.array of int arbitrary array of integer Returns ------- int maximum occurring integer in array """ if len(node_pred) == 0: return np.zeros(0) counter = Counter(node_pred) return counter.most_common()[0][0]