# -*- coding: utf-8 -*-
# SyConn - Synaptic connectivity inference toolkit
#
# Copyright (c) 2016 - now
# Max-Planck-Institute for Medical Research, Heidelberg, Germany
# Authors: Sven Dorkenwald, Philipp Schubert, Jörgen Kornfeld
import numpy as np
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
import pylab as pl
import pandas as pd
import zipfile
import tempfile
from collections import Counter
import shutil
import os
import matplotlib.patches as patches
from sklearn.decomposition import PCA
rf_params = {'n_estimators': 2000, 'oob_score': True, 'n_jobs': -1,
'class_weight': 'balanced', 'max_features': 'auto'}
svm_params = {'kernel': 'rbf', 'class_weight': 'balanced', 'probability': True,
'C': 2.0, 'gamma': 0.3, 'cache_size': 2000}
ada_params = {'n_estimators': 100, 'learning_rate': 1.}
lr_params = {'class_weight': 'balanced', 'solver': 'lbfgs', 'penalty': 'l2',
'multi_class': 'multinomial'}
[docs]def init_clf(clf_used, params=None):
if params is not None:
params_used = params
elif clf_used == 'svm':
params_used = svm_params
elif clf_used == 'ada_boost':
params_used = rf_params
elif clf_used == 'lr':
params_used = lr_params
else:
params_used = rf_params
if clf_used == 'svm':
clf = SVC(**params_used)
elif clf_used == 'ada_boost':
rf = RandomForestClassifier(**rf_params)
clf = AdaBoostClassifier(base_estimator=rf, **params_used)
elif clf_used == 'lr':
clf = LogisticRegressionCV(**params_used)
else:
clf = RandomForestClassifier(**params_used)
return clf
[docs]def load_rfcs(rf_axoness_p, rf_spiness_p):
"""Loads pickeled Random Forest Classifier for axoness and spiness. If path is
not valid returns None
Parameters
----------
rf_axoness_p : str
Path to pickeled axonnes rf directory
rf_spiness_p : str
Path to pickeled spiness rf directory
Returns
-------
RFC axoness, spiness or None
"""
if os.path.isfile(rf_axoness_p):
rfc_axoness = joblib.load(rf_axoness_p)
# print "Found RFC for axoness. SkeletonNodes will contain axoness " \
# "probability."
else:
rfc_axoness = None
print "WARNING: Could not predict axoness of SkeletonNodes. " \
"Pretrained RFC file not found."
if os.path.isfile(rf_spiness_p):
rfc_spiness = joblib.load(rf_spiness_p)
# print "Found RFC for spiness. SkeletonNodes will contain spiness " \
# "probability."
else:
rfc_spiness = None
print "WARNING: Could not predict spiness of SkeletonNodes. " \
"Pretrained RFC file not found."
return rfc_axoness, rfc_spiness
[docs]def save_train_clf(X, y, clf_used, dir_path, use_pca=False, params=None):
"""Train classifier specified by clf_used to dir_path. Train with features X
and labels y
Parameters
----------
X : np.array
features
y : np.array
labels
clf_used : str
'rf' or 'svm' for RandomForest or SupportVectorMachine, respectively
dir_path : str
directory where to save pkl files of clf
use_pca : bool
flag if pca should be performed
params: dict
parameters for classifier
"""
if use_pca:
old_dim = X.shape[1]
pca = PCA(n_components=0.99)
X = pca.fit_transform(X)
clf = init_clf(clf_used, params=params)
try:
clf.fit(X, y)
except ValueError:
X = np.nan_to_num(X)
clf.fit(X, y)
clf_dir = dir_path + '%s/' % clf_used
if os.path.exists(clf_dir):
shutil.rmtree(clf_dir)
# if clf_used == 'rf':
# print "Random Forest oob score:", clf.oob_score_
os.makedirs(clf_dir)
joblib.dump(clf, clf_dir + '/%s.pkl' % clf_used)
# print "%s-Classifier written to %s" % (clf_used, clf_dir)
[docs]def novel_multiclass_prediction(f_scores, thresholds, probs):
pred = -1 * np.ones((len(probs), ))
tprobs = np.array(probs > thresholds, dtype=np.int)
for i in range(len(probs)):
if np.sum(tprobs[i]) != 0:
pred[i] = np.argmax(f_scores * tprobs[i])
return pred
[docs]def fscore(rec, prec, beta=1.):
"""Calculates f-score with beta value
Parameters
----------
rec : np.array
recall
prec : np.array
precision
beta : float
weighting of precision
Returns
-------
np.array
f-score
"""
prec = np.array(prec)
rec = np.array(rec)
f_score = (1. + beta**2) * (prec * rec) / (beta**2 * prec + rec)
return np.nan_to_num(f_score)
[docs]def plot_corr(x, y, title='', xr=[-1, -1], yr=[-1, -1], save_path=None, nbins=5,
xlabel='Size x', ylabel='Size y'):
from matplotlib import pyplot as plt
fig, ax = plt.subplots()
fig.patch.set_facecolor('white')
ax.tick_params(axis='x', which='major', labelsize=16, direction='out',
length=8, width=2, right="off", top="off", pad=10)
ax.tick_params(axis='y', which='major', labelsize=16, direction='out',
length=8, width=2, right="off", top="off", pad=10)
ax.tick_params(axis='x', which='minor', labelsize=12, direction='out',
length=8, width=2, right="off", top="off", pad=10)
ax.tick_params(axis='y', which='minor', labelsize=12, direction='out',
length=8, width=2, right="off", top="off", pad=10)
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
if not -1 in xr:
plt.xlim(xr)
if not -1 in yr:
plt.ylim(yr)
plt.gcf().subplots_adjust(bottom=0.15)
plt.gcf().subplots_adjust(left=0.15)
plt.xlabel(xlabel, fontsize=18)
plt.ylabel(ylabel, fontsize=18)
ax.set_xscale("log")
ax.set_yscale("log")
plt.scatter(x, y, s=1, c="0.35")
if save_path is None:
plt.show(block=False)
else:
plt.savefig(save_path, dpi=600)
[docs]def plot_pr(precision, recall, title='', r=[0.67, 1.01], legend_labels=None,
save_path=None, nbins=5, colorVals=None,
xlabel='Recall', ylabel='Precision', l_pos="lower left",
legend=True, r_x=[0.67, 1.01], ls=22):
from matplotlib import pyplot as plt
fig, ax = plt.subplots()
fig.patch.set_facecolor('white')
ax.tick_params(axis='x', which='major', labelsize=ls, direction='out',
length=4, width=3, right="off", top="off", pad=10)
ax.tick_params(axis='y', which='major', labelsize=ls, direction='out',
length=4, width=3, right="off", top="off", pad=10)
ax.tick_params(axis='x', which='minor', labelsize=ls, direction='out',
length=4, width=3, right="off", top="off", pad=10)
ax.tick_params(axis='y', which='minor', labelsize=ls, direction='out',
length=4, width=3, right="off", top="off", pad=10)
ax.spines['left'].set_linewidth(3)
ax.spines['bottom'].set_linewidth(3)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.locator_params(axis='x', nbins=nbins)
plt.locator_params(axis='y', nbins=nbins)
plt.title(title)
if not -1 in r:
plt.xlim(r_x)
plt.ylim(r)
plt.xlabel(xlabel, fontsize=ls)
plt.ylabel(ylabel, fontsize=ls)
plt.tight_layout()
if isinstance(recall, list):
if colorVals is None:
colorVals = [[0.171, 0.485, 0.731, 1.],
[0.175, 0.585, 0.301, 1.],
[0.841, 0.138, 0.133, 1.]]
if len(colorVals) < len(recall):
colorVals += ["0.35"] * (len(recall) - len(colorVals))
if len(colorVals) > len(recall):
colorVals = ["0.35", "0.7"]
if legend_labels is None:
legend_labels = ["Mitochondria", "Vesicle Clouds", "Synaptic Junctions"]
handles = []
for ii in range(len(recall)):
handles.append(patches.Patch(color=colorVals[ii], label=legend_labels[ii]))
plt.plot(recall[ii], precision[ii], lw=4, c=colorVals[ii])
if legend:
plt.legend(handles=handles, loc=l_pos, frameon=False, prop={'size': ls})
else:
plt.plot(recall, precision, lw=4, c="0.35")
if save_path is None:
plt.show(block=False)
else:
plt.savefig(save_path, dpi=300)
[docs]def feature_importance(rf, save_path=None):
"""Plots feature importance of sklearn RandomForest
Parameters
----------
rf : RandomForestClassifier
save_path : str
"""
importances = rf.feature_importances_
nb = len(importances)
tree_imp = [tree.feature_importances_ for tree in rf.estimators_]
# print "Print feature importance of rf with %d trees." % len(tree_imp)
std = np.std(tree_imp, axis=0) / np.sqrt(len(tree_imp))
indices = np.argsort(importances)[::-1]
# Print the feature ranking
# print("Feature ranking:")
# for f in range(nb):
# print("%d. feature %d (%f)" %
# (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
pl.figure()
pl.title("Feature importances")
pl.bar(range(nb), importances[indices],
color="r", yerr=std[indices], align="center")
pl.xticks(range(nb), indices)
pl.xlim([-1, nb])
if save_path is not None:
pl.savefig(save_path)
pl.close()
[docs]def write_feat2csv(fpath, feat_arr, feat_names=None):
"""Writes array with column names to csv file
Parameters
----------
fpath: str
Path to file
feat_arr : np.array
feature array
feat_names : list of str
feature names
"""
if feat_names is None or (len(feat_names) != feat_arr.shape[1]):
df = pd.DataFrame(feat_arr)
# print feat_arr.shape, len(feat_names)
else:
df = pd.DataFrame(feat_arr, columns=feat_names)
df.to_csv(fpath)
return
[docs]def load_csv2feat(fpath, prop='axoness'):
"""Load csvfile from kzip and return numpy array and list of header names.
List line is supposed to be the probability prediction.
Parameters
----------
fpath : str
Source file path
prop : str
property which should be loaded
Returns
-------
np.array, list of str
features, feature names
"""
zf = zipfile.ZipFile(fpath, 'r')
df = pd.DataFrame()
for filename in ['%s_feat.csv' % prop]:
temp = tempfile.TemporaryFile()
temp.write(zf.read(filename))
temp.seek(0)
df = pd.DataFrame.from_csv(temp)
return df.as_matrix(), df.columns.values.tolist()
[docs]def loo_proba(x, y, clf_used='rf', use_pca=False, params=None):
"""Perform leave-one-out
Parameters
----------
x : np.array
features
y : np.array
labels
clf_used : str
classifier
use_pca : bool
perform principal component analysis on features x in advance
params : dict
parameter for classifier
Returns
-------
np.array, np.array
class probability, hard classification
"""
# print "Performing LOO with %s and %d features. Using PCA: %s" % \
# (clf_used, x.shape[1], str(use_pca))
if use_pca:
old_dim = x.shape[1]
pca = PCA(n_components=0.999)
x = pca.fit_transform(x)
# print pca.explained_variance_ratio_
# print "Reduced feature space dimension %d, instead of %d" % (x.shape[1],
# old_dim)
nans_in_X = np.sum(np.isnan(x))
if nans_in_X > 0:
# print np.where(np.isnan(x))
# print "Found %d nans in features, converting to number." % nans_in_X
x = np.nan_to_num(x)
loo = cross_validation.LeaveOneOut(len(x))
shape = (len(x), len(list(set(y))))
prob = np.zeros(shape, dtype=np.float)
pred = np.zeros((len(x), 1), dtype=np.int)
cnt = 0
# print "rf params:", rf_params
for train_ixs, test_ixs in loo:
x_train = x[train_ixs]
x_test = x[test_ixs]
y_train = y[train_ixs]
clf = init_clf(clf_used, params)
clf.fit(x_train, y_train)
prob[cnt] = clf.predict_proba(x_test)
pred[cnt] = clf.predict(x_test)
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
# if pred[cnt] == y[test_ixs]:
# print test_ixs, "\t", prob[cnt], pred[cnt], y[test_ixs]
# else:
# print test_ixs, "\t", prob[cnt], pred[cnt], y[test_ixs], "\t WRONG"
cnt += 1
return prob, pred
[docs]def cell_classification(node_pred):
"""Perform majority vote
Parameters
----------
node_pred : np.array of int
arbitrary array of integer
Returns
-------
int
maximum occurring integer in array
"""
if len(node_pred) == 0:
return np.zeros(0)
counter = Counter(node_pred)
return counter.most_common()[0][0]