import os
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
AdaBoostClassifier, GradientBoostingClassifier,
VotingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.externals import joblib
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.pipeline import make_pipeline
[docs]def line(x, x1, y1, x2, y2):
return (x - x1) * ((y2-y1) / (x2-x1)) + y1
[docs]class LineCutClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, demo_param='demo'):
self.demo_param = demo_param
[docs] def fit(self, X, y):
# Check that X and y have correct shape
X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
assert len(self.classes_) == 4, 'Must have 4 classes'
self.X_ = X
self.y_ = y
# Return the classifier
return self
[docs] def predict(self, X):
# Check is fit had been called
check_is_fitted(self, ['X_', 'y_'])
# Input validation
X = check_array(X)
assert X.shape[1] == 3, 'Must have only 3 training features'
y = np.empty(len(X), dtype=int)
for idx, (log_s125, log_dEdX) in enumerate(X[:, (1, 2)]):
log_dEdX_iron = line(log_s125, 0, 1.1, 2, 2.5)
log_dEdX_oxygen = line(log_s125, 0, 0.9, 2, 2.4)
log_dEdX_proton = line(log_s125, 0, 0.75, 2, 2.3)
if log_dEdX <= log_dEdX_proton:
y[idx] = 0
elif (log_dEdX <= log_dEdX_oxygen) and (log_dEdX > log_dEdX_proton):
y[idx] = 1
elif (log_dEdX <= log_dEdX_iron) and (log_dEdX > log_dEdX_oxygen):
y[idx] = 2
else:
y[idx] = 3
return y
[docs]class CustomClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, p=0.8, neighbor_weight=2.0, num_groups=4,
random_state=2):
self.p = p
self.neighbor_weight = neighbor_weight
self.num_groups = num_groups
self.random_state = random_state
[docs] def fit(self, X, y):
# Check that X and y have correct shape
X, y = check_X_y(X, y)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
if not len(self.classes_) == self.num_groups:
raise ValueError('Must have {} classes'.format(self.num_groups))
self.X_ = X
self.y_ = y
return self
[docs] def predict(self, y):
"""Performs random composition classification
"""
# # Check is fit had been called
# check_is_fitted(self, ['X_', 'y_'])
# Input validation
# y = check_array(y)
# Want to get reproducible random classifications
np.random.seed(self.random_state)
p_correct = self.p
y_pred = np.empty_like(y)
targets = list(range(self.num_groups))
probs = np.empty_like(targets, dtype=float)
for target in targets:
comp_mask = y == target
probs[target] = p_correct
not_target = [i for i in targets if i != target]
neighbors = [target - 1, target + 1]
neighbors = [i for i in neighbors if i >= 0 and i < self.num_groups]
not_neighbors = list(set(not_target).difference(neighbors))
weight = (1 - p_correct) / (len(not_neighbors) + self.neighbor_weight * len(neighbors))
probs[not_neighbors] = weight
probs[neighbors] = self.neighbor_weight * weight
# Get custom composition classification
y_pred_target = np.random.choice(targets, size=comp_mask.sum(), p=probs)
y_pred[comp_mask] = y_pred_target
return y_pred
[docs]def get_pipeline(classifier_name='BDT'):
""" Function to get classifier pipeline.
"""
steps = []
if classifier_name == 'RF':
classifier = RandomForestClassifier(
n_estimators=100, max_depth=6, n_jobs=20,
# n_estimators=100, max_depth=7, min_samples_leaf=150, n_jobs=20,
random_state=2)
elif classifier_name == 'xgboost':
classifier = XGBClassifier(n_estimators=125, nthread=10, silent=True,
seed=2)
elif classifier_name == 'Ada':
classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),
n_estimators=100, learning_rate=0.1,
random_state=2)
# classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=2)
# elif classifier_name in ['GBDT', 'BDT']:
# classifier = GradientBoostingClassifier(
# loss='exponential', max_depth=3, n_estimators=100, random_state=2)
# # classifier = GradientBoostingClassifier(loss='deviance', max_depth=3,
# # n_estimators=500, random_state=2)
elif classifier_name == 'BDT_comp_IC79.2010':
classifier = GradientBoostingClassifier(
loss='deviance', max_depth=4, n_estimators=100, random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'BDT_comp_IC79.2010_2-groups':
classifier = GradientBoostingClassifier(
loss='deviance', max_depth=4, n_estimators=100, random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'BDT_comp_IC86.2012_2-groups':
classifier = GradientBoostingClassifier(loss='deviance',
max_depth=4,
n_estimators=100,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'BDT_comp_IC86.2012_3-groups':
classifier = GradientBoostingClassifier(loss='deviance',
max_depth=3,
n_estimators=100,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'BDT_comp_IC86.2012_4-groups':
classifier = GradientBoostingClassifier(loss='deviance',
max_depth=2,
n_estimators=100,
random_state=2)
steps.append(('classifier', classifier))
elif 'CustomClassifier' in classifier_name:
hyperparams_str = classifier_name.split('_')[1:]
assert len(hyperparams_str) == 3, 'Too many CustomClassifier hyperparams. Got {} but should have 3.'.format(len(hyperparams_str))
p = float(hyperparams_str[0])
neighbor_weight = float(hyperparams_str[1])
num_groups = int(hyperparams_str[2])
classifier = CustomClassifier(p=p, neighbor_weight=neighbor_weight,
num_groups=num_groups, random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'RF_comp_IC86.2012_4-groups':
classifier = RandomForestClassifier(max_depth=10, n_estimators=500,
random_state=2, n_jobs=10)
steps.append(('classifier', classifier))
elif classifier_name == 'SVC_comp_IC86.2012_2-groups':
classifier = SVC(C=0.5, random_state=2)
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'SVC_comp_IC86.2012_4-groups':
classifier = SVC(C=0.5, random_state=2)
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'LinearSVC_comp_IC86.2012_2-groups':
classifier = LinearSVC(random_state=2)
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'LinearSVC_comp_IC86.2012_4-groups':
classifier = LinearSVC(random_state=2)
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'NuSVC_comp_IC86.2012_4-groups':
classifier = NuSVC(random_state=2)
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'xgboost_comp_IC86.2012_2-groups':
classifier = XGBClassifier(learning_rate=0.05,
max_depth=7,
n_estimators=150,
# subsample=0.75,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'xgboost_comp_IC86.2012_4-groups':
classifier = XGBClassifier(max_depth=2,
n_estimators=100,
# subsample=0.75,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'LogisticRegression_comp_IC86.2012_4-groups':
classifier = LogisticRegression(random_state=2)
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'linecut_comp_IC86.2012_4-groups':
classifier = LineCutClassifier()
steps.append(('classifier', classifier))
elif classifier_name == 'stacking_comp_IC86.2012_4-groups':
classifiers = [SVC(random_state=2),
LinearSVC(random_state=2),
GradientBoostingClassifier(loss='deviance',
max_depth=2,
n_estimators=100,
random_state=2),
]
classifier = StackingClassifier(classifiers,
meta_classifier=LogisticRegression())
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'voting_comp_IC86.2012_4-groups':
# classifiers = [SVC(random_state=2),
# LinearSVC(random_state=2),
# GradientBoostingClassifier(loss='deviance',
# max_depth=2,
# n_estimators=100,
# random_state=2),
# ]
estimators = [('SVC', SVC(random_state=2)),
# ('LinearSVC', LinearSVC(random_state=2)),
('LogisticRegression', LogisticRegression(random_state=2)),
# ('BDT', GradientBoostingClassifier(loss='deviance',
# max_depth=2,
# n_estimators=100,
# random_state=2)),
('xgboost', XGBClassifier(max_depth=3,
booster='gblinear',
n_estimators=100,
random_state=2))]
classifier = VotingClassifier(estimators, voting='hard')
steps.append(('scaler', StandardScaler()))
steps.append(('classifier', classifier))
elif classifier_name == 'RF_energy_IC79.2010':
classifier = RandomForestRegressor(n_estimators=100,
max_depth=8,
n_jobs=10,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'RF_energy_IC86.2012':
classifier = RandomForestRegressor(n_estimators=100,
max_depth=7,
n_jobs=10,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'xgboost_energy_IC86.2012':
classifier = XGBRegressor(n_estimators=75,
booster='gblinear',
# subsample=0.75,
random_state=2)
steps.append(('classifier', classifier))
elif classifier_name == 'linearregression_energy_IC86.2012':
reg = make_pipeline(PolynomialFeatures(2),
# StandardScaler(),
LinearRegression(),
)
return reg
elif classifier_name == 'SGD_comp_IC86.2012_2-groups':
# clf = make_pipeline(StandardScaler(),
# SGDClassifier(random_state=2, n_jobs=1),
# )
clf = make_pipeline(StandardScaler(),
SGDClassifier(loss='hinge', alpha=1e-3, max_iter=50, tol=1e-3, shuffle=True, random_state=2),
)
return clf
else:
raise ValueError(
'{} is not a valid classifier name'.format(classifier_name))
# pipeline = Pipeline([
# # ('scaler', StandardScaler()),
# # ('pca', PCA(n_components=4, random_state=2)),
# # ('lda', LinearDiscriminantAnalysis(n_discriminants=6)),
# ('classifier', classifier)])
pipeline = Pipeline(steps)
return pipeline