Source code for comptools.model_selection


from __future__ import division
from collections import defaultdict
import dask
from dask import delayed, multiprocessing, threaded
from dask.diagnostics import ProgressBar
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import get_scorer

from .base import get_training_features
from .io import dataframe_to_X_y
from .composition_encoding import get_comp_list
from .data_functions import ratio_error
from .pipelines import get_pipeline


def _get_frac_correct(df_train, df_test, feature_columns, num_groups,
                      pipeline_str, comp_list, log_energy_bins):
    '''Calculates the fraction of correctly identified samples in each energy bin
    for each composition in comp_list. In addition, the statisitcal error for the
    fraction correctly identified is calculated.'''

    # Fit pipeline and get mask for correctly identified events
    target = 'comp_target_{}'.format(num_groups)
    pipeline = get_pipeline(pipeline_str)
    pipeline.fit(df_train[feature_columns], df_train[target])
    test_predictions = pipeline.predict(df_test[feature_columns])
    correctly_identified_mask = (test_predictions == df_test[target])

    # Construct MC composition masks
    MC_comp_mask = {}
    for composition in comp_list:
        MC_comp_mask[composition] = df_test['comp_group_{}'.format(num_groups)] == composition
    MC_comp_mask['total'] = np.ones(len(df_test), dtype=bool)

    data = {}
    for composition in comp_list + ['total']:
        comp_mask = MC_comp_mask[composition]
        # Get number of MC comp in each reco energy bin
        num_MC_energy = np.histogram(df_test.loc[comp_mask, 'MC_log_energy'], bins=log_energy_bins)[0]
        num_MC_energy_err = np.sqrt(num_MC_energy)

        # Get number of correctly identified comp in each reco energy bin
        num_reco_energy = np.histogram(df_test.loc[comp_mask & correctly_identified_mask, 'MC_log_energy'],
                                       bins=log_energy_bins)[0]
        num_reco_energy_err = np.sqrt(num_reco_energy)

        # Calculate correctly identified fractions as a function of MC energy
        frac_correct, frac_correct_err = ratio_error(
            num_reco_energy, num_reco_energy_err,
            num_MC_energy, num_MC_energy_err)
        data['frac_correct_{}'.format(composition)] = frac_correct
        data['frac_correct_err_{}'.format(composition)] = frac_correct_err

    return data


[docs]def get_CV_frac_correct(df_train, feature_list, target, pipeline_str, num_groups,
                        log_energy_bins, n_splits=10, n_jobs=1):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)

    comp_list = get_comp_list(num_groups=num_groups)
    comp_target = target

    # Set up get_frac_correct to run on each CV fold
    folds = []
    for train_index, test_index in skf.split(df_train, df_train[comp_target]):
        df_train_fold = df_train.iloc[train_index]
        df_test_fold = df_train.iloc[test_index]
        frac_correct = delayed(_get_frac_correct)(
                    df_train_fold, df_test_fold, feature_list, num_groups,
                    pipeline_str, comp_list, log_energy_bins)
        folds.append(frac_correct)

    df_cv = delayed(pd.DataFrame.from_records)(folds)

    # Run get_frac_correct on each fold in parallel
    print('Running {}-fold CV model evaluation...'.format(n_splits))
    with ProgressBar():
        get = multiprocessing.get if n_jobs > 1 else dask.get
        df_cv = df_cv.compute(get=get, num_works=n_jobs)

    return df_cv


@delayed
def _cross_validate_comp(df_train, df_test, pipeline_str, param_name,
                         param_value, feature_list=None,
                         target='comp_target_2', scoring='r2', num_groups=2,
                         n_splits=10):
    '''Calculates stratified k-fold CV scores for a given hyperparameter value

    Parameters
    ----------
    df_train : pandas.DataFrame
        Training DataFrame (see comptools.load_sim()).
    df_test : pandas.DataFrame
        Testing DataFrame (see comptools.load_sim()).
    pipeline_str : str
        Name of pipeline to use (e.g. 'BDT', 'RF_energy', etc.).
    param_name : str
        Name of hyperparameter (e.g. 'max_depth', 'learning_rate', etc.).
    param_value : int, float, str
        Value to set hyperparameter to.
    feature_list : list, optional
        List of training feature columns to use (default is to use
        comptools.get_training_features()).
    target : str, optional
        Training target to use (default is 'comp_target_2').
    scoring : {'r2', 'mse', 'accuracy'}
        Scoring metric to calculate for each CV fold (default is 'r2').
    num_groups : int, optional
        Number of composition class groups to use (default is 2).
    n_splits : int, optional
        Number of folds to use in (KFold) cross-validation
        (default is 10).

    Returns
    -------
        data_dict : dict
            Return a dictionary with average scores as well as CV errors on those scores.

    '''
    # assert scoring in ['accuracy', 'mse', 'r2'], 'Invalid scoring parameter'
    comp_list = get_comp_list(num_groups=num_groups)
    if feature_list is None:
        feature_list, _ = get_training_features()

    pipeline = get_pipeline(pipeline_str)
    pipeline.named_steps['classifier'].set_params(**{param_name: param_value})
    # Only run on a single core
    try:
        pipeline.named_steps['classifier'].set_params(**{'n_jobs': 1})
    except ValueError:
        pass

    data_dict = {'classifier': pipeline_str, 'param_name': param_name,
                 'param_value': param_value, 'n_splits': n_splits}

    train_scores = defaultdict(list)
    test_scores = defaultdict(list)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2)
    scorer = get_scorer(scoring)

    for train_index, test_index in kf.split(df_train.values):

        df_train_fold = df_train.iloc[train_index]
        df_test_fold = df_train.iloc[test_index]
        X_train, y_train = dataframe_to_X_y(df_train_fold, feature_list,
                                            target=target)
        X_test, y_test = dataframe_to_X_y(df_test_fold, feature_list,
                                          target=target)

        pipeline = pipeline.fit(X_train, y_train)

        train_pred = pipeline.predict(X_train)
        train_score = scorer(y_train, train_pred)
        train_scores['total'].append(train_score)

        test_pred = pipeline.predict(X_test)
        test_score = scorer(y_test, test_pred)
        test_scores['total'].append(test_score)

        # Get testing/training scores for each composition group
        for composition in comp_list:
            comp_key = 'comp_group_{}'.format(num_groups)

            comp_mask_train = df_train_fold[comp_key] == composition
            comp_score_train = scorer(y_train[comp_mask_train],
                                      train_pred[comp_mask_train])
            train_scores[composition].append(comp_score_train)

            comp_mask_test = df_test_fold[comp_key] == composition
            comp_score_test = scorer(y_test[comp_mask_test],
                                     test_pred[comp_mask_test])
            test_scores[composition].append(comp_score_test)

    for label in comp_list + ['total']:
        data_dict['train_mean_{}'.format(label)] = np.mean(train_scores[label])
        data_dict['train_std_{}'.format(label)] = np.std(train_scores[label])
        data_dict['test_mean_{}'.format(label)] = np.mean(test_scores[label])
        data_dict['test_std_{}'.format(label)] = np.std(test_scores[label])

    return data_dict


[docs]def cross_validate_comp(df_train, df_test, pipeline_str, param_name,
                        param_values, feature_list=None,
                        target='comp_target_2', scoring='accuracy',
                        num_groups=2, n_splits=10, n_jobs=1, verbose=False):
    '''Calculates stratified k-fold CV scores for a given hyperparameter value

    Similar to sklearn.model_selection.cross_validate, but returns results
    for individual composition groups as well as the combined CV result.

    Parameters
    ----------
    df_train : pandas.DataFrame
        Training DataFrame (see comptools.load_sim()).
    df_test : pandas.DataFrame
        Testing DataFrame (see comptools.load_sim()).
    pipeline_str : str
        Name of pipeline to use (e.g. 'BDT', 'RF_energy', etc.).
    param_name : str
        Name of hyperparameter (e.g. 'max_depth', 'learning_rate', etc.).
    param_values : array-like
        Values to set hyperparameter to.
    feature_list : list, optional
        List of training feature columns to use (default is to use
        comptools.get_training_features()).
    target : str, optional
        Training target to use (default is 'comp_target_2').
    scoring : str, optional
        Scoring metric to calculate for each CV fold (default is 'accuracy').
    num_groups : int, optional
        Number of composition class groups to use (default is 2).
    n_splits : int, optional
        Number of folds to use in (KFold) cross-validation
        (default is 10).
    n_jobs : int, optional
        Number of jobs to run in parallel (default is 1).
    verbose : bool, optional
        Option to print a progress bar (default is False).

    Returns
    -------
        df_cv : pandas.DataFrame
            Returns a DataFrame with average scores as well as CV errors
            on those scores for each composition.

    '''
    cv_dicts = []
    for param_value in param_values:
        cv_dict = _cross_validate_comp(
                    df_train, df_test, pipeline_str,
                    param_name, param_value,
                    feature_list=feature_list, target=target,
                    scoring=scoring, num_groups=num_groups, n_splits=n_splits)
        cv_dicts.append(cv_dict)

    df_cv = delayed(pd.DataFrame.from_records)(cv_dicts, index='param_value')

    get = dask.get if n_jobs == 1 else threaded.get
    # get = dask.get if n_jobs == 1 else multiprocessing.get
    if verbose:
        with ProgressBar():
            print('Performing {}-fold CV on {} hyperparameter values ({} fits):'.format(
                n_splits, len(param_values),  n_splits*len(param_values)))
            df_cv = df_cv.compute(get=get, num_works=n_jobs)
    else:
        df_cv = df_cv.compute(get=get, num_works=n_jobs)

    return df_cv


[docs]def get_param_grid(pipeline_name=None):
    """Returns dictionary with hyperparameter values to search

    Parameters
    ----------
    pipeline_name : str, optional
        Pipeline name. Should be formatted as
        <name>_comp_<config>_<num_groups>-groups. For example,
        pipeline_name=BDT_comp_IC86.2012_2-groups (default is None).

    Returns
    -------
    param_grid : dict
        Dictionary with hyperparameter names / values to be passed to
        GridSearchCV.
    """
    if pipeline_name is None:
        raise ValueError('Must enter a value for pipeline_name.')
    if 'BDT' in pipeline_name:
        param_grid = {'classifier__n_estimators': [10, 50, 100, 150, 200, 250, 300],
                      'classifier__max_depth': list(range(1, 11)),
                      'classifier__learning_rate': [0.1, 0.5, 0.8]}
    elif 'xgboost' in pipeline_name:
        # param_grid = {'classifier__n_estimators': [100, 150, 200],
        #               'classifier__max_depth': list(range(3, 7)),
        #               'classifier__learning_rate': [0.1, 0.15, 0.2]}
        param_grid = {'classifier__n_estimators': [100, 150, 200, 250, 300, 400, 500, 600],
                      'classifier__max_depth': list(range(3, 11)),
                      'classifier__learning_rate': [0.01, 0.025, 0.05, 0.1, 0.15, 0.2],
                      # 'classifier__subsample': [0.5, 0.75, 1.0],
                      }
    else:
        raise ValueError('Invalid pipeline entered: {}'.format(pipeline_name))

    return param_grid


[docs]def gridsearch_optimize(pipeline, param_grid, X_train, y_train,
                        scoring='accuracy', n_jobs=1, return_gridsearch=False):
    """Runs a grid search to optimize hyperparameters

    Parameters
    ----------
    pipeline : sklearn.model_selection.Pipeline
        Pipeline to fit.
    param_grid : dict
        Dictionary with hyperparameter names / values to be passed to
        GridSearchCV.
    X_train : array_like
        Training features.
    y_train : array_like
        Training labels.
    scoring : str
        Scoring metric to use (default is 'accuracy').
    n_jobs : int, optional
        Number of jobs to run in parallel (default is 1).
    return_gridsearch : bool, optional
        Whether to return the fitted GridSearchCV object, or the
        best_estimator_ object (default is False, so will return the
        best_estimator_).

    Returns
    -------
    best_pipeline : sklearn.model_selection.Pipeline
        Pipeline with optimal hyperparameter values that has been trained on
        the entire training dataset (X_train, y_train).
    gridsearch : sklearn.model_selection.GridSearchCV
        Fitted GridSearchCV object.
    """
    # Want to make sure pipeline isn't running in parallel
    # Will run GridSearchCV in parallel instead
    if hasattr(pipeline, 'classifier__n_jobs'):
        pipeline.set_params(classifier__n_jobs=1)

    param_str = '\n\t'.join(['{}: {}'.format(key, value) for key, value in param_grid.iteritems()])
    print('Running grid search over the following hyperparameters:\n\t{}'.format(param_str))
    gridsearch = GridSearchCV(pipeline,
                              param_grid=param_grid,
                              cv=10,
                              scoring=scoring,
                              n_jobs=n_jobs,
                              return_train_score=True,
                              verbose=2)
    gridsearch.fit(X_train, y_train)
    print('best GridSearchCV params = {}'.format(gridsearch.best_params_))

    if return_gridsearch:
        return gridsearch
    else:
        best_pipeline = gridsearch.best_estimator_
        return best_pipeline