如何使用for循环调用多个函数,如何使用管道调用类? [英] How do I change - using for loops to call multiple functions - into - using a pipeline to call a class?

查看:149
本文介绍了如何使用for循环调用多个函数,如何使用管道调用类?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

因此,基本要求是,我从用户那里获得了一个模型字典,以及它们的超参数字典并给出了报告.目前的目标是二进制分类,但是以后可以扩展.

So the basic requirement is that, I get a dictionary of models from user, and a dictionary of their hyper parameters and give a report. Currently goal is for binary classification, but this can be extended later.

这是我目前正在做的事情:

This is what I am currently doing:

import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

def build_model(model_name, model_class, params=None):
    """
    return model instance
    """
    if 'Ridge' in model_name:
        model = model_class(penalty='l2')
    elif 'Lasso' in model_name:
        model = model_class(penalty='l1')
    elif 'Ensemble' in model_name:
        model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
    else:
        model = model_class()

    if params is not None:
        print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
        rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
                                  random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
                                 scoring=make_scorer(f1_score), error_score=0.0)
        return rscv

    print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
    return model

def fit_model(model_name, model_instance, xTrain, yTrain):
    """
    fit model
    """
    if model_name == 'SVM':
        scaler = StandardScaler()
        model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
    else:
        model = model_instance.fit(xTrain, yTrain)

    return model

def predict_vals(fitted_model, xTest):
    """
    predict and return vals
    """
    if model_name == 'SVM':
        scaler = StandardScaler()
        y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
    else:
        y_prediction = fitted_model.predict(xTest)

    return y_prediction

def get_metrics(yTest, y_prediction):
    """
    get metrics after getting prediction
    """
    return [recall_score(yTest, y_prediction),
            precision_score(yTest, y_prediction), 
            f1_score(yTest, y_prediction),
           roc_auc_score(yTest, y_prediction)]

def model_report(list_of_metrics):
    """
    add metrics to df, return df
    """
    df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
    df = df.round(3)
    return df

models = {
    'Logistic Regression Ridge': LogisticRegression,
    'Logistic Regression Lasso': LogisticRegression,
    'Random Forest': RandomForestClassifier,
    'SVM': SVC,
    'GBM': GradientBoostingClassifier,
    'EnsembleRFGBM': VotingClassifier
}

model_parameters = {
    'SVM': {
        'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
        'class_weight': ['balanced'],
        'gamma': [0.0001, 0.001],
        'kernel': ['linear']
    },
    'Random Forest': {
        'n_estimators': [5, 10, 50, 100, 200],
        'max_depth': [3, 5, 10, 20, 40],
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False],
        'min_samples_leaf': [np.random.randint(1,10)]
    },
    'Logistic Regression Ridge': {
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    },
    'Logistic Regression Lasso': {
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    },
    'GBM': {
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [3, 5, 10, None],
        'min_samples_leaf': [np.random.randint(1,10)]
    },
    'EnsembleRFGBM': {
        'rf__n_estimators': [5, 10, 50, 100, 200],
        'rf__max_depth': [3, 5, 10, 20, 40],
        'rf__min_samples_leaf': [np.random.randint(1,10)],
        'gbm__n_estimators': [10, 50, 100, 200, 500],
        'gbm__max_depth': [3, 5, 10, None],
        'gbm__min_samples_leaf': [np.random.randint(1,10)]
    }
}

没有参数,我得到以下报告.

Without parameters I get the following report.

# without parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)

model_report(lst)

使用参数作为输入

# with parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class, model_parameters)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)

model_report(lst)

现在给我的任务如下.

  1. 从用户那里获取模型及其参数的字典.如果未提供参数,则使用模型的默认值.
  2. 将报告作为输出(如图片所示)

有人告诉我应该将函数更改为类.并尽可能避免for循环.

I was told that I should change the functions to classes. And avoid for loops if possible.

我的挑战:

  1. 如何将所有功能更改为类和方法?基本上我的上级要的是类似的东西

report.getReport # gives the dataFrame of the report

但是上面的声音听起来像可以通过以下函数来完成(我不明白为什么/如何有益于一堂课)

But the above sounds to me like it can be done in a function as follows (I don't understand why/how a class would be beneficial)

customReport(whatever inputs I'd like to give) # gives df of report

  1. 如何避免for loops通过各种型号的用户输入?我以为我可以使用 sklearn管道 ,因为根据我的理解,流水线是一系列步骤,所以从用户那里获取参数和模型,并将其作为一系列步骤执行.这样可以避免for循环.
  1. How do I avoid for loops to get through the user inputs for various models? What I thought was that I could use sklearn pipeline, since according to my understanding, pipeline is a series of steps, so from user take the params and models, and execute them as a series of steps. This avoids the for loops.

类似这样的东西

customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
                             'SVC', SVC(with relevant params from params dict)) ] )

类似的解决方案,我发现的是此处,但我想避免这样的for loops.

Similar solution I found is here but I would like to avoid for loops as such.

另一种相关解决方案 此处使用的类可以在不同的型号.但是在这里,我将要求用户能够选择是否要执行Gridsearch/RandomizedSearch/CV/None.我的想法是,我使用这个类,然后将其继承到另一个类,用户可以输入该类以选择Gridsearch/RandomizedSearch/CV/None等.我不确定我是否在朝着正确的方向思考.

Another related solution here is using a class which can switch between different models. But here I would require that the user be able to give option whether he wants to do Gridsearch/RandomizedSearch/CV/None. My thinking is that I use this class, then inherit this to another class which the user can give input to choose Gridsearch/RandomizedSearch/CV/None etc. I'm not sure if I'm thinking in the right direction.

注意(虽然很喜欢),但需要一个完整的解决方案,但不是强制性的.可以,只要您的答案具有骨架即可为我提供指导.我可以探索并从中学习.

NOTE A full working solution is desirable (would love it) but not mandatory. It is ok if your answer has a skeleton which can give me a direction how to proceed. I am ok with exploring and learning from it.

推荐答案

我已经实现了一个可行的解决方案.我应该更好地措辞我的问题.我最初误解了GridsearchCVRandomizedSearchCV在内部如何工作. cv_results_给出所有可用的网格结果.我以为只有best estimator可供我们使用.

I have implemented a working solution. I should have worded my question better. I initially misunderstood how GridsearchCV or RandomizedSearchCV works internally. cv_results_ gives all the results of the grid available. I thought only the best estimator was available to us.

使用此方法,对于每种类型的模型,我采用最大值rank_test_score,并获得构成模型的参数.在此示例中,它是4个模型.现在,我运行了每个模型,即每个模型的参数与我的测试数据的最佳组合,并预测了所需的分数.我认为该解决方案可以扩展到RandomizedSearchCV和更多其他选择.

Using this, for each type of model, I took the max rank_test_score, and got the parameters making up the model. In this example, it is 4 models. Now I ran each of those models, i.e. the best combination of parameters for each model, with my test data, and predicted the required scores. I think this solution can be extended to RandomizedSearchCV and a lot more other options.

注意:这只是一个简单的解决方案.许多必要的修改,例如需要为特定模型缩放数据等.此解决方案仅作为起点,可以根据用户的需求进行修改.

NOTE: This is just a trivial solution. Lot of modifications necessary, like needing to scale data for specific models, etc. This solution will just serve as a starting point which can be modified according to the user's needs.

ClfSwitcher() class此答案的信用额.

以下是该类的实现(欢迎提出改进建议).

Following is the implementation of the class (suggestions to improve are welcomed).

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

class ClfSwitcher(BaseEstimator):

    def __init__(self, model=RandomForestClassifier()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.model = model


    def fit(self, X, y=None, **kwargs):
        self.model.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.model.predict(X)


    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

class report(ClfSwitcher):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.grid = None
        self.full_report = None
        self.concise_report = None
        self.scoring_metrics = {
            'precision': precision_score,
            'recall': recall_score,
            'f1': f1_score,
            'roc_auc': roc_auc_score
        }


    def griddy(self, pipeLine, parameters, **kwargs):
        self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)


    def fit_grid(self, X_train, y_train=None, **kwargs):
        self.grid.fit(X_train, y_train)

    def make_grid_report(self):
        self.full_report = pd.DataFrame(self.grid.cv_results_)

    @staticmethod
    def get_names(col):
        return col.__class__.__name__

    @staticmethod
    def calc_score(col, metric):
        return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)


    def make_concise_report(self):
        self.concise_report = pd.DataFrame(self.grid.cv_results_)
        self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
        self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
                                                .groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
                                                .reset_index(drop=True)

        for metric_name, metric_func in self.scoring_metrics.items():
            self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)

        self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]

pipeline = Pipeline([
    ('cst', ClfSwitcher()),
])

parameters = [
    {
        'cst__model': [RandomForestClassifier()],
        'cst__model__n_estimators': [10, 20],
        'cst__model__max_depth': [5, 10],
        'cst__model__criterion': ['gini', 'entropy']
    },
    {
        'cst__model': [SVC()],
        'cst__model__C': [10, 20],
        'cst__model__kernel': ['linear'],
        'cst__model__gamma': [0.0001, 0.001]
    },
    {
        'cst__model': [LogisticRegression()],
        'cst__model__C': [13, 17],
        'cst__model__penalty': ['l1', 'l2']
    },
    {
        'cst__model': [GradientBoostingClassifier()],
        'cst__model__n_estimators': [10, 50],
        'cst__model__max_depth': [3, 5],
        'cst__model__min_samples_leaf': [1, 2]
    }
]

my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report

根据需要输出报告.

这篇关于如何使用for循环调用多个函数,如何使用管道调用类?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆