ML之catboost：catboost的CatBoostRegressor函數源代碼簡介、解讀之詳細攻略

catboost的CatBoostRegressor函數源代碼簡介、解讀

class CatBoostRegressor Found at: catboost.core

class CatBoostRegressor(CatBoost):

_estimator_type = 'regressor'

"""

Implementation of the scikit-learn API for CatBoost regression.

Parameters

----------

Like in CatBoostClassifier, except loss_function, classes_count, class_names and class_weights

loss_function : string, [default='RMSE']

'RMSE'

'MAE'

'Quantile:alpha=value'

'LogLinQuantile:alpha=value'

'Poisson'

'MAPE'

'Lq:q=value'

實作scikit-learn API的CatBoost回歸。

參數

----------

像CatBoostClassifier，除了loss_function, classes_count, class_names和class_weights

def __init__(

self,

iterations=None,

learning_rate=None,

depth=None,

l2_leaf_reg=None,

model_size_reg=None,

rsm=None,

loss_function='RMSE',

border_count=None,

feature_border_type=None,

per_float_feature_quantization=None,

input_borders=None,

output_borders=None,

fold_permutation_block=None,

od_pval=None,

od_wait=None,

od_type=None,

nan_mode=None,

counter_calc_method=None,

leaf_estimation_iterations=None,

leaf_estimation_method=None,

thread_count=None,

random_seed=None,

use_best_model=None,

best_model_min_trees=None,

verbose=None,

silent=None,

logging_level=None,

metric_period=None,

ctr_leaf_count_limit=None,

store_all_simple_ctr=None,

max_ctr_complexity=None,

has_time=None,

allow_const_label=None,

target_border=None,

one_hot_max_size=None,

random_strength=None,

name=None,

ignored_features=None,

train_dir=None,

custom_metric=None,

eval_metric=None,

bagging_temperature=None,

save_snapshot=None,

snapshot_file=None,

snapshot_interval=None,

fold_len_multiplier=None,

used_ram_limit=None,

gpu_ram_part=None,

pinned_memory_size=None,

allow_writing_files=None,

final_ctr_computation_mode=None,

approx_on_full_history=None,

boosting_type=None,

simple_ctr=None,

combinations_ctr=None,

per_feature_ctr=None,

ctr_description=None,

ctr_target_border_count=None,

task_type=None,

device_config=None,

devices=None,

bootstrap_type=None,

subsample=None,

mvs_reg=None,

sampling_frequency=None,

sampling_unit=None,

dev_score_calc_obj_block_size=None,

dev_efb_max_buckets=None,

sparse_features_conflict_fraction=None,

max_depth=None,

n_estimators=None,

num_boost_round=None,

num_trees=None,

colsample_bylevel=None,

random_state=None,

reg_lambda=None,

objective=None,

eta=None,

max_bin=None,

gpu_cat_features_storage=None,

data_partition=None,

metadata=None,

early_stopping_rounds=None,

cat_features=None,

grow_policy=None,

min_data_in_leaf=None,

min_child_samples=None,

max_leaves=None,

num_leaves=None,

score_function=None,

leaf_estimation_backtracking=None,

ctr_history_unit=None,

monotone_constraints=None,

feature_weights=None,

penalties_coefficient=None,

first_feature_use_penalties=None,

per_object_feature_penalties=None,

model_shrink_rate=None,

model_shrink_mode=None,

langevin=None,

diffusion_temperature=None,

posterior_sampling=None,

boost_from_average=None):

params = {}

not_params = ["not_params", "self", "params", "__class__"]

for key, value in iteritems(locals().copy()):

if key not in not_params and value is not None:

params[key] = value

super(CatBoostRegressor, self).__init__(params)

def fit(self, X, y=None, cat_features=None, sample_weight=None, baseline=None,

use_best_model=None,

eval_set=None, verbose=None, logging_level=None, plot=False,

column_description=None,

verbose_eval=None, metric_period=None, silent=None, early_stopping_rounds=None,

save_snapshot=None, snapshot_file=None, snapshot_interval=None, init_model=None):

"""

Fit the CatBoost model.

Parameters

----------

X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series. If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset.

y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None). Labels, 1 dimensional array like. Use only if X is not catboost.Pool.

cat_features : list or numpy.ndarray, optional (default=None). If not None, giving the list of Categ columns indices.Use only if X is not catboost.Pool.

sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None). Instance weights, 1 dimensional array like.

baseline : list or numpy.ndarray, optional (default=None). If not None, giving 2 dimensional array like data. Use only if X is not catboost.Pool.

use_best_model : bool, optional (default=None). Flag to use best model

eval_set : catboost.Pool or list, optional (default=None). A list of (X, y) tuple pairs to use as a validation set for early-stopping

metric_period : int. Frequency of evaluating metrics.

verbose : bool or int. If verbose is bool, then if set to True, logging_level is set to Verbose, if set to False, logging_level is set to Silent. If verbose is int, it determines the frequency of writing metrics to output and logging_level is set to Verbose.

silent : bool. If silent is True, logging_level is set to Silent. If silent is False, logging_level is set to Verbose.

logging_level : string, optional (default=None). Possible values:

- 'Silent'

- 'Verbose'

- 'Info'

- 'Debug'

plot : bool, optional (default=False). If True, draw train and eval error in Jupyter notebook

verbose_eval : bool or int. Synonym for verbose. Only one of these parameters should be set.

early_stopping_rounds : int. Activates Iter overfitting detector with od_wait set to early_stopping_rounds.

save_snapshot : bool, [default=None]. Enable progress snapshotting for restoring progress after crashes or interruptions

snapshot_file : string, [default=None]. Learn progress snapshot file path, if None will use default filename snapshot_interval: int, [default=600]. Interval between saving snapshots (seconds)

init_model : CatBoost class or string, [default=None]. Continue training starting from the existing model. If this parameter is a string, load initial model from the path specified by this string.

Returns

-------

model : CatBoost

params = deepcopy(self._init_params)

_process_synonyms(params)

if 'loss_function' in params:

X: catboost。pool或list或numpy。ndarray或pandas.DataFrame或pandas.Series。如果不是catboost。Pool，二維特征矩陣或字元串檔案與資料集。

y: list或numpy。ndarray或pandas.DataFrame或pandas.Series。可選(預設= None)。标簽，類似于一維數組。僅當X不是catboost.Pool時使用。

cat_features: list或numpy.ndarray,可選(預設= None)。如果不是None，則給出類别列索引的清單。僅當X不是catboost.Pool時使用。

sample_weight:清單或numpy。ndarray或pandas.DataFrame或pandas.Series,可選(預設= None)。執行個體權重，類似于一維數組。

baseline:清單或numpy。ndarray,可選(預設= None)。如果不是None，則給出像data這樣的二維數組。僅當X不是catboost.Pool時使用。

use_best_model: bool，可選(預設為None)。标記使用最佳模型

eval_set: catboost。Pool或清單，可選(預設為None)。(X, y)元組對的清單，用作早期停止的驗證集。

metric_period: int。評估名額的頻率。

verbose: bool或int。如果verbose是bool，那麼如果設定為True, logging_level将設定為verbose，如果設定為False, logging_level将設定為Silent。如果verbose為int，則它确定向輸出寫入名額的頻率，并将logging_level設定為verbose。

silent : bool。如果silent為True, loging_level設定為silent。如果silent為False, loging_level設定為Verbose。

logging_level:字元串，可選(預設為None)。可能的值:

——“沉默”

——“詳細”

——“資訊”

——“調試”

plot: bool，可選(預設=False)。如果為真，在Jupyter中繪制訓練集和測試集的error

verbose_eval: bool或int。詳細的同義詞。應該隻設定這些參數中的一個。

early_stopping_rounds: int。激活Iter過拟合檢測器，od_wait設定為early_stopping_rounds。

save_snapshot: bool， [default=None]。啟用進度快照，以便在崩潰或中斷後恢複進度

snapshot_file: string， [default=None]。學習進度快照檔案路徑，如果沒有将使用預設檔案名snapshot_interval: int，[預設=600]。儲存快照的時間間隔(秒)

init_model: CatBoost類或字元串，[default=None]。從現有的模式開始繼續教育訓練。如果該參數為字元串，則從該字元串指定的路徑加載初始模型。

self._check_is_regressor_loss(params['loss_function'])

return self._fit(X, y, cat_features, None, None, None, sample_weight, None, None, None,

None, baseline,

use_best_model, eval_set, verbose, logging_level, plot, column_description,

verbose_eval, metric_period, silent, early_stopping_rounds,

save_snapshot, snapshot_file, snapshot_interval, init_model)

def predict(self, data, prediction_type=None, ntree_start=0, ntree_end=0, thread_count=-

1, verbose=None):

Predict with data.

data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas. DataFrame or pandas.Series or catboost.FeaturesData. Data to apply model on. If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted as a list of features for a single object.

prediction_type : string, optional (default='RawFormulaVal'). Can be:

- 'RawFormulaVal' : return raw formula value.

- 'Exponent' : return Exponent of raw formula value.

ntree_start: int, optional (default=0)

Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

ntree_end: int, optional (default=0)

Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing). If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

thread_count : int (default=-1). The number of threads to use when applying the model. Allows you to optimize the speed of execution. This parameter doesn't affect results. If -1, then the number of threads is set to the number of CPU cores.

verbose : bool. If True, writes the evaluation metric measured set to stderr.

prediction : If data is for a single object, the return value is single float formula return value otherwise one-dimensional numpy.ndarray of formula return values for each object.

if prediction_type is None:

prediction_type = self._get_default_prediction_type()

return self._predict(data, prediction_type, ntree_start, ntree_end, thread_count, verbose,

'predict')

---------

data : catboost。池或特性清單或清單的清單或numpy。ndarray或熊貓。DataFrame或熊貓。系列或catboost.FeaturesData。應用模型的資料。如果data是一個簡單的清單(不是清單的清單)或一維numpy。ndarray它被解釋為一個對象的特性清單。

prediction_type :字元串，可選(預設為'RawFormulaVal')。可以是:

- 'RawFormulaVal':傳回原始公式值。

- 'Exponent':傳回原始公式值的指數。

ntree_start: int，可選(預設為0)

模型應用于區間[ntree_start, ntree_end)(從零開始索引)。

ntree_end: int，可選(預設為0)

模型應用于區間[ntree_start, ntree_end)(從零開始索引)。如果value等于0，則忽略該參數，ntree_end等于tree_count_。

thread_count :int(預設=-1)。應用模型時要使用的線程數。允許您優化執行速度。此參數不影響結果。如果-1，則線程數設定為CPU核數。

verbose :bool。如果為真，則将評估路徑成本寫入stderr。

傳回

-------

prediction：如果資料是針對單個對象的，則傳回值為單個float公式傳回值，否則為一維numpy。ndarray的公式傳回每個對象的值。

def staged_predict(self, data, prediction_type='RawFormulaVal', ntree_start=0,

ntree_end=0, eval_period=1, thread_count=-1, verbose=None):

Predict target at each stage for data.

ntree_start: int, optional (default=0). Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

ntree_end: int, optional (default=0).Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing). If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

eval_period: int, optional (default=1). Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

prediction : generator for each iteration that generates:If data is for a single object, the return value is single float formula return value otherwise one-dimensional numpy.ndarray of formula return values for each object.

return self._staged_predict(data, prediction_type, ntree_start, ntree_end, eval_period,

thread_count, verbose, 'staged_predict')

data : catboost。池或特性清單或清單的清單或numpy。ndarray或DataFrame 或pandas.Series or catboost.FeaturesData。應用模型的資料。如果data是一個簡單的清單(不是清單的清單)或一維numpy。ndarray它被解釋為一個對象的特性清單。

ntree_start: int，可選(預設為0)。模型應用于間隔[ntree_start, ntree_end)，步長為eval_period(從零開始索引)。

ntree_end:int，可選(預設為0)。模型應用于間隔[ntree_start, ntree_end)，步長為eval_period(從零開始索引)。如果value等于0，則忽略該參數，ntree_end等于tree_count_。

eval_period: int，可選(預設為1)。模型應用于間隔[ntree_start, ntree_end)，步長為eval_period(從零開始索引)。

thread_count : int(預設=-1)。應用模型時要使用的線程數。允許您優化執行速度。此參數不影響結果。如果-1，則線程數設定為CPU核數。

prediction :為每個疊代生成的生成器:如果資料是針對單個對象的，則傳回值為單個float公式傳回值，否則為一維numpy。ndarray的公式傳回每個對象的值。

def score(self, X, y=None):

Calculate R^2.

X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series.Data to apply model on.

y : list or numpy.ndarray.True labels.

R^2 : float

if isinstance(X, Pool):

if y is not None:

raise CatBoostError("Wrong initializing y: X is catboost.Pool object, y must be

initialized inside catboost.Pool.")

y = X.get_label()

if y is None:

raise CatBoostError("Label in X has not initialized.")

elif y is None:

raise CatBoostError("y should be specified.")

y = np.array(y, dtype=np.float64)

predictions = self._predict(X,

prediction_type=self._get_default_prediction_type(),

ntree_start=0,

ntree_end=0,

thread_count=-1,

verbose=None,

parent_method_name='score')

loss = self._object._get_loss_function_name()

if loss == 'RMSEWithUncertainty':

predictions = predictions[:0]

total_sum_of_squares = np.sum((y - y.mean(axis=0)) ** 2)

residual_sum_of_squares = np.sum((y - predictions) ** 2)

return 1 - residual_sum_of_squares / total_sum_of_squares

def _check_is_regressor_loss(self, loss_function):

is_regression = self._is_regression_objective(loss_function) or self.

_is_multiregression_objective(loss_function)

if isinstance(loss_function, str) and not is_regression:

raise CatBoostError("Invalid loss_function='{}': for regressor use "

"RMSE, MultiRMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom

objective object".format(loss_function))

def _get_default_prediction_type(self):

# TODO(ilyzhin) change on get_all_params after MLTOOLS-4758

loss_function = params.get('loss_function')

if loss_function and isinstance(loss_function, str):

if loss_function.startswith('Poisson') or loss_function.startswith('Tweedie'):

return 'Exponent'

if loss_function == 'RMSEWithUncertainty':

return 'RMSEWithUncertainty'

return 'RawFormulaVal'

ML之catboost：catboost的CatBoostRegressor函數源代碼簡介、解讀之詳細攻略

catboost的CatBoostRegressor函數源代碼簡介、解讀

繼續閱讀

來自python的【條件控制/語句循環/break/continue/else/pass】一、條件控制二、語句循環

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入