diff --git a/README.md b/README.md index 3b1f3ca..dac6f7a 100644 --- a/README.md +++ b/README.md @@ -313,5 +313,176 @@ plt.show() +Gradient Boosting - Classification +
+ +````python + from sklearn.datasets import make_hastie_10_2 + from sklearn.ensemble import GradientBoostingClassifier + import matplotlib.pyplot as plt + + X, y = make_hastie_10_2(random_state=0) + X_train, X_test = X[:2000], X[2000:] + y_train, y_test = y[:2000], y[2000:] + print(X.shape, y.shape) + print(X[0:5,:]) + print(y[0:5]) + + clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0) + clf.fit(X_train, y_train) + print('Accuracy score (training): {0:.3f}'.format(clf.score(X_train, y_train))) + print('Accuracy score (testing): {0:.3f}'.format(clf.score(X_test, y_test))) +```` + +
+ +Result +
+ +````planetext + (12000, 10) (12000,) + [[ 1.76405235 0.40015721 0.97873798 2.2408932 1.86755799 -0.97727788 + 0.95008842 -0.15135721 -0.10321885 0.4105985 ] + [ 0.14404357 1.45427351 0.76103773 0.12167502 0.44386323 0.33367433 + 1.49407907 -0.20515826 0.3130677 -0.85409574] + [-2.55298982 0.6536186 0.8644362 -0.74216502 2.26975462 -1.45436567 + 0.04575852 -0.18718385 1.53277921 1.46935877] + [ 0.15494743 0.37816252 -0.88778575 -1.98079647 -0.34791215 0.15634897 + 1.23029068 1.20237985 -0.38732682 -0.30230275] + [-1.04855297 -1.42001794 -1.70627019 1.9507754 -0.50965218 -0.4380743 + -1.25279536 0.77749036 -1.61389785 -0.21274028]] + [ 1. -1. 1. -1. 1.] + Accuracy score (training): 0.879 + Accuracy score (testing): 0.819 +```` + +
+ +XGBoosting - Regression +
+ +````python + import numpy as np + import pandas as pd + from sklearn.datasets import load_boston + from sklearn.metrics import mean_squared_error + from sklearn.model_selection import train_test_split + import xgboost as xgb + + boston = load_boston() + data = pd.DataFrame(boston.data) + data.columns = boston.feature_names + data['PRICE'] = boston.target + print(data.head()) + X, y = data.iloc[:,:-1], data.iloc[:,-1] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) + xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=10) + xg_reg.fit(X_train, y_train) + preds = xg_reg.predict(X_test) + rmse = np.sqrt(mean_squared_error(y_test, preds)) + print('RMSE: %f' % (rmse)) +```` + +
+ +Result +
+ +````planetext + CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX + 0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 + 1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 + 2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 + 3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 + 4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 + + PTRATIO B LSTAT PRICE + 0 15.3 396.90 4.98 24.0 + 1 17.8 396.90 9.14 21.6 + 2 17.8 392.83 4.03 34.7 + 3 18.7 394.63 2.94 33.4 + 4 18.7 396.90 5.33 36.2 + RMSE: 10.423243 +```` + +
+ + +LightGBM +
+ +````python + from lightgbm import LGBMClassifier, LGBMRegressor + from lightgbm import plot_importance, plot_metric, plot_tree + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + from sklearn.model_selection import cross_validate + + iris = load_iris() + X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=123) + lgbmc = LGBMClassifier(n_estimators=400) + evals = [(X_test, y_test)] + lgbmc.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='logloss', eval_set=evals, verbose=True) + preds = lgbmc.predict(X_test) + + cross_val = cross_validate( + estimator=lgbmc, + X=iris.data, y=iris.target, + cv=5 + ) + + print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std())) + print('avg fit time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std())) + print('avg fit time: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std())) + + plot_metric(lgbmc) + plot_importance(lgbmc, figsize=(10,12)) + plot_tree(lgbmc, figsize=(28,14)) +```` + +
+ +Result +
+ +````planetext + [1] valid_0's multi_logloss: 0.95847 + [2] valid_0's multi_logloss: 0.832184 + [3] valid_0's multi_logloss: 0.731164 + [4] valid_0's multi_logloss: 0.641056 + [5] valid_0's multi_logloss: 0.571726 + [6] valid_0's multi_logloss: 0.507286 + [7] valid_0's multi_logloss: 0.454933 + [8] valid_0's multi_logloss: 0.410205 + [9] valid_0's multi_logloss: 0.372194 + [10] valid_0's multi_logloss: 0.333919 + [11] valid_0's multi_logloss: 0.310212 + [12] valid_0's multi_logloss: 0.282326 + [13] valid_0's multi_logloss: 0.257165 + [14] valid_0's multi_logloss: 0.240836 + [15] valid_0's multi_logloss: 0.225383 + [16] valid_0's multi_logloss: 0.211583 + [17] valid_0's multi_logloss: 0.199289 + [18] valid_0's multi_logloss: 0.186269 + [19] valid_0's multi_logloss: 0.171556 + [20] valid_0's multi_logloss: 0.168245 + [21] valid_0's multi_logloss: 0.161065 + [22] valid_0's multi_logloss: 0.151371 + [23] valid_0's multi_logloss: 0.148081 + [24] valid_0's multi_logloss: 0.143843 + [25] valid_0's multi_logloss: 0.140169 + ... + [137] valid_0's multi_logloss: 0.376748 + avg fit time: 0.5514350891113281 (+/- 0.3701610138582717) + avg fit time: 0.010002517700195312 (+/- 0.009552237668971902) + avg fit time: 0.9600000000000002 (+/- 0.04898979485566355) +```` + +![result](./images/11_4.png) +![result](./images/11_5.png) +![result](./images/11_6.png) + +
diff --git a/images/11_4.png b/images/11_4.png new file mode 100644 index 0000000..11665b2 Binary files /dev/null and b/images/11_4.png differ diff --git a/images/11_5.png b/images/11_5.png new file mode 100644 index 0000000..9009e98 Binary files /dev/null and b/images/11_5.png differ diff --git a/images/11_6.png b/images/11_6.png new file mode 100644 index 0000000..734ebd0 Binary files /dev/null and b/images/11_6.png differ