|
33 | 33 | "[Images Source](https://commons.wikimedia.org/wiki/File:3D_view_of_an_event_recorded_with_the_CMS_detector_in_2012_at_a_proton-proton_centre_of_mass_energy_of_8_TeV.png)"
|
34 | 34 | ]
|
35 | 35 | },
|
36 |
| - { |
37 |
| - "cell_type": "markdown", |
38 |
| - "id": "2f66db55-d1f8-407e-b7bb-b353ce78fa56", |
39 |
| - "metadata": {}, |
40 |
| - "source": [ |
41 |
| - "# Example environment creation:\n", |
42 |
| - "\n", |
43 |
| - "This environment is the latest [Intel® oneAPI AI Analytics Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html) base environment, which includes data analytics and machine learning workflows and Intel optimizations for XGboost. See [here](https://software.intel.com/content/www/us/en/develop/articles/installing-ai-kit-with-conda.html) for more installation information." |
44 |
| - ] |
45 |
| - }, |
46 | 36 | {
|
47 | 37 | "cell_type": "markdown",
|
48 | 38 | "id": "57efb70b-7073-4baa-b544-253551c7bb58",
|
|
61 | 51 | "outputs": [],
|
62 | 52 | "source": [
|
63 | 53 | "import sklearn\n",
|
64 |
| - "#from sklearnex import patch_sklearn\n", |
65 |
| - "#patch_sklearn()\n", |
| 54 | + "from sklearnex import patch_sklearn\n", |
| 55 | + "patch_sklearn()\n", |
66 | 56 | "#unpatch_sklearn()\n",
|
67 | 57 | "from sklearn.model_selection import train_test_split\n",
|
68 | 58 | "from sklearn.metrics import mean_squared_error\n",
|
| 59 | + "import warnings\n", |
| 60 | + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", |
69 | 61 | "import pandas as pd\n",
|
| 62 | + "from pandas import MultiIndex, Int16Dtype # if you don't import in this order you will get a pandas.Int64Index fix for FutureWarning error.\n", |
70 | 63 | "import xgboost as xgb\n",
|
71 | 64 | "import numpy as np\n",
|
72 |
| - "import warnings\n", |
73 | 65 | "from time import perf_counter\n",
|
74 | 66 | "print(\"XGB Version : \", xgb.__version__)\n",
|
75 |
| - "print(\"Scikit-Learn Version : \", sklearn.__version__)" |
| 67 | + "print(\"Scikit-Learn Version : \", sklearn.__version__)\n", |
| 68 | + "print(\"Pandas Version : \", pd.__version__)" |
76 | 69 | ]
|
77 | 70 | },
|
78 | 71 | {
|
|
169 | 162 | "source": [
|
170 | 163 | "filename = 'HIGGS.csv'\n",
|
171 | 164 | "names = ['class_label', 'lepton pT', 'lepton eta', 'lepton phi', 'missing energy magnitude', 'missing energy phi', 'jet 1 pt', 'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag', 'jet 2 pt', 'jet 2 eta', 'jet 2 phi', 'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta', 'jet 3 phi', 'jet 3 b-tag', 'jet 4 pt', 'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']\n",
|
172 |
| - "data = pd.read_csv(filename, names=names, delimiter=\",\", nrows=100000)\n", |
173 |
| - "#data = pd.read_csv(filename, names=names, delimiter=\",\", nrows=1100000)\n", |
| 165 | + "#data = pd.read_csv(filename, names=names, delimiter=\",\", nrows=100000)\n", |
| 166 | + "data = pd.read_csv(filename, names=names, delimiter=\",\", nrows=1100000)\n", |
174 | 167 | "print(data.shape)"
|
175 | 168 | ]
|
176 | 169 | },
|
| 170 | + { |
| 171 | + "cell_type": "code", |
| 172 | + "execution_count": null, |
| 173 | + "id": "505ce472-a525-42c3-b995-ddc2c3aa2b43", |
| 174 | + "metadata": {}, |
| 175 | + "outputs": [], |
| 176 | + "source": [ |
| 177 | + "%time p_df = pd.read_csv(\"HIGGS.csv\")" |
| 178 | + ] |
| 179 | + }, |
177 | 180 | {
|
178 | 181 | "cell_type": "markdown",
|
179 | 182 | "id": "0d7249bc-4e6b-4a28-8894-00b14c61d4f2",
|
|
233 | 236 | "id": "d339991a-3485-49ef-8151-c5c8020fc586",
|
234 | 237 | "metadata": {},
|
235 | 238 | "source": [
|
236 |
| - "* In this scenario loading 100000 rows the balance isn't too skewed. " |
| 239 | + "* In this scenario loading 100000 rows the balance isn't too skewed, the next cell is optional." |
237 | 240 | ]
|
238 | 241 | },
|
239 | 242 | {
|
|
293 | 296 | "outputs": [],
|
294 | 297 | "source": [
|
295 | 298 | "# This is the y target vector -- the ones we want to predict.\n",
|
296 |
| - "# print(data.iloc[:,0])" |
| 299 | + "print(data.iloc[:,0])" |
297 | 300 | ]
|
298 | 301 | },
|
299 | 302 | {
|
|
383 | 386 | "}\n",
|
384 | 387 | "\n",
|
385 | 388 | "# Train the model\n",
|
386 |
| - "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", |
| 389 | + "warnings.simplefilter(action='ignore', category=UserWarning)\n", |
387 | 390 | "t1_start = perf_counter() # Time fit function\n",
|
388 | 391 | "model_xgb= xgb.XGBClassifier(**xgb_params)\n",
|
389 | 392 | "model_xgb.fit(X_train,y_train)\n",
|
|
515 | 518 | " 'disable_default_eval_metric': 'true',\n",
|
516 | 519 | " 'tree_method': 'hist', \n",
|
517 | 520 | "}\n",
|
| 521 | + "# Necessary for now to supress multi-threaded Future errors with respect to pandas and XGBoost\n", |
| 522 | + "import os\n", |
| 523 | + "os.environ['PYTHONWARNINGS']='ignore::FutureWarning'\n", |
518 | 524 | "\n",
|
519 | 525 | "# Train the model\n",
|
520 |
| - "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", |
521 | 526 | "model_xgb= xgb.XGBClassifier(**xgb_params2, use_label_encoder=False)\n",
|
522 | 527 | "\n",
|
523 |
| - "\n", |
524 | 528 | "# Setup grid search n_jobs=-1 uses all cores, reducing cv from 5 to 3 for speed, scoring is done using area under curve.\n",
|
525 | 529 | "grid_cv = GridSearchCV(model_xgb, param_grid, n_jobs=-1, cv=3, scoring=\"roc_auc\")\n",
|
526 | 530 | "\n",
|
527 | 531 | "# This fit function takes a while--hours, make sure you are ready.\n",
|
528 |
| - "\n", |
529 | 532 | "_ = grid_cv.fit(X_train, y_train)"
|
530 | 533 | ]
|
531 | 534 | },
|
|
556 | 559 | "source": [
|
557 | 560 | "### Results\n",
|
558 | 561 | "\n",
|
559 |
| - " grid_cv.best_score_ = 0.8002252116945674 grid cv.best_params\n", |
| 562 | + " grid_cv.best_score_ = 0.80 grid cv.best_params\n", |
560 | 563 | "\n",
|
561 | 564 | " {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 8, 'reg_lambda': 10, 'scale_pos_weight': 1, 'subsample': 1}\n",
|
562 | 565 | "\n",
|
|
609 | 612 | "\n",
|
610 | 613 | " n_estimators:, default=100\n",
|
611 | 614 | "\n",
|
612 |
| - "The number of trees in the forest. A good way to see how many trees might be useful is to plot the learning curve. since this is a classification problem we will use log loss as our measurement where lower values are better. \n", |
| 615 | + "n_estimaters represents the number of trees in the forest. A good way to see how many trees might be useful is to plot the learning curve. Since this is a classification problem we will use log loss as our measurement where lower values are better. \n", |
613 | 616 | "\n",
|
614 | 617 | "Our orignal fit function needs to be modified to include eval_metric with the type set to logloss. In addition we need to define the evaluation data set so that the results are evaluated after each round in order to plot them.\n"
|
615 | 618 | ]
|
|
632 | 635 | "metadata": {},
|
633 | 636 | "outputs": [],
|
634 | 637 | "source": [
|
635 |
| - "# fit the model\n", |
| 638 | + "# Fit the model\n", |
636 | 639 | "model_xgb.fit(X_train, y_train, eval_metric='logloss', eval_set=evalset)"
|
637 | 640 | ]
|
638 | 641 | },
|
|
716 | 719 | " 'reg_lambda': 10,\n",
|
717 | 720 | " 'scale_pos_weight': 1,\n",
|
718 | 721 | " 'tree_method': 'hist', \n",
|
719 |
| - " 'n_estimators': 250\n", |
| 722 | + " 'n_estimators': 1000,\n", |
720 | 723 | "}\n",
|
721 | 724 | "\n",
|
722 | 725 | "# Train the model\n",
|
723 |
| - "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", |
724 |
| - "t = time.process_time() # Time fit function\n", |
| 726 | + "t1_start = perf_counter() # Time fit function\n", |
725 | 727 | "model_xgb= xgb.XGBClassifier(**xgb_params)\n",
|
726 | 728 | "model_xgb.fit(X_train,y_train, eval_metric='logloss', eval_set=evalset, verbose=True)\n",
|
727 |
| - "elapsed_time = time.process_time() - t\n", |
728 |
| - "print (\"It took\",elapsed_time,\" to fit.\")" |
| 729 | + "t1_stop = perf_counter()\n", |
| 730 | + "print (\"It took\", t1_stop-t1_start,\"seconds to fit.\")" |
729 | 731 | ]
|
730 | 732 | },
|
731 | 733 | {
|
|
780 | 782 | "source": [
|
781 | 783 | "## So how many trees do we need really?\n",
|
782 | 784 | "\n",
|
783 |
| - "* It takes awhile to watch 250 trees get evaluated, a great performance improvement is to use the XGBoost early stopping capbility.\n", |
| 785 | + "* It takes awhile to watch 1000 trees get evaluated, a great performance improvement is to use the XGBoost early stopping capability.\n", |
784 | 786 | "\n",
|
785 | 787 | "* Modify the fit function to stop the training after 10 to 15 rounds of no improvement. \n",
|
786 | 788 | " \n",
|
|
823 | 825 | ],
|
824 | 826 | "metadata": {
|
825 | 827 | "kernelspec": {
|
826 |
| - "display_name": "Python 3 (ipykernel)", |
| 828 | + "display_name": "Python 3 (Intel® oneAPI 2023.0)", |
827 | 829 | "language": "python",
|
828 |
| - "name": "python3" |
| 830 | + "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python" |
829 | 831 | },
|
830 | 832 | "language_info": {
|
831 | 833 | "codemirror_mode": {
|
|
837 | 839 | "name": "python",
|
838 | 840 | "nbconvert_exporter": "python",
|
839 | 841 | "pygments_lexer": "ipython3",
|
840 |
| - "version": "3.9.7" |
| 842 | + "version": "3.9.15" |
841 | 843 | },
|
842 | 844 | "nbTranslate": {
|
843 | 845 | "displayLangs": [
|
|
0 commit comments