27
27
import numpy as np
28
28
from sklearn .pipeline import Pipeline
29
29
from sklearn .model_selection import (StratifiedKFold , GroupKFold , KFold ,
30
- train_test_split ,GroupShuffleSplit , PredefinedSplit )
30
+ train_test_split ,GroupShuffleSplit ,
31
+ StratifiedGroupKFold ,
32
+ PredefinedSplit )
31
33
from sklearn .ensemble import (RandomForestClassifier , ExtraTreesClassifier ,
32
34
GradientBoostingClassifier ,RandomForestRegressor ,
33
35
GradientBoostingRegressor , ExtraTreesRegressor ,
39
41
from sklearn .preprocessing import (LabelEncoder , MaxAbsScaler , MinMaxScaler ,
40
42
Normalizer , PowerTransformer ,StandardScaler ,
41
43
QuantileTransformer )
44
+ from sklearn .svm import SVC , SVR , NuSVC , NuSVR , LinearSVC , LinearSVR
42
45
from sklearn .feature_selection import VarianceThreshold , RFECV
43
46
from sklearn .inspection import permutation_importance
44
47
from sklearn import metrics
@@ -256,7 +259,7 @@ def objective(trial, X, y, cv, group, score=scr):
256
259
print (f"\t Best value (rmse or r2): { study .best_value :.5f} " )
257
260
print (f"\t Best params:" )
258
261
259
- def _group_cv (X_train , y_train , group , test_size = 0.2 , cv = 10 ):
262
+ def _group_cv (X_train , y_train , group , test_size = 0.2 , cv = 10 , strat = False ):
260
263
261
264
"""
262
265
Return the splits and and vars for a group grid search
@@ -275,17 +278,27 @@ def _group_cv(X_train, y_train, group, test_size=0.2, cv=10):
275
278
y_train = y_train [train_inds ]
276
279
group_trn = group [train_inds ]
277
280
278
- group_kfold = GroupKFold (n_splits = cv )
279
- # Create a nested list of train and test indices for each fold
280
- k_kfold = group_kfold .split (X_train , y_train , group_trn )
281
+ if strat == True :
282
+ group_kfold = StratifiedGroupKFold (n_splits = cv ).split (X_train ,
283
+ y_train ,
284
+ group_trn )
285
+ else :
286
+ group_kfold = GroupKFold (n_splits = cv ).split (X_train ,
287
+ y_train ,
288
+ group_trn )
289
+
290
+ # all this not required produces same as above - keep for ref though
291
+ # # Create a nested list of train and test indices for each fold
292
+ # k_kfold = group_kfold.split(X_train, y_train, groups=group_trn)
281
293
282
- train_ind2 , test_ind2 = [list (traintest ) for traintest in zip (* k_kfold )]
294
+ # train_ind2, test_ind2 = [list(traintest) for traintest in zip(*k_kfold)]
283
295
284
- cv = [* zip (train_ind2 , test_ind2 )]
296
+ # cv = [*zip(train_ind2, test_ind2)]
285
297
286
- return X_train , y_train , X_test , y_test , cv
298
+ return X_train , y_train , X_test , y_test , group_kfold
287
299
288
- def rec_feat_sel (X_train , featnames , preproc = ('scaler' , None ), clf = 'erf' , group = None ,
300
+ def rec_feat_sel (X_train , featnames , preproc = ('scaler' , None ), clf = 'erf' ,
301
+ group = None ,
289
302
cv = 5 , params = None , cores = - 1 , strat = True ,
290
303
test_size = 0.3 , regress = False , return_test = True ,
291
304
scoring = None , class_names = None , save = True , cat_feat = None ):
@@ -550,7 +563,10 @@ class names in order of their numercial equivalents
550
563
# devices='0:1'),
551
564
'lgbm' : lgb .LGBMClassifier (random_state = 0 ),
552
565
553
- 'hgb' : HistGradientBoostingClassifier (random_state = 0 )}
566
+ 'hgb' : HistGradientBoostingClassifier (random_state = 0 ),
567
+ 'svm' : SVC (),
568
+ 'nusvc' : NuSVC (),
569
+ 'linsvc' : LinearSVC ()}
554
570
555
571
regdict = {'rf' : RandomForestRegressor (random_state = 0 ),
556
572
'erf' : ExtraTreesRegressor (random_state = 0 ),
@@ -563,16 +579,19 @@ class names in order of their numercial equivalents
563
579
# task_type="GPU",
564
580
# devices='0:1'),
565
581
'lgbm' : lgb .LGBMRegressor (random_state = 0 ),
566
-
567
- 'hgb' : HistGradientBoostingRegressor (random_state = 0 )}
582
+ 'hgb' : HistGradientBoostingRegressor (random_state = 0 ),
583
+ 'svm' : SVR (),
584
+ 'nusvc' : NuSVR (),
585
+ 'linsvc' : LinearSVR ()}
568
586
569
587
if regress is True :
570
588
model = regdict [clf ]
571
589
if scoring is None :
572
590
scoring = 'r2'
573
591
else :
574
592
model = clfdict [clf ]
575
- cv = StratifiedKFold (cv )
593
+ if group is None :
594
+ cv = StratifiedKFold (cv )
576
595
if scoring is None :
577
596
scoring = 'accuracy'
578
597
@@ -600,25 +619,18 @@ class names in order of their numercial equivalents
600
619
601
620
602
621
# this is not a good way to do this
603
- if group is not None :
622
+ if regress == True :
623
+ strat = False # failsafe
604
624
625
+ if group is not None : # becoming a mess
626
+
605
627
X_train , y_train , X_test , y_test , cv = _group_cv (X_train , y_train ,
606
- group , test_size ,
607
- cv )
608
-
628
+ group , test_size ,
629
+ cv , strat = strat )
609
630
else :
610
631
X_train , X_test , y_train , y_test = train_test_split (
611
632
X_train , y_train , test_size = test_size , random_state = 0 )
612
-
613
- #
614
- # if clf[0:4] == 'catb':
615
- # # Quick and quiet but can't enter the group cv indices or the sklearn
616
- # # pipe
617
- # ds = Pool(X_train, label=y_train)
618
-
619
- # # fails at end saying
620
- # model.grid_search(param_grid, ds, cv=k_kfold)
621
- #CatBoostError: /src/catboost/catboost/private/libs/options/cross_validation_params.cpp:21: FoldCount is 0
633
+ #cv = StratifiedKFold(cv)
622
634
623
635
624
636
if pipe == 'default' :
@@ -650,10 +662,8 @@ class names in order of their numercial equivalents
650
662
grid = GridSearchCV (sk_pipe , param_grid = sclr ,
651
663
cv = cv , n_jobs = cores ,
652
664
scoring = scoring , verbose = 1 )
653
-
654
665
655
-
656
-
666
+
657
667
grid .fit (X_train , y_train )
658
668
659
669
joblib .dump (grid .best_estimator_ , outModel )
@@ -667,12 +677,17 @@ class names in order of their numercial equivalents
667
677
else :
668
678
crDf = hp .plot_classif_report (y_test , testresult , target_names = class_names ,
669
679
save = outModel [:- 3 ]+ '._classif_report.png' )
680
+
681
+ confmat = metrics .confusion_matrix (testresult , y_test , labels = class_names )
682
+ disp = metrics .ConfusionMatrixDisplay (confusion_matrix = confmat ,
683
+ display_labels = class_names )
684
+ disp .plot ()
670
685
671
- confmat = hp .plt_confmat (X_test , y_test , grid .best_estimator_ ,
672
- class_names = class_names ,
673
- cmap = plt .cm .Blues ,
674
- fmt = "%d" ,
675
- save = outModel [:- 3 ]+ '_confmat.png' )
686
+ # confmat = hp.plt_confmat(X_test, y_test, grid.best_estimator_,
687
+ # class_names=class_names,
688
+ # cmap=plt.cm.Blues,
689
+ # fmt="%d",
690
+ # save=outModel[:-3]+'_confmat.png')
676
691
677
692
results = [grid , crDf , confmat ]
678
693
@@ -776,13 +791,26 @@ class names in order of their numercial equivalents
776
791
# we only wish to predict really - but necessary
777
792
# for sklearn model construct
778
793
else :
779
- clfdict = {'rf' : RandomForestClassifier , 'erf' : ExtraTreesClassifier ,
780
- 'gb' : GradientBoostingClassifier , 'xgb' : XGBClassifier ,
781
- 'logit' : LogisticRegression , 'hgb' : HistGradientBoostingClassifier }
782
-
783
- regdict = {'rf' : RandomForestRegressor , 'erf' : ExtraTreesRegressor ,
784
- 'gb' : GradientBoostingRegressor , 'xgb' : XGBRegressor ,
785
- 'hgb' : HistGradientBoostingRegressor }
794
+ clfdict = {'rf' : RandomForestClassifier (random_state = 0 ),
795
+ 'erf' : ExtraTreesClassifier (random_state = 0 ),
796
+ 'gb' : GradientBoostingClassifier (random_state = 0 ),
797
+ 'xgb' : XGBClassifier (random_state = 0 ),
798
+ 'logit' : LogisticRegression (),
799
+ 'lgbm' : lgb .LGBMClassifier (random_state = 0 ),
800
+ 'hgb' : HistGradientBoostingClassifier (random_state = 0 ),
801
+ 'svm' : SVC (),
802
+ 'nusvc' : NuSVC (),
803
+ 'linsvc' : LinearSVC ()}
804
+
805
+ regdict = {'rf' : RandomForestRegressor (random_state = 0 ),
806
+ 'erf' : ExtraTreesRegressor (random_state = 0 ),
807
+ 'gb' : GradientBoostingRegressor (random_state = 0 ),
808
+ 'xgb' : XGBRegressor (random_state = 0 ),
809
+ 'lgbm' : lgb .LGBMRegressor (random_state = 0 ),
810
+ 'hgb' : HistGradientBoostingRegressor (random_state = 0 ),
811
+ 'svm' : SVR (),
812
+ 'nusvc' : NuSVR (),
813
+ 'linsvc' : LinearSVR ()}
786
814
787
815
if mtype == 'regress' :
788
816
# won't accept the dict even with the ** to unpack it
@@ -840,23 +868,18 @@ def regression_results(y_true, y_pred):
840
868
print ('r2: ' , round (r2 ,4 ))
841
869
print ('MAE: ' , round (mean_absolute_error ,4 ))
842
870
print ('MSE: ' , round (mse ,4 ))
871
+ print ('MedianAE' , round (median_absolute_error , 4 ))
843
872
print ('RMSE: ' , round (np .sqrt (mse ),4 ))
844
- #TODO add when sklearn updated
845
- # display = metrics.PredictionErrorDisplay.from_predictions(
846
- # y_true=y,
847
- # y_pred=y_pred,
848
- # kind="actual_vs_predicted",
849
- # ax=ax,
850
- # scatter_kwargs={"alpha": 0.2, "color": "tab:blue"},
851
- # line_kwargs={"color": "tab:red"},
852
- # )
853
- # print(grid.best_params_)
854
- # print(grid.best_estimator_)
855
- # print(grid.oob_score_)
856
-
857
- # plt.plot(est_range, grid_mean_scores)
858
- # plt.xlabel('no of estimators')
859
- # plt.ylabel('Cross validated accuracy')
873
+ #TODO add when sklearn updated
874
+ display = metrics .PredictionErrorDisplay .from_predictions (
875
+ y_true = y_true ,
876
+ y_pred = y_pred ,
877
+ kind = "actual_vs_predicted" ,
878
+ #ax=ax,
879
+ scatter_kwargs = {"alpha" : 0.2 , "color" : "tab:blue" },
880
+ line_kwargs = {"color" : "tab:red" },
881
+ )
882
+
860
883
861
884
862
885
def RF_oob_opt (model , X_train , min_est , max_est , step , group = None ,
@@ -1103,6 +1126,11 @@ def plot_feat_importance_permutation(modelPth, featureNames, X_test, y_test,
1103
1126
featureNames : list of strings
1104
1127
a list of feature names
1105
1128
1129
+ Returns
1130
+ -------
1131
+
1132
+ pandas df of importances
1133
+
1106
1134
"""
1107
1135
1108
1136
if modelPth is not str :
0 commit comments