ML Projekt unter Anaconda in Jupiter Notebook (XGBoost, scikit-learn, pandas, numpy )

draude · Donnerstag 26. März 2020, 17:33

Versteht jemand den Ursprung des Fehlers bzw. was geändert werden müsste damit es funktioniert?

Ich nutze den multiclass Classifier XGBoost. Im y_all DataFrame stehen Werte wie 0,1,2 oder 3 (4 Klassen). Anhand der Daten von X_all soll der Classifier traniert werden und dann nutzbar sein um neue Daten in einer der 4 Klassen einzustufen.

Kriege folgenden Error:

Code: Alles auswählen

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted']

Genauer:

Code: Alles auswählen

[color=#FF0000]ValueError                                Traceback (most recent call last)
<ipython-input-8-4217ca86886e> in <module>
      4 #Boosting refers to this general problem of producing a very accurate prediction rule
      5 #by combining rough and moderately inaccurate rules-of-thumb
----> 6 train_predict(clf_xg, X_train, y_train, X_test, y_test)
      7 print('')

<ipython-input-7-45b04513073f> in train_predict(clf, X_train, y_train, X_test, y_test)
     46 
     47     # Print the results of prediction for both training and testing
---> 48     f1, acc = predict_labels(clf, X_train, y_train)
     49     print(f1, acc)
     50     print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

<ipython-input-7-45b04513073f> in predict_labels(clf, features, target)
     33     print("Made predictions in {:.4f} seconds.".format(end - start))
     34     #
---> 35     return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))
     36 
     37 

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/sklearn/metrics/_classification.py in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight, zero_division)
   1097                        pos_label=pos_label, average=average,
   1098                        sample_weight=sample_weight,
-> 1099                        zero_division=zero_division)
   1100 
   1101 

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/sklearn/metrics/_classification.py in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight, zero_division)
   1224                                                  warn_for=('f-score',),
   1225                                                  sample_weight=sample_weight,
-> 1226                                                  zero_division=zero_division)
   1227     return f
   1228 

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/sklearn/metrics/_classification.py in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight, zero_division)
   1482         raise ValueError("beta should be >=0 in the F-beta score")
   1483     labels = _check_set_wise_labels(y_true, y_pred, average, labels,
-> 1484                                     pos_label)
   1485 
   1486     # Calculate tp_sum, pred_sum, true_sum ###

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/sklearn/metrics/_classification.py in _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
   1314             raise ValueError("Target is %s but average='binary'. Please "
   1315                              "choose another average setting, one of %r."
-> 1316                              % (y_type, average_options))
   1317     elif pos_label not in (None, 1):
   1318         warnings.warn("Note that pos_label (set to %r) is ignored when "
.[/color]

So sehen die Funktionen aus:

Code: Alles auswählen

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    #
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print( "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

Damit splitte ich die Daten:

Code: Alles auswählen

from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 50,
                                                    random_state = 2,
                                                    stratify = y_all)

X_all und y_all sind pandas.DataFrames (744x8 und 744x1)

Wenn ich average='weighted' setze, ändert sich der Fehler wie folgt:

Code: Alles auswählen

ValueError: Unable to coerce to Series, length must be 1: given 694

694 ist die Anzahl an Zeilen von X_all_train.

Fehler genauer:

Code: Alles auswählen

ValueError                                Traceback (most recent call last)
<ipython-input-17-4217ca86886e> in <module>
      4 #Boosting refers to this general problem of producing a very accurate prediction rule
      5 #by combining rough and moderately inaccurate rules-of-thumb
----> 6 train_predict(clf_xg, X_train, y_train, X_test, y_test)
      7 print('')

<ipython-input-16-b50ecbd49bb5> in train_predict(clf, X_train, y_train, X_test, y_test)
     46 
     47     # Print the results of prediction for both training and testing
---> 48     f1, acc = predict_labels(clf, X_train, y_train)
     49     print(f1, acc)
     50     print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

<ipython-input-16-b50ecbd49bb5> in predict_labels(clf, features, target)
     33     print("Made predictions in {:.4f} seconds.".format(end - start))
     34     #
---> 35     return f1_score(target, y_pred, pos_label='H',average='weighted'), sum(target == y_pred) / float(len(y_pred))
     36 
     37 

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/pandas/core/ops/__init__.py in f(self, other)
    825     def f(self, other):
    826 
--> 827         other = _align_method_FRAME(self, other, axis=None)
    828 
    829         if isinstance(other, ABCDataFrame):

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/pandas/core/ops/__init__.py in _align_method_FRAME(left, right, axis)
    645 
    646         if right.ndim == 1:
--> 647             right = to_series(right)
    648 
    649         elif right.ndim == 2:

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/pandas/core/ops/__init__.py in to_series(right)
    637             if len(left.columns) != len(right):
    638                 raise ValueError(
--> 639                     msg.format(req_len=len(left.columns), given_len=len(right))
    640                 )
    641             right = left._constructor_sliced(right, index=left.columns)

ThomasL · Donnerstag 26. März 2020, 17:45

Schon mal hier genau eingelesen?
https://scikit-learn.org/stable/modules ... score.html

draude · Donnerstag 26. März 2020, 19:34

Hallo, @ThomasL:

Ja. Wie gesagt, wenn ich z.B. average='weighted' oder 'macro'/'micro' setze und das pos_label dann weg lasse, kommt folgender Fehler:

Code: Alles auswählen

ValueError: Unable to coerce to Series, length must be 1: given 694

Genauer:

Code: Alles auswählen

ValueError                                Traceback (most recent call last)
<ipython-input-103-4217ca86886e> in <module>
      4 #Boosting refers to this general problem of producing a very accurate prediction rule
      5 #by combining rough and moderately inaccurate rules-of-thumb
----> 6 train_predict(clf_xg, X_train, y_train, X_test, y_test)
      7 print('')

<ipython-input-102-76246727c57f> in train_predict(clf, X_train, y_train, X_test, y_test)
     46 
     47     # Print the results of prediction for both training and testing
---> 48     f1, acc = predict_labels(clf, X_train, y_train)
     49     print(f1, acc)
     50     print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

<ipython-input-102-76246727c57f> in predict_labels(clf, features, target)
     33     print("Made predictions in {:.4f} seconds.".format(end - start))
     34     #
---> 35     return f1_score(target, y_pred, average=None), sum(target == y_pred) / float(len(y_pred))
     36 
     37 

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/pandas/core/ops/__init__.py in f(self, other)
    825     def f(self, other):
    826 
--> 827         other = _align_method_FRAME(self, other, axis=None)
    828 
    829         if isinstance(other, ABCDataFrame):

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/pandas/core/ops/__init__.py in _align_method_FRAME(left, right, axis)
    645 
    646         if right.ndim == 1:
--> 647             right = to_series(right)
    648 
    649         elif right.ndim == 2:

~/anacon/anaconda3/envs/eddy_workspace/lib/python3.7/site-packages/pandas/core/ops/__init__.py in to_series(right)
    637             if len(left.columns) != len(right):
    638                 raise ValueError(
--> 639                     msg.format(req_len=len(left.columns), given_len=len(right))
    640                 )
    641             right = left._constructor_sliced(right, index=left.columns)

Sieht so aus als würde in den tieferen Schichten der lib mit den pandas scripts was nicht stimmen. Das kann doch aber garnicht sein, oder?