Getting a Value error : Found input variables with inconsistent numbers of samples:

45 views Asked by At
#function for the model building and prediction
def Model(model, X, y):
#training and testing the data
    print(X.shape)
    print(y.shape)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)
    # model building using CountVectorizer and TfidfTransformer
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', model)])
    pipeline_model.fit(x_train, y_train)
    

    y_pred = pipeline_model.predict(x_test)
    y_probas =pipeline_model.predict_proba(x_test)
    print(y_test.shape)
    print(y_probas.shape)
    skplt.metrics.plot_roc(y_test,y_probas,figsize=(12,8),title_fontsize=12,text_fontsize=16)
    plt.show()
    skplt.metrics.plot_precision_recall(y_test,y_probas,figsize=   (12,8),title_fontsize=12,text_fontsize=16)
    plt.show()
    print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
    print("Classification Report is:\n",classification_report(y_test, y_pred))
    print('Accuracy:', pipeline_model.score(x_test, y_test)*100)
    print("Training Score:\n",pipeline_model.score(x_train,y_train)*100)

#Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=500)
Model(model, X, y)
  • I tried changing the hyperparameters such as max_iter from 100->500
  • Checked by printing x_train,x_test,y_train,y_test and all of them have the same number of samples
  • print(y_test.shape) -> gave (3143, ) , print(y_probas.shape) -> gave (3143,42)

Received the Output:

ValueError: Found input variables with inconsistent numbers of samples: [125720, 132006]

Error points to this line :

skplt.metrics.plot_roc(y_test,y_probas,figsize=(12,8),title_fontsize=12,text_fontsize=16)
1

There are 1 answers

1
PV8 On

You are using the wrong input for your Roc-Curve function, followed by the docs:

 import matplotlib.pyplot as plt
 from sklearn import datasets, metrics, model_selection, svm
 X, y = datasets.make_classification(random_state=0)
 X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, random_state=0)
clf = svm.SVC(random_state=0)
 clf.fit(X_train, y_train)
SVC(random_state=0)
 metrics.plot_roc_curve(clf, X_test, y_test) 
 plt.show()

You have to change it to:

skplt.metrics.plot_roc(pipeline_model, X_test,y_test)
plt.show()