How should I make the code match up to pass doctests?

31 views Asked by At

I implemented a PCA class like this:

import numpy as np
import matplotlib.pyplot as plt
class PCA:
    def __init__(self, n_components):
        print('****** Created from-scratch PCA object *****')
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.eigenvectors = None
        self.eigenvalues = None
        self.variance_ratio = None
        self.cumulative_var = None

    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean
        covariance = np.cov(X_centered.T)
        self.eigenvalues, self.eigenvectors = np.linalg.eig(covariance)
        eigenvalue_idxs = np.argsort(self.eigenvalues)[::-1]
        self.eigenvalues = self.eigenvalues[eigenvalue_idxs]
        self.eigenvectors = self.eigenvectors[:, eigenvalue_idxs]
        self.components = self.eigenvectors[:, :self.n_components]
        self.variance_ratio = self.eigenvalues / np.sum(self.eigenvalues)
        self.cumulative_var = np.cumsum(self.variance_ratio)
        
    def transform(self, X):
        X_centered = X - self.mean
        projected_data = np.dot(X_centered, self.components)
        return projected_data

And this is my code utilizing the class, but it won't pass the doctests:

#Step 1.  import libs and dataset
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

breast_cancer = load_breast_cancer()

print(breast_cancer.feature_names)
print(len(breast_cancer.feature_names))
print(breast_cancer.target)
print(breast_cancer.target_names)
print(np.array(np.unique(breast_cancer.target, return_counts=True)))

df = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
df['diagnosis'] = breast_cancer.target
df.head()
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']


random_state = 12  

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=12, shuffle=True)
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(X_train, y_train)
print('Logistic regression model score on original UNSCALED dataset (all features):', log_reg.score(X_test,y_test))


sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = random_state, shuffle = True)
log_reg_scaled = LogisticRegression(max_iter=5000)
log_reg_scaled.fit(X_train_scaled, y_train)
print('Logistic regression model score on original SCALED dataset (all features):', log_reg_scaled.score(X_test_scaled,y_test))


components = X.shape[1]
pca_all = PCA(n_components = components)
pca_all.fit(X_scaled)
X_pca_all = pca_all.transform(X_scaled)

print('Principal components:', pca_all.components)
print('Variance ratios:', pca_all.variance_ratio)
print('Cumulative Variances:', pca_all.cumulative_var*100)

print('Number of components in scree plot:', pca_all.n_components)
PC_components = np.arange(1, pca_all.components.shape[1]+1)
sns.set(style='whitegrid', font_scale=1.2)
plt.subplots(figsize=(20, 7))
sns.barplot(x=PC_components, y=pca_all.variance_ratio, color='b')
sns.lineplot(x=PC_components-1, y=pca_all.cumulative_var, color='black', linestyle='-', linewidth=2, marker='o', markersize=8)
plt.title('Scree Plot')
plt.xlabel('N-th Principal Component')
plt.ylabel('Variance Explained')
plt.ylim(0, 1)
plt.show()

X_train_pca_all, X_test_pca_all, y_train_pca_all, y_test_pca_all = train_test_split(X_pca_all, y, test_size=0.3, random_state=random_state, shuffle=True)
log_reg_pca_all = LogisticRegression(max_iter=5000)
log_reg_pca_all.fit(X_train_pca_all, y_train_pca_all)
print('Logistic regression model score on transformed dataset (all PCs):', log_reg_pca_all.score(X_test_pca_all, y_test_pca_all))

pca_4 = PCA(n_components=4)
pca_4.fit(X_scaled)
X_pca_4 = pca_4.transform(X_scaled)


X_train_pca_4, X_test_pca_4, y_train_pca_4, y_test_pca_4 = train_test_split(X_pca_all, y, test_size=0.3, random_state=random_state, shuffle=True)
log_reg_pca_4 = LogisticRegression(max_iter=5000)
log_reg_pca_4.fit(X_train_pca_4, y_train_pca_4)

print('Logistic regression model score on transformed dataset (keep 4 PCs):', log_reg_pca_4.score(X_test_pca_4,y_test_pca_4))




#Run the doctest module.  DO NOT modify any code below this line!
import doctest
"""
  >>> print(X_pca_all.shape[1])
  30
  >>> print(np.round(X_pca_all[0][17], 3))
  0.55
  >>> print(np.round(log_reg_pca_all.score(X_test_pca_all,y_test_pca_all), 3))
  0.971
  >>> print(X_pca_4.shape[1])
  4
  >>> print(np.round(X_pca_4[29][3], 3))
  1.911
  >>> print(np.round(log_reg_pca_4.score(X_test_pca_4,y_test_pca_4), 3))
  0.965
"""

doctest.testmod()

The result when I run shows that 4/6 doctests pass. The only doctests that fail are the ones with the score of the logistic regression model. What should I proceed in order to get the expected doctest result.

**********************************************************************
File "__main__", line 7, in __main__
Failed example:
    print(np.round(log_reg_pca_all.score(X_test_pca_all,y_test_pca_all), 3))
Expected:
    0.971
Got:
    0.982
**********************************************************************
File "__main__", line 13, in __main__
Failed example:
    print(np.round(log_reg_pca_4.score(X_test_pca_4,y_test_pca_4), 3))
Expected:
    0.965
Got:
    0.982
**********************************************************************
1 items had failures:
   2 of   6 in __main__
***Test Failed*** 2 failures.
TestResults(failed=2, attempted=6)
0

There are 0 answers