Classification
Regression
Fine-tuning your model
Preprocessing and pipelines
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
Naming Conventions
In practice
In Python
The Iris dataset
iris = datasets.load_iris()
type(iris)
iris.keys()
iris.data[:6,:]
iris.target
iris.feature_names
iris.target_names
iris.DESCR
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns = iris.feature_names)
df.head()
_ = pd.plotting.scatter_matrix(
df,
c = y,
figsize = [14,14],
s = 100,
marker = 'D')
plt.show()
%run data.py
vote_clean = pd.DataFrame(data = vote_clean_list, columns = vote_clean_columns)
vote_clean.head()
vote_clean.head()
vote_clean.info()
vote_clean.describe()
party
, all of the columns are of type int64
.party
.scatter_matrix()
but here we have all binary data so a countplot()
(from seaborn
) is more appropriate.countplot()
for a few variablesplt.figure(figsize=(18,10))
sns.countplot(x='education', hue='party', data=vote_clean, palette='RdBu')
plt.xticks([0,1], ['No', 'Yes'])
plt.show()
plt.figure(figsize=(18,10))
sns.countplot(x='satellite', hue='party', data=vote_clean, palette='RdBu')
plt.xticks([0,1], ['No', 'Yes'])
plt.show()
plt.figure(figsize=(18,10))
sns.countplot(x='missile', hue='party', data=vote_clean, palette='RdBu')
plt.xticks([0,1], ['No', 'Yes'])
plt.show()
.fit()
method.predict()
methodfrom sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(iris['data'], iris['target'])
iris['data'].shape
iris['target'].shape
X_new = iris['data'][[0,10,100,149],:] + np.random.random((4, 4))
X_new
prediction = knn.predict(X_new)
X_new.shape
print('Prediction {}'. format(prediction))
# Import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
# Create arrays for the features and the response variable
y = vote_clean['party'].values
X = vote_clean.drop('party', axis=1).values
# Create a k-NN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors = 6)
# Fit the classifier to the data
knn.fit(X, y)
Having fit a k-NN classifier, you can now use it to predict the label of a new data point.
In the next video we will cover splitting our data to fix this issues
# Import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
# Create arrays for the features and the response variable
y = vote_clean['party'].values
X = vote_clean.drop('party', axis = 1).values
# Create a k-NN classifier with 6 neighbors: knn
knn = KNeighborsClassifier(n_neighbors = 6)
# Fit the classifier to the data
knn.fit(X, y)
# Predict the labels for the training data X
y_pred = knn.predict(X)
print(y_pred[0:10])
# Predict and print the label for the new data point X_new
X_new = np.array([[0.72251225, 0.61952487, 0.25539929, 0.34978056, 0.04279606,
0.64407096, 0.74224677, 0.26632645, 0.90179262, 0.73532079,
0.96163873, 0.98119655, 0.79379732, 0.29022046, 0.44359749,
0.95018481]])
new_prediction = knn.predict(X_new)
print("Prediction: {}".format(new_prediction))
test_size
is the proportion of data that will be in the test setrandom_state
sets a seed for the random number generator so later you can reporoduce you results laterstratify
ensures that our data is split so it has similar ratios of target labels in both test and train as in the original datafrom sklearn.model_selection import train_test_split
X = iris['data']
y = iris['target']
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 21,
stratify = y)
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Test set predictions:\n{}".format(y_pred))
knn.score(X_test, y_test)
# Import necessary modules
from sklearn import datasets
import matplotlib.pyplot as plt
# Load the digits dataset: digits
digits = datasets.load_digits()
# Print the keys and DESCR of the dataset
print(digits.keys())
print(digits.DESCR)
# Print the shape of the images and data keys
print(digits.images.shape)
print(digits.data.shape)
# Display digit 1010
plt.imshow(digits.images[1010], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
# Import necessary modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# Create feature and target arrays
X = digits.data
y = digits.target
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.2,
random_state = 42,
stratify = y)
# Create a k-NN classifier with 7 neighbors: knn
knn = KNeighborsClassifier(n_neighbors = 7)
# Fit the classifier to the training data
knn.fit(X_train, y_train)
# Print the accuracy
print(knn.score(X_test, y_test))
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 15)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
# Setup a k-NN Classifier with k neighbors: knn
knn = KNeighborsClassifier(n_neighbors = k)
# Fit the classifier to the training data
knn.fit(X_train, y_train)
#Compute accuracy on the training set
train_accuracy[i] = knn.score(X_train, y_train)
#Compute accuracy on the testing set
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.figure(figsize=(16,10))
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
file = 'https://assets.datacamp.com/production/course_1939/datasets/boston.csv'
boston = pd.read_csv(file)
boston.head()
X = boston.drop('MEDV', axis = 1).values
y = boston['MEDV'].values
X_rooms = X[:,5]
type(X_rooms), type(y)
y = y.reshape(-1,1)
X_rooms = X_rooms.reshape(-1, 1)
plt.figure(figsize=(16,10))
plt.plot(X_rooms, y,
marker='o',
markersize = 6,
markeredgewidth = 1,
markeredgecolor = 'k',
linestyle='none')
plt.ylabel('Value of house / 1000 ($)')
plt.xlabel('Number of rooms')
plt.show()
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X_rooms, y)
prediction_space = np.linspace(
min(X_rooms),
max(X_rooms)).reshape(-1,1)
plt.figure(figsize=(16,10))
plt.plot(X_rooms, y,
marker='o',
markersize = 6,
markeredgewidth = 1,
markeredgecolor = 'k',
linestyle='none')
plt.plot(
prediction_space,
reg.predict(prediction_space),
color = 'black',
linewidth=3)
plt.ylabel('Value of house / 1000 ($)')
plt.xlabel('Number of rooms')
plt.show()
file = 'https://assets.datacamp.com/production/course_1939/datasets/gm_2008_region.csv'
gapminder = pd.read_csv(file)
gapminder.head()
# Create arrays for features and target variable
y = gapminder.life
X = gapminder.fertility
# Print the dimensions of X and y before reshaping
print("Dimensions of y before reshaping: {}".format(y.shape))
print("Dimensions of X before reshaping: {}".format(X.shape))
# Reshape X and y
y = y.reshape(-1,1)
X = X.reshape(-1,1)
# Print the dimensions of X and y after reshaping
print("Dimensions of y after reshaping: {}".format(y.shape))
print("Dimensions of X after reshaping: {}".format(X.shape))
plt.figure(figsize=(14,10))
sns.heatmap(
gapminder.corr(),
square = True,
cmap="RdYlGn",
linewidths = 1)
plt.show()
gapminder.info()
gapminder.describe()
.fit()
on a linear regression model in scikit-learn it performs this OLS under the hoodfrom sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
reg_all = linear_model.LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)
gapminder.head()
X_fertility = gapminder.fertility.values.reshape(-1, 1)
y = gapminder.life.values
# Import LinearRegression
from sklearn.linear_model import LinearRegression
# Create the regressor: reg
reg = LinearRegression()
# Create the prediction space
prediction_space = np.linspace(
min(X_fertility),
max(X_fertility)
).reshape(-1,1)
# Fit the model to the data
reg.fit(X_fertility, y)
# Compute predictions over the prediction space: y_pred
y_pred = reg.predict(prediction_space)
# Print R^2
print(reg.score(X_fertility, y))
plt.figure(figsize=(16,10))
plt.plot(X_fertility, y,
marker='o',
markersize = 6,
color = 'lightblue',
markeredgewidth = 1,
markeredgecolor = 'k',
linestyle='none')
plt.plot(
prediction_space,
y_pred,
color = 'black',
linewidth=3)
plt.ylabel('Life Expectancy')
plt.xlabel('Fertility')
plt.show()
gapminder.head()
X = gapminder.drop(['life','Region'], axis = 1).values
y = gapminder.life.values
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state=42)
# Create the regressor: reg_all
reg_all = LinearRegression()
# Fit the regressor to the training data
reg_all.fit(X_train, y_train)
# Predict on the test data: y_pred
y_pred = reg_all.predict(X_test)
# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))
X = boston.drop('MEDV', axis = 1).values
y = boston['MEDV'].values
from sklearn.model_selection import cross_val_score
reg = linear_model.LinearRegression()
cv_results = cross_val_score(reg, X, y, cv = 5)
print(cv_results)
np.mean(cv_results)
Cross-validation is a vital step in evaluating a model.
By default, scikit-learn's cross_val_score() function uses R2R2 as the metric of choice for regression.
# Import the necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
X = gapminder.drop(['life','Region'], axis = 1).values
y = gapminder.life.values
# Create a linear regression object: reg
reg = LinearRegression()
# Compute 5-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(reg, X, y, cv = 5)
# Print the 5-fold cross-validation scores
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
# Perform 3-fold CV
cvscores_3 = cross_val_score(reg, X, y, cv = 3)
print(np.mean(cvscores_3))
# Perform 10-fold CV
cvscores_10 = cross_val_score(reg, X, y, cv = 10)
print(np.mean(cvscores_10))
%timeit cross_val_score(reg, X, y, cv = 3)
%timeit cross_val_score(reg, X, y, cv = 10)
Alpha (also called lambda in the wild) controls model complexity
My words
from sklearn.linear_model import Ridge
X = boston.drop('MEDV', axis = 1).values
y = boston['MEDV'].values
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
from sklearn.linear_model import Lasso
X = boston.drop('MEDV', axis = 1).values
y = boston['MEDV'].values
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)
names = boston.drop('MEDV', axis = 1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X, y).coef_
_ = plt.plot(range(len(names)), lasso_coef)
_ = plt.xticks(range(len(names)), names, rotation=60)
_ = plt.ylabel('Coefficients')
plt.show()
# Import Lasso
from sklearn.linear_model import Lasso
columns = gapminder.drop(['life','Region'], axis = 1).columns
X = gapminder.drop(['life','Region'], axis = 1).values
y = gapminder.life.values
plt.figure(figsize=(18,10))
for i in range(0,5):
# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha = i*0.1, normalize = True)
# Fit the regressor to the data
lasso.fit(X, y)
# Compute and print the coefficients
lasso_coef = lasso.coef_
# Plot the coefficients
plt.plot(range(len(columns)), lasso_coef, label = str(round(i*0.1, 1)))
plt.xticks(range(len(columns)), columns.values, rotation=60)
plt.margins(0.02)
plt.legend()
plt.show()
def display_plot(cv_scores, cv_scores_std):
fig = plt.figure(figsize=(18,10))
ax = fig.add_subplot(1,1,1)
ax.plot(alpha_space, cv_scores, marker = 'o')
std_error = cv_scores_std / np.sqrt(10)
ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
ax.set_ylabel('CV Score +/- Std Error')
ax.set_xlabel('Alpha')
ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
ax.set_xlim([alpha_space[0], alpha_space[-1]])
ax.set_xscale('log')
plt.show()
X = gapminder.drop(['life','Region'], axis = 1).values
y = gapminder.life.values
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []
# Create a ridge regressor: ridge
ridge = Ridge(normalize = True)
# Compute scores over range of alphas
for alpha in alpha_space:
# Specify the alpha value to use: ridge.alpha
ridge.alpha = alpha
# Perform 10-fold CV: ridge_cv_scores
ridge_cv_scores = cross_val_score(ridge, X, y, cv = 10)
# Append the mean of ridge_cv_scores to ridge_scores
ridge_scores.append(np.mean(ridge_cv_scores))
# Append the std of ridge_cv_scores to ridge_scores_std
ridge_scores_std.append(np.std(ridge_cv_scores))
# Display the plot
display_plot(ridge_scores, ridge_scores_std)
Precision:
$$ \frac{TP}{TP + FP} $$
Recall:
$$ \frac{TP}{TP + FN} $$
F1 score:
$$ 2 * \frac{precision * recall}{precision + recall} $$
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors = 8)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.4,
random_state = 42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
file = 'https://assets.datacamp.com/production/course_1939/datasets/diabetes.csv'
diabetes = pd.read_csv(file)
diabetes.head()
# Import necessary modules
from sklearn.metrics import classification_report, confusion_matrix
X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.4,
random_state = 42)
# Instantiate a k-NN classifier: knn
knn = KNeighborsClassifier(n_neighbors = 6)
# Fit the classifier to the training data
knn.fit(X_train, y_train)
# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)
# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
logreg = LogisticRegression()
y = vote_clean['party'].values
X = vote_clean.drop('party', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred[:6]
Notes:
y_test[0:6]
y_pred_prob[:6]
y_test_bool = y_test == 'republican'
y_test_bool[:6]
from sklearn.metrics import roc_curve
y_pred_prob = logreg.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test_bool, y_pred_prob)
plt.figure(figsize=(18,10))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label = 'logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logisitc Regression ROC Curve')
plt.show()
# Import the necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)
# Create the classifier: logreg
logreg = LogisticRegression()
# Fit the classifier to the training data
logreg.fit(X_train, y_train)
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Import necessary modules
from sklearn.metrics import roc_curve
# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot ROC curve
plt.figure(figsize=(18,10))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
from sklearn.metrics import roc_auc_score
logreg = LogisticRegression()
y = vote_clean['party'].values == 'republican'
X = vote_clean.drop('party', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.4,
random_state = 42)
logreg.fit(X_train, y_train)
y_pred_prob = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob)
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(logreg, X, y, cv = 5, scoring = 'roc_auc')
print(cv_scores)
# Import necessary modules
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]
# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))
# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(logreg, X, y, cv = 5, scoring = 'roc_auc')
# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))
from sklearn.model_selection import GridSearchCV
y = vote_clean['party'].values
X = vote_clean.drop('party', axis=1).values
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X,y)
print(knn_cv.best_params_)
print(knn_cv.best_score_)
C
. C
controls the inverse of the regularization strength, and this is what you will tune in this exercise. C
can lead to an overfit model, while a small CC can lead to an underfit model.X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
# Import necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}
# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()
# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
# Fit it to the data
logreg_cv.fit(X,y)
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Best score is {}".format(logreg_cv.best_score_))
A solution to this is to use RandomizedSearchCV, in which not all hyperparameter values are tried out.
Here, we are using the Decision Tree, because its suited to the Random search.
# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
"max_features": randint(1, 9),
"min_samples_leaf": randint(1, 9),
"criterion": ["gini", "entropy"]}
# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()
# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)
# Fit it to the data
tree_cv.fit(X,y)
# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))
RandomizedSearchCV
will never outperform GridSearchCV
. C
, logistic regression has a 'penalty' hyperparameter which specifies whether to use 'l1' or 'l2' regularization. # Set features and target data
X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C':c_space, 'penalty': ['l1', 'l2']}
# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.4,
random_state = 42)
# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)
# Fit it to the training data
logreg_cv.fit(X_train, y_train)
# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))
L1
penalty to regularize, while ridge used the L2
penalty. L1
and L2
penalties:$$ a∗L1+b∗L2 $$
L1
penalty, and anything lower is a combination of L1
and L2
.X = gapminder.drop(['life','Region'], axis = 1).values
y = gapminder.life.values
# Import necessary modules
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.4,
random_state = 42)
# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio': l1_space}
# Instantiate the ElasticNet regressor: elastic_net
elastic_net = ElasticNet()
# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net, param_grid, cv=5)
# Fit it to the training data
gm_cv.fit(X_train, y_train)
# Predict on the test set and compute metrics
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))
file = 'https://assets.datacamp.com/production/course_1939/datasets/auto.csv'
auto = pd.read_csv(file)
auto.head()
auto_origin = pd.get_dummies(auto)
auto_origin.head()
## Using drop_first to reduce duplication
auto_origin = pd.get_dummies(auto, drop_first=True)
auto_origin.head()
## Or you can pick which to drop explicity
auto_origin = pd.get_dummies(auto)
auto_origin = auto_origin.drop('origin_Asia', axis = 1)
auto_origin.head()
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
X = auto_origin.drop(['mpg'], axis = 1).values
y = auto_origin.mpg.values
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
ridge = Ridge(alpha = 0.5, normalize = True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)
gapminder.head()
# Create a boxplot of life expectancy per region
gapminder.boxplot('life', 'Region', rot=60)
# Show the plot
plt.show()
# Create dummy variables: gapminder_region
gapminder_region = pd.get_dummies(gapminder)
# Print the columns of gapminder_region
print(gapminder_region.columns)
# Create dummy variables with drop_first=True: gapminder_region
gapminder_region = pd.get_dummies(gapminder, drop_first = True)
# Print the new columns of gapminder_region
print(gapminder_region.columns)
X = gapminder_region.drop(['life'], axis = 1).values
y = gapminder_region.life.values
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
# Instantiate a ridge regressor: ridge
ridge = Ridge(alpha = 0.5, normalize = True)
# Perform 5-fold cross-validation: ridge_cv
ridge_cv = cross_val_score(ridge, X, y, cv = 5)
# Print the cross-validated scores
print(ridge_cv)
info
, but...diabetes.info()
diabetes.head(15)
np.sum(diabetes == 0)
diabetes.glucose.replace(0, np.nan, inplace=True)
diabetes.diastolic.replace(0, np.nan, inplace=True)
diabetes.triceps.replace(0, np.nan, inplace=True)
diabetes.insulin.replace(0, np.nan, inplace=True)
diabetes.bmi.replace(0, np.nan, inplace=True)
diabetes.info()
df = diabetes.dropna()
df.shape
transform()
method to transform data# X = diabetes.drop(['diabetes'], axis = 1).values
# y = diabetes.diabetes.values
diabetes.info()
X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
X = diabetes.drop(['diabetes'], axis = 1).values
y = diabetes.diabetes.values
imp = Imputer(missing_values = 'NaN', strategy='mean', axis=0)
logreg = LogisticRegression()
steps = [('imputation', imp),
('logistic_regression', logreg)]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)
file = 'https://assets.datacamp.com/production/course_1939/datasets/house-votes-84.csv'
vote = pd.read_csv(file, names = vote_clean_columns)
vote.head()
# Convert '?' to NaN
vote[vote == '?'] = np.nan
# Print the number of NaNs
print(vote.isnull().sum())
# Print shape of original DataFrame
print("Shape of Original DataFrame: {}".format(vote.shape))
# Drop missing values and print shape of new DataFrame
vote = vote.dropna()
# Print shape of new DataFrame
print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(vote.shape))
Scikit-learn provides a pipeline constructor that allows you to piece together these steps into one process and thereby simplify your workflow.
You'll now practice setting up a pipeline with two steps: the imputation step, followed by the instantiation of a classifier.
# Import the Imputer module
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
# Setup the Imputation transformer: imp
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
# Instantiate the SVC classifier: clf
clf = SVC()
# Setup the pipeline with the required steps: steps
steps = [('imputation', imp),
('SVM', clf)]
vote = pd.read_csv(file, names = vote_clean_columns)
vote.head()
vote[vote == '?'] = np.nan
vote[vote == 'y'] = True
vote[vote == 'n'] = False
vote.head()
y = vote_clean['party'].values
X = vote_clean.drop('party', axis=1).values
# Import necessary modules
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
('SVM', SVC())]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)
# Predict the labels of the test set
y_pred = pipeline.predict(X_test)
# Compute metrics
print(classification_report(y_test, y_pred))
file = 'https://assets.datacamp.com/production/course_1939/datasets/winequality-red.csv'
redwine = pd.read_csv(file, sep = ';')
redwine.head()
X = redwine.drop('quality', axis = 1).values
y = redwine.quality.values < 5
from sklearn.preprocessing import scale
X_scaled = scale(X)
print(np.mean(X), np.std(X))
print(np.mean(X_scaled), np.std(X_scaled))
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.2,
random_state = 21)
knn_scaled = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)
knn_unscaled.score(X_test, y_test)
steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
parameters = {'knn__n_neighbors' : np.arange(1, 50)}
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.2,
random_state = 21)
cv = GridSearchCV(pipeline, param_grid = parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
print(cv.best_params_)
print(cv.score(X_test, y_test))
print(classification_report(y_test, y_pred))
file = 'https://assets.datacamp.com/production/course_1939/datasets/white-wine.csv'
whitewine = pd.read_csv(file)
whitewine.head()
X = whitewine.drop('quality', axis = 1).values
y = whitewine.quality.values > 5
# Import scale
from sklearn.preprocessing import scale
# Scale the features: X_scaled
X_scaled = scale(X)
# Print the mean and standard deviation of the unscaled features
print("Mean of Unscaled Features: {}".format(np.mean(X)))
print("Standard Deviation of Unscaled Features: {}".format(np.std(X)))
print('--------------------------------')
# Print the mean and standard deviation of the scaled features
print("Mean of Scaled Features: {}".format(np.mean(X_scaled)))
print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled)))
# Import the necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.3,
random_state = 42)
# Fit the pipeline to the training set: knn_scaled
knn_scaled = pipeline.fit(X_train, y_train)
# Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)
# Compute and print metrics
print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test)))
print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test)))
# Setup the pipeline
steps = [('scaler', StandardScaler()),
('SVM', SVC())]
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
'SVM__gamma':[0.1, 0.01]}
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.2,
random_state = 21)
# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, param_grid = parameters, cv = 3)
# Fit to the training set
cv.fit(X_train, y_train)
# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)
# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print('--------------------------------')
print(classification_report(y_test, y_pred))
print('--------------------------------')
print("Tuned Model Parameters: {}".format(cv.best_params_))
gapminder_region = pd.get_dummies(gapminder, drop_first = True)
X = gapminder_region.drop(['life'], axis = 1).values
y = gapminder_region.life.values
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
('scaler', StandardScaler()),
('elasticnet', ElasticNet())]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.4,
random_state = 42)
# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, param_grid = parameters, cv = 3)
# Fit to the training set
gm_cv.fit(X_train, y_train)
# Compute and print the metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))