Custering for dataset exploration
Visualization with hierarchical clustering and t-SNE
Decorrelating your data and dimension reduction
Discovering interpretable features
import pandas as pd
import matplotlib.pyplot as plt
%run data/data.py
sklearn
xs = points[:,0]
ys = points[:,1]
plt.scatter(xs, ys)
plt.show()
# Import KMeans
from sklearn.cluster import KMeans
# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters = 3)
# Fit model to points
model.fit(points)
# Determine the cluster labels of new_points: labels
labels = model.predict(new_points)
# Print cluster labels of new_points
print(labels)
# Import pyplot
import matplotlib.pyplot as plt
# Assign the columns of new_points: xs and ys
xs = new_points[:,0]
ys = new_points[:,1]
# Make a scatter plot of xs and ys, using labels to define the colors
plt.scatter(xs,ys, c=labels, alpha = 0.5)
# Assign the cluster centers: centroids
centroids = model.cluster_centers_
# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]
# Make a scatter plot of centroids_x and centroids_y
plt.scatter(centroids_x, centroids_y, marker = 'D', s=50, c = 'k')
plt.show()
How to evalute a clustering if there were no species information? ...
fit()
, available as attribute inertia_
from urllib.request import urlretrieve
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt'
urlretrieve(url, 'data/uci_rice')
rice_features = np.loadtxt('data/uci_rice')
print(rice_features.shape)
ks = range(1, 6)
inertias = []
for k in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters = k)
# Fit model to samples
model.fit(rice_features)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
rice_names = np.concatenate([np.repeat(name, 70) for name in ['Kama', 'Rosa', 'Canadian']])
rice_names
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters = 3)
# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(rice_features)
# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': rice_names})
# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['varieties'])
# Display ct
print(ct)
file = 'https://assets.datacamp.com/production/course_2072/datasets/wine.csv'
wines = pd.read_csv(file)
wines.head()
wine_features = wines.drop(['class_label', 'class_name'], axis = 1)
wine_names = wines.class_name
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 3)
labels = model.fit_predict(wine_features)
df = pd.DataFrame({'labels':labels, 'names':wine_names})
ct = pd.crosstab(df['labels'], df['names'])
ct
proline
which has a std of 314wine_features.describe()
StandardScaler
transforms each feature to have mean o and variance 1from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(wine_features)
wine_scaled = scaler.transform(wine_features)
wine_scaled
fit()
/ transform()
with StandardScaler
fit()
/ predict()
with KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
scaler = StandardScaler()
kmeans = KMeans(n_clusters = 3)
pipeline = make_pipeline(scaler, kmeans)
pipeline.fit(wine_features)
labels = pipeline.predict(wine_features)
labels
df = pd.DataFrame({'labels':labels, 'names':wine_names})
ct = pd.crosstab(df['labels'], df['names'])
ct
# Perform the necessary imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Create scaler: scaler
scaler = StandardScaler()
# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters = 4)
# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)
file = 'https://assets.datacamp.com/production/course_2072/datasets/fish.csv'
fish = pd.read_csv(file, header = None)
fish.head()
fish_features = fish.drop([0], axis = 1)
fish_features.head()
fish_names = fish[0]
fish_names[0:6]
# Import pandas
import pandas as pd
# Fit the pipeline to samples
pipeline.fit(fish_features)
# Calculate the cluster labels: labels
labels = pipeline.predict(fish_features)
# Create a DataFrame with labels and species as columns: df
df = pd.DataFrame({'labels': labels,'species': fish_names})
# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['species'])
# Display ct
print(ct)
## pipeline with no scaler
pipeline = make_pipeline(kmeans)
pipeline.fit(fish_features)
labels = pipeline.predict(fish_features)
df = pd.DataFrame({'labels': labels,'species': fish_names})
ct = pd.crosstab(df['labels'], df['species'])
print(ct)
file = 'https://assets.datacamp.com/production/course_2072/datasets/company-stock-movements-2010-2015-incl.csv'
movements = pd.read_csv(file)
movements.head()
movements_features = movements.drop(['Unnamed: 0'], axis = 1)
movements_names = movements['Unnamed: 0']
# Import Normalizer
from sklearn.preprocessing import Normalizer
# Create a normalizer: normalizer
normalizer = Normalizer()
# Create a KMeans model with 10 clusters: kmeans
kmeans = KMeans(n_clusters = 10)
# Make a pipeline chaining normalizer and kmeans: pipeline
pipeline = make_pipeline(normalizer, kmeans)
# Fit pipeline to the daily price movements
pipeline.fit(movements_features)
# Import pandas
import pandas as pd
# Predict the cluster labels: labels
labels = pipeline.predict(movements_features)
# Create a DataFrame aligning labels and companies: df
df = pd.DataFrame({'labels': labels, 'companies': movements_names})
# Display df sorted by cluster label
print(df.sort_values('labels'))
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
# Calculate the linkage: mergings
mergings = linkage(rice_features, method = 'complete')
# Plot the dendrogram, using varieties as labels
plt.figure(figsize=(16,10))
dendrogram(
mergings,
labels=rice_names.tolist(),
leaf_rotation=90,
leaf_font_size=6)
plt.show()
# Import normalize
from sklearn.preprocessing import normalize
# Normalize the movements: normalized_movements
normalized_movements = normalize(movements_features)
# Calculate the linkage: mergings
mergings = linkage(normalized_movements, method = 'complete')
# Plot the dendrogram
plt.figure(figsize=(16,10))
dendrogram(
mergings,
labels = movements_names.tolist(),
leaf_rotation = 90,
leaf_font_size = 10)
plt.show()
linkage(samples, method = "complete")
fcluster
methodfile = 'https://assets.datacamp.com/production/course_2072/datasets/eurovision-2016.csv'
eurovision = pd.read_csv(file)
eurovision.head()
eurovision['To country'].nunique()
eurovision['From country'].nunique()
eurovision.describe()
euro_pivot = eurovision.pivot(index = 'From country', columns = 'To country', values = 'Jury Rank')
print(euro_pivot.shape)
euro_pivot.head()
eurovision_features = euro_pivot.fillna(0)
eurovision_features.head()
eurovision_names = euro_pivot.index.tolist()
eurovision_names[:6]
# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
# Calculate the linkage: mergings
mergings = linkage(eurovision_features, method = "single")
# Plot the dendrogram
plt.figure(figsize=(16,10))
dendrogram(
mergings,
labels = eurovision_names,
leaf_rotation = 90,
leaf_font_size = 12)
plt.show()
This is what it would look like with complete
linkage...
# Calculate the linkage: mergings
mergings = linkage(eurovision_features, method = "complete")
# Plot the dendrogram
plt.figure(figsize=(16,10))
dendrogram(
mergings,
labels = eurovision_names,
leaf_rotation = 90,
leaf_font_size = 12)
plt.show()
# Calculate the linkage: mergings
mergings = linkage(rice_features, method = "complete")
# Plot the dendrogram
plt.figure(figsize=(16,10))
dendrogram(
mergings,
labels = rice_names,
leaf_rotation = 90,
leaf_font_size = 12)
plt.show()
# Perform the necessary imports
import pandas as pd
from scipy.cluster.hierarchy import fcluster
# Use fcluster to extract labels: labels
labels = fcluster(
mergings,
8,
criterion = 'distance')
# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': rice_names})
# Create crosstab: ct
ct = pd.crosstab(df['labels'],df['varieties'])
# Display ct
print(ct)
fit_tranform()
methodfit()
or transoform()
methodsclassnames, indices = np.unique(rice_names, return_inverse=True)
# Import TSNE
from sklearn.manifold import TSNE
# Create a TSNE instance: model
model = TSNE(learning_rate = 200)
# Apply fit_transform to samples: tsne_features
tsne_features = model.fit_transform(rice_features)
# Select the 0th feature: xs
xs = tsne_features[:,0]
# Select the 1st feature: ys
ys = tsne_features[:,1]
# Scatter plot, coloring by variety_numbers
plt.figure(figsize=(16,10))
plt.scatter(xs, ys, c = indices)
plt.show()
# Import TSNE
from sklearn.manifold import TSNE
# Create a TSNE instance: model
model = TSNE(learning_rate = 50)
# Apply fit_transform to normalized_movements: tsne_features
tsne_features = model.fit_transform(normalized_movements)
# Select the 0th feature: xs
xs = tsne_features[:,0]
# Select the 1th feature: ys
ys = tsne_features[:,1]
# Scatter plot
plt.figure(figsize=(16,10))
plt.scatter(xs, ys, alpha = .5)
# Annotate the points
for x, y, company in zip(xs, ys, movements_names):
plt.annotate(company, (x, y), fontsize=10, alpha=0.75)
plt.show()
fit()
learns the transformation from given datatransform()
applies the learned transformationtransform()
can also be applied to new datacomponents_
attribute of PCA objectgrains = pd.read_csv('data/seeds-width-vs-length.csv', header = None)
grains.head()
# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
# Assign the 0th column of grains: width
width = grains.loc[:,0]
# Assign the 1st column of grains: length
length = grains.loc[:,1]
# Scatter plot width vs length
plt.scatter(width, length)
plt.axis('equal')
plt.show()
# Calculate the Pearson correlation
correlation, pvalue = pearsonr(width, length)
# Display the correlation
print(correlation)
# Import PCA
from sklearn.decomposition import PCA
# Create PCA instance: model
model = PCA()
# Apply the fit_transform method of model to grains: pca_features
pca_features = model.fit_transform(grains)
# Assign 0th column of pca_features: xs
xs = pca_features[:,0]
# Assign 1st column of pca_features: ys
ys = pca_features[:,1]
# Scatter plot xs vs ys
plt.scatter(xs, ys)
plt.axis('equal')
plt.show()
# Calculate the Pearson correlation of xs and ys
correlation, pvalue = pearsonr(xs, ys)
# Display the correlation
print(correlation)
# Make a scatter plot of the untransformed points
plt.scatter(grains.loc[:,0], grains.loc[:,1])
# Create a PCA instance: model
model = PCA()
# Fit model to points
model.fit(grains)
# Get the mean of the grain samples: mean
mean = model.mean_
# Get the first principal component: first_pc
first_pc = model.components_[0,:]
# Plot first_pc as an arrow, starting at mean
plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)
# Keep axes on same scale
plt.axis('equal')
plt.show()
# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
# Create scaler: scaler
scaler = StandardScaler()
# Create a PCA instance: pca
pca = PCA()
# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)
# Fit the pipeline to 'samples'
pipeline.fit(fish_features)
# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
scipy.sparse.csr_matrix
insted of NumPy arraycsr_matrix
remembers only the non-zero entries (saves space!)# Import PCA
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
fish_scaled = scale(fish_features)
# Create a PCA model with 2 components: pca
pca = PCA(n_components = 2)
# Fit the PCA instance to the scaled samples
pca.fit(fish_scaled)
# Transform the scaled samples: pca_features
pca_features = pca.transform(fish_scaled)
# Print the shape of pca_features
print(pca_features.shape)
example_documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer()
# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(example_documents)
# Print result of toarray() method
print(csr_mat.toarray())
# Get the words: words
words = tfidf.get_feature_names()
# Print words
print(words)
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer()
# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(example_documents)
# Print result of toarray() method
print(csr_mat.toarray())
import pandas as pd
from scipy.sparse import csr_matrix
df = pd.read_csv('data/wikipedia-vectors.csv', index_col=0)
df.head()
articles = csr_matrix(df.transpose())
titles = list(df.columns)
# Perform the necessary imports
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components = 50)
# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters = 6)
# Create a pipeline: pipeline
pipeline = make_pipeline(svd, kmeans)
# Import pandas
import pandas as pd
# Fit the pipeline to articles
pipeline.fit(articles)
# Calculate the cluster labels: labels
labels = pipeline.predict(articles)
# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'article': titles})
# Display df sorted by cluster label
print(df.sort_values('label'))
fit()
/transform()
patternNMF(n_components = 2)
csr_matrix
# Import NMF
from sklearn.decomposition import NMF
# Create an NMF instance: model
model = NMF(n_components = 6)
# Fit the model to articles
model.fit(articles)
# Transform the articles: nmf_features
nmf_features = model.transform(articles)
# Print the NMF features
print(nmf_features[:6])
# Import pandas
import pandas as pd
# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index = titles)
df.head()
# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])
# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])
articles
(tf-idf)words = pd.read_csv('data/wikipedia-vocabulary-utf8.txt', header = None)[0]
words[:6]
# Import pandas
import pandas as pd
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns = words)
components_df.head()
# Print the shape of the DataFrame
print(components_df.shape)
# Select row 3: component
component = components_df.iloc[3,:]
# Print result of nlargest
print(component.nlargest(20))
file = 'https://assets.datacamp.com/production/course_2072/datasets/lcd-digits.csv'
digits = pd.read_csv(file, header = None).as_matrix()
digits
# Import pyplot
from matplotlib import pyplot as plt
# Select the 0th row: digit
digit = digits[0,:]
# Print digit
print(digit)
# Reshape digit to a 13x8 array: bitmap
bitmap = digit.reshape(13,8)
# Print bitmap
print(bitmap)
# Use plt.imshow to display bitmap
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
plt.show()
# Import NMF
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
# Create an NMF model: model
model = NMF(n_components = 7)
# Apply fit_transform to samples: features
features = model.fit_transform(digits)
# Call show_as_image on each component
plt.figure(figsize=(18,10))
x = 1
for component in model.components_:
bitmap = component.reshape(13,8)
plt.subplot(2,4,x)
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
x += 1
plt.show()
# Assign the 0th row of features: digit_features
digit_features = features[0,:]
# Print digit_features
print(digit_features)
def show_as_image(vector, x):
"""
Given a 1d vector representing an image, display that image in
black and white. If there are negative values, then use red for
that pixel.
"""
bitmap = vector.reshape((13, 8)) # make a square array
bitmap /= np.abs(vector).max() # normalise
bitmap = bitmap[:,:,np.newaxis]
rgb_layers = [np.abs(bitmap)] + [bitmap.clip(0)] * 2
rgb_bitmap = np.concatenate(rgb_layers, axis=-1)
plt.subplot(2,4,x)
plt.imshow(rgb_bitmap, interpolation='nearest')
plt.xticks([])
plt.yticks([])
# Import PCA
from sklearn.decomposition import PCA
# Create a PCA instance: model
model = PCA(n_components = 7)
# Apply fit_transform to samples: features
features = model.fit_transform(digits)
# Call show_as_image on each component
plt.figure(figsize=(18,10))
x = 1
for component in model.components_:
show_as_image(component, x)
x += 1
plt.show()
# Perform the necessary imports
import pandas as pd
from sklearn.preprocessing import normalize
# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)
# Create a DataFrame: df
df = pd.DataFrame(norm_features, index = titles)
# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']
# Compute the dot products: similarities
similarities = df.dot(article)
# Display those with the largest cosine similarity
print(similarities.nlargest(10))
file = 'data/scrobbler-small-sample.csv'
artists = pd.read_csv(file)
artists.head()
artists.shape
artists_spread = artists.pivot(
index = 'artist_offset',
columns = 'user_offset',
values = 'playcount'
).fillna(0)
artists_spread.head()
artists_spread.shape
## load the corresponding artist names
file = 'data/artists.csv'
artists_names = pd.read_csv(file, header = None)[0].tolist()
artists_names[:6]
# Perform the necessary imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline
# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()
# Create an NMF model: nmf
nmf = NMF(n_components = 20)
# Create a Normalizer: normalizer
normalizer = Normalizer()
# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)
# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artists_spread)
norm_features.shape
# Import pandas
import pandas as pd
# Create a DataFrame: df
df = pd.DataFrame(norm_features, index = artists_names)
df.head()
# Select row of 'Bruce Springsteen': artist
artist = df.loc['Bruce Springsteen']
# Compute cosine similarities: similarities
similarities = df.dot(artist)
# Display those with highest cosine similarity
print(similarities.nlargest(10))