# import necessary libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from scipy import linalg
from scipy.linalg import norm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib as mpl
import time
% matplotlib inline
# read training and test data from excel file
xl = pd.ExcelFile("data_handwritten.xlsx")
# convert data to numpy array and transpose data
# in order to handle them in a morde convenient
# way
x_train = np.asarray(xl.parse("azip", header=None)).T
y_train = np.asarray(xl.parse("dzip", header=None)).T
x_test = np.asarray(xl.parse("testzip", header=None)).T
y_test = np.asarray(xl.parse("dtest", header=None)).T
# although data seem normalized, normalize them
# between -1 and 1 before applying SVD in order
# to avoid any problems
normalizer = MinMaxScaler(feature_range=(-1.0, 1.0))
X_train = normalizer.fit_transform(x_train)
X_test = normalizer.transform(x_test)
# create a dictionary having as keys the
# the digits from 0 to 9 and as values all
# the training data corresponding to that specific
# key. That data separation is a necessary step for
# the implementation of the classification algorithm
# and its variations.
def create_dict(X, y):
dict_digits = {}
for i, digit in enumerate(list(y)):
digit = int(digit)
if digit in dict_digits.keys():
dict_digits[digit].append(X[i])
else:
dict_digits[digit] = [X[i]]
return dict_digits
# method to compute the k singular vectors
# of the training data
def svd_tranform(components, X_train):
U_train, s_train, Vh_train = linalg.svd(np.asarray(X_train).T,
check_finite=False)
return U_train[:, 0:components], s_train[0:components]
# method to compute truncated SVD per digit.
# Return a dictionary containing different number
# of basis vectors (1-20) for each digit.
def trunc_svd(dict_train):
svd_dict = {}
for component in range(1, 21):
temp = {}
for digit in sorted(dict_train.keys()):
temp[digit] = svd_tranform(component, dict_train[digit])[0]
svd_dict[component] = temp
return svd_dict
# method to convert an array to image,
# plot the first 21 digits from
# each class
def to_image(digit, dict_train):
counter = 0
while counter < 21:
fig, (ax1, ax2, ax3) = plt.subplots(figsize=(6, 6), ncols=3)
array = dict_train[digit][counter].reshape(16, 16)
array = (array - min(array.ravel())) * np.ones(array.shape)
array = (20 / max(array.ravel())) * array
ax1.imshow(np.uint8(array.reshape(16, 16)), cmap="gray")
counter += 1
array = dict_train[digit][counter].reshape(16, 16)
array = (array - min(array.ravel()))*np.ones(array.shape)
array = (20/max(array.ravel()))*array
ax2.imshow(np.uint8(array.reshape(16, 16)), cmap="gray")
counter += 1
array = dict_train[digit][counter].reshape(16, 16)
array = (array - min(array.ravel())) * np.ones(array.shape)
array = (20 / max(array.ravel())) * array
ax3.imshow(np.uint8(array.reshape(16, 16)), cmap="gray")
def svd_classifier(x_test, components, svd_dict):
# create a 2-D array with ones on the diagonal
# and zeros elsewhere
ones = np.eye(256)
# initialize a list for the predictions
y_pred = []
for sample in x_test:
# initialize a list for the norms
norms = []
for digit in svd_dict[components].keys():
svd = svd_dict[components][digit]
# compute 10 least squares residuals
norms.append(norm(np.dot(ones - np.dot(svd, svd.T),
sample)) / norm(sample))
# ff one residual is significantly smaller than all
# the others, classify as that.
y_pred.append(norms.index(min(norms)))
# return the prediction
return y_pred
def svd_classifier_adj(x_test, components, svd_dict, dig_component):
ones = np.eye(256)
y_pred = []
for sample in x_test:
norms = []
for digit in svd_dict[components].keys():
# since for digits 0, 1 and 6 greater accuracy and
# F1-score can relative easy be achieved use a different
# number of basis vectors (smaller). And apply the same
# proceedure.
if digit == 1:
svd = svd_dict[dig_component][digit]
norms.append(norm(np.dot(ones - np.dot(svd, svd.T),
sample))/norm(sample))
else:
svd = svd_dict[components][digit]
norms.append(norm(np.dot(ones - np.dot(svd, svd.T),
sample))/norm(sample))
y_pred.append(norms.index(min(norms)))
return y_pred
def svd_classifier_adj2(x_test, components, svd_dict):
ones = np.eye(256)
y_pred = []
counter = 0
digits_pass = {el: 0 for el in list(svd_dict[components].keys())}
for sample in x_test:
norms = []
for digit in svd_dict[components].keys():
svd = svd_dict[components][digit]
# compare the unknown digit only to the first singular vector in each class
norms.append(norm(np.dot(ones - np.dot(svd[:, 0:1], svd[:, 0:1].T),
sample))/norm(sample))
# If for one class the residual is significantly smaller
# than for the others, classify as that class.
if norm(sorted(norms)[1] - sorted(norms)[0]) > 0.1:
pred = norms.index(min(norms))
y_pred.append(norms.index(min(norms)))
counter += 1
digits_pass[pred] += 1
continue
norms = []
for digit in svd_dict[components].keys():
svd = svd_dict[components][digit]
norms.append(norm(np.dot(ones - np.dot(svd, svd.T),
sample))/norm(sample))
y_pred.append(norms.index(min(norms)))
print("The second stage is unnecessary", counter, "times.")
print(digits_pass)
return y_pred
# initialize variables for the classification task
dict_train = create_dict(X_train, y_train)
svd_dict = trunc_svd(dict_train)
The following block of code runs the SVD classification algoirthm for a number of basis vectors in the range (5,20). Different metrics are used to evaluate the different approaches i.e. accuracy, recall, precision and F1-score.
Accuracy is the number of correct predictions made divided by the total number of predictions made, multiplied by one hundred (100) to turn it into a percentage. In order to calculate the total cost ratio mentioned below we also calculate error rate, which is one minus Accuracy.
Precision is the number of True Positives divided by the number of True Positives and False Positives. Precision can be considered a measure of a classifier’s exactness. A low precision can also indicate a large number of False Positives.
Recall is the number of True Positives divided by the number of True Positives and the number of False Negatives. In other way, it is the number of positive predictions divided by the number of positive class values in the test data. It is also known as Sensitivity or the True Positive Rate. Recall is a measure of a classifiers completeness. A low recall indicates many False egatives.
A system with high recall but low precision returns many results, but most of its predicted labels are incorrect when compared to the training labels. A system with high precision but low recall is just the opposite, returning very few results, but most of its predicted labels are correct when compared to the training labels. An ideal system with high precision and high recall will return many results, with all results labeled correctly.
F1-score is the 2 ((precision recall) / (precision + recall)). In other words, the F1-score conveys the balance between the precision and the recall.
accuracy_test = []
accuracy_train = []
vectors = np.arange(5, 21, 1)
# Use the first few (5-20) singular vectors as basis and
# classify unknown test digits according to how well
# they can be represented in terms of the respective bases.
# Calculate accuracy, precision, recall and f1-score for
# the different number of basis vectors.
for i in range(5, 21):
y_pred_train = svd_classifier(X_train, i, svd_dict)
y_pred_test = svd_classifier(X_test, i, svd_dict)
print ("Number of SVD basis vectors:", i)
acc_train = accuracy_score(y_pred_train, y_train.ravel())
acc_test = accuracy_score(y_pred_test, y_test.ravel())
print("Accuracy: %0.6f" % (acc_test))
precision = precision_score(y_pred_test, y_test.ravel(), average="macro")
print ('Precision:', precision)
recall = recall_score(y_pred_test, y_test.ravel(), average="macro")
print ('Recall:', recall)
f1score = f1_score(y_pred_test, y_test.ravel(), average="macro")
print ('F1-score:', f1score)
accuracy_test.append(acc_test)
accuracy_train.append(acc_train)
print()
# plot the percentage of correctly classified digits
# as a function of the number of basis vectors for the
# training set.
fig = plt.figure()
ax = fig.add_subplot(111)
plt.xlabel('Number of Basis Vectors')
plt.ylabel('Accuracy')
plt.title("Accuracy vs. Number of Basis Vectors on Train Set")
ymax = max(accuracy_train)
xpos = accuracy_train.index(ymax)
xmax = vectors[xpos]
print("---Maximum Accuracy---")
print("Number of Basis Vectors: " + str(xmax) + ", Accuracy: " + str(ymax))
ax.annotate('maximum accuracy', xy=(xmax, ymax), xytext=(xmax-4, ymax-0.004),
arrowprops=dict(color='red', arrowstyle="->"),
)
plt.plot(vectors, accuracy_train, marker='*')
plt.show()
# plot the percentage of correctly classified digits
# as a function of the number of basis vectors for the
# test set. We observe maximum accuracy for 18 basis vectors.
# However, for 15 basis vectors we have almost the same
# accuracy concequently could use 15 basis vectors in favor of
# parsimonious models.
fig = plt.figure()
ax = fig.add_subplot(111)
plt.xlabel('Number of Basis Vectors')
plt.ylabel('Accuracy')
plt.title("Accuracy vs. Number of Basis Vectors on Test Set")
ymax = max(accuracy_test)
xpos = accuracy_test.index(ymax)
xmax = vectors[xpos]
print("---Maximum Accuracy---")
print("Number of Basis Vectors: " + str(xmax) + ", Accuracy: " + str(ymax))
ax.annotate('maximum accuracy', xy=(xmax, ymax), xytext=(xmax-3, ymax-0.01),
arrowprops=dict(color='red', arrowstyle="->"))
plt.plot(vectors, accuracy_test, marker='*')
plt.show()
# print the main classification metrics i.e.
# precision, recall and f1-score for each class
# We observe that it is easier to classify digits
# 0, 1 and 6 and more difficu,lt digits 5 and 3.
y_pred = svd_classifier(X_test, 15, svd_dict)
print("Classification report for %s:\n%s\n"
% ("SVD basis classification algorithm",
classification_report(y_test, y_pred)))
# print the confusion matrix
# We observe again that it is easier to classify digits
# 0, 1 and 6 and more difficu,lt digits 5 and 3.
digits = sorted(set(list(y_test.ravel())))
confusion = confusion_matrix(y_test, y_pred, labels=digits)
print(confusion)
# print the digits 5 and 3, as they seem to be
# the most difficult digits to predict. We observe
# that in many cases they are very badly written.
to_image(5, dict_train)
to_image(3, dict_train)
# Check the singular values of the different classes.
# In the class "1", we observe that first singular value
# is significantly higher than the rest singular values.
# Consequently, we could use fewer basis vector for that class.
# It should also be noted that this is the class which
# present the highest F1-score
for i in range(0, 10):
print(i, svd_tranform(10, dict_train[i])[1])
print()
y_pred = svd_classifier_adj(X_test, 15, svd_dict, 1)
print("Classification report for %s:\n%s\n"
% ("SVD basis classification algorithm",
classification_report(y_test, y_pred)))
digits = sorted(set(list(y_test.ravel())))
confusion = confusion_matrix(y_test, y_pred, labels=digits)
print(confusion)
y_pred = svd_classifier_adj(X_test, 15, svd_dict, 2)
print("Classification report for %s:\n%s\n"
% ("SVD basis classification algorithm",
classification_report(y_test, y_pred)))
digit s= sorted(set(list(y_test.ravel())))
confusion = confusion_matrix(y_test, y_pred, labels=digits)
print(confusion)
y_pred = svd_classifier_adj(X_test, 15, svd_dict, 3)
print("Classification report for %s:\n%s\n"
% ("SVD basis classification algorithm",
classification_report(y_test, y_pred)))
digits = sorted(set(list(y_test.ravel())))
confusion = confusion_matrix(y_test, y_pred, labels=digits)
print(confusion)
y_pred = svd_classifier_adj(X_test, 15, svd_dict, 4)
print("Classification report for %s:\n%s\n"
% ("SVD basis classification algorithm",
classification_report(y_test, y_pred)))
digits = sorted(set(list(y_test.ravel())))
confusion = confusion_matrix(y_test, y_pred, labels=digits)
print(confusion)
Consequently, according to the above experiments it seems that it pays off to use fewer basis vectors in one some of the classes. Indeed, having more parsimonious models enables us to ensure avoiding also problems such as overfitting.
# The second stage is unnecessary 1028 times.
# we observe that in the first stage of the algorithm
# are ussually classified digits of class 0,1 and 6. Indeed,
# those are the classes with the highest F1-score and accuracy
# as we observed also above.
y_pred = svd_classifier_adj2(X_test, 15, svd_dict)
According to the following classification report and confusion matrix it seems that it is possible to get as good results for this variant of the algorithm. Additionally, that version of the algorithm can be a little bit faster as we only compute the relative residual taking into account the first singular vector.
print("Classification report for %s:\n%s\n"
% ("SVD basis classification algorithm",
classification_report(y_test, y_pred)))
digits = sorted(set(list(y_test.ravel())))
confusion = confusion_matrix(y_test, y_pred, labels=digits)
print(confusion)