```
# Engineered By: Noelle Milton Vega (PRISMALYTICS, LLC).
# Import scikit-learn, numpy, matplotlib and other python3 modules necessary for this Linear Regression project.
import sklearn.datasets
import sklearn.linear_model
import sklearn.utils
import matplotlib.pyplot
import numpy as np
```

```
digit_a = 3
digit_b = 7
mnist = sklearn.datasets.fetch_mldata("MNIST original") # A dict-like object with the following attributes:
# mnist.data : A numpy array containing the dataset samples. Shape: (70000, 784). dtype: uint8
# mnist.target : A numpy array containing the corresponding labels. Shape: (70000,). dtype: float64
samples = np.vstack([mnist.data[mnist.target == digit_a], # Select all samples that are labeled 3.
mnist.data[mnist.target == digit_b]]) # Select all samples that are labeled 7.
# And next we extract labels that 'positionally' correspond
# to the samples selected above. We also convert to uint8 dtype,
labels = np.hstack([mnist.target[mnist.target == digit_a], # which is sufficient and helpful later on.
mnist.target[mnist.target == digit_b]]).astype(numpy.uint8, copy=False)
# The above created subset samples & labels by sequentially stacking 'digit_a' entries followed by 'digit_b' entries.
# We therefore need to shuffle the deck (so to speak) so that when we partition the data into Training and Testing sets,
# each will consist of a roughly 50/50 mix of each binary class. Note: resample() is the same as shuffle()
(samples, labels) = sklearn.utils.shuffle(samples, labels)
```

```
print('Number of \'samples\' in the extracted dataset (samples.shape[0]):', samples.shape[0])
print('Number of \'labels\' in extracted dataset (labels.shape[0]):', labels.shape[0])
print('Number of \'features\' in each sample (samples.shape[1]):', samples.shape[1])
print('Distinct classes in the \'labels\' vector:', repr(np.unique(labels)))
```

```
idx = np.random.randint(0, samples.shape[0])
print("This is an image-plot of a randomly selected vector/sample representing a handwritten digit: %d" % (labels[idx]))
matplotlib.pyplot.rc("image", cmap="binary")
matplotlib.pyplot.matshow(samples[idx].reshape(28, 28))
```

```
(samples, labels) = sklearn.utils.shuffle(samples, labels) # We don't cross-validate here, so at least re-shuffle each time.
train_qty = int(samples.shape[0] * 0.85) # We'll use 85%/15% split for Training/Testing.
#
training_samples = samples[0:train_qty,:]
training_labels = labels[0:train_qty]
testing_samples = samples[train_qty:,:]
testing_labels = labels[train_qty:]
#
bincounts_training = np.bincount(training_labels) # See doc for 'scipy.stats.itemfreq()' as an alternative.
bincounts_testing = np.bincount(testing_labels)
print('Quantity of Class-A/Class-B items in training set: %d / %d' % (bincounts_training[digit_a], bincounts_training[digit_b]))
print('Quantity of Class-A/Class-B items in testing set: %d / %d' % (bincounts_testing[digit_a], bincounts_testing[digit_b]))
```

```
clf = sklearn.linear_model.LogisticRegression()
clf.fit(training_samples, training_labels)
```

```
print('Accuracy for training set: %.2f%%' % (clf.score(training_samples, training_labels) * 100)) # Should always be 100%
print('Accuracy for testing set: %.2f%%' % (clf.score(testing_samples, testing_labels) * 100)) # As close to 100% desired.
```

```
misclassified_indices = (clf.predict(testing_samples) != testing_labels).nonzero()[0] # Returns a tuple, so [0] is needed.
print('The number of incorrect/mis-classified Test sample predictions is:', misclassified_indices.size)
print('The index location of these incorrect/mis-classified Test samples are:\n\t', misclassified_indices)
```

```
a_misclassified_index = misclassified_indices[np.random.randint(0, misclassified_indices.size, 1)]
print('True -vs- Predicted classification for mis-classified test sample at index number %i: [True: %i / Pred: %i]' % \
(a_misclassified_index,
testing_labels[a_misclassified_index],
clf.predict(testing_samples[a_misclassified_index])))
print('We plot the image here to see if there are visual clues as to why mis-classification occurred:\n')
plt.matshow(testing_samples[a_misclassified_index].reshape(28, 28))
```