vertopal.com_mnist_logreg

pdf

School

Stony Brook University *

*We aren’t endorsed by this school

Course

512

Subject

Computer Science

Date

Dec 6, 2023

Type

pdf

Pages

14

Uploaded by BailiffHeat15921

Report
#common packages we basically always need import numpy as np import matplotlib.pyplot as plt import scipy.io as sio from time import time #load the MNIST dataset with binary pixel values data = sio.loadmat( 'mnist.mat' ) print (data) Xtrain, Xtest = data[ 'trainX' ].astype( float ), data[ 'testX' ].astype( float ) ytrain, ytest = data[ 'trainY' ][ 0 ], data[ 'testY' ][ 0 ] #pull and plot some samples for k in range ( 9 ): plot_data = Xtrain[k,:] plot_data = np.reshape(plot_data,( 28 , 28 )) plot_label = ytrain[k] plt.subplot( 3 , 3 ,k + 1 ) plt.imshow(plot_data) plt.title(plot_label) plt.tight_layout() {'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Oct 18 19:00:09 2017', '__version__': '1.0', '__globals__': [], 'testX': array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'testY': array([[7, 2, 1, ..., 4, 5, 6]], dtype=uint8), 'trainY': array([[5, 0, 4, ..., 5, 6, 8]], dtype=uint8), 'trainX': array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}
#load the MNIST dataset with binary pixel values data = sio.loadmat( 'mnist.mat' ) print (data) select_train = np.logical_or(np.equal(ytrain, 4 ),np.equal(ytrain, 9 )) select_test = np.logical_or(np.equal(ytest, 4 ),np.equal(ytest, 9 )) Xtrain = Xtrain[select_train,:] Xtest = Xtest[select_test,:] ytrain = np.sign(np.equal(ytrain[select_train], 4. ) - .5 ) ytest = np.sign(np.equal(ytest[select_test], 4. ) - .5 ) m,n = Xtrain.shape mt = Xtest.shape[ 0 ] {'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Oct 18 19:00:09 2017', '__version__': '1.0', '__globals__': [], 'testX': array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ...,
[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'testY': array([[7, 2, 1, ..., 4, 5, 6]], dtype=uint8), 'trainY': array([[5, 0, 4, ..., 5, 6, 8]], dtype=uint8), 'trainX': array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)} (b) Coding gradient descent. Do not use scikit-learn or other built in tools for this exercise. Please only use the packages that are already imported in the notebook. 1 Open mnist logreg.ipynb. We will use logistic regression to direntiate 4's from 9's, a notoriously tricky problem. Run the rst box to see what the data looks like. In the second box I have set up the problem for you by pulling out the train and test set, selecting out only the data samples related to 4's and 9's. I have not altered the data in any other way. While other normalization tricks will help you improve the accuracy, for the purposes of this exercise we will forgo them, so that it's easy to compare everyone's solutions. Fill in the next box by providing code that will return the loss function value and the gradient. Make sure that everything is normalized, e.g. don't forget the 1=m term in the front of our loss function. Run the script. If done correctly, you should see 45.192, 12343.177 Write a script that returns the classication accuracy given . Use gradient descent to minimize the logistic loss for this classication problem. Use a step size of 10􀀀6. (1.5 pts) Run for 1500 iterations. In your report, give the plot of the train loss and train/test misclassi- cation rate, plotted as a function of iterations. Report the nal train and test accuracy values. def sigmoid(val): return 1 / ( 1 + np.exp( - val)) def getLossFunction(theta): val = ytrain * np.dot(Xtrain, theta) return - np.mean(np.log(sigmoid(val))) def getGradient(theta): val = ytrain * np.dot(Xtrain, theta) return - np.mean(( 1 - sigmoid(val))[:, np.newaxis] * ytrain[:, np.newaxis] * Xtrain, axis = 0 ) def predict(theta,x): return np.where(np.dot(x, theta) >= 0 , 1 , - 1 ) def classification_accuracy(theta, x, y): y_pred = np.array(predict(theta, x)).reshape( - 1 ) y_true = np.array(y).reshape( - 1 ) tp = sum ( 1 for pred, true in zip (y_pred, y_true) if pred == true) return tp / len (y)
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
def gradient_descent(theta, alpha, num_iters): theta_values = [] loss_values = [] for i in range (num_iters): theta_values.append(theta) theta = theta - (alpha * getGradient(theta)) loss_values.append(getLossFunction(theta)) return theta_values, loss_values # TEST SCRIPT. DO NOT MODIFY! theta = np.linspace( - .1 , .1 ,n) print ( 'Check number: ' , round (getLossFunction(theta), 3 ), round (np. sum (getGradient(theta)), 3 )) Check number: 45.192 12343.177 alpha = 1e-6 n_iterations = 1500 theta_0 = np.zeros(Xtrain.shape[ 1 ]) theta_values, loss_values = gradient_descent(theta_0, alpha, n_iterations) plt.figure(figsize = ( 12 , 5 )) plt.subplot( 1 , 2 , 1 ) plt.plot(loss_values) plt.title( 'Training Loss vs. Iterations' ) plt.xlabel( 'Iterations' ) plt.ylabel( 'Loss' ) Text(0, 0.5, 'Loss')
train_miss = [ 1 - classification_accuracy(theta, Xtrain, ytrain) for theta in theta_values] test_miss = [ 1 - classification_accuracy(theta, Xtest, ytest) for theta in theta_values] plt.plot(train_miss, label = 'Train' ) plt.plot(test_miss, label = 'Test' ) plt.title( 'Misclassification Rate vs. Iterations' ) plt.xlabel( 'Iterations' ) plt.ylabel( 'Misclassification Rate' ) plt.legend() plt.show()
#Computing Train and Test Accuracy train_acc = classification_accuracy(theta_values[ - 1 ], Xtrain, ytrain) test_acc = classification_accuracy(theta_values[ - 1 ], Xtest, ytest) print ( f"Final Training Accuracy: { train_acc * 100 :.2f} %" ) print ( f"Final Test Accuracy: { test_acc * 100 :.2f} %" ) Final Training Accuracy: 96.45% Final Test Accuracy: 96.63% 1(c) Coding stochastic gradient descent. Do not use scikit-learn or other built in tools for this exercise. Please only use the packages that are already imported in the notebook. Now, ll in the next box a function that takes in and a minibatch B as either a list of numbers or as an np.array, and returns the minibatch gradient rBf() = 1 jBj X i2B rfi() where fi() is the contribution to the gradient from datapoint i: fi = 􀀀log((yixTi )): Run the script. If done correctly, you should see the number 5803.5 printed out. def getStochGradient(theta, minibatch): grad = np.zeros_like(theta) for i in minibatch: y_i = ytrain[i] x_i = Xtrain[i]
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
phi = - ( 1 - sigmoid(y_i * np.dot(x_i, theta))) * y_i * x_i grad = grad + phi grad = grad / len (minibatch) return grad # TEST SCRIPT. DO NOT MODIFY! theta = np.linspace( - .1 , .1 ,n) print ( 'Check number: ' ,np. sum (getStochGradient(theta,[ 1 , 4 , 6 , 2 ]))) Check number: 5803.5 def stochastic_gradient_descent(theta, alpha, n_iterations, minibatch_size): theta_values = [] loss_values = [] iteration = 0 l = len (ytrain) while iteration < n_iterations: indexes = np.arange(l) np.random.shuffle(indexes) for i in range ( 0 , l, minibatch_size): minibatch_indexes = indexes[i:i + minibatch_size] theta_values.append(theta) theta = theta - alpha * getStochGradient(theta, minibatch_indexes) if iteration % 100 == 0 : loss_values.append(getLossFunction(theta)) iteration += 1 if iteration == n_iterations: break return theta_values, loss_values def new_gradient_descent(theta, alpha, num_iters): theta_values = [] loss_values = [] for i in range (num_iters): theta_values.append(theta) theta = theta - (alpha * getGradient(theta))
if i % 10 == 0 : loss_values.append(getLossFunction(theta)) return theta_values, loss_values import time n_iterations = 50000 theta_0 = np.linspace( - .1 , .1 , n) #For Gradient Descent start_time = time.time() gd_theta_values, gd_loss_values = new_gradient_descent(theta_0, alpha, n_iterations) print ( "Time taken to run Gradient Descent = " , round (time.time() - start_time, 2 )) Time taken to run Gradient Descent = 3866.55 #For Stochastic Gradient Descent start_time = time.time() sgd_theta_values, sgd_loss_values = stochastic_gradient_descent(theta_0, alpha, n_iterations, 50 ) print ( "Time taken to run Stochastic Gradient Descent = " , round (time.time() - start_time, 2 )) Time taken to run Stochastic Gradient Descent = 27.73 (d) (1.5 pts) Write a script to run stochastic gradient descent over logistic regression. When coding up the minibatching, make sure you cycle through an entire training set once before moving on to the next epoch. Additionally, use time() to record the runtime, and compare the performance of gradient descent and stochastic gradient descent, using a minibatch size of 50 data samples and running for 50000 iterations. Return a plot that compares the objective loss, train accuracy, and test accuracy between the two optimization methods, as a function of runtime. Comment on the pros and cons of the two methods. gd_iterations = np.arange( 0 , n_iterations, 10 )[: len (gd_loss_values)] sgd_iterations = np.arange( 0 , n_iterations, 100 )[: len (sgd_loss_values)] plt.figure(figsize = ( 12 , 8 )) #Plotting loss values plt.subplot( 3 , 1 , 1 ) plt.plot(gd_iterations, gd_loss_values, label = 'Gradient Descent' ) plt.plot(sgd_iterations, sgd_loss_values, label = 'Stochastic Gradient Descent' ) plt.title( "Loss vs. Iterations" ) plt.legend()
<matplotlib.legend.Legend at 0x1db816a7550> #Plotting Train Accuracy #gd_train_acc = [classification_accuracy(theta, Xtrain, ytrain) for theta in gd_theta_values][:len(gd_loss_values)] sgd_train_acc = [classification_accuracy(theta, Xtrain, ytrain) for theta in sgd_theta_values][: len (sgd_loss_values)] plt.plot(gd_iterations, gd_train_acc, label = "Gradient Descent" ) plt.plot(sgd_iterations, sgd_train_acc, label = "Stochastic Gradient Descent" ) plt.title( "Train Accuracy vs. Iterations" ) plt.legend() <matplotlib.legend.Legend at 0x1db9eab0d50> #Plotting Test Accuracy gd_test_acc = [classification_accuracy(theta, Xtest, ytest) for theta in gd_theta_values][: len (gd_loss_values)] sgd_test_acc = [classification_accuracy(theta, Xtest, ytest) for theta in sgd_theta_values][: len (sgd_loss_values)] plt.plot(gd_iterations, gd_test_acc, label = "Gradient Descent" ) plt.plot(sgd_iterations, sgd_train_acc, label = "Stochastic Gradient Descent" ) plt.title( "Test Accuracy vs. Iterations" ) plt.legend() <matplotlib.legend.Legend at 0x1db94a07d50>
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
Here are the pros and cons of both methods: Gradient Descent: Pros: 1. Guaranteed Convergence: Given certain conditions (convexity of the loss function and a proper choice of learning rate), gradient descent is guaranteed to converge to the global minimum. 2. Stable Convergence: Since it considers the entire dataset for each update, it moves in a direction that is a true reflection of the overall landscape of the loss function, which can lead to stable convergence. 3. Deterministic Updates: The updates to the model parameters are deterministic and predictable, making it easier to debug and analyze the optimization process. Cons: 1. Computational Intensity: Computing gradients for the entire dataset can be computationally intensive, especially when dealing with large datasets. This makes gradient descent slow and sometimes impractical for big data problems.
Stochastic Gradient Descent (SGD): Pros: 1. Faster Convergence: Since SGD updates the model parameters using only one training example at a time, it converges faster per iteration, especially for large datasets. Each iteration is much quicker. 2. Better Generalization: The stochastic nature of updates introduces noise, which can help the algorithm escape local minima and find better generalizable solutions, especially when the data is noisy or has redundant patterns. 3. Real time Learning: SGD can be used for online learning, where the model is updated in real-time as new data arrives, making it suitable for dynamic or streaming data scenarios. Cons: 1. Noisy Updates: The noisy updates in SGD can cause the optimization process to oscillate around the minimum or even overshoot it, making it harder to converge to the optimal solution. 2. No Convergence Guarantee: Due to its stochastic nature, SGD does not guarantee convergence to the global minimum, especially when the learning rate is not properly tuned. It might get stuck in local minima or saddle points. 3. Learning Rate Tuning: Choosing an appropriate learning rate is crucial for the convergence of SGD. A too high or too low learning rate can significantly affect the optimization process. 3(d) Coding. (1 pt) Run multiclass logistic regression on MNIST dataset, against each of the 10 classes. While usually we pick a stepsize of 2/L, I have tried this and found a larger stepsize of 10^-5 will work well. Use this stepsize and run for 500 iterations, or however many you need to see reasonable \working" behavior. Show the train/test loss plot and the train/test classification plot. X_train = data[ 'trainX' ].astype( float ) X_test = data[ 'testX' ].astype( float ) y_train = data[ 'trainY' ][ 0 ] y_test = data[ 'testY' ][ 0 ] train_acc = [] train_loss = [] test_acc = [] test_loss = [] def softmax(x): val = np.exp(x - np. max (x, axis = 1 , keepdims = True )) return val / val. sum (axis = 1 , keepdims = True ) def compute_loss(theta, X, y):
val = softmax(np.dot(X, theta)) s = X.shape[ 0 ] loss = - np. sum (np.log(val[ range (s), y.astype( int )])) / s return loss def gradient(theta, X, y): val = softmax(np.dot(X, theta)) grad = np.dot(X.T, val - (y[:, np.newaxis] == np.arange(theta.shape[ 1 ]))) return grad / X.shape[ 0 ] def compute_accuracy(theta, X, y): y_pred = predict(theta, X) tp = sum ( 1 for pred, true in zip (y_pred, y) if pred == true) return tp / len (y) def train(X, y, lr, epochs): theta = np.zeros((X.shape[ 1 ], len (np.unique(y)))) for epoch in range (epochs): grad = gradient(theta, X, y) theta = theta - (lr * grad) train_acc.append(compute_accuracy(theta, X_train, y_train)) test_acc.append(compute_accuracy(theta, X_test, y_test)) train_loss.append(compute_loss(theta, X_train, y_train)) test_loss.append(compute_loss(theta, X_test, y_test)) if (epoch + 1 ) % 50 == 0 : print ( f'Epoch { epoch + 1 } / { epochs } , Loss: { compute_loss(theta, X, y) } ' ) return theta def predict(theta, X): return np.argmax(softmax(np.dot(X,theta)), axis = 1 ) final_theta = train(X_train, y_train, lr = pow ( 10 , - 5 ), epochs = 500 ) Epoch 50/500, Loss: 0.43303781750474707 Epoch 100/500, Loss: 0.377047373840449 Epoch 150/500, Loss: 0.35266453445686136 Epoch 200/500, Loss: 0.3381561296393 Epoch 250/500, Loss: 0.32823523531012044 Epoch 300/500, Loss: 0.3208876171754041 Epoch 350/500, Loss: 0.31515416111898353 Epoch 400/500, Loss: 0.31051141521398773 Epoch 450/500, Loss: 0.30664603309909005 Epoch 500/500, Loss: 0.30335748284346775
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
plt.figure(figsize = ( 12 , 4 )) plt.plot( range ( 500 ), train_loss, label = 'Train Loss' ) plt.plot( range ( 500 ), test_loss, label = 'Test Loss' ) plt.xlabel( 'Epochs' ) plt.ylabel( 'Loss' ) plt.legend() <matplotlib.legend.Legend at 0x1dbbff95810> plt.figure() plt.plot(train_acc, label = 'Train Accuracy' ) plt.plot(test_acc, label = "Test Accuracy" ) plt.xlabel( 'Iterations' ) plt.ylabel( 'Accuracy' ) plt.title( 'Training and Test Classification Accuracy' ) plt.legend() plt.show()
# #you can use time() to measure runtime of things. # #sample runtime code: # def do_stuff_takes_nseconds(n): # wait(n) # start = time() # do_stuff_takes_nseconds(n) # print(time()-start, ' seconds to run code')