Digit_recog_Q2 copy

py

School

Pennsylvania State University *

*We aren’t endorsed by this school

Course

556

Subject

Economics

Date

Jan 9, 2024

Type

py

Pages

1

Uploaded by MateNightingalePerson923

Report
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.impute import SimpleImputer # Function to load data from file def load_data(file_path): with open(file_path, 'r') as file: data = np.array([[float(x) for x in line.split()] for line in file]) return data # Load data from each feature file feature_files = [ 'mfeat-fou', 'mfeat-fac', 'mfeat-kar', 'mfeat-pix', 'mfeat-zer', 'mfeat-mor' ] feature_data = [load_data(file) for file in feature_files] # Concatenating training features from all 6 types train_features = np.concatenate(feature_data, axis=1) # Splitting the data into 1000 training examples and 1000 test examples train_features, test_features, train_labels, test_labels = train_test_split( train_features, np.repeat(np.arange(10), 200), test_size=1000, train_size=1000, stratify=np.repeat(np.arange(10), 200), random_state=42 ) # Splitting the test set into the first 3 features and the last 3 features test_features_part, test_features_missing = np.split(test_features, [356], axis=1) print(np.shape(test_features_part)) print(np.shape(test_features_missing)) # Fit imputer on training data for the identified features imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(train_features) # Fit on the last 3 feature types # Impute missing values only for the identified features in the test set test_features_missing = matrix_with_nan = np.full(test_features_missing.shape, np.nan) test_features_combined = np.concatenate((test_features_part, test_features_missing), axis=1) test_features_imputer = imputer.transform(test_features_combined) # Train a classifier on the training set with all available features clf = RandomForestClassifier(n_estimators=100, random_state=42) clf.fit(train_features, train_labels) # Evaluate accuracy on the test set predictions = clf.predict(test_features_imputer) accuracy = accuracy_score(test_labels, predictions) # Display the accuracy print(f"Imputation accuracy using mean strategy: {accuracy:.4f}")
Discover more documents: Sign up today!
Unlock a world of knowledge! Explore tailored content for a richer learning experience. Here's what you'll get:
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help