import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, KNNImputer
# Function to load data from file
def load_data(file_path):
with open(file_path, 'r') as file:
data = np.array([[float(x) for x in line.split()] for line in file])
return data
# Load data from each feature file
feature_files = [
'mfeat-fou', 'mfeat-fac', 'mfeat-kar',
'mfeat-pix', 'mfeat-zer', 'mfeat-mor'
]
feature_data = [load_data(file) for file in feature_files]
# Concatenating training features from all 6 types
train_features = np.concatenate(feature_data, axis=1)
# Splitting the data into 1000 training examples and 1000 test examples
train_features, test_features, train_labels, test_labels = train_test_split(
train_features, np.repeat(np.arange(10), 200), test_size=1000, train_size=1000,
random_state=10
)
# Splitting the test set into the first 3 features and the last 3 features
test_features_part, test_features_missing = np.split(test_features, [356], axis=1)
# Impute missing values only for the identified features in the test set
test_features_missing = np.full(test_features_missing.shape, np.nan)
test_features_combined = np.concatenate((test_features_part,
test_features_missing), axis=1)
# Strategies to try for imputation
imputation_methods = {
'mean': SimpleImputer(strategy='mean'),
'median': SimpleImputer(strategy='median'),
'mode': SimpleImputer(strategy='most_frequent'),
'constant': SimpleImputer(strategy='constant', fill_value=0),
# Replace NaNs
with 0
'KNN': KNNImputer(n_neighbors=3),
}
results = {}
for method, imputer in imputation_methods.items():
# Fit the imputer on the training set
imputer.fit(train_features)
# Impute missing values in the test set
imputed_test_features = imputer.transform(test_features_combined)
# Train a classifier on the training set with all available features
clf = RandomForestClassifier(n_estimators=30, random_state=10)
clf.fit(train_features, train_labels)