import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
# Function to load data from file
def load_data(file_path):
with open(file_path, 'r') as file:
data = np.array([[float(x) for x in line.split()] for line in file])
return data
# Load data from each feature file
feature_files = [
'mfeat-fou', 'mfeat-fac', 'mfeat-kar',
'mfeat-pix', 'mfeat-zer', 'mfeat-mor'
]
feature_data = [load_data(file) for file in feature_files]
# Concatenating training features from all 6 types
train_features = np.concatenate(feature_data, axis=1)
# Splitting the data into 1000 training examples and 1000 test examples
train_features, test_features, train_labels, test_labels = train_test_split(
train_features, np.repeat(np.arange(10), 200), test_size=1000, train_size=1000
)
train_features_known1, train_features_known2 = train_test_split(
train_features, test_size=500, train_size=500, shuffle=False
)
# Splitting the features for the training set
train_features_known1, train_missing_known1 = np.split(train_features_known1,
[356], axis=1)
train_missing_known2, train_features_known2 = np.split(train_features_known2,
[356], axis=1)
# Impute missing values only for the identified features in the test set
train_missing_known1 = np.full(train_missing_known1.shape, np.nan)
train_combined_known1 = np.concatenate((train_features_known1,
train_missing_known1), axis=1)
train_missing_known2 = np.full(train_missing_known2.shape, np.nan)
train_combined_known2 = np.concatenate((train_missing_known2,
train_features_known2), axis=1)
train_combined = np.vstack((train_combined_known1, train_combined_known2))
# Fit imputer on training data for the identified features
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train_combined)
# Impute missing values in the test set
imputed_train_features1 = imputer.transform(train_combined_known1)
imputed_train_features2 = imputer.transform(train_combined_known2)
# Train the first model on the first 3 known features