message (2)

txt

School

University Of Arizona *

*We aren’t endorsed by this school

Course

BIOCHEMIST

Subject

Computer Science

Date

Jan 9, 2024

Type

txt

Pages

3

Uploaded by MasterEnergyDragonfly27

Report
import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier # Step 1: Load The Data fileName = 'edos_labelled_data.csv'; dataSet = pd.read_csv(fileName); #print(dataSet) print("yo0") # Step 2: Clean the Data # ----- Convert to lowercase dataSet['processed_text'] = dataSet['text'].apply(lambda x: x.lower()); # ---- Removing emojies---- dataSet['processed_text'] = dataSet['processed_text'].apply(remove_emojis) # ---- remove stopwords----- dataSet['processed_text'] = dataSet['processed_text'].apply(remove_stopwords) #----- text lemmatization and stemmin--- dataSet['processed_text'] = dataSet['processed_text'].apply(lemmatize_and_stem) #print(dataSet) # Step 3: Encode Text Features. vectorizer = TfidfVectorizer() tf_idf_matrix = pd.DataFrame(vectorizer.fit_transform(dataSet['processed_text']).toarray(), columns=vectorizer.get_feature_names_out()) count_vectorizer = CountVectorizer() bow_matrix = pd.DataFrame(count_vectorizer.fit_transform(dataSet['processed_text']).toarray(), columns=count_vectorizer.get_feature_names_out()) # Bag of Words count_vectorizer = CountVectorizer() X_bow = count_vectorizer.fit_transform(dataSet['processed_text']) bow_matrix = pd.DataFrame(X_bow.toarray(), columns=count_vectorizer.get_feature_names_out()) #print(tf_idf_matrix) # Separate the dataset using the 'split' column train_data = dataSet[dataSet['split'] == 'train'] test_data = dataSet[dataSet['split'] == 'test'] X_train_tf_idf = vectorizer.transform(train_data['processed_text']) X_test_tf_idf = vectorizer.transform(test_data['processed_text']) X_train_bow = count_vectorizer.transform(train_data['processed_text']) X_test_bow = count_vectorizer.transform(test_data['processed_text']) Y_train = train_data['label'] Y_test = test_data['label'] # Function to train and evaluate models def train_evaluate_model(model, X_train, Y_train, X_test, Y_test): model.fit(X_train, Y_train) y_pred = model.predict(X_test) return accuracy_score(Y_test, y_pred), precision_score(Y_test, y_pred,
average='weighted'), \ recall_score(Y_test, y_pred, average='weighted'), f1_score(Y_test, y_pred, average='weighted') # Model 1: Logistic Regression logistic_model = LogisticRegression() log_acc_tf_idf, log_prec_tf_idf, log_rec_tf_idf, log_f1_tf_idf = train_evaluate_model(logistic_model, X_train_tf_idf, Y_train, X_test_tf_idf, Y_test) log_acc_bow, log_prec_bow, log_rec_bow, log_f1_bow = train_evaluate_model(logistic_model, X_train_bow, Y_train, X_test_bow, Y_test) # Model 2: Naive Bayes naive_bayes_model = MultinomialNB() nb_acc_tf_idf, nb_prec_tf_idf, nb_rec_tf_idf, nb_f1_tf_idf = train_evaluate_model(naive_bayes_model, X_train_tf_idf, Y_train, X_test_tf_idf, Y_test) nb_acc_bow, nb_prec_bow, nb_rec_bow, nb_f1_bow = train_evaluate_model(naive_bayes_model, X_train_bow, Y_train, X_test_bow, Y_test) # Model 3: Ridge (Logistic Regression with L2 regularization) ridge_model = LogisticRegression(penalty='l2') ridge_acc_tf_idf, ridge_prec_tf_idf, ridge_rec_tf_idf, ridge_f1_tf_idf = train_evaluate_model(ridge_model, X_train_tf_idf, Y_train, X_test_tf_idf, Y_test) ridge_acc_bow, ridge_prec_bow, ridge_rec_bow, ridge_f1_bow = train_evaluate_model(ridge_model, X_train_bow, Y_train, X_test_bow, Y_test) # Model 4: Lasso (Logistic Regression with L1 regularization) lasso_model = LogisticRegression(penalty='l1', solver='liblinear') lasso_acc_tf_idf, lasso_prec_tf_idf, lasso_rec_tf_idf, lasso_f1_tf_idf = train_evaluate_model( lasso_model, X_train_tf_idf, Y_train, X_test_tf_idf, Y_test) lasso_acc_bow, lasso_prec_bow, lasso_rec_bow, lasso_f1_bow = train_evaluate_model( lasso_model, X_train_bow, Y_train, X_test_bow, Y_test) # Model 1: Logistic Regression print("") print("Logistic Regression with TF-IDF - Accuracy:", log_acc_tf_idf, "Precision:", log_prec_tf_idf, "Recall:", log_rec_tf_idf, "F1 Score:", log_f1_tf_idf) print("Logistic Regression with BoW - Accuracy:", log_acc_bow, "Precision:", log_prec_bow, "Recall:", log_rec_bow, "F1 Score:", log_f1_bow) # Model 2: Naive Bayes print("") print("Naive Bayes with TF-IDF - Accuracy:", nb_acc_tf_idf, "Precision:", nb_prec_tf_idf, "Recall:", nb_rec_tf_idf, "F1 Score:", nb_f1_tf_idf) print("Naive Bayes with BoW - Accuracy:", nb_acc_bow, "Precision:", nb_prec_bow, "Recall:", nb_rec_bow, "F1 Score:", nb_f1_bow) # Model 3: Ridge (Logistic Regression with L2 regularization) print("") print("Ridge with TF-IDF - Accuracy:", ridge_acc_tf_idf, "Precision:", ridge_prec_tf_idf, "Recall:", ridge_rec_tf_idf, "F1 Score:", ridge_f1_tf_idf) print("Ridge with BoW - Accuracy:", ridge_acc_bow, "Precision:", ridge_prec_bow, "Recall:", ridge_rec_bow, "F1 Score:", ridge_f1_bow) # Model 4: Lasso (Logistic Regression with L1 regularization) print("") print("Lasso with TF-IDF - Accuracy:", lasso_acc_tf_idf, "Precision:",
lasso_prec_tf_idf, "Recall:", lasso_rec_tf_idf, "F1 Score:", lasso_f1_tf_idf) print("Lasso with BoW - Accuracy:", lasso_acc_bow, "Precision:", lasso_prec_bow, "Recall:", lasso_rec_bow, "F1 Score:", lasso_f1_bow)
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help