D212_JBT_Task2

.pdf

School

Western Governors University *

*We aren’t endorsed by this school

Course

D212

Subject

Computer Science

Date

Jun 21, 2024

Type

pdf

Pages

14

Uploaded by MajorMouseMaster1147

In [1]: import pandas as pd from pandas.api.types import CategoricalDtype import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve from sklearn.metrics import accuracy_score # import warnings filter from warnings import simplefilter # ignore all future warnings simplefilter ( action = 'ignore' , category = FutureWarning ) # The CSV's first column is an index and Pandas will duplicate this and create df = pd . read_csv ( './medical_clean.csv' , index_col = 0 ) In [2]: # Check data types and count of values df . info ()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 1 to 10000 Data columns (total 49 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_id 10000 non-null object 1 Interaction 10000 non-null object 2 UID 10000 non-null object 3 City 10000 non-null object 4 State 10000 non-null object 5 County 10000 non-null object 6 Zip 10000 non-null int64 7 Lat 10000 non-null float64 8 Lng 10000 non-null float64 9 Population 10000 non-null int64 10 Area 10000 non-null object 11 TimeZone 10000 non-null object 12 Job 10000 non-null object 13 Children 10000 non-null int64 14 Age 10000 non-null int64 15 Income 10000 non-null float64 16 Marital 10000 non-null object 17 Gender 10000 non-null object 18 ReAdmis 10000 non-null object 19 VitD_levels 10000 non-null float64 20 Doc_visits 10000 non-null int64 21 Full_meals_eaten 10000 non-null int64 22 vitD_supp 10000 non-null int64 23 Soft_drink 10000 non-null object 24 Initial_admin 10000 non-null object 25 HighBlood 10000 non-null object 26 Stroke 10000 non-null object 27 Complication_risk 10000 non-null object 28 Overweight 10000 non-null object 29 Arthritis 10000 non-null object 30 Diabetes 10000 non-null object 31 Hyperlipidemia 10000 non-null object 32 BackPain 10000 non-null object 33 Anxiety 10000 non-null object 34 Allergic_rhinitis 10000 non-null object 35 Reflux_esophagitis 10000 non-null object 36 Asthma 10000 non-null object 37 Services 10000 non-null object 38 Initial_days 10000 non-null float64 39 TotalCharge 10000 non-null float64 40 Additional_charges 10000 non-null float64 41 Item1 10000 non-null int64 42 Item2 10000 non-null int64 43 Item3 10000 non-null int64 44 Item4 10000 non-null int64 45 Item5 10000 non-null int64 46 Item6 10000 non-null int64 47 Item7 10000 non-null int64 48 Item8 10000 non-null int64 dtypes: float64(7), int64(15), object(27) memory usage: 3.8+ MB In [3]: # Data Cleaning Code from D206 # Replace the city-specific values with the standard time zones
df . TimeZone . replace ( { 'America/Chicago' : 'America - Central' , 'America/New_York' : 'America - Eastern' , 'America/Los_Angeles' : 'America - Pacific' , 'America/Denver' : 'America - Mountain' , 'America/Detroit' : 'America - Eastern' , 'America/Indiana/Indianapolis' : 'America - Eastern' , 'America/Phoenix' : 'America - Mountain' , 'America/Boise' : 'America - Mountain' , 'America/Anchorage' : 'America - Alaskan' , 'America/Puerto_Rico' : 'America - Atlantic' , 'Pacific/Honolulu' : 'America - Hawaii-Aleutian' , 'America/Menominee' : 'America - Central' , 'America/Nome' : 'America - Alaskan' , 'America/Indiana/Vincennes' : 'America - Eastern' , 'America/Sitka' : 'America - Alaskan' , 'America/Kentucky/Louisville' : 'America - Eastern' , 'America/Toronto' : 'America - Eastern' , 'America/Indiana/Tell_City' : 'America - Central' , 'America/Indiana/Marengo' : 'America - Eastern' , 'America/North_Dakota/Beulah' : 'America - Central' , 'America/Indiana/Winamac' : 'America - Eastern' , 'America/Indiana/Vevay' : 'America - Eastern' , 'America/North_Dakota/New_Salem' : 'America - Central' , 'America/Indiana/Knox' : 'America - Eastern' , 'America/Yakutat' : 'America - Alaskan' , 'America/Adak' : 'America - Hawaii-Aleutian' }, inplace = True ) # Convert the column type of zip from int to string then front fill with zeroes df [ 'Zip' ] = df [ 'Zip' ] . astype ( "str" ) . str . zfill ( 5 ) # Convert column that are string object to category df [ "TimeZone" ] = df [ "TimeZone" ] . astype ( "category" ) df [ "Area" ] = df [ "Area" ] . astype ( "category" ) df [ "Marital" ] = df [ "Marital" ] . astype ( "category" ) df [ "Gender" ] = df [ "Gender" ] . astype ( "category" ) df [ "Initial_admin" ] = df [ "Initial_admin" ] . astype ( "category" ) df [ "Complication_risk" ] = df [ "Complication_risk" ] . astype ( "category" ) df [ "Services" ] = df [ "Services" ] . astype ( "category" ) # Convert column that are float to integer df [ "Initial_days" ] = df [ "Initial_days" ] . astype ( "int64" ) # Recast object to boolean by changing the Yes to 1 and No to 0 bool_mapping = { "Yes" : 1 , "No" : 0 } # Convert column that are string to boolean df [ "ReAdmis" ] = df [ "ReAdmis" ] . map ( bool_mapping ) df [ "Soft_drink" ] = df [ "Soft_drink" ] . map ( bool_mapping ) df [ "HighBlood" ] = df [ "HighBlood" ] . map ( bool_mapping ) df [ "Stroke" ] = df [ "Stroke" ] . map ( bool_mapping ) df [ "Overweight" ] = df [ "Overweight" ] . map ( bool_mapping ) df [ "Arthritis" ] = df [ "Arthritis" ] . map ( bool_mapping ) df [ "Diabetes" ] = df [ "Diabetes" ] . map ( bool_mapping ) df [ "Hyperlipidemia" ] = df [ "Hyperlipidemia" ] . map ( bool_mapping ) df [ "BackPain" ] = df [ "BackPain" ] . map ( bool_mapping ) df [ "Anxiety" ] = df [ "Anxiety" ] . map ( bool_mapping )
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help