vertopal

.txt

School

Kenyatta University *

*We aren’t endorsed by this school

Course

503

Subject

Statistics

Date

Nov 24, 2024

Type

txt

Pages

7

Uploaded by CoachDolphinMaster782

# Heart Disease Data Analysis using Pandas import pandas as pd # Load the dataset data = pd.read_csv('heart.csv') # Task 1: Average Age of Patients average_age = data['age'].mean() print("Report 1: Average Age of Patients") print(f"Average age of patients: {average_age:.2f} years") # Task 2: Average Cholesterol Levels in Age Intervals age_intervals = [20, 30, 40, 50, 60, 100] avg_chol_by_age = [] for i in range(len(age_intervals) - 1): lower_age = age_intervals[i] upper_age = age_intervals[i + 1] avg_chol = data[(data['age'] >= lower_age) & (data['age'] < upper_age)] ['chol'].mean() avg_chol_by_age.append((f'[{lower_age}-{upper_age})', avg_chol)) print("Report 2: Average Cholesterol Levels in Age Intervals") for age_interval, avg_chol in avg_chol_by_age: print(f"Average cholesterol for age interval {age_interval}: {avg_chol:.2f} mg/dl") # Task 3: Average Resting Blood Pressure in Extreme Cholesterol Levels # Clean the 'chol' column by removing non-numeric characters and converting to string data['chol'] = data['chol'].astype(str).str.replace('[^0-9]', '', regex=True) # Convert the 'chol' column to numeric data['chol'] = pd.to_numeric(data['chol'], errors='coerce') # Remove rows with NaN values in the 'chol' column data = data.dropna(subset=['chol']) # Calculate the 30th and 70th percentiles for cholesterol levels chol_30th_percentile = data['chol'].quantile(0.3) chol_70th_percentile = data['chol'].quantile(0.7) # Identify patients with the lowest 30% cholesterol levels low_chol_patients = data[data['chol'] < chol_30th_percentile] # Identify patients with the highest 30% cholesterol levels high_chol_patients = data[data['chol'] > chol_70th_percentile] # Calculate average resting blood pressure for patients with low and high cholesterol levels average_trestbps_low_chol = low_chol_patients['trestbps'].mean() average_trestbps_high_chol = high_chol_patients['trestbps'].mean() print("Report 3: Average Resting Blood Pressure in Extreme Cholesterol Levels") print(f"Average resting blood pressure for patients with low cholesterol: {average_trestbps_low_chol:.2f} mm Hg") print(f"Average resting blood pressure for patients with high cholesterol: {average_trestbps_high_chol:.2f} mm Hg")
# Task 4: Percentage of Men and Women with Positive Heart Disease Diagnosis # Calculate the number of women and men with a positive diagnosis of heart disease (num=1) women_with_heart_disease = len(data[(data['sex'] == 0) & (data['num '] == 1)]) men_with_heart_disease = len(data[(data['sex'] == 1) & (data['num '] == 1)]) # Calculate the percentage of women and men with a positive diagnosis of heart disease total_women = len(data[data['sex'] == 0]) total_men = len(data[data['sex'] == 1]) percentage_women_with_heart_disease = (women_with_heart_disease / total_women) * 100 percentage_men_with_heart_disease = (men_with_heart_disease / total_men) * 100 print("Report 4: Percentage of Men and Women with Positive Heart Disease Diagnosis") print(f"Percentage of women with heart disease: {percentage_women_with_heart_disease:.2f}%") print(f"Percentage of men with heart disease: {percentage_men_with_heart_disease:.2f}%") Report 1: Average Age of Patients Average age of patients: 47.83 years Report 2: Average Cholesterol Levels in Age Intervals Average cholesterol for age interval [20-30): 187.50 mg/dl Average cholesterol for age interval [30-40): 239.63 mg/dl Average cholesterol for age interval [40-50): 250.13 mg/dl Average cholesterol for age interval [50-60): 256.21 mg/dl Average cholesterol for age interval [60-100): 261.10 mg/dl Report 3: Average Resting Blood Pressure in Extreme Cholesterol Levels Average resting blood pressure for patients with low cholesterol: 133.01 mm Hg Average resting blood pressure for patients with high cholesterol: 136.04 mm Hg Report 4: Percentage of Men and Women with Positive Heart Disease Diagnosis Percentage of women with heart disease: 16.22% Percentage of men with heart disease: 45.18% # Exploratory Data Analysis of Environmental Sensor Measurements import pandas as pd # 1. Data Loading and Initial Inspection # Load the dataset data = pd.read_csv("sensor_data.csv") # Display the data types of each column data_types = data.dtypes print("Data Types of Each Column:") print(data_types) # Display the number of unique values in the 'time' column unique_time_values = data['time'].nunique() print(f"Number of Unique Values in the 'time' Column: {unique_time_values}") # 2. Data Preprocessing # Convert the 'time' column to datetime format
data['time'] = pd.to_datetime(data['time']) # Drop duplicate rows if any data.drop_duplicates(inplace=True) # 3. Data Analysis # Calculate and display the average power consumption on weekdays and weekends separately data['day_of_week'] = data['time'].dt.dayofweek average_power_weekdays = data[data['day_of_week'] < 5]['power'].mean() average_power_weekends = data[data['day_of_week'] >= 5]['power'].mean() print(f"Average Power Consumption on Weekdays: {average_power_weekdays:.2f} watts") print(f"Average Power Consumption on Weekends: {average_power_weekends:.2f} watts") # Create a new column named 'CO2_category' with categories 'Low', 'Medium', and 'High' based on CO2 concentration def categorize_CO2(co2_value): if co2_value < 400: return 'Low' elif 400 <= co2_value < 1000: return 'Medium' else: return 'High' data['CO2_category'] = data['CO2'].apply(categorize_CO2) # Display the first few rows of the updated dataset print("First Few Rows of the Updated Dataset:") print(data.head()) # Convert the 'time' column to datetime format data['time'] = pd.to_datetime(data['time']) # Drop duplicate rows if any data.drop_duplicates(inplace=True) # Create a new column 'CO2_category' based on CO2 concentration def categorize_CO2(co2_value): if co2_value < 400: return 'Low' elif 400 <= co2_value < 1000: return 'Medium' else: return 'High' data['CO2_category'] = data['CO2'].apply(categorize_CO2) # 3. Data Analysis # Calculate the mean power consumption for each CO2 category mean_power_by_CO2 = data.groupby('CO2_category')['power'].mean() print("Mean Power Consumption for Each CO2 Category:") print(mean_power_by_CO2) # 4. Data Manipulation # Calculate the cumulative sum of power consumption on a rolling window of 3 hours window_size = 3 # 3 hours data['power_cumulative_sum'] = data['power'].rolling(window=window_size).sum() # 4. Data Manipulation # Calculate the cumulative sum of power consumption on a rolling window of 3 hours
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help