vertopal.com_ENGG680_Assignment03_yourUCID

txt

School

Kenyatta University *

*We aren’t endorsed by this school

Course

503

Subject

Statistics

Date

Nov 24, 2024

Type

txt

Pages

7

Uploaded by CoachDolphinMaster782

Report
# Heart Disease Data Analysis using Pandas import pandas as pd # Load the dataset data = pd.read_csv('heart.csv') # Task 1: Average Age of Patients average_age = data['age'].mean() print("Report 1: Average Age of Patients") print(f"Average age of patients: {average_age:.2f} years") # Task 2: Average Cholesterol Levels in Age Intervals age_intervals = [20, 30, 40, 50, 60, 100] avg_chol_by_age = [] for i in range(len(age_intervals) - 1): lower_age = age_intervals[i] upper_age = age_intervals[i + 1] avg_chol = data[(data['age'] >= lower_age) & (data['age'] < upper_age)] ['chol'].mean() avg_chol_by_age.append((f'[{lower_age}-{upper_age})', avg_chol)) print("Report 2: Average Cholesterol Levels in Age Intervals") for age_interval, avg_chol in avg_chol_by_age: print(f"Average cholesterol for age interval {age_interval}: {avg_chol:.2f} mg/dl") # Task 3: Average Resting Blood Pressure in Extreme Cholesterol Levels # Clean the 'chol' column by removing non-numeric characters and converting to string data['chol'] = data['chol'].astype(str).str.replace('[^0-9]', '', regex=True) # Convert the 'chol' column to numeric data['chol'] = pd.to_numeric(data['chol'], errors='coerce') # Remove rows with NaN values in the 'chol' column data = data.dropna(subset=['chol']) # Calculate the 30th and 70th percentiles for cholesterol levels chol_30th_percentile = data['chol'].quantile(0.3) chol_70th_percentile = data['chol'].quantile(0.7) # Identify patients with the lowest 30% cholesterol levels low_chol_patients = data[data['chol'] < chol_30th_percentile] # Identify patients with the highest 30% cholesterol levels high_chol_patients = data[data['chol'] > chol_70th_percentile] # Calculate average resting blood pressure for patients with low and high cholesterol levels average_trestbps_low_chol = low_chol_patients['trestbps'].mean() average_trestbps_high_chol = high_chol_patients['trestbps'].mean() print("Report 3: Average Resting Blood Pressure in Extreme Cholesterol Levels") print(f"Average resting blood pressure for patients with low cholesterol: {average_trestbps_low_chol:.2f} mm Hg") print(f"Average resting blood pressure for patients with high cholesterol: {average_trestbps_high_chol:.2f} mm Hg")
# Task 4: Percentage of Men and Women with Positive Heart Disease Diagnosis # Calculate the number of women and men with a positive diagnosis of heart disease (num=1) women_with_heart_disease = len(data[(data['sex'] == 0) & (data['num '] == 1)]) men_with_heart_disease = len(data[(data['sex'] == 1) & (data['num '] == 1)]) # Calculate the percentage of women and men with a positive diagnosis of heart disease total_women = len(data[data['sex'] == 0]) total_men = len(data[data['sex'] == 1]) percentage_women_with_heart_disease = (women_with_heart_disease / total_women) * 100 percentage_men_with_heart_disease = (men_with_heart_disease / total_men) * 100 print("Report 4: Percentage of Men and Women with Positive Heart Disease Diagnosis") print(f"Percentage of women with heart disease: {percentage_women_with_heart_disease:.2f}%") print(f"Percentage of men with heart disease: {percentage_men_with_heart_disease:.2f}%") Report 1: Average Age of Patients Average age of patients: 47.83 years Report 2: Average Cholesterol Levels in Age Intervals Average cholesterol for age interval [20-30): 187.50 mg/dl Average cholesterol for age interval [30-40): 239.63 mg/dl Average cholesterol for age interval [40-50): 250.13 mg/dl Average cholesterol for age interval [50-60): 256.21 mg/dl Average cholesterol for age interval [60-100): 261.10 mg/dl Report 3: Average Resting Blood Pressure in Extreme Cholesterol Levels Average resting blood pressure for patients with low cholesterol: 133.01 mm Hg Average resting blood pressure for patients with high cholesterol: 136.04 mm Hg Report 4: Percentage of Men and Women with Positive Heart Disease Diagnosis Percentage of women with heart disease: 16.22% Percentage of men with heart disease: 45.18% # Exploratory Data Analysis of Environmental Sensor Measurements import pandas as pd # 1. Data Loading and Initial Inspection # Load the dataset data = pd.read_csv("sensor_data.csv") # Display the data types of each column data_types = data.dtypes print("Data Types of Each Column:") print(data_types) # Display the number of unique values in the 'time' column unique_time_values = data['time'].nunique() print(f"Number of Unique Values in the 'time' Column: {unique_time_values}") # 2. Data Preprocessing # Convert the 'time' column to datetime format
data['time'] = pd.to_datetime(data['time']) # Drop duplicate rows if any data.drop_duplicates(inplace=True) # 3. Data Analysis # Calculate and display the average power consumption on weekdays and weekends separately data['day_of_week'] = data['time'].dt.dayofweek average_power_weekdays = data[data['day_of_week'] < 5]['power'].mean() average_power_weekends = data[data['day_of_week'] >= 5]['power'].mean() print(f"Average Power Consumption on Weekdays: {average_power_weekdays:.2f} watts") print(f"Average Power Consumption on Weekends: {average_power_weekends:.2f} watts") # Create a new column named 'CO2_category' with categories 'Low', 'Medium', and 'High' based on CO2 concentration def categorize_CO2(co2_value): if co2_value < 400: return 'Low' elif 400 <= co2_value < 1000: return 'Medium' else: return 'High' data['CO2_category'] = data['CO2'].apply(categorize_CO2) # Display the first few rows of the updated dataset print("First Few Rows of the Updated Dataset:") print(data.head()) # Convert the 'time' column to datetime format data['time'] = pd.to_datetime(data['time']) # Drop duplicate rows if any data.drop_duplicates(inplace=True) # Create a new column 'CO2_category' based on CO2 concentration def categorize_CO2(co2_value): if co2_value < 400: return 'Low' elif 400 <= co2_value < 1000: return 'Medium' else: return 'High' data['CO2_category'] = data['CO2'].apply(categorize_CO2) # 3. Data Analysis # Calculate the mean power consumption for each CO2 category mean_power_by_CO2 = data.groupby('CO2_category')['power'].mean() print("Mean Power Consumption for Each CO2 Category:") print(mean_power_by_CO2) # 4. Data Manipulation # Calculate the cumulative sum of power consumption on a rolling window of 3 hours window_size = 3 # 3 hours data['power_cumulative_sum'] = data['power'].rolling(window=window_size).sum() # 4. Data Manipulation # Calculate the cumulative sum of power consumption on a rolling window of 3 hours
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
window_size = 3 # 3 hours data['power_cumulative_sum'] = data['power'].rolling(window=window_size).sum() # Create a new column 'temp_change' representing the change in temperature from the previous row data['temp_change'] = data['temp'].diff() # Display the first few rows of the updated dataset print("First Few Rows of the Updated Dataset:") print(data.head()) Data Types of Each Column: time object power float64 temp int64 humidity int64 light int64 CO2 int64 dust float64 dtype: object Number of Unique Values in the 'time' Column: 44512 Average Power Consumption on Weekdays: -0.50 watts Average Power Consumption on Weekends: -0.21 watts First Few Rows of the Updated Dataset: time power temp humidity light CO2 dust day_of_week \ 0 2015-01-08 00:00:00 0.0 32 40 0 973 27.80 3 1 2015-01-08 00:00:00 0.0 32 40 0 973 27.09 3 2 2015-01-08 00:01:00 0.0 32 40 0 973 34.50 3 3 2015-01-08 00:01:00 0.0 32 40 0 973 28.43 3 4 2015-01-08 00:02:00 0.0 32 40 0 973 27.58 3 CO2_category 0 Medium 1 Medium 2 Medium 3 Medium 4 Medium Mean Power Consumption for Each CO2 Category: CO2_category High -0.451787 Low -0.484364 Medium 0.116040 Name: power, dtype: float64 First Few Rows of the Updated Dataset: time power temp humidity light CO2 dust day_of_week \ 0 2015-01-08 00:00:00 0.0 32 40 0 973 27.80 3 1 2015-01-08 00:00:00 0.0 32 40 0 973 27.09 3 2 2015-01-08 00:01:00 0.0 32 40 0 973 34.50 3 3 2015-01-08 00:01:00 0.0 32 40 0 973 28.43 3 4 2015-01-08 00:02:00 0.0 32 40 0 973 27.58 3 CO2_category power_cumulative_sum temp_change 0 Medium NaN NaN 1 Medium NaN 0.0 2 Medium 0.0 0.0 3 Medium 0.0 0.0 4 Medium 0.0 0.0 # Exploratory Data Analysis of Engineering Services Revenue # Exploratory Data Analysis of Engineering Services Revenue
# Import the necessary libraries import pandas as pd # Task 1: Exploratory Data Analysis # Read the "21100163.csv" file into a DataFrame named df df = pd.read_csv("21100163.csv") # Display the first 5 rows of the DataFrame print("First 5 rows of the DataFrame:") print(df.head()) # Task 2: Data Analysis for Specific Regions # For the region "Canada": # Filter the DataFrame to include only rows with the NAICS code "Engineering services [54133]" and the region "Canada" canada_df = df[(df['GEO'] == 'Canada') & (df['North American Industry Classification System (NAICS)'] == 'Engineering services [54133]')] # Group the data by 'REF_DATE' and 'Summary statistics', and sum the 'VALUE' column canada_grouped = canada_df.groupby(['REF_DATE', 'Summary statistics']) ['VALUE'].sum().reset_index() # Pivot the grouped data to create a summary table with 'Summary statistics' as the index and years as columns canada_summary_table = canada_grouped.pivot(index='Summary statistics', columns='REF_DATE', values='VALUE') # Move the row with 'Operating profit margin' to the last row of the index canada_summary_table = canada_summary_table.reindex(['Operating profit margin'] + list(canada_summary_table.index.difference(['Operating profit margin']))) # Display the summary table for Canada print("\nSummary Table for Canada:") print(canada_summary_table) # For the region "Alberta": # Filter the DataFrame to include only rows with the NAICS code "Engineering services [54133]" and the region "Alberta" alberta_df = df[(df['GEO'] == 'Alberta') & (df['North American Industry Classification System (NAICS)'] == 'Engineering services [54133]')] # Group the data by 'REF_DATE' and 'Summary statistics', and sum the 'VALUE' column alberta_grouped = alberta_df.groupby(['REF_DATE', 'Summary statistics']) ['VALUE'].sum().reset_index() # Pivot the grouped data to create a summary table with 'Summary statistics' as the index and years as columns alberta_summary_table = alberta_grouped.pivot(index='Summary statistics', columns='REF_DATE', values='VALUE') # Move the row with 'Operating profit margin' to the last row of the index alberta_summary_table = alberta_summary_table.reindex(['Operating profit margin'] + list(alberta_summary_table.index.difference(['Operating profit margin']))) # Display the summary table for Alberta print("\nSummary Table for Alberta:") print(alberta_summary_table)
First 5 rows of the DataFrame: REF_DATE GEO DGUID \ 0 2012 Canada 2016A000011124 1 2012 Canada 2016A000011124 2 2012 Canada 2016A000011124 3 2012 Canada 2016A000011124 4 2012 Newfoundland and Labrador 2016A000210 North American Industry Classification System (NAICS) \ 0 Engineering services [54133] 1 Engineering services [54133] 2 Engineering services [54133] 3 Engineering services [54133] 4 Engineering services [54133] Summary statistics UOM UOM_ID SCALAR_FACTOR \ 0 Operating revenue Dollars 81 millions 1 Operating expenses Dollars 81 millions 2 Salaries, wages, commissions and benefits Dollars 81 millions 3 Operating profit margin Percent 239 units 4 Operating revenue Dollars 81 millions SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED \ 0 6 v99920202 1.1.1 28960.0 NaN NaN NaN 1 6 v99920203 1.1.2 25701.5 NaN NaN NaN 2 6 v99920204 1.1.3 11488.3 NaN NaN NaN 3 0 v99920205 1.1.4 11.3 NaN NaN NaN 4 6 v99920206 2.1.1 470.7 NaN NaN NaN DECIMALS 0 1 1 1 2 1 3 1 4 1 Summary Table for Canada: REF_DATE 2012 2013 2014 2015 \ Summary statistics Operating profit margin 11.3 11.0 12.7 9.0 Operating expenses 25701.5 28731.7 28906.7 28041.1 Operating revenue 28960.0 32271.3 33100.3 30806.8 Salaries, wages, commissions and benefits 11488.3 11646.4 12063.7 12232.7 REF_DATE 2016 2017 2018 2019 \ Summary statistics Operating profit margin 7.7 10.1 9.6 10.1 Operating expenses 27032.4 25294.1 26205.3 29813.0 Operating revenue 29295.7 28141.0 28997.3 33153.1 Salaries, wages, commissions and benefits 12249.3 11484.3 12425.8 14198.7 REF_DATE 2020 2021 Summary statistics Operating profit margin 8.6 8.2 Operating expenses 30738.6 32279.3 Operating revenue 33637.0 35145.7 Salaries, wages, commissions and benefits 14016.1 14733.2
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
Summary Table for Alberta: REF_DATE 2012 2013 2014 2015 \ Summary statistics Operating profit margin 11.5 12.4 10.6 8.1 Operating expenses 8888.2 11698.9 12789.6 10584.0 Operating revenue 10038.8 13361.9 14307.1 11511.1 Salaries, wages, commissions and benefits 3565.0 3586.7 3940.7 3502.5 REF_DATE 2016 2017 2018 2019 \ Summary statistics Operating profit margin 4.7 10.2 9.6 14.6 Operating expenses 9521.3 7252.2 5738.3 6732.6 Operating revenue 9987.6 8071.5 6349.7 7886.0 Salaries, wages, commissions and benefits 3494.4 2620.3 2521.3 2792.0 REF_DATE 2020 2021 Summary statistics Operating profit margin 10.3 10.2 Operating expenses 7154.3 6959.2 Operating revenue 7973.5 7752.0 Salaries, wages, commissions and benefits 2931.4 2903.9