miniproject (1)

py

School

University of Massachusetts, Amherst *

*We aren’t endorsed by this school

Course

410

Subject

Electrical Engineering

Date

Jan 9, 2024

Type

py

Pages

5

Uploaded by ChefHeat7492

Report
# -*- coding: utf-8 -*- """MiniProject.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1bj_Sy7v0VLzOD9X-ual0SHzhy5ehBbL8 <h1 align="center"><strong>DS/CMPSC 410 - Mini Project</strong></h1> <h2 align="center"><strong>Crime Analysis in Chicago City.</strong></h2> ## Instructor: Professor Romit Maulik ## Team Members: ### - Sai Sanwariya Narayan ### - Nikhil Melligeri ### - Shafwat Mustafa ### - Rohan Singh ### - Shengdi You ### - Daniel Gao ### - Nathan Quint """ ##pip install pyspark import pyspark import pandas as pd import numpy as np from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType, FloatType from pyspark.sql.functions import col, column from pyspark.sql.functions import expr from pyspark.sql.functions import split from datetime import datetime import matplotlib.pyplot as plt from pyspark.sql import Row crime=SparkSession.builder.master("local").appName("CrimeDataAnalysis").getOrCreate () crime.sparkContext.setCheckpointDir("~/scratch") """## Uploading 2022 Crime Data""" Data22 = crime.read.csv("/storage/home/njq5013/Project/Crimes_-_2022_20231016.csv", header=True, inferSchema=True) Data22.printSchema() DF_2 = Data22.select("Primary Type" , "Location Description") DF_2.take(10) """## Cleaning 2022 Crime Data""" cleandata22 = DF_2.filter(DF_2["Primary Type"].isNotNull())
cleandata22.show(10) """## Uploading 2023 Crime Data""" Data23 = crime.read.csv("/storage/home/njq5013/Project/Crimes_-_2023_20231016.csv", inferSchema = True, header = True) Data22.printSchema() DF_3 = Data23.select("Primary Type" , "Location Description") DF_3.take(10) """## Cleaning 2022 Crime Data""" cleandata23 = DF_3.filter(DF_3["Primary Type"].isNotNull()) cleandata23.show(10) """## Data Merging with 2022 - 2023""" Data22_23 = Data22.union(Data23) print(Data22.count()) print(Data23.count()) print(Data22.count() + Data23.count()) print(Data22_23.count()) Data22_23.printSchema() """## Selecting useful columns""" df = Data22_23.select("Date", "Block", "Primary Type", "Description", "Location Description", "Arrest", "Domestic", "Beat", "District", "Ward", "Community Area", "Year") """## Removing rows with null values""" df_clean = df.dropna(how = 'any') print(df.count()) print(df_clean.count()) """## INITIAL EDA AND MAPREDUCE""" mapped_primary_type = df_clean.rdd.map(lambda row: (row["Primary Type"], 1)) reduced_primary_type = mapped_primary_type.reduceByKey(lambda a, b: a + b) sorted_primary_type = reduced_primary_type.sortBy(lambda x: x[1], ascending=False) primary_type_counts_sorted = sorted_primary_type.collect() primary = [] #empty list for visualization counts = [] #empty list for visualization for primary_type, count in primary_type_counts_sorted: print(f"Primary Type: {primary_type} -> Count: {count}") primary.append(primary_type) #the first column (primary type) counts.append(int(count)) #the second column (counts) """## Visualization - Primary type and count"""
#bar plot of primary_type and counts plt.bar(primary, counts) #from above plt.xlabel('Primary Type') plt.ylabel('Count') plt.title('Primary Type Counts') plt.xticks(rotation = 90)#readability is bad without this, unless we switch the axis plt.show() #line plot - same thing as top plt.plot(primary, counts, marker = "o") #from above plt.xlabel('Primary Type') plt.ylabel('Count') plt.title('Primary Type Counts') plt.xticks(rotation = 90)#readability is bad without this, unless we switch the axis plt.show() # MapReduce to count occurrences based on multiple attributes # Map function to extract the required attributes and create key-value pairs def map_attributes(row): date_obj = datetime.strptime(row["Date"], "%m/%d/%Y %I:%M:%S %p") # Adjust date format if necessary month = date_obj.month year = row["Year"] key = (row["Primary Type"], row["Location Description"], month, year, row["Block"], row["Ward"], row["District"]) return (key, 1) # Map phase mapped_data = df_clean.rdd.map(map_attributes) # Reduce phase: Sum the values for each key reduced_data = mapped_data.reduceByKey(lambda a, b: a + b) # Sort the results in descending order based on count sorted_data = reduced_data.sortBy(lambda x: x[1], ascending=False) # Collect the sorted results aggregated_counts_sorted = sorted_data.collect() # Displaying the first few sorted results for verification for (primary_type, location_description, month, year, block, ward, district), count in aggregated_counts_sorted[:10]: print(f"Primary Type: {primary_type}, Location: {location_description}, Month: {month}, Year: {year}, Block: {block}, Ward: {ward}, District: {district} -> Count: {count}") # MapReduce to count occurrences for each type of crime in each ward mapped_rdd = df_clean.rdd.map(lambda row: ((row["Ward"], row["Primary Type"]), 1)) reduced_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b) ward_crime_counts = reduced_rdd.collect() # Selecting a specific ward for visualization (e.g., Ward 10)
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
ward_10_data = [(crime_type, count) for ((ward, crime_type), count) in ward_crime_counts if ward == 10] # Unzipping data for plotting crime_types, counts = zip(*ward_10_data) # Plotting a bar chart for Ward 10's crime distribution //Visualization Example plt.figure(figsize=(12, 6)) plt.bar(crime_types, counts) plt.xticks(rotation=90) plt.ylabel('Number of Crimes') plt.title('Crime Distribution in Ward 10') plt.show() # MapReduce to count arrests for each type of crime mapped_arrests = df_clean.rdd.map(lambda row: (row["Primary Type"], 1 if row["Arrest"] else 0)) reduced_arrests = mapped_arrests.reduceByKey(lambda a, b: a + b) arrest_counts = reduced_arrests.collect() # Visualization import matplotlib.pyplot as plt # Unzipping data for plotting crime_types, counts = zip(*arrest_counts) # Plotting a bar chart for arrests by crime type plt.figure(figsize=(12, 8)) plt.bar(crime_types, counts, color = 'r') plt.xticks(rotation=90) plt.ylabel('Number of Arrests') plt.xlabel('Crime Type') plt.title('Number of Arrests by Crime Type') plt.tight_layout() plt.show() from datetime import datetime # MapReduce to count occurrences for each type of crime by month for 2022 and 2023 def extract_month_year(row): date_obj = datetime.strptime(row["Date"], "%m/%d/%Y %I:%M:%S %p") # Assuming this date format, adjust if necessary return ((date_obj.year, date_obj.month, row["Primary Type"]), 1) mapped_monthly = df_clean.rdd.map(extract_month_year) reduced_monthly = mapped_monthly.reduceByKey(lambda a, b: a + b) monthly_counts = reduced_monthly.collect() # Visualization import matplotlib.pyplot as plt # Let's visualize the trend for a specific crime type, e.g., "THEFT" crime_to_plot = "THEFT" months = [f"{month}-{year}" for year in [2022, 2023] for month in range(1, 13)] counts = [0] * 24 # Initialize with zeros for all months for ((year, month, crime_type), count) in monthly_counts: if crime_type == crime_to_plot:
idx = (year - 2022) * 12 + month - 1 counts[idx] = count plt.figure(figsize=(14, 7)) plt.plot(months, counts, marker='o', linestyle='-') plt.xticks(rotation=45) plt.ylabel('Number of Crimes') plt.title(f'Monthly Trend for {crime_to_plot} (2022-2023)') plt.grid(True, which="both", ls="--") plt.tight_layout() plt.show() from datetime import datetime # MapReduce to count occurrences for each type of crime by month for 2022 and 2023 def extract_month_year(row): date_obj = datetime.strptime(row["Date"], "%m/%d/%Y %I:%M:%S %p") # Assuming this date format, adjust if necessary return ((date_obj.year, date_obj.month, row["Primary Type"]), 1) mapped_monthly = df_clean.rdd.map(extract_month_year) reduced_monthly = mapped_monthly.reduceByKey(lambda a, b: a + b) monthly_counts = reduced_monthly.collect() # Visualization import matplotlib.pyplot as plt # Let's visualize the trend for a specific crime type, e.g., "THEFT" crime_to_plot = "BATTERY" months = [f"{month}-{year}" for year in [2022, 2023] for month in range(1, 13)] counts = [0] * 24 # Initialize with zeros for all months for ((year, month, crime_type), count) in monthly_counts: if crime_type == crime_to_plot: idx = (year - 2022) * 12 + month - 1 counts[idx] = count plt.figure(figsize=(14, 7)) plt.plot(months, counts, marker='o', linestyle='-') plt.xticks(rotation=45) plt.ylabel('Number of Crimes') plt.title(f'Monthly Trend for {crime_to_plot} (2022-2023)') plt.grid(True, which="both", ls="--") plt.tight_layout() plt.show() """## Verify""" """# Spark stop""" spark.stop()