info-6105-hw2

pdf

School

Northeastern University *

*We aren’t endorsed by this school

Course

6105

Subject

Industrial Engineering

Date

Dec 6, 2023

Type

pdf

Pages

14

Uploaded by MateBoarMaster1159

Report
info-6105-hw2 November 20, 2023 [1]: # 1. Convert the list of 26 letters of the alphabet to the index of the Dataframe, and assign 26 random integer numbers and print out the first 5 rows. import pandas as pd import numpy as np # Create a DataFrame with the alphabet as the index alphabet = list ( "abcdefghijklmnopqrstuvwxyz" ) df = pd . DataFrame(index = alphabet) # Assign 26 random integer numbers and add them as a new column df[ 'Random Numbers' ] = np . random . randint( 1 , 100 , 26 ) #print(df) # Print the first 5 rows of the DataFrame print (df . head( 5 )) Random Numbers a 76 b 30 c 68 d 97 e 75 [2]: # 2. Construct the following DataFrame and print it out. Use "iloc" to convert the first column of this DataFrame as a Series. import pandas as pd data = { 'class1' : [ 1 , 2 , 3 , 4 , 7 , 11 ], 'class2' : [ 4 , 5 , 6 , 9 , 5 , 0 ], 'class3' : [ 7 , 5 , 8 , 12 , 1 , 11 ] } df = pd . DataFrame(data) # Convert the first column 'class1' to a Series using iloc series_class1 = df . iloc[:, 0 ] 1
print (df) print ( " \n 1st column of this DataFrame as a Series:" ) print (series_class1) class1 class2 class3 0 1 4 7 1 2 5 5 2 3 6 8 3 4 9 12 4 7 5 1 5 11 0 11 1st column of this DataFrame as a Series: 0 1 1 2 2 3 3 4 4 7 5 11 Name: class1, dtype: int64 [3]: # 3. Work on the above data (question 2), implement the following tasks using indexers loc and/or iloc: #1) Select all columns except 'class3' and print the result: # Using loc to select all columns except 'class3' # Select all columns except 'class3' result_task1 = df . loc[:, df . columns != 'class3' ] print (result_task1) class1 class2 0 1 4 1 2 5 2 3 6 3 4 9 4 7 5 5 11 0 [4]: # 3. #2)Remove the first 3 rows of the DataFrame and print it out: # Using iloc to remove the first 3 rows # Remove the first 3 rows using iloc result_task2 = df . iloc[ 3 :] print (result_task2) class1 class2 class3 2
3 4 9 12 4 7 5 1 5 11 0 11 [5]: # 3. #3)Remove the last 3 rows of the DataFrame and print it out: # Using iloc to remove the last 3 rows # Remove the last 3 rows using iloc result_task3 = df . iloc[: -3 ] print (result_task3) class1 class2 class3 0 1 4 7 1 2 5 5 2 3 6 8 [6]: #4 Compute the Euclidean distance between two series (points) p1 and p2: #p1 = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) #p2 = pd.Series([8, 9, 10, 11, 12, 7, 6, 5, 4, 3]) import pandas as pd import numpy as np p1 = pd . Series([ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ]) p2 = pd . Series([ 8 , 9 , 10 , 11 , 12 , 7 , 6 , 5 , 4 , 3 ]) # Calculate the squared difference between the two series squared_diff = (p1 - p2) ** 2 # Sum up the squared differences sum_squared_diff = squared_diff . sum() # Calculate the Euclidean distance by taking the square root of the sum euclidean_distance = np . sqrt(sum_squared_diff) print ( "Euclidean Distance between p1 and p2:" , euclidean_distance) Euclidean Distance between p1 and p2: 18.16590212458495 [7]: # 5) #1) Create the DataFrame df with random values in the specified range: import pandas as pd import numpy as np # Set a random seed for reproducibility np . random . seed( 42 ) 3
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
# Create the DataFrame with 10 indexes (a to j) and 4 columns (p, q, r, s) data = np . random . randint( 1 , 100 , size = ( 10 , 4 )) df = pd . DataFrame(data, index = [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'j' ], columns = [ 'p' , 'q' , 'r' , 's' ]) # Print the DataFrame print (df) p q r s a 52 93 15 72 b 61 21 83 87 c 75 75 88 24 d 3 22 53 2 e 88 30 38 2 f 64 60 21 33 g 76 58 22 89 h 49 91 59 42 i 92 60 80 15 j 62 62 47 62 [8]: # 5) #2) Create a new column that contains the row number of the nearest row-record by Euclidean distance import pandas as pd import numpy as np from scipy.spatial import distance # Set a random seed for reproducibility np . random . seed( 42 ) # Create the DataFrame with random values data = np . random . randint( 1 , 100 , size = ( 10 , 4 )) df = pd . DataFrame(data, index = [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'j' ], columns = [ 'p' , 'q' , 'r' , 's' ]) # Calculate the nearest rows by Euclidean distance def find_nearest_row (row): distances = df . apply( lambda x: distance . euclidean(row, x), axis =1 ) nearest_index = distances . idxmin() return nearest_index df[ 'nearest' ] = df . apply(find_nearest_row, axis =1 ) # Print the updated DataFrame print (df) 4
p q r s nearest a 52 93 15 72 a b 61 21 83 87 b c 75 75 88 24 c d 3 22 53 2 d e 88 30 38 2 e f 64 60 21 33 f g 76 58 22 89 g h 49 91 59 42 h i 92 60 80 15 i j 62 62 47 62 j [9]: #6) Create a Dataframe with rows as strides from a given series: L = pd. Series(range(15)) import pandas as pd import numpy as np L = pd . Series( range ( 15 )) stride_length = 4 strides = [L[i:i + stride_length] for i in range ( 0 , len (L) - stride_length + 1 , 2 )] output = np . array(strides) print (output) [[ 0 1 2 3] [ 2 3 4 5] [ 4 5 6 7] [ 6 7 8 9] [ 8 9 10 11] [10 11 12 13]] [10]: #7) #1)Create a DataFrame, the name of columns is abcde respectively, the values ranges [0,20) import pandas as pd import numpy as np # Create a DataFrame with columns 'abcde' and values ranging from 0 to 19. data = { 'a' : range ( 0 , 19 ), 'b' : range ( 1 , 20 ), 'c' : range ( 2 , 21 ), 'd' : range ( 3 , 22 ), 'e' : range ( 4 , 23 )} 5
df = pd . DataFrame(data) print (df) a b c d e 0 0 1 2 3 4 1 1 2 3 4 5 2 2 3 4 5 6 3 3 4 5 6 7 4 4 5 6 7 8 5 5 6 7 8 9 6 6 7 8 9 10 7 7 8 9 10 11 8 8 9 10 11 12 9 9 10 11 12 13 10 10 11 12 13 14 11 11 12 13 14 15 12 12 13 14 15 16 13 13 14 15 16 17 14 14 15 16 17 18 15 15 16 17 18 19 16 16 17 18 19 20 17 17 18 19 20 21 18 18 19 20 21 22 [11]: #7) #2)Interchange columns 'a' and 'c': # Interchange columns 'a' and 'c' # Interchange columns 'a' and 'c' df[ 'a' ], df[ 'c' ] = df[ 'c' ], df[ 'a' ] # Print the DataFrame print (df) a b c d e 0 2 1 0 3 4 1 3 2 1 4 5 2 4 3 2 5 6 3 5 4 3 6 7 4 6 5 4 7 8 5 7 6 5 8 9 6 8 7 6 9 10 7 9 8 7 10 11 8 10 9 8 11 12 9 11 10 9 12 13 10 12 11 10 13 14 11 13 12 11 14 15 6
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
12 14 13 12 15 16 13 15 14 13 16 17 14 16 15 14 17 18 15 17 16 15 18 19 16 18 17 16 19 20 17 19 18 17 20 21 18 20 19 18 21 22 [12]: #7) #3)Create a generic function to interchange arbitrary two columns: # Define a generic function to interchange arbitrary two columns def interchange_columns_generic (df, col1, col2): df[col1], df[col2] = df[col2], df[col1] return df # Usage: interchange columns 'b' and 'd' df = interchange_columns_generic(df, 'b' , 'd' ) # Print the DataFrame after interchange print (df) a b c d e 0 2 3 0 1 4 1 3 4 1 2 5 2 4 5 2 3 6 3 5 6 3 4 7 4 6 7 4 5 8 5 7 8 5 6 9 6 8 9 6 7 10 7 9 10 7 8 11 8 10 11 8 9 12 9 11 12 9 10 13 10 12 13 10 11 14 11 13 14 11 12 15 12 14 15 12 13 16 13 15 16 13 14 17 14 16 17 14 15 18 15 17 18 15 16 19 16 18 19 16 17 20 17 19 20 17 18 21 18 20 21 18 19 22 [13]: #7) #4)Sort the columns in reverse alphabetical order: # Sort columns in reverse alphabetical order # Sort the columns in reverse alphabetical order df = df[ sorted (df . columns, reverse = True )] 7
# Print the DataFrame after sorting print (df) e d c b a 0 4 1 0 3 2 1 5 2 1 4 3 2 6 3 2 5 4 3 7 4 3 6 5 4 8 5 4 7 6 5 9 6 5 8 7 6 10 7 6 9 8 7 11 8 7 10 9 8 12 9 8 11 10 9 13 10 9 12 11 10 14 11 10 13 12 11 15 12 11 14 13 12 16 13 12 15 14 13 17 14 13 16 15 14 18 15 14 17 16 15 19 16 15 18 17 16 20 17 16 19 18 17 21 18 17 20 19 18 22 19 18 21 20 [14]: #8)Create a TimeSeries starting ‘2021-01-01’ and 10 weekends (Saturdays) after that having random numbers as values. import pandas as pd import random # Define the start date start_date = '2021-01-01' # Create a list of dates for 10 Saturdays date_range = pd . date_range(start = start_date, periods =10 , freq = 'W-SAT' ) # Generate random values for each Saturday random_values = [random . randint( 1 , 100 ) for _ in range ( 10 )] # Create a DataFrame with the dates and random values time_series = pd . DataFrame({ 'Date' : date_range, 'Value' : random_values}) # Display the time series print (time_series) Date Value 8
0 2021-01-02 56 1 2021-01-09 54 2 2021-01-16 46 3 2021-01-23 79 4 2021-01-30 52 5 2021-02-06 29 6 2021-02-13 25 7 2021-02-20 60 8 2021-02-27 15 9 2021-03-06 46 [15]: #9) #1)Create a Pandas time series with missing dates (from 2021-01-01 to 2021-01-08) and values, shown as below import numpy as np # Define the dates and values as given date_values = { '2021-01-01' : 1.0 , '2021-01-03' : 10.0 , '2021-01-06' : 3.0 , '2021-01-08' : np . nan } # Create a Pandas Series with the specified dates and values time_series = pd . Series(date_values, dtype = float ) # Convert the index to datetime format time_series . index = pd . to_datetime(time_series . index) # Print the time series print (time_series) 2021-01-01 1.0 2021-01-03 10.0 2021-01-06 3.0 2021-01-08 NaN dtype: float64 [16]: #9) #2)Make all missing dates appear and fill up with value from previous date, shown as below import pandas as pd # Create a dictionary with the given data data = { 'Date' : [ '2021-01-01' , '2021-01-03' , '2021-01-06' , '2021-01-08' ], 9
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
'Value' : [ 1.0 , 10.0 , 3.0 , None ] } # Create a DataFrame df = pd . DataFrame(data) # Convert the 'Date' column to a datetime type df[ 'Date' ] = pd . to_datetime(df[ 'Date' ]) # Set the 'Date' column as the index df = df . set_index( 'Date' ) # Generate a date range from '2021-01-01' to '2021-01-08' date_range = pd . date_range(start = '2021-01-01' , end = '2021-01-08' ) # Reindex the DataFrame with the full date range and forward fill missing values df = df . reindex(date_range) . ffill() # Reset the index to have 'Date' as a column again (optional) df = df . reset_index() # Display the final DataFrame print (df) index Value 0 2021-01-01 1.0 1 2021-01-02 1.0 2 2021-01-03 10.0 3 2021-01-04 10.0 4 2021-01-05 10.0 5 2021-01-06 3.0 6 2021-01-07 3.0 7 2021-01-08 3.0 [17]: #10) #1)Create a DataFrame ‘df’: the indexes are integer from 0 to 8 (9 rows); the columns are ‘fruit’, ‘taste’, ‘price’; the items of ‘fruit’ are: apple, banana, orange, respectively and repeating 3 times. The ‘taste’ ranges [0,1). The price is integer and ranges [1, 15). The seed of random values is set as 100. import pandas as pd import numpy as np # Task 1: Create the DataFrame np . random . seed( 100 ) data = { 10
'fruit' : [ 'apple' , 'banana' , 'orange' ] * 3 , 'taste' : np . random . rand( 9 ), 'price' : np . random . randint( 1 , 15 , 9 ) } df = pd . DataFrame(data) # Print the DataFrame print (df) fruit taste price 0 apple 0.543405 9 1 banana 0.278369 5 2 orange 0.424518 12 3 apple 0.844776 13 4 banana 0.004719 11 5 orange 0.121569 1 6 apple 0.670749 12 7 banana 0.825853 10 8 orange 0.136707 14 [18]: #10) #2)Find the second largest value of 'taste' for 'banana': # Group by 'fruit' and filter for 'banana' banana_group = df[df[ 'fruit' ] == 'banana' ] # Sort the 'taste' column in descending order and get the second value second_largest_taste = banana_group[ 'taste' ] . sort_values(ascending = False ) . iloc[ 1 ] print ( "Second largest taste value for 'banana':" , second_largest_taste) Second largest taste value for 'banana': 0.27836938509379616 [19]: #10) #3)Compute the mean price of every fruit, keeping 'fruit' as another column # Group by 'fruit' and compute the mean price for each fruit mean_prices = df . groupby( 'fruit' )[ 'price' ] . mean() . reset_index() # Rename the columns for clarity mean_prices . columns = [ 'fruit' , 'mean_price' ] # Print the result print ( "Mean prices for each fruit:" ) print (mean_prices) Mean prices for each fruit: fruit mean_price 11
0 apple 11.333333 1 banana 8.666667 2 orange 9.000000 [20]: #11) #1)Create a df, for the values [0,100) with a seed 1, its shape is (8,10) import numpy as np import pandas as pd # Set the random seed np . random . seed( 1 ) # Create a DataFrame with random values in the range [0, 100) df = pd . DataFrame(np . random . randint( 0 , 100 , size = ( 8 , 10 ))) # Print the DataFrame print (df) 0 1 2 3 4 5 6 7 8 9 0 37 12 72 9 75 5 79 64 16 1 1 76 71 6 25 50 20 18 84 11 28 2 29 14 50 68 87 87 94 96 86 13 3 9 7 63 61 22 57 1 0 60 81 4 8 88 13 47 72 30 71 3 70 21 5 49 57 3 68 24 43 76 26 52 80 6 41 82 15 64 68 25 98 87 7 26 7 25 22 9 67 23 27 37 57 83 38 [21]: #11) #2) Create a new column 'largest2nd' with the second largest value of each row of df # Function to calculate the second largest value in a row def second_largest (row): sorted_row = sorted (row, reverse = True ) return sorted_row[ 1 ] # Create the 'largest2nd' column by applying the function to each row df[ 'largest2nd' ] = df . apply(second_largest, axis =1 ) # Print the DataFrame with the 'largest2nd' column print (df) 0 1 2 3 4 5 6 7 8 9 largest2nd 0 37 12 72 9 75 5 79 64 16 1 75 1 76 71 6 25 50 20 18 84 11 28 76 2 29 14 50 68 87 87 94 96 86 13 94 3 9 7 63 61 22 57 1 0 60 81 63 4 8 88 13 47 72 30 71 3 70 21 72 12
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
5 49 57 3 68 24 43 76 26 52 80 76 6 41 82 15 64 68 25 98 87 7 26 87 7 25 22 9 67 23 27 37 57 83 38 67 [22]: #12)For the same df created in Question 11-(1), use Pandas ufuncs to normalize all columns of df by subtracting the column mean and divide by standard deviation and keep the result within two decimal places. import pandas as pd import numpy as np # Create a DataFrame with random values np . random . seed( 1 ) df = pd . DataFrame(np . random . randint( 0 , 100 , size = ( 8 , 10 ))) # Normalize the DataFrame by subtracting the column mean and dividing by the standard deviation normalized_df = (df - df . mean()) / df . std() # Round the result to two decimal places normalized_df = normalized_df . round( 2 ) # Print the normalized DataFrame print (normalized_df) 0 1 2 3 4 5 6 7 8 9 0 0.12 -0.95 1.54 -1.87 0.84 -1.25 0.55 0.31 -0.99 -1.18 1 1.88 0.79 -0.82 -1.16 -0.10 -0.66 -1.14 0.84 -1.14 -0.27 2 -0.24 -0.89 0.75 0.75 1.29 1.97 0.96 1.15 1.17 -0.78 3 -1.14 -1.09 1.22 0.44 -1.15 0.80 -1.61 -1.37 0.37 1.52 4 -1.18 1.29 -0.57 -0.18 0.73 -0.27 0.33 -1.29 0.67 -0.51 5 0.66 0.38 -0.92 0.75 -1.08 0.25 0.46 -0.69 0.12 1.49 6 0.30 1.12 -0.49 0.57 0.58 -0.46 1.07 0.92 -1.27 -0.34 7 -0.42 -0.65 -0.71 0.70 -1.12 -0.38 -0.62 0.13 1.07 0.07 [23]: #13)Use the Planets dataset (taught in Pandas 4 lecture regarding groupby), count discovered planets by method and by decade: import seaborn as sns import pandas as pd # Load the Planets dataset planets = sns . load_dataset( 'planets' ) # Extract the year of discovery and create a 'decade' column planets[ 'decade' ] = (planets[ 'year' ] // 10 ) * 10 # Create a pivot table to count the discovered planets by method and decade 13
pivot_table = planets . pivot_table(index = 'method' , columns = 'decade' , values = 'number' , aggfunc = 'count' , fill_value =0 ) # Print the pivot table print (pivot_table) decade 1980 1990 2000 2010 method Astrometry 0 0 0 2 Eclipse Timing Variations 0 0 3 6 Imaging 0 0 20 18 Microlensing 0 0 10 13 Orbital Brightness Modulation 0 0 0 3 Pulsar Timing 0 3 1 1 Pulsation Timing Variations 0 0 1 0 Radial Velocity 1 28 309 215 Transit 0 0 62 335 Transit Timing Variations 0 0 0 4 [ ]: [ ]: 14