HW02 NLP

docx

School

Ramapo College Of New Jersey *

*We aren’t endorsed by this school

Course

01

Subject

Electrical Engineering

Date

Apr 3, 2024

Type

docx

Pages

13

Uploaded by neel08852

Report
Neel Patel 1) # Problem 1 from textblob import TextBlob import nltk nltk.download( 'punkt' ) nltk.download( 'stopwords' ) from nltk.corpus import stopwords from google.colab import files # Upload data file uploaded_file = files.upload() # Read the text from the uploaded file file_content = "" for file_name in uploaded_file.keys(): with open (file_name, 'r' ) as file : file_content = file .read() break # Read only the first file if multiple files were uploaded # Take the first 1,000 characters of the text sample_text = file_content[: 1000 ] # Create a TextBlob object text_blob = TextBlob(sample_text) # Tokenize the text into words using TextBlob words = text_blob.words # Display the words with word number, word, and count of words, 10 words per line for the first 1,000 characters words_count = 0 word_dict = {} for word in words: words_count += 1 word_dict[word] = word_dict.get(word, 0 ) + 1 if words_count % 10 == 0 : print ()
# Stop after processing the first 1,000 characters if words_count >= 1000 : break # Print the total number of words found print ( 'Count of words =' , words_count) print () # Display word number, word, and count of words for i, word in enumerate (word_dict.keys(), start= 1 ): count = word_dict[word] print ( f " {i} . {word} (Count: {count} )" , end= ", " ) if i % 3 == 0 : print () Output:
2) # Problem 2 # Import the required modules from textblob import TextBlob import nltk nltk.download( 'punkt' ) from google.colab import files # Assuming you have uploaded the file "RomeoJuliet.txt" to the current directory with open ( 'RomeoJuliet.txt' , 'r' ) as file : sample_text = file .read() # Create a TextBlob object text_blob = TextBlob(sample_text) # Tokenize the text into words using TextBlob words = text_blob.words # Count the total number of words in the entire text total_words = len (words) print ( f "Total number of words in the text: {total_words} " ) Output: 3) # Problem 3 # Import the required modules from textblob import TextBlob import nltk nltk.download( 'punkt' ) from google.colab import files # Assuming you have uploaded the file "RomeoJuliet.txt" to the current directory with open ( 'RomeoJuliet.txt' , 'r' ) as file : sample_text = file .read() # Create a TextBlob object
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
text_blob = TextBlob(sample_text) # Tokenize the text into words using TextBlob words = text_blob.words # Count the total number of words in the entire text total_words = len (words) print ( f "Total number of words in the text: {total_words} " ) # Count the total number of unique words in the text total_unique_words = len ( list (text_blob.word_counts.items())) print ( f "Total unique words in the text: {total_unique_words} " ) Output: 4) # Problem 4 from textblob import TextBlob import nltk nltk.download( 'punkt' ) from nltk.corpus import stopwords nltk.download( 'stopwords' ) from google.colab import files # Upload data file (assuming you have uploaded "RomeoJuliet.txt" to the current directory) uploaded_file = files.upload() # Read the text data from the uploaded file sample_text = None for file_name in uploaded_file.keys(): sample_text = uploaded_file[file_name].decode( "utf-8" ) # Create a TextBlob object text_blob = TextBlob(sample_text)
# Tokenize the text into words using TextBlob words = text_blob.words # Count the total number of words in the entire text total_words = len (words) print ( f "Total number of words in the text: {total_words} " ) # Count the total number of unique words in the text total_unique_words = len ( set (words)) print ( f "Total unique words in the text: {total_unique_words} " ) # Total number of unique words in the text after removing stop words stop_words = set (stopwords.words( 'english' )) words_no_stop = [word for word in words if word.lower() not in stop_words] total_unique_words_no_stop = len ( set (words_no_stop)) print ( 'Total number of unique words in the text after removing stop words =' , total_unique_words_no_stop) print () # Count the number of stop words in the text total_stop_words = len ( set (words).intersection(stop_words)) print ( 'Total number of stop words in the text =' , total_stop_words) Output: 5) import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from collections import Counter
# Problem 5 # Sort the dictionary by frequency in descending order from operator import itemgetter sorted_items = sorted (word_freq.items(), key=itemgetter( 1 ), reverse= True ) # Pull out the top 10 entries from the sorted list # numberTopWords = 10 top10Words = sorted_items[ 0 :numberTopWords] # Display the top 10 words using Pandas # import pandas as pd df = pd.DataFrame(top10Words, columns=[ 'word' , 'frequency' ]) print (df) # Replace 'RomeoJuliet.txt' with the path to your actual .txt file file_path = 'RomeoJuliet.txt' text = read_text_from_file(file_path) # Tokenize the text into words words = nltk.word_tokenize(text.lower()) # Get the top-10 words with the highest frequency top_10_words = word_counts.most_common( 10 ) Output:
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
6) # Problem 6 import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from collections import Counter # Function to read the contents of a text file def read_text_from_file ( file_path ): with open (file_path, 'r' , encoding= 'utf-8' ) as file : text = file .read() return text # Replace 'RomeoJuliet.txt' with the path to your actual .txt file file_path = 'RomeoJuliet.txt' text = read_text_from_file(file_path) # Tokenize the text into words words = word_tokenize(text.lower()) # Remove stop words stop_words = set (stopwords.words( "english" )) filtered_words = [word for word in words if word.isalpha() and word not in stop_words] # Count the occurrences of each unique word word_counts = Counter(filtered_words) # Get the top-10 words with the highest frequency top_10_words = word_counts.most_common( 10 )
# Print the top-10 words and their frequencies in a column format print ( "Top-10 words and their frequencies:" ) print ( "Word | Frequency" ) print ( "----------------------" ) for word, frequency in top_10_words: print ( f " {word :<10 } | {frequency} " ) Output: 7) # Problem 7 import re from textblob import TextBlob # Read the content of the file with open ( "RomeoJuliet.txt" , "r" ) as file : text = file .read()[: 1000 ] # Read the first 1,000 characters # Define a regular expression pattern to identify noun phrases noun_phrase_pattern = re. compile (r '\b[A-Z][a-z]*\b' ) # Find all noun phrases in the text noun_phrases = noun_phrase_pattern.findall(text) # Create a TextBlob object for the text blob = TextBlob(text)
# Create a standard Python dictionary to store the noun phrases and their frequencies noun_phrase_dict = {} for noun_phrase in noun_phrases: noun_phrase_dict[noun_phrase] = noun_phrase_dict.get(noun_phrase, 0 ) + 1 # Print the total number of noun phrases found print ( 'Total Noun Phrases =' , len (noun_phrases)) print () # Display all noun phrases, 3 per line with numbering for i, noun_phrase in enumerate (noun_phrases, start= 1 ): print ( f " {i} . {noun_phrase} " , end= ", " ) if i % 3 == 0 : print () Output: 8) # Problem 8 import re # Read the content of the file with open ( "RomeoJuliet.txt" , "r" ) as file : text = file .read()
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
# Define a regular expression pattern to identify noun phrases noun_phrase_pattern = re. compile (r '\b[A-Z][a-z]*\b' ) # Find all noun phrases in the text noun_phrases = noun_phrase_pattern.findall(text) # Count the total number of noun phrases total_noun_phrases = len (noun_phrases) # Display the total count of noun phrases print ( 'Total Noun Phrases =' , total_noun_phrases) Output: 9) # Problem 9 import re from collections import Counter # Read the content of the file with open ( "RomeoJuliet.txt" , "r" ) as file : text = file .read() # Define a regular expression pattern to identify noun phrases noun_phrase_pattern = re. compile (r '\b[A-Z][a-z]*\b' ) # Find all noun phrases in the text noun_phrases = noun_phrase_pattern.findall(text) # Create a counter to count the frequency of each noun phrase noun_phrase_counter = Counter(noun_phrases) # Count the total number of noun phrases total_noun_phrases = len (noun_phrases) # Display the total count of noun phrases print ( 'Total Noun Phrases =' , total_noun_phrases) print ()
# Display the top-10 noun phrases with their frequencies print ( "Top-10 Noun Phrases with Highest Frequency:" ) for noun_phrase, frequency in noun_phrase_counter.most_common( 10 ): print ( f " {noun_phrase} - Frequency: {frequency} " ) Output: 10) # Problem 10 import re # Read the content of the file with open ( "RomeoJuliet.txt" , "r" ) as file : text = file .read() # Define a regular expression pattern to find words containing "WA" word_pattern = re. compile (r '\b\w*WA\w*\b' ) # Find all words containing "WA" in the text words_with_wa = word_pattern.findall(text) # Print the words containing "WA" print ( "Words containing 'WA':" ) for word in words_with_wa: print (word) Output:
11) # Problem 11 # Visualizing word frequencies with WordClouds # # First Upload the mask file # from google.colab import files uploaded_mask = files.upload() uploaded_mask = files.upload() uploaded_mask = files.upload() uploaded_mask = files.upload() file_name = list (uploaded_mask.keys())[ 0 ] print () print ( "The Mask File Name = : " + str (file_name)) import imageio mask_image = imageio.imread( str (file_name)) ################################################### from wordcloud import WordCloud # Assuming you have uploaded the text file 'RomeoJuliet.txt' uploaded_text = files.upload() text_data = uploaded_text[ list (uploaded_text.keys())[ 0 ]].decode( 'utf-8' ) # Decoding the text content wordcloudImage = wordcloud.generate(text_data) print ( type (wordcloudImage)) wordclodFile = wordcloudImage.to_file( 'RomeoJuliet.png' )
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
wordcloudImage.to_image() from PIL import Image import matplotlib.pyplot as plt im = Image. open ( 'RomeoJuliet.png' ) fig, aux = plt.subplots() aux.imshow(im) Output: