introduction_to_nlp

py

School

University of Central Florida *

*We aren’t endorsed by this school

Course

5610

Subject

Computer Science

Date

Apr 3, 2024

Type

py

Pages

3

Uploaded by madhu_iitm

Report
# -*- coding: utf-8 -*- """Introduction to NLP.ipynb Automatically generated by Colaboratory. Original file is located at ## Imports """ import numpy as np from sklearn.feature_extraction.text import CountVectorizer from scipy.spatial import distance import pprint from sklearn.feature_extraction.text import TfidfVectorizer """## Generate a Bag of Words (BoW) representation (model) or the document""" hd_text = \ '''Humpty Dumpty sat on a wall. Humpty Dumpty had a great fall. All the king's horses and all the king's men Couldn't put Humpty together again.''' print(hd_text) # The "corpus": tokenized_hd = hd_text.split('\n') print (tokenized_hd) """## Create an instance of CountVectorizer with the English stop_words filter""" # An instance of CountVectorizer will transform the corpus into a BoW vectorizer = CountVectorizer(stop_words='english') pprint.pprint (vectorizer.get_params()) # The stop words used by the engligh stop words filter: print(vectorizer.get_stop_words(), end=' ') print() print (len(vectorizer.get_stop_words())) BoW = vectorizer.fit_transform(tokenized_hd) words = vectorizer.get_feature_names_out() print (f'The words (features): \n{words}') vectorizer.vocabulary_ """### *Q1: ```With the stop words removed (the stop_words='english' parameter), we have a much smaller vocabulary. Also, some text normalization has been applied as well: king's --> king; couldn't --> couldn.``` <br>How, in your opinion, this "normalization" was performed?<br>Hint: look at the output of the get_params() method.*""" """## The BoW matrix""" # The Bag of Words matrix
# The visual presentation: bow_matrix = BoW.toarray() print (bow_matrix) # The type: scipy.sparse._csr.csr_matrix used to minimize the memory footprint print(type(BoW)) """## How to read a BoW matrix""" # That's the BoW index print(words) # In the 3rd sentence we have 2 occurrences of a word in the 6th position in our BoW. print (words[6]) # All the (#1)king's horses and all the (#2)king's men """## Define Cosine Similarity (Distance)""" # 2-D vectors with the base at origin: (0,0) # Coordinates [x,y] below are those of the vector's end A = np.array([1,0]) # x is 1, y is 0 -- the vector points in the directon of the X axis B = np.array([-1,0]) C = np.array([0,1]) pi = np.math.pi # from school geometry ... D = np.array([np.math.cos(pi/3), np.math.sin(pi/3)]) # pi/3 radians == 60 degrees print(D) print(np.linalg.norm(D) ) def cosine_similarity(A,B): A_L2 = np.linalg.norm(A) B_L2 = np.linalg.norm(B) AB = np.sum(A * B) cos_sim = AB / (A_L2 * B_L2) return cos_sim # Pointing in the same direction as cos(0) == 1.0 print(cosine_similarity(A, 2*A)) print(cosine_similarity(4*B,B)) # Diametrically opposed print(cosine_similarity(A,B)) print(cosine_similarity(B,A)) # Orthogonal cosine_similarity(A,C), cosine_similarity(C,B) # Approximation of cos(pi/3) == cosine for 60 degrees cosine_similarity(A,D) # Repeat the above calculation # Note that cosine_distance == (1 - cosine_similarity) distance.cosine(A, 2*A), distance.cosine(A,B), distance.cosine(D, A)
"""### Find cosine distance between sentences""" # Finding the cosine similarities with the first (0th) sentence. first = 0 next = 1 A = bow_matrix[0] for w in bow_matrix[1:]: #B = w.toarray()[0] #print (f'{j}:\t{B}') cs = np.round(cosine_similarity(A,w),3) print(f"The cosine distance between\n'{tokenized_hd[first]}'\ n'{tokenized_hd[next]}'\n==>{cs}") print(f'') next += 1 """### *Q2: How would you explain a cosine distance of 0.0 in the middle of the above output?*""" """## TF-IDF""" print(tokenized_hd) tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer.get_params() """### *Q3: What does the above output tell you?*""" # Tf-idf-weighted document-term matrix. tfidf_sm = tfidf_vectorizer.fit_transform(tokenized_hd) print(tfidf_sm.toarray()) print (tokenized_hd[2]) print(tfidf_sm.toarray()[2]) features = tfidf_vectorizer.get_feature_names_out() print (f"Size of BoW: {len(features)}") print(tfidf_vectorizer.vocabulary_, end=' ') print(tfidf_vectorizer.idf_) print(len(tfidf_vectorizer.idf_)) for i,f in enumerate(features): print (f'{i}:{f}', end=' ') print ("\n" + "-" * 140) print (tfidf_sm) """*End of the lab notebook*"""
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help