How do I use webscraping techniques to retrieve https://www.gutenberg.org/files/11/old/alice30.txt? Then update the following code to read and print out the unique words in each chapter with the aid of adictionary and a set. from re import split def main() : # Read the word list and the document. correctly_spelled_words = read_words("words", "") document_words = read_words("alice30.txt", "*END*") # Skip the prefix # Print all words that are in the document but not the word list. misspellings = document_words.difference(correctly_spelled_words) for word in sorted(misspellings) : print(word) def read_words(filename, skip_until) : word_set = set() input_file = open(filename, "r") skip = True for line in input_file : line = line.strip() if not skip : # Use any character other than a-z or A-Z as word delimiters. parts = split("[^a-zA-Z]+", line) # notice this unique split function from the re module for word in parts : if len(word) > 0 : word_set.add(word.lower()) elif line.find(skip_until) >= 0 : skip = False input_file.close() return word_set main() Please do not include stop-words in the count based on the list of stop-words in the followingfile: https://cse2050.drfitz.fit/data/very/stop_words.txt. Also, copy and paste the stop-words in thecode. I can use regular expressions to split the book into chapters (e.g. CHAPTER [A-Z]+)). Print a count of unique words found in each chapter.Example dictionary:{"Chapter I: Down the Rabbit-Hole": {word1, word2, word3},"Chapter II: The Pool of Tears": {word1, word2, word3},"Chapter III: A Caucus-Race and a Long Tale": {word2, word3, word5},...} Example OutputCHAPTER CHAPTER TITLE UNIQUE WORD COUNTChapter I Down the Rabbit-Hole xChapter II The Pool of Tears yChapter III A Caucus-Race and a Long Tale z...
dictionary and a set.
from re import split
def main() :
# Read the word list and the document.
correctly_spelled_words = read_words("words", "")
document_words = read_words("alice30.txt", "*END*") # Skip the prefix
# Print all words that are in the document but not the word list.
misspellings = document_words.difference(correctly_spelled_words)
for word in sorted(misspellings) :
print(word)
def read_words(filename, skip_until) :
word_set = set()
input_file = open(filename, "r")
skip = True
for line in input_file :
line = line.strip()
if not skip :
# Use any character other than a-z or A-Z as word delimiters.
parts = split("[^a-zA-Z]+", line) # notice this unique split function from the re module
for word in parts :
if len(word) > 0 :
word_set.add(word.lower())
elif line.find(skip_until) >= 0 :
skip = False
input_file.close()
return word_set
main()
file: https://cse2050.drfitz.fit/data/very/stop_words.txt. Also, copy and paste the stop-words in the
code. I can use regular expressions to split the book into chapters (e.g. CHAPTER [A-Z]+)). Print a count of unique words found in each chapter.
Example dictionary:
{
"Chapter I: Down the Rabbit-Hole": {word1, word2, word3},
"Chapter II: The Pool of Tears": {word1, word2, word3},
"Chapter III: A Caucus-Race and a Long Tale": {word2, word3, word5},
...
}
CHAPTER CHAPTER TITLE UNIQUE WORD COUNT
Chapter I Down the Rabbit-Hole x
Chapter II The Pool of Tears y
Chapter III A Caucus-Race and a Long Tale z
...

Trending now
This is a popular solution!
Step by step
Solved in 1 steps with 1 images









