How can I add the WebscrapingV3.py functions into my class?

Database System Concepts
7th Edition
ISBN:9780078022159
Author:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Chapter1: Introduction
Section: Chapter Questions
Problem 1PE
icon
Related questions
Question

How can I add the WebscrapingV3.py functions into my class?

Here is my code:

from bs4 import BeautifulSoup

from collections import defaultdict

import requests

class WebScraper:

def __init__(self, url):

self.url = url

self.data_dict = defaultdict(list)

self.soup = None

def __repr__(self):

return f"WebScraper(url={self.url})"

def __str__(self):

return f"WebScraper object for {self.url}"

def fetch_data(self):

html_text = requests.get(self.url)

self.soup = BeautifulSoup(html_text.content, "html.parser")

def scrape_table(self, table_class):

for row in self.soup.select(f".{table_class} tr")[1:]:

cells = row.select("td")

self.data_dict["Country Name"].append(cells[0].text.strip())

self.data_dict["1980"].append(cells[1].text.strip())

self.data_dict["2018"].append(cells[2].text.strip())

def scrape_data(self, table_class):

self.fetch_data()

self.scrape_table(table_class)

def get_data(self):

return self.data_dict

if __name__ == "__main__":

scraper =

WebScraper('https://en.wikipedia.org/wiki/List_of_countries_by_carbon_dioxide_emiss

ions_per_capita')

scraper.scrape_data('wikitable sortable')

data = scraper.get_data()

print(data)

I have attached the WebscrapingV3.py file.

WebScrapingV3 No Selection
1
2
3 Install Beautiful Soup (BS4).
4 From command line type:
5 python -m pip install bs4
6
7 Install Matplotlib
8 From command line type:
9 python -m pip install matplotlib
10
11 Install shutil
12 From cmmand line type:
13 python -m pip install shutil
14
15
16
17
18 from urllib.request import urlopen
19 from bs4 import BeautifulSoup
20 import matplotlib.pyplot as plt
21 import matplotlib.image as mpim
22 import requests
23 import shutil
24 import sys
25
26 def openHtmlFile(htmlfile):
27
f = open(htmlfile, "r")
28
contents = f.read()
29
return BeautifulSoup (contents, 'html.parser')
30
31 def openHtmlSite (html site):
32
html = urlopen (html site)
33
return Beautiful Soup (html, 'html.parser')
34
35 soupf = openHtmlFile("index.html") #local file
36 print (soupf.prettify())
37 print (soupf.h2)
38 print (soupf.head)
39 print (soupf.li)
40 print ("HTML: {0}, name: {1}, text: {2}".format(soupf.h2, soupf.h2.name, soupf.h2. text))
41 print (25*'-')
42
43 souph = openHtmlSite('https://climate.nasa.gov/evidence/')
44 print (souph.prettify())
45 print (souph.h2)
46 print (souph.head)
47 print (souph.li)
48 print("HTML: {0}, name: {1}, text: {2}". format (souph.h2, souph.h2.name, souph.h2. text))
49 print (25*¹=-¹)
50
51 #headers and lists.
52 print (soupf.h2)
53 print (soupf.head)
54 print (soupf.li)
55 print ("HTML: {0}, name: {1}, text: {2}".format(soupf.h2, soupf.h2.name, soupf.h2.text))
56 print (souph.h2)
57 print (souph.head)
111
58 print (souph.li)
59 print ("HTML: {0}, name: {1}, text: {2}" .format(souph.h2, souph.h2.name, souph.h2.text))
60 print (25*'-')
61
62 63 4 65 6 7 68 69 70 71 72 73 74 75
64
67
# children
66 children (soupf)
children (souph)
def children (html):
[print(child.name) for child in html.recursiveChildGenerator () if child.name is not None]
68 print (25*'==')
70 # find all
def findAll(html, tags):
dict = {}
for tag in html.find_all(tags):
print("{0}: {1}" .format(tag.name, tag.text))
r = re.compile(r'\s')
s = r.split(tag.text)
dict [s[0]]= s[1]
for k, v in dict.items():
76
77
78
79
80
81
findAll(soupf, 'td')
82 findAll(souph, 'td')
83 print (25*'-')
84
85 # append tag
86 def appendTag (html, tag, nustr):
87
newtag soupf.new_tag (tag)
88
newtag.string=nustr
89
ultag = soupf.ul
90
ultag.append(newtag)
91
print (ultag.prettify())
92
93 appendTag (soupf, 'li', 'First')
94 appendTag (souph, 'li', 'First')
95 print (25*'==')
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
print('key= ',k, '\tvalue= ',v)
96
97 #inserting a new tag at position
98 def insertAt (html, tag, nustr, index):
99
newtag html.new_tag (tag)
newtag.string = nustr
ultag=html.ul
ultag.insert(index, newtag)
100
101
102
103
print (ultag.prettify())
104
105 insertAt (soupf, 'li', 'Second', 2)
106 insertAt (souph, 'li', 'Second', 2)
107 print (25*¹=-¹)
108
109 #select list element
110 def selectIndex (html, index):
111
112
113
114
selectIndex (soupf, 3)
115 selectIndex (souph, 3)
116 print (25*¹=-¹)
117
118 #select paragraph
119 def selectParagraph (html):
120
121
122
123
124
125 selectParagraph (soupf)
126 selectParagraph (souph)
127 print (25*'=')
128
129 #select span
130 def selectSpan (html):
131
spanElem
132
133
134
135
136 selectSpan (soupf)
137 selectSpan (souph)
138 print (25*'-')
139
sel = "li:nth-of-type ("+str(index)+")"
print (html.select (sel))
spanElem= html.select('p').
print (spanElem)
for i in range (0, len (spanElem)) :
print (str((span Elem) [i]))
print (spanElem)
for i in range (0, len (spanElem)):
print (str((spanElem) [i]))
140 def scrapefile():
141
142
143
144
html.select('span')
alink = "https://automatetheboringstuff.com/files/"
page= requests.get(alink)
soup = BeautifulSoup (page.content, 'html.parser')
links
145
146
147
148
149
150
151
152
153
154 scrapefile()
155 print (25*¹=-¹)
soup.select('a')
htpfile = alink + links [20]. text
res requests.get(htpfile)
res.raise_for_status()
playFile = open('Romeo And Juliet.txt', 'wb')
for chunk in res.iter_content (100000):
print (chunk)
playFile.write(chunk)
playFile.close()
156
157 def scrapeimage():
wpage='https://www.livescience.com/37821-greenhouse-gases.html'
htmldata= urlopen (wpage)
soup Beautiful Soup (htmldata, 'html.parser')
images = soup.find_all('img')
image1
images [1]
src=image1.attrs['src']
req = requests.get(src, stream=True)
# end custom code
if req.status_code == 200: # 200 status code = OK
with open("image1.jpg", 'wb') as f:
req.raw.decode_content = True
shutil.copy fileobj (req.raw, f)
img=mpim.imread('image1.jpg')
imgplot = plt.imshow (img)
plt.show()
173
174 scrapeimage ( )
175
Transcribed Image Text:WebScrapingV3 No Selection 1 2 3 Install Beautiful Soup (BS4). 4 From command line type: 5 python -m pip install bs4 6 7 Install Matplotlib 8 From command line type: 9 python -m pip install matplotlib 10 11 Install shutil 12 From cmmand line type: 13 python -m pip install shutil 14 15 16 17 18 from urllib.request import urlopen 19 from bs4 import BeautifulSoup 20 import matplotlib.pyplot as plt 21 import matplotlib.image as mpim 22 import requests 23 import shutil 24 import sys 25 26 def openHtmlFile(htmlfile): 27 f = open(htmlfile, "r") 28 contents = f.read() 29 return BeautifulSoup (contents, 'html.parser') 30 31 def openHtmlSite (html site): 32 html = urlopen (html site) 33 return Beautiful Soup (html, 'html.parser') 34 35 soupf = openHtmlFile("index.html") #local file 36 print (soupf.prettify()) 37 print (soupf.h2) 38 print (soupf.head) 39 print (soupf.li) 40 print ("HTML: {0}, name: {1}, text: {2}".format(soupf.h2, soupf.h2.name, soupf.h2. text)) 41 print (25*'-') 42 43 souph = openHtmlSite('https://climate.nasa.gov/evidence/') 44 print (souph.prettify()) 45 print (souph.h2) 46 print (souph.head) 47 print (souph.li) 48 print("HTML: {0}, name: {1}, text: {2}". format (souph.h2, souph.h2.name, souph.h2. text)) 49 print (25*¹=-¹) 50 51 #headers and lists. 52 print (soupf.h2) 53 print (soupf.head) 54 print (soupf.li) 55 print ("HTML: {0}, name: {1}, text: {2}".format(soupf.h2, soupf.h2.name, soupf.h2.text)) 56 print (souph.h2) 57 print (souph.head) 111 58 print (souph.li) 59 print ("HTML: {0}, name: {1}, text: {2}" .format(souph.h2, souph.h2.name, souph.h2.text)) 60 print (25*'-') 61 62 63 4 65 6 7 68 69 70 71 72 73 74 75 64 67 # children 66 children (soupf) children (souph) def children (html): [print(child.name) for child in html.recursiveChildGenerator () if child.name is not None] 68 print (25*'==') 70 # find all def findAll(html, tags): dict = {} for tag in html.find_all(tags): print("{0}: {1}" .format(tag.name, tag.text)) r = re.compile(r'\s') s = r.split(tag.text) dict [s[0]]= s[1] for k, v in dict.items(): 76 77 78 79 80 81 findAll(soupf, 'td') 82 findAll(souph, 'td') 83 print (25*'-') 84 85 # append tag 86 def appendTag (html, tag, nustr): 87 newtag soupf.new_tag (tag) 88 newtag.string=nustr 89 ultag = soupf.ul 90 ultag.append(newtag) 91 print (ultag.prettify()) 92 93 appendTag (soupf, 'li', 'First') 94 appendTag (souph, 'li', 'First') 95 print (25*'==') 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 print('key= ',k, '\tvalue= ',v) 96 97 #inserting a new tag at position 98 def insertAt (html, tag, nustr, index): 99 newtag html.new_tag (tag) newtag.string = nustr ultag=html.ul ultag.insert(index, newtag) 100 101 102 103 print (ultag.prettify()) 104 105 insertAt (soupf, 'li', 'Second', 2) 106 insertAt (souph, 'li', 'Second', 2) 107 print (25*¹=-¹) 108 109 #select list element 110 def selectIndex (html, index): 111 112 113 114 selectIndex (soupf, 3) 115 selectIndex (souph, 3) 116 print (25*¹=-¹) 117 118 #select paragraph 119 def selectParagraph (html): 120 121 122 123 124 125 selectParagraph (soupf) 126 selectParagraph (souph) 127 print (25*'=') 128 129 #select span 130 def selectSpan (html): 131 spanElem 132 133 134 135 136 selectSpan (soupf) 137 selectSpan (souph) 138 print (25*'-') 139 sel = "li:nth-of-type ("+str(index)+")" print (html.select (sel)) spanElem= html.select('p'). print (spanElem) for i in range (0, len (spanElem)) : print (str((span Elem) [i])) print (spanElem) for i in range (0, len (spanElem)): print (str((spanElem) [i])) 140 def scrapefile(): 141 142 143 144 html.select('span') alink = "https://automatetheboringstuff.com/files/" page= requests.get(alink) soup = BeautifulSoup (page.content, 'html.parser') links 145 146 147 148 149 150 151 152 153 154 scrapefile() 155 print (25*¹=-¹) soup.select('a') htpfile = alink + links [20]. text res requests.get(htpfile) res.raise_for_status() playFile = open('Romeo And Juliet.txt', 'wb') for chunk in res.iter_content (100000): print (chunk) playFile.write(chunk) playFile.close() 156 157 def scrapeimage(): wpage='https://www.livescience.com/37821-greenhouse-gases.html' htmldata= urlopen (wpage) soup Beautiful Soup (htmldata, 'html.parser') images = soup.find_all('img') image1 images [1] src=image1.attrs['src'] req = requests.get(src, stream=True) # end custom code if req.status_code == 200: # 200 status code = OK with open("image1.jpg", 'wb') as f: req.raw.decode_content = True shutil.copy fileobj (req.raw, f) img=mpim.imread('image1.jpg') imgplot = plt.imshow (img) plt.show() 173 174 scrapeimage ( ) 175
Expert Solution
trending now

Trending now

This is a popular solution!

steps

Step by step

Solved in 3 steps

Blurred answer
Knowledge Booster
Elements and Tags
Learn more about
Need a deep-dive on the concept behind this application? Look no further. Learn more about this topic, computer-science and related others by exploring similar questions and additional content below.
Similar questions
Recommended textbooks for you
Database System Concepts
Database System Concepts
Computer Science
ISBN:
9780078022159
Author:
Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:
McGraw-Hill Education
Starting Out with Python (4th Edition)
Starting Out with Python (4th Edition)
Computer Science
ISBN:
9780134444321
Author:
Tony Gaddis
Publisher:
PEARSON
Digital Fundamentals (11th Edition)
Digital Fundamentals (11th Edition)
Computer Science
ISBN:
9780132737968
Author:
Thomas L. Floyd
Publisher:
PEARSON
C How to Program (8th Edition)
C How to Program (8th Edition)
Computer Science
ISBN:
9780133976892
Author:
Paul J. Deitel, Harvey Deitel
Publisher:
PEARSON
Database Systems: Design, Implementation, & Manag…
Database Systems: Design, Implementation, & Manag…
Computer Science
ISBN:
9781337627900
Author:
Carlos Coronel, Steven Morris
Publisher:
Cengage Learning
Programmable Logic Controllers
Programmable Logic Controllers
Computer Science
ISBN:
9780073373843
Author:
Frank D. Petruzella
Publisher:
McGraw-Hill Education