How can I add the WebscrapingV3.py functions into my class?
Here is my code:
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
class WebScraper:
def __init__(self, url):
self.url = url
self.data_dict = defaultdict(list)
self.soup = None
def __repr__(self):
return f"WebScraper(url={self.url})"
def __str__(self):
return f"WebScraper object for {self.url}"
def fetch_data(self):
html_text = requests.get(self.url)
self.soup = BeautifulSoup(html_text.content, "html.parser")
def scrape_table(self, table_class):
for row in self.soup.select(f".{table_class} tr")[1:]:
cells = row.select("td")
self.data_dict["Country Name"].append(cells[0].text.strip())
self.data_dict["1980"].append(cells[1].text.strip())
self.data_dict["2018"].append(cells[2].text.strip())
def scrape_data(self, table_class):
self.fetch_data()
self.scrape_table(table_class)
def get_data(self):
return self.data_dict
if __name__ == "__main__":
scraper =
WebScraper('https://en.wikipedia.org/wiki/List_of_countries_by_carbon_dioxide_emiss
ions_per_capita')
scraper.scrape_data('wikitable sortable')
data = scraper.get_data()
print(data)
I have attached the WebscrapingV3.py file.
Transcribed Image Text: WebScrapingV3 No Selection
1
2
3 Install Beautiful Soup (BS4).
4 From command line type:
5 python -m pip install bs4
6
7 Install Matplotlib
8 From command line type:
9 python -m pip install matplotlib
10
11 Install shutil
12 From cmmand line type:
13 python -m pip install shutil
14
15
16
17
18 from urllib.request import urlopen
19 from bs4 import BeautifulSoup
20 import matplotlib.pyplot as plt
21 import matplotlib.image as mpim
22 import requests
23 import shutil
24 import sys
25
26 def openHtmlFile(htmlfile):
27
f = open(htmlfile, "r")
28
contents = f.read()
29
return BeautifulSoup (contents, 'html.parser')
30
31 def openHtmlSite (html site):
32
html = urlopen (html site)
33
return Beautiful Soup (html, 'html.parser')
34
35 soupf = openHtmlFile("index.html") #local file
36 print (soupf.prettify())
37 print (soupf.h2)
38 print (soupf.head)
39 print (soupf.li)
40 print ("HTML: {0}, name: {1}, text: {2}".format(soupf.h2, soupf.h2.name, soupf.h2. text))
41 print (25*'-')
42
43 souph = openHtmlSite('https://climate.nasa.gov/evidence/')
44 print (souph.prettify())
45 print (souph.h2)
46 print (souph.head)
47 print (souph.li)
48 print("HTML: {0}, name: {1}, text: {2}". format (souph.h2, souph.h2.name, souph.h2. text))
49 print (25*¹=-¹)
50
51 #headers and lists.
52 print (soupf.h2)
53 print (soupf.head)
54 print (soupf.li)
55 print ("HTML: {0}, name: {1}, text: {2}".format(soupf.h2, soupf.h2.name, soupf.h2.text))
56 print (souph.h2)
57 print (souph.head)
111
58 print (souph.li)
59 print ("HTML: {0}, name: {1}, text: {2}" .format(souph.h2, souph.h2.name, souph.h2.text))
60 print (25*'-')
61
62 63 4 65 6 7 68 69 70 71 72 73 74 75
64
67
# children
66 children (soupf)
children (souph)
def children (html):
[print(child.name) for child in html.recursiveChildGenerator () if child.name is not None]
68 print (25*'==')
70 # find all
def findAll(html, tags):
dict = {}
for tag in html.find_all(tags):
print("{0}: {1}" .format(tag.name, tag.text))
r = re.compile(r'\s')
s = r.split(tag.text)
dict [s[0]]= s[1]
for k, v in dict.items():
76
77
78
79
80
81
findAll(soupf, 'td')
82 findAll(souph, 'td')
83 print (25*'-')
84
85 # append tag
86 def appendTag (html, tag, nustr):
87
newtag soupf.new_tag (tag)
88
newtag.string=nustr
89
ultag = soupf.ul
90
ultag.append(newtag)
91
print (ultag.prettify())
92
93 appendTag (soupf, 'li', 'First')
94 appendTag (souph, 'li', 'First')
95 print (25*'==')
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
print('key= ',k, '\tvalue= ',v)
96
97 #inserting a new tag at position
98 def insertAt (html, tag, nustr, index):
99
newtag html.new_tag (tag)
newtag.string = nustr
ultag=html.ul
ultag.insert(index, newtag)
100
101
102
103
print (ultag.prettify())
104
105 insertAt (soupf, 'li', 'Second', 2)
106 insertAt (souph, 'li', 'Second', 2)
107 print (25*¹=-¹)
108
109 #select list element
110 def selectIndex (html, index):
111
112
113
114
selectIndex (soupf, 3)
115 selectIndex (souph, 3)
116 print (25*¹=-¹)
117
118 #select paragraph
119 def selectParagraph (html):
120
121
122
123
124
125 selectParagraph (soupf)
126 selectParagraph (souph)
127 print (25*'=')
128
129 #select span
130 def selectSpan (html):
131
spanElem
132
133
134
135
136 selectSpan (soupf)
137 selectSpan (souph)
138 print (25*'-')
139
sel = "li:nth-of-type ("+str(index)+")"
print (html.select (sel))
spanElem= html.select('p').
print (spanElem)
for i in range (0, len (spanElem)) :
print (str((span Elem) [i]))
print (spanElem)
for i in range (0, len (spanElem)):
print (str((spanElem) [i]))
140 def scrapefile():
141
142
143
144
html.select('span')
alink = "https://automatetheboringstuff.com/files/"
page= requests.get(alink)
soup = BeautifulSoup (page.content, 'html.parser')
links
145
146
147
148
149
150
151
152
153
154 scrapefile()
155 print (25*¹=-¹)
soup.select('a')
htpfile = alink + links [20]. text
res requests.get(htpfile)
res.raise_for_status()
playFile = open('Romeo And Juliet.txt', 'wb')
for chunk in res.iter_content (100000):
print (chunk)
playFile.write(chunk)
playFile.close()
156
157 def scrapeimage():
wpage='https://www.livescience.com/37821-greenhouse-gases.html'
htmldata= urlopen (wpage)
soup Beautiful Soup (htmldata, 'html.parser')
images = soup.find_all('img')
image1
images [1]
src=image1.attrs['src']
req = requests.get(src, stream=True)
# end custom code
if req.status_code == 200: # 200 status code = OK
with open("image1.jpg", 'wb') as f:
req.raw.decode_content = True
shutil.copy fileobj (req.raw, f)
img=mpim.imread('image1.jpg')
imgplot = plt.imshow (img)
plt.show()
173
174 scrapeimage ( )
175