HG_Scraping_Urls

School

Stony Brook University *

*We aren’t endorsed by this school

Course

214

Subject

Computer Science

Date

Jan 9, 2024

Type

Pages

Uploaded by MinisterScience5603

from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, Text, DateTime from bs4 import BeautifulSoup from selenium import webdriver import time # Database Configuration DATABASE_URL = "sqlite:///sites_crawled (1).db3" engine = create_engine(DATABASE_URL) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) Base = declarative_base() # Defining the DoctorScrapedInfo class based on database schema class DoctorScrapedInfo(Base): __tablename__ = 'scraped_doctors_info' id = Column(Integer, primary_key=True) site_crawled_id = Column(Integer, nullable=False) page = Column(String(1024), nullable=False) site = Column(String(1024), nullable=False) website = Column(String(1024)) name = Column(String(1024)) title = Column(String(1024)) first_name = Column(String(1024)) middle_name = Column(String(1024)) last_name = Column(String(1024)) personal_suffix = Column(String(1024)) clinical_suffix = Column(String(1024)) specialty = Column(String(1024)) university = Column(String(1024)) street = Column(String(1024)) street_internal = Column(String(1024)) city = Column(String(1024)) state = Column(String(1024)) zip_code = Column(String(1024)) phone_number = Column(String(1024)) # Initializing the Chrome driver driver = webdriver.Chrome() driver.maximize_window() def extract_urls(url): driver.get(url) time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, "html.parser") ul_text = soup.find("ul", class_="results__results-cards") list_items = ul_text.find_all("li") urls = [] for list_item in list_items: address = list_item.find("span", class_="location-info-address__city- state") if address is not None and "11772" in address.text: url = "https://healthgrades.com" + list_item.find("a").get("href") urls.append(url) return urls

def get_doctor_name(url): driver.get(url) time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, "html.parser") doctor_name = soup.find('h1', {'data-qa-target': 'ProviderDisplayName'}) return doctor_name.text if doctor_name else "None" def check_overlap(doctor_names): # Creating a session to interact with the database session = SessionLocal() # Query to get all the doctor namesfrom the database db_names = session.query(DoctorScrapedInfo.name).all() # Comparing the doctor names with the ones in the database overlapping_names = set(doctor_names) & set(name[0] for name in db_names) # Closing the session session.close() return overlapping_names def main(): base_url = 'https://www.healthgrades.com/usearch?what=Family %20Medicine&where=Patchogue%2C%20NY&pt=40.762199%2C- 73.011177&pageNum={page_num}&sort.provider=bestmatch&state=NY' urls = [] # To get the last page number driver.get(base_url.format(page_num=1)) time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, "html.parser") pagination_list = soup.find("div", class_="results__pagination") list_items = pagination_list.find_all('li') if len(list_items) >= 2: penultimate_li = list_items[-2] page_limit = int(penultimate_li.text) else: page_limit = 1 page_num = 1 while page_num <= page_limit: current_url = base_url.format(page_num=page_num) current_page_urls = extract_urls(current_url) if len(current_page_urls) == 0: break urls += current_page_urls page_num += 1 print("Total count of URLs:", len(urls)) # Getting the list of scraped doctor names scraped_names = [get_doctor_name(url) for url in urls] # Checking for overlapping names in the old database overlapping_names = check_overlap(scraped_names)

# Printing the total number of URLs which are listed print("Total count of URLs:", len(urls)) # Printing each URL with the associated doctor's name for url, name in zip(urls, scraped_names): print(url) print("Doctor Name:", name) # Printing the overlapping names print("Overlapping names:") for name in overlapping_names: print(name) main()

Your preview ends here

Eager to read complete document? Join bartleby learn and gain access to the full version

Access to all documents
Unlimited textbook solutions
24/7 expert homework help

Related Documents

CS 135 Exam 3.docx

Getting Started (ObjectiveC).docx

final-S20.docx

SITHCCC027 Service Planning Template-2.docx

CS128 QUIZ PREP pt 2.docx

Data sharing via HIE.docx

Assignment-02-Solutions.pdf

Assignment-05-Solutions.pdf

Assignment-03-Solutions.pdf

Assignment-04-Solutions.pdf

FT-Review-NS.pdf

Rev-Fi23.docx

Recommended textbooks for you

Oracle 12c: SQL

Computer Science

ISBN:9781305251038

Author:Joan Casteel

Publisher:Cengage Learning

Np Ms Office 365/Excel 2016 I Ntermed

Computer Science

ISBN:9781337508841

Author:Carey

Publisher:Cengage

Programming with Microsoft Visual Basic 2017

Computer Science

ISBN:9781337102124

Author:Diane Zak

Publisher:Cengage Learning

A Guide to SQL

Computer Science

ISBN:9781111527273

Author:Philip J. Pratt

Publisher:Course Technology Ptr

SEE MORE TEXTBOOKS

Recommended textbooks for you

Oracle 12c: SQL
Computer Science
ISBN:9781305251038
Author:Joan Casteel
Publisher:Cengage Learning
Np Ms Office 365/Excel 2016 I Ntermed
Computer Science
ISBN:9781337508841
Author:Carey
Publisher:Cengage
Programming with Microsoft Visual Basic 2017
Computer Science
ISBN:9781337102124
Author:Diane Zak
Publisher:Cengage Learning
A Guide to SQL
Computer Science
ISBN:9781111527273
Author:Philip J. Pratt
Publisher:Course Technology Ptr