HG_Scraping_Urls

py

School

Stony Brook University *

*We aren’t endorsed by this school

Course

214

Subject

Computer Science

Date

Jan 9, 2024

Type

py

Pages

3

Uploaded by MinisterScience5603

Report
from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, Text, DateTime from bs4 import BeautifulSoup from selenium import webdriver import time # Database Configuration DATABASE_URL = "sqlite:///sites_crawled (1).db3" engine = create_engine(DATABASE_URL) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) Base = declarative_base() # Defining the DoctorScrapedInfo class based on database schema class DoctorScrapedInfo(Base): __tablename__ = 'scraped_doctors_info' id = Column(Integer, primary_key=True) site_crawled_id = Column(Integer, nullable=False) page = Column(String(1024), nullable=False) site = Column(String(1024), nullable=False) website = Column(String(1024)) name = Column(String(1024)) title = Column(String(1024)) first_name = Column(String(1024)) middle_name = Column(String(1024)) last_name = Column(String(1024)) personal_suffix = Column(String(1024)) clinical_suffix = Column(String(1024)) specialty = Column(String(1024)) university = Column(String(1024)) street = Column(String(1024)) street_internal = Column(String(1024)) city = Column(String(1024)) state = Column(String(1024)) zip_code = Column(String(1024)) phone_number = Column(String(1024)) # Initializing the Chrome driver driver = webdriver.Chrome() driver.maximize_window() def extract_urls(url): driver.get(url) time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, "html.parser") ul_text = soup.find("ul", class_="results__results-cards") list_items = ul_text.find_all("li") urls = [] for list_item in list_items: address = list_item.find("span", class_="location-info-address__city- state") if address is not None and "11772" in address.text: url = "https://healthgrades.com" + list_item.find("a").get("href") urls.append(url) return urls
def get_doctor_name(url): driver.get(url) time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, "html.parser") doctor_name = soup.find('h1', {'data-qa-target': 'ProviderDisplayName'}) return doctor_name.text if doctor_name else "None" def check_overlap(doctor_names): # Creating a session to interact with the database session = SessionLocal() # Query to get all the doctor namesfrom the database db_names = session.query(DoctorScrapedInfo.name).all() # Comparing the doctor names with the ones in the database overlapping_names = set(doctor_names) & set(name[0] for name in db_names) # Closing the session session.close() return overlapping_names def main(): base_url = 'https://www.healthgrades.com/usearch?what=Family %20Medicine&where=Patchogue%2C%20NY&pt=40.762199%2C- 73.011177&pageNum={page_num}&sort.provider=bestmatch&state=NY' urls = [] # To get the last page number driver.get(base_url.format(page_num=1)) time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, "html.parser") pagination_list = soup.find("div", class_="results__pagination") list_items = pagination_list.find_all('li') if len(list_items) >= 2: penultimate_li = list_items[-2] page_limit = int(penultimate_li.text) else: page_limit = 1 page_num = 1 while page_num <= page_limit: current_url = base_url.format(page_num=page_num) current_page_urls = extract_urls(current_url) if len(current_page_urls) == 0: break urls += current_page_urls page_num += 1 print("Total count of URLs:", len(urls)) # Getting the list of scraped doctor names scraped_names = [get_doctor_name(url) for url in urls] # Checking for overlapping names in the old database overlapping_names = check_overlap(scraped_names)
# Printing the total number of URLs which are listed print("Total count of URLs:", len(urls)) # Printing each URL with the associated doctor's name for url, name in zip(urls, scraped_names): print(url) print("Doctor Name:", name) # Printing the overlapping names print("Overlapping names:") for name in overlapping_names: print(name) main()
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help