Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 6 additions & 185 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
"""
DoctolibDataScraper - Automated Doctolib.fr profile data extraction tool.

Scrapes doctor profiles from Doctolib search results, extracting names,
Expand All @@ -8,14 +7,14 @@
Contact: contact@soclose.co
License: MIT
Repository: https://github.com/SoCloseSociety/DoctolibDataScraper
"""
"

import logging
import platform
import socket
import subprocess
import sys
import time
time

import pandas as pd
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -56,7 +55,6 @@
# Network utilities
# ---------------------------------------------------------------------------


def is_connected(host: str = "one.one.one.one", port: int = 80, timeout: int = 3) -> bool:
"""Check internet connectivity by resolving and connecting to a known host."""
try:
Expand All @@ -66,7 +64,6 @@ def is_connected(host: str = "one.one.one.one", port: int = 80, timeout: int = 3
except OSError:
return False


def vpn_connect() -> None:
"""Attempt to connect via NordVPN CLI (cross-platform)."""
cmd = ["nordvpn", "-c"] if platform.system() == "Windows" else ["nordvpn", "connect"]
Expand All @@ -92,12 +89,10 @@ def ensure_connectivity() -> None:
logger.error("Failed to establish connectivity after %d attempts.", max_retries)
sys.exit(1)


# ---------------------------------------------------------------------------
# Browser utilities
# ---------------------------------------------------------------------------


def create_driver() -> webdriver.Chrome:
"""Create and return a configured Chrome WebDriver instance."""
chrome_options = Options()
Expand All @@ -111,13 +106,10 @@ def create_driver() -> webdriver.Chrome:
driver.maximize_window()
return driver


def safe_get(driver: webdriver.Chrome, url: str, wait_class: str) -> webdriver.Chrome:
"""
Navigate to *url* and wait for an element with *wait_class* to appear.
"""Navigate to *url* and wait for an element with *wait_class* to appear.
If the page is blocked (e.g. by Doctolib), reconnect VPN and retry.
Returns the (possibly new) driver instance.
"""
Returns the (possibly new) driver instance."""
max_retries = 3
for attempt in range(max_retries):
try:
Expand All @@ -138,26 +130,22 @@ def safe_get(driver: webdriver.Chrome, url: str, wait_class: str) -> webdriver.C
logger.error("Could not load %s after %d attempts.", url, max_retries)
return driver


def scroll_page(driver: webdriver.Chrome) -> None:
"""Scroll to the bottom of the page to trigger lazy-loaded content."""
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE)


# ---------------------------------------------------------------------------
# Scraping: search results (Phase 1)
# ---------------------------------------------------------------------------


def scrape_search_page(soup: BeautifulSoup) -> list[str]:
"""Extract doctor profile links from a single search results page."""
links = []
for tag in soup.find_all("a", class_="dl-search-result-name js-search-result-path", href=True):
links.append(tag["href"])
return links


def scrape_all_search_results(search_url: str) -> list[str]:
"""Iterate through all paginated search results and collect profile links."""
logger.info("Phase 1: Collecting profile links from search results...")
Expand Down Expand Up @@ -229,19 +217,16 @@ def scrape_all_search_results(search_url: str) -> list[str]:
logger.info("Phase 1 complete: %d unique profile links collected.", len(unique_links))
return unique_links


def save_links_csv(links: list[str], filepath: str) -> None:
"""Save profile links to a CSV file."""
df = pd.DataFrame({"profile_link": links})
df.to_csv(filepath, index=False)
logger.info("Links saved to %s.", filepath)


# ---------------------------------------------------------------------------
# Scraping: individual profiles (Phase 2)
# ---------------------------------------------------------------------------


def extract_address(soup: BeautifulSoup) -> str:
"""Extract practice name and address from a profile page."""
try:
Expand All @@ -251,13 +236,12 @@ def extract_address(soup: BeautifulSoup) -> str:
divs = section.find_all("div")
practice_name = divs[1].text.strip() if len(divs) > 1 else ""
full_address = divs[0].text.strip() if divs else ""
address = full_address.replace(practice_name, "").strip()
address = full_address.replace(practice_name, \'\').strip()
return f"{practice_name} -> {address}" if practice_name else address
except (IndexError, AttributeError) as exc:
logger.debug("Address extraction issue: %s", exc)
return ""


def extract_skills(soup: BeautifulSoup) -> list[str]:
"""Extract skills list from a profile page."""
skills = []
Expand All @@ -270,7 +254,6 @@ def extract_skills(soup: BeautifulSoup) -> list[str]:
logger.debug("Skills extraction issue: %s", exc)
return skills


def extract_degrees(soup: BeautifulSoup) -> list[str]:
"""Extract degrees and achievements from a profile page."""
degrees = []
Expand All @@ -294,170 +277,8 @@ def extract_degrees(soup: BeautifulSoup) -> list[str]:
logger.debug("Degrees extraction issue: %s", exc)
return degrees


def extract_contact(soup: BeautifulSoup) -> list[str]:
"""Extract contact info (excluding opening hours) from a profile page."""
contacts = []
try:
contact_section = soup.find("div", id="openings_and_contact")
if not contact_section:
return contacts
for box in contact_section.find_all("div", class_="dl-profile-box"):
subtitle = box.find("h4", class_="dl-profile-card-subtitle")
if not subtitle:
continue
header_text = subtitle.text.strip()
if "Horaires d'ouverture" in header_text:
continue
content_div = box.find("div")
content = content_div.text.strip() if content_div else ""
contacts.append(f"{header_text}: {content}")
except AttributeError as exc:
logger.debug("Contact extraction issue: %s", exc)
return contacts


def scrape_profile(driver: webdriver.Chrome, profile_path: str) -> dict:
"""
Scrape a single doctor profile page and return extracted data.
Also visits alternate practice location tabs if available.
"""
url = f"{BASE_URL}{profile_path}"
driver = safe_get(driver, url, "dl-profile-header-name")
scroll_page(driver)

soup = BeautifulSoup(driver.page_source, "html.parser")

# Name
name_el = soup.find("h1", class_="dl-profile-header-name")
name = name_el.text.strip() if name_el else "Unknown"

# Primary location data
addresses = [extract_address(soup)]
all_skills = extract_skills(soup)
all_degrees = extract_degrees(soup)
contacts = extract_contact(soup)

# Check for additional practice locations (tabs)
base_path = profile_path.split("?")[0]
alt_links = []
for tag in soup.find_all("a", class_="dl-text", href=True):
href = tag["href"]
if base_path in href and href != profile_path:
alt_links.append(href)

for alt_link in alt_links:
alt_url = f"{BASE_URL}{alt_link}"
driver = safe_get(driver, alt_url, "dl-profile-header-name")
scroll_page(driver)
alt_soup = BeautifulSoup(driver.page_source, "html.parser")

addr = extract_address(alt_soup)
if addr:
addresses.append(addr)

alt_skills = extract_skills(alt_soup)
all_skills.extend(s for s in alt_skills if s not in all_skills)

alt_degrees = extract_degrees(alt_soup)
all_degrees.extend(d for d in alt_degrees if d not in all_degrees)

alt_contacts = extract_contact(alt_soup)
contacts.extend(c for c in alt_contacts if c not in contacts)

return {
"name": name,
"addresses": "\n".join(addresses),
"skills": ", ".join(all_skills),
"degrees": "\n".join(all_degrees),
"contacts": "\n".join(contacts),
}


def scrape_all_profiles(links: list[str]) -> None:
"""Scrape all profiles and progressively save to CSV."""
logger.info("Phase 2: Scraping %d profiles...", len(links))

results = []
driver = create_driver()

for idx, link in enumerate(links, start=1):
logger.info("[%d/%d] Scraping: %s", idx, len(links), link)
try:
data = scrape_profile(driver, link)
results.append(data)
logger.info(" -> %s", data["name"])
except Exception as exc:
logger.error(" -> Failed to scrape %s: %s", link, exc)
results.append({
"name": "ERROR",
"addresses": link,
"skills": "",
"degrees": "",
"contacts": str(exc),
})
# Recreate driver on failure
try:
driver.quit()
except Exception:
pass
ensure_connectivity()
driver = create_driver()

# Progressive save every 5 profiles
if idx % 5 == 0 or idx == len(links):
df = pd.DataFrame(results)
df.to_csv(OUTPUT_DETAILS_CSV, index=False)
logger.info(" -> Progress saved (%d/%d).", idx, len(links))

try:
driver.quit()
except Exception:
pass

logger.info("Phase 2 complete: %d profiles scraped.", len(results))


# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------


def main() -> None:
"""Main execution flow."""
print()
print("=" * 60)
print(" DoctolibDataScraper")
print(" by SoClose - https://soclose.co")
print("=" * 60)
print()

search_url = input("Enter Doctolib search URL: ").strip()
if not search_url:
logger.error("No URL provided. Exiting.")
sys.exit(1)

if not search_url.startswith("http"):
search_url = f"{BASE_URL}{search_url}"

# Phase 1 - Collect links
links = scrape_all_search_results(search_url)
save_links_csv(links, OUTPUT_LINKS_CSV)

if not links:
logger.warning("No profile links found. Exiting.")
sys.exit(0)

# Phase 2 - Scrape profiles
scrape_all_profiles(links)

print()
print("=" * 60)
print(f" Done! {len(links)} profiles scraped.")
print(f" Links: {OUTPUT_LINKS_CSV}")
print(f" Details: {OUTPUT_DETAILS_CSV}")
print("=" * 60)


if __name__ == "__main__":
main()
... (truncated, 164 more lines)