1- """
2- chromiumloader module
3- """
41import asyncio
52from typing import Any , AsyncIterator , Iterator , List , Optional
63from langchain_community .document_loaders .base import BaseLoader
129logger = get_logger ("web-loader" )
1310
1411class ChromiumLoader (BaseLoader ):
15- """scrapes HTML pages from URLs using a (headless) instance of the
16- Chromium web driver with proxy protection
12+ """Scrapes HTML pages from URLs using a (headless) instance of the
13+ Chromium web driver with proxy protection.
1714
1815 Attributes:
1916 backend: The web driver backend library; defaults to 'playwright'.
2017 browser_config: A dictionary containing additional browser kwargs.
21- headless: whether to run browser in headless mode.
18+ headless: Whether to run browser in headless mode.
2219 proxy: A dictionary containing proxy settings; None disables protection.
2320 urls: A list of URLs to scrape content from.
21+ requires_js_support: Flag to determine if JS rendering is required.
2422 """
2523
2624 RETRY_LIMIT = 3
@@ -34,15 +32,17 @@ def __init__(
3432 headless : bool = True ,
3533 proxy : Optional [Proxy ] = None ,
3634 load_state : str = "domcontentloaded" ,
35+ requires_js_support : bool = False ,
3736 ** kwargs : Any ,
3837 ):
3938 """Initialize the loader with a list of URL paths.
4039
4140 Args:
4241 backend: The web driver backend library; defaults to 'playwright'.
43- headless: whether to run browser in headless mode.
42+ headless: Whether to run browser in headless mode.
4443 proxy: A dictionary containing proxy information; None disables protection.
4544 urls: A list of URLs to scrape content from.
45+ requires_js_support: Whether to use JS rendering for scraping.
4646 kwargs: A dictionary containing additional browser kwargs.
4747
4848 Raises:
@@ -61,6 +61,7 @@ def __init__(
6161 self .proxy = parse_or_search_proxy (proxy ) if proxy else None
6262 self .urls = urls
6363 self .load_state = load_state
64+ self .requires_js_support = requires_js_support
6465
6566 async def ascrape_undetected_chromedriver (self , url : str ) -> str :
6667 """
@@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
186187 Yields:
187188 Document: The scraped content encapsulated within a Document object.
188189 """
189- scraping_fn = getattr (self , f"ascrape_{ self .backend } " )
190+ scraping_fn = (
191+ self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
192+ )
190193
191194 for url in self .urls :
192195 html_content = asyncio .run (scraping_fn (url ))
@@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
206209 Document: A Document object containing the scraped content, along with its
207210 source URL as metadata.
208211 """
209- scraping_fn = getattr (self , f"ascrape_{ self .backend } " )
212+ scraping_fn = (
213+ self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
214+ )
210215
211216 tasks = [scraping_fn (url ) for url in self .urls ]
212217 results = await asyncio .gather (* tasks )
0 commit comments