python-webScrappers/quotesJS.py at main · sunil-dhaka/python-webScrappers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver# rather use helium to make it easy going
from time import sleep
url='http://quotes.toscrape.com/js/'

quotePage=webdriver.Firefox()
quotePage.get(url)
quoteList=[]
for page in range(10):
    html=quotePage.page_source
    soup=bs(html,'html.parser')
    quotes=soup.find_all('div',class_='quote')
    for quote in quotes:
        tags=[]
        for tag in quote.find_all('a',class_='tag'):
            tags.append(tag.string)
        quoteData={
            'text':quote.find('span',class_='text').string,
            'author':quote.find('small',class_='author').string,
            'tags':tags
        }
        quoteList.append(quoteData)
    print('quotes collected after page no',str(page+1),'are',len(quoteList))
    try:
        nextPage=quotePage.find_element_by_css_selector('html body div.container nav ul.pager li.next a')
        sleep(5)
        nextPage.click()
    except Exception as e:
        print(e)
        #quotePage.quit()
quotePage.quit()
[print(str(i+1), '). quote text -- ',quoteList[i]['text'],'\n') for i in range(5)]
'''
using bs4 to parse requested data gives no quotes as this is js rendered website
although we see some nice quote class and all in web-browser but it is not what requests gets
to get what we saw on inspect have to use selenium rendered html
'''
'''r=requests.get(url)
print(r.status_code)

soup=bs(r.content,'html.parser')

quotes=soup.find_all('div',class_='quote')
print(quotes) #<-- [] empty'''