-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
128 lines (113 loc) · 5.78 KB
/
main.py
File metadata and controls
128 lines (113 loc) · 5.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import openpyxl
from datetime import datetime
QUERIES = ["яблоко", "абрикос", "малина"]
TOP_NUM = 10
WB = openpyxl.load_workbook("report.xlsx")
DT = datetime.now().strftime('%m-%d-%Y')
CODE_RED = ["избегайте", "опасен", "опасны", "издевательство"]
CODE_GREEN = ["отличный", "легко", "полезен", "полезные", "полезный",
"профессиональные", "профессиональное", "профессиональная",
"натуральная", "уважение", "современной", "удобной", "лечебный", "инновации",
"развлечения", "мощное", "современной", "благодарны", "интересные", "круто"]
def main():
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument(f"--window-size=1920,2800")
chrome_options.add_argument("--hide-scrollbars")
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://www.google.com")
for i in range(len(QUERIES)):
collect_data_google(QUERIES[i], driver)
driver.get("https://yandex.ru")
for j in range(len(QUERIES)):
collect_data_yandex(QUERIES[j], driver)
driver.quit()
WB.save('report.xlsx')
def collect_data_google(query, driver):
sheet = WB[query]
search = driver.find_element_by_name("q")
search.clear()
search.send_keys(query + "\n")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "tbody")) # wait until the lowest table is downloaded
)
results_collected = 0
page_num = 1
while results_collected < TOP_NUM:
driver.save_screenshot("screenshots/" + query + "-google-" + str(page_num) + "_" + DT + ".png")
page_results = driver.find_elements_by_xpath("//div[@class='g']/div[not(ancestor::div[@class='xpdopen'])]")
for result in page_results:
results_collected += 1
sheet['B' + str(1 + results_collected)] = results_collected
header = result.find_element_by_tag_name("h3").text
sheet['C' + str(1 + results_collected)] = header
summary = result.find_element_by_class_name("st").text
sheet['D' + str(1 + results_collected)] = summary
link = result.find_element_by_tag_name("a")
sheet['E' + str(1 + results_collected)] = link.get_attribute("href")
rate = analyze_tone(header + " " + summary)
sheet['F' + str(1 + results_collected)] = rate
if results_collected == TOP_NUM:
break
if results_collected < TOP_NUM:
page_num += 1
new_page = driver.find_element_by_link_text(str(page_num))
new_page.click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "tbody"))
)
WB.save('report.xlsx')
def collect_data_yandex(query, driver):
sheet = WB[query]
search = driver.find_element_by_name("text")
search.clear()
search.send_keys(query + "\n")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "pager__items")) # wait until the lowest table is downloaded
)
results_collected = 0
page_num = 1
while results_collected < TOP_NUM:
driver.save_screenshot("screenshots/" + query + "-yandex-" + str(page_num) + "_" + DT + ".png")
page_results = driver.find_elements_by_xpath("//li[contains(@class, 'serp-item')"
"and not(contains(@data-fast-wzrd, 'images'))"
"and not(contains(@data-fast-wzrd, 'collections'))"
"and not(contains(@data-fast-wzrd, 'market_constr'))"
"and not(contains(@data-fast-wzrd, 'videowiz'))"
"and not(contains(@data-fast-wzrd, 'mushroom'))]")
for result in page_results:
results_collected += 1
sheet['B' + str(1 + TOP_NUM + results_collected)] = results_collected
header = result.find_element_by_class_name("organic__url-text").text
sheet['C' + str(1 + TOP_NUM + results_collected)] = header
summary = result.find_element_by_tag_name("div.text-container.typo.typo_text_m.typo_line_m.organic__text").text
sheet['D' + str(1 + TOP_NUM + results_collected)] = summary
link = result.find_element_by_tag_name("a")
sheet['E' + str(1 + TOP_NUM + results_collected)] = link.get_attribute("href")
rate = analyze_tone(header + " " + summary)
sheet['F' + str(1 + TOP_NUM + results_collected)] = rate
if results_collected == TOP_NUM:
break
if results_collected < TOP_NUM:
page_num += 1
new_page = driver.find_element_by_link_text(str(page_num))
new_page.click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "pager__items"))
)
WB.save('report.xlsx')
def analyze_tone(text):
for item in CODE_RED:
if item in text:
return -1
for item in CODE_GREEN:
if item in text:
return 1
return 0
if __name__ == "__main__":
main()