-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnewcode_crawler.py
More file actions
94 lines (79 loc) · 2.74 KB
/
newcode_crawler.py
File metadata and controls
94 lines (79 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
# @Time : 2022/4/6 22:53
# @Author : beyoung
# @Email : linbeyoung@stu.pku.edu.cn
# @File : newcode_crawler.py
import pymysql
import requests
from lxml import html
import datetime
import time
import re
# import multiprocessing
class SpiderNKW(object):
def spider(self, sn):
'''nkw'''
url = 'https://www.nowcoder.com/ta/review-c/review?page={0}'.format(sn)
# url = 'https://www.nowcoder.com/ta/review-network/review?page={0}'.format(sn)
resp = requests.get(url)
# html文档
resp = requests.get(url, headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400',
})
resp.encoding = 'utf-8'
rest = resp.text
print('问题{0}:'.format(sn))
# 问题
title = re.findall('<div class="final-question">(.*?)</div>', rest, re.S)
title = title[0]
title = re.sub('(<p>)||(</p>)||<div>', '', title)
title=title.strip().replace("'","\'")
print(title)
# 答案
answer = re.findall('<div class="design-answer-box">(.*?)<div class="final-action clearfix">', rest, re.S)
# print(answer[0])
# answer=str(answer)
answer = answer[0]
answer = re.sub('(<div>)||(</div>)||(<br>)||(<p>)||(<br/>)||(</p>)||(<span>)||(</span>)', '', answer)
content = answer.strip().replace("\'",'')
print(content)
# answer_after=re.sub('<div>','',answer)
# answer_after=re.sub('\\\\n','',answer_after)
# print(answer_after.strip())
# ##把数据插入到mysql数据库中
# conn = self.get_conn()
# cursor = conn.cursor()
# sql = "INSERT INTO `interview_question`(`title`, `content`, `created_at`,`url`) VALUES('{title}', '{content}', '{created_at}','{url}')".format(
# title=title, content=content, created_at=datetime.datetime.now(),url=url)
#
# print(sql)
# # cursor.execute(sql)
# # conn.commit()
#
# cursor.close()
# conn.close()
def get_conn(self):
'''获取mysql数据库连接'''
# try:
conn = pymysql.connect(
db='news',
host='localhost',
user='root',
password='123456',
charset='utf8'
)
# except:
# print('wrong')
# pass
return conn
if __name__ == '__main__':
spider = SpiderNKW()
#引入多进程
# pool = multiprocessing.Pool(2)
##通过循环将页数传到url
for page in range(1, 20):
s = ''
try:
spider.spider(page)
except:
continue