-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathniuke_7.py
More file actions
101 lines (89 loc) · 3.68 KB
/
niuke_7.py
File metadata and controls
101 lines (89 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
# @Time : 2022/6/16 20:58
# @Author : beyoung
# @Email : linbeyoung@stu.pku.edu.cn
# @File : niuke_7.py
# -*- coding = UTF-8 #-*-
# @Time : 2022/3/12 11:57 下午
# @Author : 李宇博
# @File : test.py
# @SoftWare : PyCharm
import re
import os
import requests
import json
from bs4 import BeautifulSoup
def getNum():
# sum = int(input("请输入你要爬取的面经组数(30个为一组):\n"))
# tagID = int(input("请输入你要爬取的领域代号:\n"))
sum = 10
tagID = 1194
for i in range(1, sum + 1):
# r = requests.get("https://www.nowcoder.com/discuss/experience/json?token=&tagId=640&companyId=0&phaseId=0&order=3&query=&page=" + str(i))
r = requests.get("https://www.nowcoder.com/discuss/experience/json?token=&tagId=" + str(
tagID) + "&companyId=0&phaseId=0&order=3&query=&page=" + str(i))
# with open("result.txt","a") as f:
# f.write(json.dumps(r.json(),indent=4,ensure_ascii=False))
content = json.dumps(r.json(), indent=4, ensure_ascii=False)
content = json.loads(content)
with open("num2.txt", "a") as f:
for i in range(30):
# print(content['data']['discussPosts'][i]['postId'])
f.write(str(content['data']['discussPosts'][i]['postId']) + "\n")
print("获取代号完毕")
return tagID
def getHtml():
cnt = 1
result = ""
with open("num2.txt", "r") as f:
content = f.read().splitlines()
# print(content)
for i in content:
url = "https://www.nowcoder.com/discuss/" + str(i) + "?source_id=discuss_experience_nctrack&channel=-1"
r = requests.get(url)
# r = requests.get("https://www.nowcoder.com/comment/list-by-page-v2?token=&entityId="+str(content[199])+"&entityType=8&page=1&pageSize=20&order=1")
# print(r.json())
title = re.findall('<title>(.*?)</title>', r.text)[0].replace("_笔经面经_牛客网", '')
time = re.findall('<span class="post-time">(.*?)</span>', r.text, re.S)[0].replace("\n", "").replace(" ", "")
bs = BeautifulSoup(r.text, "lxml")
contents = bs.find("div", class_="post-topic-des nc-post-content").children
# content = re.findall('<div class="post-topic-des nc-post-content">(.*?)<div class="clearfix">', r.text, re.S)
for content2 in contents:
content2 = content2.get_text()
# content2 = content2.replace("\n", "")
result += content2
# print(r.text)
with open("final.txt", "a") as f:
f.write("第" + str(cnt) + "篇面经" + "\n" + "链接:" + str(url) + "\n" + "标题:" + str(title) + "\n" + "时间:" + str(
time) + "\n" + "内容:" + str(result) + "liyubo")
print("链接:" + url, "标题:" + title, "时间:" + time, sep="\t")
cnt = cnt + 1
result = ""
print("内容爬取完毕,已写入final.txt文件")
def clearBlankLine(id):
file1 = open('final.txt', 'r', encoding='utf-8') # 要去掉空行的文件
file2 = open(str(id)+".txt", 'w', encoding='utf-8') # 生成没有空行的文件
try:
for line in file1.readlines():
if line == '\n':
line = line.strip("\n")
elif line.__contains__("liyubo"):
line = line.replace("liyubo", "\n\n\n\n")
file2.write(line)
finally:
file1.close()
file2.close()
print("空行去除完毕,完成!")
def formatting():
try:
os.remove("final.txt")
os.remove("num2.txt")
except:
print("出现异常")
if __name__ == '__main__':
id = getNum()
getHtml()
clearBlankLine(id)
# formatting()
# print(r.json())
# print(r2.json())