files_processor/niuke_7.py at master · linbeyoung/files_processor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
# @Time   : 2022/6/16 20:58
# @Author : beyoung
# @Email  : linbeyoung@stu.pku.edu.cn
# @File   : niuke_7.py
# -*- coding = UTF-8 #-*-
# @Time : 2022/3/12 11:57 下午
# @Author : 李宇博
# @File : test.py
# @SoftWare : PyCharm
import re
import os

import requests
import json
from bs4 import BeautifulSoup


def getNum():
    # sum = int(input("请输入你要爬取的面经组数(30个为一组):\n"))
    # tagID = int(input("请输入你要爬取的领域代号:\n"))
    sum = 10
    tagID = 1194
    for i in range(1, sum + 1):
        # r = requests.get("https://www.nowcoder.com/discuss/experience/json?token=&tagId=640&companyId=0&phaseId=0&order=3&query=&page=" + str(i))
        r = requests.get("https://www.nowcoder.com/discuss/experience/json?token=&tagId=" + str(
            tagID) + "&companyId=0&phaseId=0&order=3&query=&page=" + str(i))
        # with open("result.txt","a") as f:
        #    f.write(json.dumps(r.json(),indent=4,ensure_ascii=False))
        content = json.dumps(r.json(), indent=4, ensure_ascii=False)
        content = json.loads(content)
        with open("num2.txt", "a") as f:
            for i in range(30):
                # print(content['data']['discussPosts'][i]['postId'])
                f.write(str(content['data']['discussPosts'][i]['postId']) + "\n")
    print("获取代号完毕")
    return tagID


def getHtml():
    cnt = 1
    result = ""
    with open("num2.txt", "r") as f:
        content = f.read().splitlines()
    # print(content)
    for i in content:
        url = "https://www.nowcoder.com/discuss/" + str(i) + "?source_id=discuss_experience_nctrack&channel=-1"
        r = requests.get(url)
        # r = requests.get("https://www.nowcoder.com/comment/list-by-page-v2?token=&entityId="+str(content[199])+"&entityType=8&page=1&pageSize=20&order=1")
        # print(r.json())
        title = re.findall('<title>(.*?)</title>', r.text)[0].replace("_笔经面经_牛客网", '')
        time = re.findall('<span class="post-time">(.*?)</span>', r.text, re.S)[0].replace("\n", "").replace("  ", "")
        bs = BeautifulSoup(r.text, "lxml")
        contents = bs.find("div", class_="post-topic-des nc-post-content").children
        # content = re.findall('<div class="post-topic-des nc-post-content">(.*?)<div class="clearfix">', r.text, re.S)
        for content2 in contents:
            content2 = content2.get_text()
            # content2 = content2.replace("\n", "")
            result += content2
        # print(r.text)
        with open("final.txt", "a") as f:
            f.write("第" + str(cnt) + "篇面经" + "\n" + "链接:" + str(url) + "\n" + "标题:" + str(title) + "\n" + "时间:" + str(
                time) + "\n" + "内容:" + str(result) + "liyubo")
        print("链接:" + url, "标题:" + title, "时间:" + time, sep="\t")
        cnt = cnt + 1
        result = ""
    print("内容爬取完毕，已写入final.txt文件")


def clearBlankLine(id):
    file1 = open('final.txt', 'r', encoding='utf-8')  # 要去掉空行的文件
    file2 = open(str(id)+".txt", 'w', encoding='utf-8')  # 生成没有空行的文件
    try:
        for line in file1.readlines():
            if line == '\n':
                line = line.strip("\n")
            elif line.__contains__("liyubo"):
                line = line.replace("liyubo", "\n\n\n\n")
            file2.write(line)
    finally:
        file1.close()
        file2.close()
        print("空行去除完毕，完成！")


def formatting():
    try:
        os.remove("final.txt")
        os.remove("num2.txt")
    except:
        print("出现异常")


if __name__ == '__main__':
    id = getNum()
    getHtml()
    clearBlankLine(id)
    # formatting()

# print(r.json())
# print(r2.json())