From b8aad4b7aac6521cc529b3371c542e181eb3c3ac Mon Sep 17 00:00:00 2001
From: 16480000 <37862161+16480000@users.noreply.github.com>
Date: Tue, 21 Aug 2018 13:25:24 +0800
Subject: [PATCH] py
gushi
---
get_url.py | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 149 insertions(+)
create mode 100644 get_url.py
diff --git a/get_url.py b/get_url.py
new file mode 100644
index 0000000..d98b7bf
--- /dev/null
+++ b/get_url.py
@@ -0,0 +1,149 @@
+#coding=utf-8
+import re
+import requests
+url = 'https://www.gushiwen.org/shiwen/'
+def get_all_url(url):
+ html_text = requests.get(url)
+ html_text = html_text.text
+ url_text = re.findall('(.*?)',html_text)
+ #print(url_text)
+ #print(html_text)
+ return url_text
+
+title_list = [] #获取各个分类链接
+name = [] #获取各个分类名称
+def get_title(url_list):
+ url = []
+ u = 'https://so.gushiwen.org/gushi/'
+ l = '.aspx'
+
+ for line in url_list:
+ #title.append(line[0])
+ title_list.append(u+line[0]+l)
+ name.append(line[1])
+ return url
+ #print(url)
+ #print(title)
+ #print(line[1])
+def get_all_url_title():
+ temp = []
+ u = 'https://so.gushiwen.org/shiwenv_'
+ l = '.aspx'
+ i = 0
+
+ for line in title_list:
+ sb = []
+ html_text = requests.get(line)
+ html_text = html_text.text
+ re_text = re.findall('',html_text)
+ for ll in re_text:
+ sb.append(u+ll+l)
+ #print(type(re_text))
+ temp.insert(i,sb)
+ #print(temp)
+ i+=1
+ return temp
+def getall_name_list(li):
+ temp = []
+ pat = []
+ count = 0
+ for lp in name:
+ pat.append(lp)
+ print(pat)
+ i = 0
+ for line in li:
+
+ path = './' + pat[i] + '.txt'
+ f = open(path, 'w+')
+ print(path)
+ for ie in line:
+ count+=1
+ name_title = []
+ author = []
+ content = []
+ #print(ie)
+ if(ie=='https://so.gushiwen.org/shiwenv_9464a0f0b635.aspx'):
+ continue
+ if(ie=='https://so.gushiwen.org/shiwenv_d512fc308251.aspx'):
+ continue
+ html_text = requests.get(ie)
+ html_text = html_text.text
+ name_title = re.findall('(.*?)
', html_text)
+ author = re.findall('(.*?)', html_text)
+ # content = re.findall('(.*?)
',html_text)
+ content = re.findall('', html_text)
+ try:
+ f.writelines(''.join(name_title)+'\t'+''.join(author)+'\t'+''.join(content[0]+'\n'))
+ except Exception :
+ print('一个错误')
+ #print(''.join(name_title))
+ #print(''.join(content))
+ print(count)
+ print(ie)
+ i+=1
+ print(i)
+ f.close()
+ #name.append(name1)
+ #author.append(author1)
+ #content.append(content1)
+
+ #print(re_text)
+def get_name_url(url):
+ url_list_data = []
+ u = 'https://so.gushiwen.org/shiwenv_'
+ l = '.aspx'
+ text = []
+ for line in url:
+ print(line)
+ html_text = requests.get(line)
+ html_text = html_text.text
+ #print(html_text)
+ text.append(re.findall('.*?',html_text))
+ #print(text)
+ for line in text:
+ for ll in line:
+ url_list_data.append(u+ll+l)
+ #print(line)
+ return url_list_data
+ #print(url_list_data)
+def get_con(u):
+ name = []
+ author = []
+ content = []
+ for line in u:
+ html_text = requests.get(line)
+ html_text = html_text.text
+ name1 = re.findall('(.*?)
', html_text)
+ author1 = re.findall('(.*?)', html_text)
+ # content = re.findall('(.*?)
',html_text)
+ content1 = re.findall('', html_text)
+ name.append(name1)
+ author.append(author1)
+ content.append(content1)
+ f = open('./shiwen.txt','w+')
+ i = 0
+ try:
+ for lin in name:
+ f.writelines(name[i])
+ f.writelines(" ")
+ f.writelines(author[i])
+ f.writelines(" ")
+ f.writelines(content[i])
+ f.writelines("\n")
+ i += 1
+ except Exception:
+ print("写入失败")
+ f.close()
+if __name__ == '__main__':
+ allurl = []
+ url_list = []
+ all_url = []
+ allurl = get_all_url(url)
+ url_list = get_title(allurl)
+ #print(title_list)
+ #print(name)
+ _all_url_title = get_all_url_title()
+ getall_name_list(_all_url_title)
+ #all_url = get_name_url(url_list)
+ #get_con(all_url)
+