-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
70 lines (60 loc) · 2.25 KB
/
scraper.py
File metadata and controls
70 lines (60 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests
import PyPDF2
import io
import re
import csv
from datetime import datetime
url= "https://tim.blog/2018/09/20/all-transcripts-from-the-tim-ferriss-show/"
req = Request(url)
html_page = urlopen(req)
bs = BeautifulSoup(html_page, "lxml")
title_link = {}
urls = []
transcript = []
pattern = re.compile('#')
for link in bs.find_all('a',text=pattern, attrs={'data-wpel-link' : True}):
title_link[link.string] = link.get('href')
urls.append(link.get('href'))
now = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
with open(f'content-{now}.csv', mode='w') as csv_file:
fieldnames = ['title', 'url', 'content']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for author, url in title_link.items():
if url.endswith('.pdf'):
print("page is pdf file")
output = io.BytesIO()
output.write(requests.get(url).content)
output.seek(0)
pdf_reader = PyPDF2.PdfReader(output)
# Get the number of pages in the PDF
num_pages = len(pdf_reader.pages)
# Initialize a variable to store the text
pdf_text = ""
# Iterate through each page
for page in range(num_pages):
# Extract the text from the page
pdf_text += pdf_reader.pages[page].extract_text()
print("author, url", author, url)
writer.writerow({'title': author, 'url': url, 'content': pdf_text})
elif url.endswith('.mp3') or url.endswith('manifesto') or url.endswith('freedom/'):
print("skipping these")
else:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
content = []
blockquote = soup.find('blockquote')
if blockquote:
next_element = blockquote.find_next_sibling()
while next_element:
if next_element.name == "p":
content.append(next_element.get_text())
elif next_element.name == "div" and "jp-relatedposts" in next_element["class"]:
break
next_element = next_element.find_next_sibling()
if content:
title = soup.find('title').text
print("url---->",title, url)
writer.writerow({'title': title, 'url': url, 'content': ''.join(content)})