Tim-Ferriss-AI-Bot/scraper.py at main · rshiva/Tim-Ferriss-AI-Bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests
import PyPDF2
import io
import re
import csv
from datetime import datetime

url= "https://tim.blog/2018/09/20/all-transcripts-from-the-tim-ferriss-show/"

req = Request(url)
html_page = urlopen(req)
bs = BeautifulSoup(html_page, "lxml")
title_link = {}
urls = []
transcript = []
pattern = re.compile('#')
for link in bs.find_all('a',text=pattern, attrs={'data-wpel-link' : True}):
  title_link[link.string] = link.get('href')
  urls.append(link.get('href'))

now = datetime.now().strftime("%d-%m-%Y-%H:%M:%S")

with open(f'content-{now}.csv', mode='w') as csv_file:
  fieldnames = ['title', 'url', 'content']
  writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
  writer.writeheader()

  for author, url in title_link.items():
    if url.endswith('.pdf'):
      print("page is pdf file")

      output = io.BytesIO()
      output.write(requests.get(url).content)
      output.seek(0)
      pdf_reader = PyPDF2.PdfReader(output)
      # Get the number of pages in the PDF
      num_pages = len(pdf_reader.pages)
      # Initialize a variable to store the text
      pdf_text = ""

      # Iterate through each page
      for page in range(num_pages):
          # Extract the text from the page
          pdf_text += pdf_reader.pages[page].extract_text()

      print("author, url", author, url)

      writer.writerow({'title': author, 'url': url, 'content': pdf_text})

    elif url.endswith('.mp3') or url.endswith('manifesto') or url.endswith('freedom/'):
      print("skipping these")
    else:
      response = requests.get(url)
      soup = BeautifulSoup(response.text, 'html.parser')
      content = []
      blockquote = soup.find('blockquote')
      if blockquote:
        next_element = blockquote.find_next_sibling()
        while next_element:
          if next_element.name == "p":
            content.append(next_element.get_text())
          elif next_element.name == "div" and "jp-relatedposts" in next_element["class"]:
            break
          next_element = next_element.find_next_sibling()
        if content:
          title = soup.find('title').text
          print("url---->",title, url)
          writer.writerow({'title': title, 'url': url, 'content': ''.join(content)})