web_to_obsidian/code.py at main · Flybell/web_to_obsidian · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""Functions for main_script.py"""


import re #regular expression
from bs4 import BeautifulSoup #web parsing library
import io #manipulate files
import os #manipulate paths
import string
from datetime import date, datetime #get dates

#----make requests as a browser-----#
import requests
headers = requests.utils.default_headers()
headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'})
#----------------------------------#

## BEAUTIFUL SOUP
## useful: https://stackoverflow.com/questions/60045381/how-to-get-the-text-and-url-from-a-link-using-beautifulsoup

def make_request(url): #create a soup
    """request HTML soup object from url via beautiful soup"""
    req = requests.get(url, headers)
    soup = BeautifulSoup (req.content, "html5lib")
    return soup

## Scraping data

def get_next_page(soup):
  """ locate the next page button and output url"""
    page = soup.find("li", {"class": "next"}) # customize this search term
    url = page.find("a", href=True)
    if url != None:
        output_url = "https://dummy.site.com" + url["href"]
        print("starting next page")
    else:
        output_url = 0
        print("\n\nfinal page")
    return output_url

def get_members(soup):
  """ output list of profile urls extracted from page"""
    page = soup.find("div",  {"class": "row"}) # customize this search term
    member_urls = []
    for a in page.find_all("a", href=True):
        member_urls.append("https://dummy.site.com" + a["href"])
    return member_urls

def get_name(soup):
    """ on profile page, find the "Person Details" section,
    return full name"""
    name = soup.find("div", {"class": "list_title"}) # customize this search term
    given_name = name.find("span", {"itemprop": "givenName"})
    family_name = name.find("span", {"itemprop": "familyName"})
    return given_name.text.strip() + " " + family_name.text.strip()

def get_member_string(soup):
    """ finds the string with project info from "Project Details"
    cleans up a string full of tabs and line breaks & rejoins with spaces
    returns a string that looks like this
    First Last | Role 2022-10-15 - 2023-04-14 | Research area: Animals"""
    section = get_project_details_soup(soup)
    string = section.find("div", {"class": "list_category"}).text # customize this search term
    string = " ".join(string.split())
    return string

def get_project(soup):
    """ extract string that has project details type, dates, research area
    return list of project type, start date, end date, research area, title, abstract"""

    string = get_member_string(soup) # customize here: there is a line in target site that contains all the info I need, yours might not.

    #create profile list for each project
    proj_profile = []
    proj_profile.append(get_project_type(string)) #0
    proj_profile.append(get_project_startdate(string)) #1
    proj_profile.append(get_project_enddate(string)) #2
    proj_profile.append(get_research_area(string)) #3
    proj_profile.append(get_project_title(soup)) #4
    proj_profile.append(get_project_abstract(soup)) #5

    return proj_profile


def get_project_url(soup):
    """ find the "Related Projects" section
    return list of project urls from member profile page"""
    section = soup.find("div", {"class": "related_content"}) # customize this search term
    project_urls = []
    for a in section.find_all("a", href=True, text=True):
        project_urls.append("https://dummy.site.com" + a["href"])
    return project_urls

def get_project_details_soup(soup):
    """ find the Project Details section """
    return soup.find("div", {"class": "view_wrapper"}) # customize this search term

def get_project_startdate(string):
    """ takes second item of string (member role and dates)
    Role 2022-10-15 - 2023-04-14
    and returns the first string of dates
    2022-10-15"""
    type_date = string.split(" | ")[1]
    return type_date[-23:-13]

def get_project_enddate(string):
    """ takes second item of string (member role and dates)
    Role 2022-10-15 - 2023-04-14
    and retrieves the second string of dates
    2023-04-14"""
    type_date = string.split(" | ")[1]
    return type_date[-10:]

def get_project_type(string):
    """ takes second item of string (project role and dates)
    and retrieves the role text"""
    type_date = string.split(" | ")[1]
    return type_date.split(" ")[0] + " " + type_date.split(" ")[1]

def get_research_area(string):
    """ takes third item of string (research area)
    and retrieves the text"""
    area = string.split(" | ")[2]
    return area.split(" ")[2]

def get_project_title(soup):
  """get project title from soup"""
    section = get_project_details_soup(soup)
    title = section.find("div", {"class": "list_title"}).text # customize this search term
    return title

def get_project_abstract(soup):
  """get project abstract from soup"""
    section = get_project_details_soup(soup)
    abstract = section.find("div", {"class": "list_text"}).text # customize this search term
    abstract = " ".join(abstract.split())
    return abstract


#for the event webpage, I created a modified "main_script.py" to get events.
#leaving this here for your reference.
def get_event(soup):
  """on event page, find the event details, output a dictionary of name, title, date"""
    title = soup.find_all("h4", {"class": "list_title"}) # customize this search term
    name = soup.find_all("div", {"class": "list_subtitle"}) # customize this search term
    date = soup.find_all("div", {"class": "list_date"}) # customize this search term
    event_dict = []
    for n in range(len(name)): # create list of dictionaries
        dict = {
            "name": name[n].text.title().split(" (")[0].strip(),
            "title": title[n].text,
            "date": date[n].text
        }
        event_dict.append(dict)
    return event_dict

## Creating md files


def create_md_file_project(name, person_url, project):
    """output: project md file"""
    filename_project = "%s-%s.md" % (name, project[2]) # this will be the note file name
    project_path = "C:\\Users\\XXX\\XXX\\" + filename_project #designate where to put the folder, in this case it's the Obsidian vault
    with io.open(project_path, "w+", encoding="UTF8") as f:
        write_YAML_project(f, name, person_url, project)

def create_md_file_person(name, person_url, tag):
    """output: person md file """
    filename_person = "%s.md" % name # this will be the note file name
    person_path = "C:\\Users\\XXX\\XXX\\" + filename_person #designate where to put the folder, in this case it's the Obsidian vault

    # if file already exists, then only append
    # important when a person got a promotion and thus changed roles, the new role (designed by tag & a wikilink) will be appended to the pre-existing file
    if os.path.exists(person_path):
        with io.open(person_path, "a", encoding="UTF8") as f:
            f.write("\n\n" + "[[" + tag + "]]") # I created notes for high-level "roles" in the company and had them wiki-linked to the person note
            f.write("\n\n#" + tag) #add tag

    # if file doesn't exist, create new file
    else:
        with io.open(person_path, "w+", encoding="UTF8") as f:
            write_YAML_person(f, name, person_url, tag)

def write_YAML_person(f, name, person_url, tag):
    #YAML
    f.writelines(["---", "\ntype: person","\naliases: []"])
    f.write("\ncreate_date: " + datetime.today().strftime('%Y-%m-%d'))
    f.write("\nurl: " + person_url)
    f.write("\ntags: " + tag)
    f.writelines(["\n", "---", "\n"])

def write_YAML_project(f, name, person_url, project):
    """ write file into .md file for Obsidian to read"""
    #YAML
    f.writelines(["---", "\ntype: project", "\naliases: []"])
    f.write("\ncreate_date: " + datetime.today().strftime('%Y-%m-%d'))
    f.write("\nurl: " + person_url)
    f.writelines(["\n", "---", "\n"])

    # under YAML
    f.write("\nname:: " + "[[" + name + "]]")
    f.write("\n\nmember_role:: " + project[0])
    f.write("\n\nproject_start_date:: " + project[1])
    f.write("\n\nproject_end_date:: " + project[2])
    f.write("\n\nresearch_area:: " + "[[" + project[3] + "]]")
    f.write("\n\ntitle:: " + project[4])
    f.write("\n\nabstract:: " + project[5])
    f.write("\n\n")

#for the event webpage, I created a modified "main_script.py" to get events.
#leaving this here for your reference.
    def create_md_file_event(title, name, date):
    filename_project = "event_%s.md" % (date[0:10])
    project_path = "C:\\Users\\XXX\\XXX\\XXX\\" + filename_project

    with io.open(project_path, "w+", encoding="UTF8") as f:
        #YAML
        f.writelines(["---", "\n"])
        f.write("\ntags: tag1, tag2")
        f.write("\ntitle: " + "\"" + title + "\"")
        f.write("\ndate: " + date[0:10])
        f.write("\nmode: ") #hybrid, online, etc.
        f.write("\nattendance: ")
        f.writelines(["\n", "---", "\n"])

        #under YAML
        f.write("\nname:: " + "[[" + name + "]]")