-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython-link-grabber.py
More file actions
47 lines (24 loc) · 878 Bytes
/
python-link-grabber.py
File metadata and controls
47 lines (24 loc) · 878 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# coding: utf-8
# ### Import urllib and read in data from webpage
# In[1]:
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('https://en.wikipedia.org/wiki/Computer_graphics')
lines = []
for line in fhand:
lines.append(line.decode().strip())
# ### Import regex library and compile the regular expression to grab a tags
# In[2]:
import re
match_string = re.compile('(?:<a)(?:\s+.*?href="(.*?)")(?:.*?<\/a>)')
match_string
# ### Loop through the text line by line and match against the regular expression built earlier
# In[3]:
tag_texts = []
for line in lines:
temp = re.findall(match_string, line)
if len(temp) != 0:
tag_texts.extend(temp)
# ### Display the number of tags extracted and the tags themselves
# In[7]:
print("Number of links grabbed: " + str(len(tag_texts)) + "\nList of Links:")
tag_texts