forked from samadii/WebDownloaderBot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_dl.py
More file actions
69 lines (64 loc) · 3.44 KB
/
web_dl.py
File metadata and controls
69 lines (64 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os, sys
import re
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
class urlDownloader(object):
""" Download the webpage components base on the input url."""
def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True):
self.soup = None
self.imgFlg = imgFlg
self.linkFlg = linkFlg
self.scriptFlg = scriptFlg
self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml','js')
self.session = requests.Session()
#-----------------------------------------------------------------------------
def savePage(self, url, pagefolder='page'):
""" Save the web page components based on the input url and dir name.
Args:
url ([try]): web url string.
pagefolder (str, optional): path to save the web components.
Returns:
[bool]: whether the components saved the successfully.
"""
try:
response = self.session.get(url)
self.soup = BeautifulSoup(response.text, features="lxml")
if not os.path.exists(pagefolder): os.mkdir(pagefolder)
if self.imgFlg: self._soupfindnSave(url, pagefolder, tag2find='img', inner='src')
if self.linkFlg: self._soupfindnSave(url, pagefolder, tag2find='link', inner='href')
if self.scriptFlg: self._soupfindnSave(url, pagefolder, tag2find='script', inner='src')
with open(os.path.join(pagefolder, 'page.html'), 'wb') as file:
file.write(self.soup.prettify('utf-8'))
return True
except Exception as e:
print("> savePage(): Create files failed: %s." % str(e))
return False
#-----------------------------------------------------------------------------
def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'):
""" Saves on specified pagefolder all tag2find objects. """
pagefolder = os.path.join(pagefolder, tag2find)
if not os.path.exists(pagefolder): os.mkdir(pagefolder)
for res in self.soup.findAll(tag2find): # images, css, etc..
try:
if not res.has_attr(inner): continue # check if inner tag (file object) exists
# clean special chars such as '@, # ? <>'
filename = re.sub('\W+', '.', os.path.basename(res[inner]))
# print("> filename:", filename)
# Added the '.html' for the html file in the href
if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)):
filename += '.html'
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
# create the file.
if not os.path.isfile(filepath):
with open(filepath, 'wb') as file:
filebin = self.session.get(fileurl)
if len(filebin.content) > 0: # filter the empty file(imge not found)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)