Skip to content

Commit 05900fd

Browse files
committed
Automatically split large files when copying
1 parent a74e193 commit 05900fd

File tree

3 files changed

+158
-22
lines changed

3 files changed

+158
-22
lines changed

ricecooker/classes/files.py

Lines changed: 79 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from ..exceptions import UnknownFileTypeError
3030
from ricecooker.utils.encodings import get_base64_encoding
3131
from ricecooker.utils.encodings import write_base64_to_file
32+
from ricecooker.utils.file_slice import FileSlice
3233
from ricecooker.utils.images import create_image_from_epub
3334
from ricecooker.utils.images import create_image_from_pdf_page
3435
from ricecooker.utils.images import create_image_from_zip
@@ -161,12 +162,12 @@ def download(path, default_ext=None):
161162
# Get extension of file or use `default_ext` if none found
162163
if not ext:
163164
ext = extract_path_ext(path, default_ext=default_ext)
164-
filename = copy_file_to_storage(tempf.name, ext=ext)
165-
FILECACHE.set(key, bytes(filename, "utf-8"))
166-
config.LOGGER.info("\t--- Downloaded {}".format(filename))
165+
filenames = copy_file_to_storage(tempf.name, ext=ext)
166+
FILECACHE.set(key, bytes(",".join(filenames), "utf-8"))
167+
config.LOGGER.info("\t--- Downloaded {}".format(filenames))
167168
os.unlink(tempf.name)
168169

169-
return filename, ext
170+
return filenames, ext
170171

171172

172173
def download_and_convert_video(path, ffmpeg_settings=None):
@@ -242,29 +243,48 @@ def write_path_to_filename(path, write_to_file):
242243

243244

244245
def get_hash(filepath):
245-
file_hash = hashlib.md5()
246246
with open(filepath, "rb") as fobj:
247-
for chunk in iter(lambda: fobj.read(2097152), b""):
248-
file_hash.update(chunk)
247+
return get_hash_from_fd(fobj)
248+
249+
250+
def get_hash_from_fd(fobj):
251+
file_hash = hashlib.md5()
252+
for chunk in iter(lambda: fobj.read(2097152), b""):
253+
file_hash.update(chunk)
249254
return file_hash.hexdigest()
250255

251256

252-
def copy_file_to_storage(srcfilename, ext=None):
257+
# 10 MB in bytes
258+
FILE_SIZE_MAX_BYTES = 10000000
259+
260+
261+
def copy_file_to_storage(src_file_name, ext=None, chunk_size=FILE_SIZE_MAX_BYTES):
253262
"""
254-
Copy `srcfilename` (filepath) to destination.
263+
Copy `src_file_name` (filepath) to destination.
264+
The file will be broken into parts if its size exceeds `chunk_size`.
255265
:rtype: None
256266
"""
257267
if ext is None:
258-
ext = extract_path_ext(srcfilename)
268+
ext = extract_path_ext(src_file_name)
259269

260-
hash = get_hash(srcfilename)
261-
filename = "{}.{}".format(hash, ext)
262-
try:
263-
shutil.copy(srcfilename, config.get_storage_path(filename))
264-
except shutil.SameFileError:
265-
pass
270+
filenames = []
266271

267-
return filename
272+
with open(src_file_name, "rb") as src_fd:
273+
slices = list(FileSlice.from_file(src_fd, chunk_size))
274+
275+
for slice in slices:
276+
slice_hash = get_hash_from_fd(slice)
277+
slice.seek(0)
278+
279+
file_name = "{}.{}".format(slice_hash, ext)
280+
storage_path = config.get_storage_path(file_name)
281+
282+
with open(storage_path, "wb") as out_fd:
283+
shutil.copyfileobj(slice, out_fd)
284+
285+
filenames.append(file_name)
286+
287+
return filenames
268288

269289

270290
def compress_video_file(filename, ffmpeg_settings):
@@ -386,6 +406,9 @@ class File(object):
386406
language = None
387407
assessment_item = None
388408
is_primary = False
409+
# Supplementary files are additional File objects which have been
410+
# discovered that must be tracked in addition to this one.
411+
supplementary_files = []
389412

390413
def __init__(self, preset=None, language=None, default_ext=None, source_url=None):
391414
self.preset = preset
@@ -490,22 +513,59 @@ def validate(self):
490513

491514
def process_file(self):
492515
try:
493-
self.filename, self.ext = download(self.path, default_ext=self.default_ext)
516+
filenames, self.ext = download(self.path, default_ext=self.default_ext)
494517
# don't validate for single-digit extension, or no extension
495518
if not self.ext:
496519
self.ext = extract_path_ext(self.path)
497-
return self.filename
498520
# Catch errors related to reading file path and handle silently
499521
except HTTP_CAUGHT_EXCEPTIONS as err:
500522
self.error = str(err)
501523
config.LOGGER.debug("Failed to download, error is: {}".format(err))
502524
config.FAILED_FILES.append(self)
503525
return None
504526

527+
supplementary_files = []
528+
529+
if isinstance(filenames, list):
530+
self.filename = filenames[0]
531+
for extra_filename in filenames[1:]:
532+
extra_file = SplitFile(
533+
self,
534+
extra_filename,
535+
self.ext,
536+
preset=self.preset,
537+
language=self.language,
538+
default_ext=self.default_ext,
539+
source_url=self.source_url,
540+
)
541+
supplementary_files.append(extra_file)
542+
else:
543+
self.filename = filenames
544+
545+
self.supplementary_files = supplementary_files
546+
547+
return self.filename
548+
505549
def __str__(self):
506550
return self.path
507551

508552

553+
class SplitFile(File):
554+
# FIXME: Move this to the ZimNode / ZimFile, and adjust DownloadFile so it
555+
# only creates split files if it is supported.
556+
def __init__(self, base_file, filename, ext, **kwargs):
557+
super(SplitFile, self).__init__(**kwargs)
558+
self.base_file = base_file
559+
self.filename = filename
560+
self.ext = ext
561+
562+
def __str__(self):
563+
return "{} split {}".format(self.base_file, self.filename)
564+
565+
def get_preset(self):
566+
return self.base_file.get_preset()
567+
568+
509569
IMAGE_EXTENSIONS = {
510570
file_formats.PNG,
511571
file_formats.JPG,

ricecooker/classes/nodes.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,14 @@ def process_files(self):
181181
- (optionally) generate thumbnail file from the node's content
182182
Returns: content-hash based filenames of all the files for this node
183183
"""
184-
filenames = []
184+
extra_files = []
185+
185186
for file in self.files:
186-
filenames.append(file.process_file())
187+
file.process_file()
188+
extra_files.extend(file.supplementary_files)
189+
self.files.extend(extra_files)
190+
191+
filenames = [file.filename for file in self.files]
187192

188193
# Auto-generation of thumbnails happens here if derive_thumbnail or config.THUMBNAILS is set
189194
if not self.has_thumbnail() and (config.THUMBNAILS or self.derive_thumbnail):
@@ -198,7 +203,7 @@ def process_files(self):
198203
else:
199204
pass # method generate_thumbnail is not implemented or no suitable source file found
200205

201-
return filenames
206+
return tuple(filenames)
202207

203208
def count(self):
204209
"""count: get number of nodes in tree

ricecooker/utils/file_slice.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
class FileSlice(object):
2+
"""
3+
File-like object that represents a slice of a file, starting from its
4+
current offset until `count`. Reads are always relative to the slice's
5+
start and end point.
6+
"""
7+
8+
def __init__(self, file, count):
9+
self.file = file
10+
self.start = file.tell()
11+
12+
file.seek(0, 2)
13+
self.file_size = file.tell()
14+
15+
count = min(self.file_size - self.start, count)
16+
self.end = self.start + count
17+
18+
# Seek to the end of the file so the next FileSlice object will be
19+
# created from that point.
20+
file.seek(self.end)
21+
22+
self.__last_offset = self.start
23+
24+
@classmethod
25+
def from_file(cls, file, chunk_size):
26+
slice = cls(file, chunk_size)
27+
yield slice
28+
29+
while slice.end < slice.file_size:
30+
slice = cls(file, chunk_size)
31+
yield slice
32+
33+
@property
34+
def size(self):
35+
return self.end - self.start
36+
37+
def seek(self, offset, whence=0):
38+
if whence == 0:
39+
offset = self.start + offset
40+
elif whence == 1:
41+
offset = self.tell() + offset
42+
elif whence == 2:
43+
offset = self.end - offset
44+
self.file.seek(offset)
45+
self.__store_offset()
46+
47+
def __reset_offset(self):
48+
if self.file.tell() != self.__last_offset:
49+
self.file.seek(self.__last_offset)
50+
51+
def __store_offset(self):
52+
self.__last_offset = self.file.tell()
53+
54+
def tell(self):
55+
self.__reset_offset()
56+
return self.file.tell() - self.start
57+
58+
def read(self, count=None):
59+
self.__reset_offset()
60+
61+
if count is None:
62+
count = self.size
63+
64+
remaining = max(0, self.size - self.tell())
65+
66+
buffer = self.file.read(min(count, remaining))
67+
self.__store_offset()
68+
return buffer
69+
70+
def write(self, string):
71+
raise NotImplementedError()

0 commit comments

Comments
 (0)