diff --git a/ricecooker/classes/files.py b/ricecooker/classes/files.py index 8e628871..5c60956a 100644 --- a/ricecooker/classes/files.py +++ b/ricecooker/classes/files.py @@ -29,6 +29,7 @@ from ..exceptions import UnknownFileTypeError from ricecooker.utils.encodings import get_base64_encoding from ricecooker.utils.encodings import write_base64_to_file +from ricecooker.utils.file_slice import FileSlice from ricecooker.utils.images import create_image_from_epub from ricecooker.utils.images import create_image_from_pdf_page from ricecooker.utils.images import create_image_from_zip @@ -114,13 +115,19 @@ def generate_key(action, path_or_id, settings=None, default=" (default)"): def get_cache_filename(key): cache_file = FILECACHE.get(key) if cache_file: - cache_file = cache_file.decode("utf-8") + cache_file = cache_file.decode("utf-8").split(",") # if the file was somehow deleted, make sure we don't return it. - if not os.path.exists(config.get_storage_path(cache_file)): + if not all(map(cache_file_exists, cache_file)): cache_file = None + if cache_file and len(cache_file) == 1: + cache_file = cache_file[0] return cache_file +def cache_file_exists(cache_file): + return os.path.exists(config.get_storage_path(cache_file)) + + def cache_is_outdated(path, cache_file): outdated = True if not cache_file: @@ -137,7 +144,7 @@ def cache_is_outdated(path, cache_file): return outdated -def download(path, default_ext=None): +def download(path, default_ext=None, slice_size=None): """ Download `path` and save to storage based on file extension derived from `path`. :param path: An URL or a local path @@ -145,7 +152,10 @@ def download(path, default_ext=None): :return: filename derived from hash of file contents {md5hash(file)}.ext :rtype: sting (path of the form `{md5hash(file at path)}.ext` """ - key = "DOWNLOAD:{}".format(path) + if slice_size is not None: + key = "DOWNLOAD:{}:{}".format(path, slice_size) + else: + key = "DOWNLOAD:{}".format(path) cache_file = get_cache_filename(key) if not config.UPDATE and not cache_is_outdated(path, cache_file): @@ -161,8 +171,12 @@ def download(path, default_ext=None): # Get extension of file or use `default_ext` if none found if not ext: ext = extract_path_ext(path, default_ext=default_ext) - filename = copy_file_to_storage(tempf.name, ext=ext) - FILECACHE.set(key, bytes(filename, "utf-8")) + filename = copy_file_to_storage(tempf.name, ext=ext, slice_size=slice_size) + if isinstance(filename, list): + cache_value = ",".join(filename) + else: + cache_value = filename + FILECACHE.set(key, bytes(cache_value, "utf-8")) config.LOGGER.info("\t--- Downloaded {}".format(filename)) os.unlink(tempf.name) @@ -242,29 +256,54 @@ def write_path_to_filename(path, write_to_file): def get_hash(filepath): - file_hash = hashlib.md5() with open(filepath, "rb") as fobj: - for chunk in iter(lambda: fobj.read(2097152), b""): - file_hash.update(chunk) + return get_hash_from_fd(fobj) + + +def get_hash_from_fd(fobj): + file_hash = hashlib.md5() + for chunk in iter(lambda: fobj.read(2097152), b""): + file_hash.update(chunk) return file_hash.hexdigest() -def copy_file_to_storage(srcfilename, ext=None): +def copy_file_to_storage(src_file_name, ext=None, slice_size=None): """ - Copy `srcfilename` (filepath) to destination. + Copy `src_file_name` (filepath) to destination. + If `slice_size` is set, the file will be broken into slices if it exceeds + that size in bytes. :rtype: None """ if ext is None: - ext = extract_path_ext(srcfilename) + ext = extract_path_ext(src_file_name) - hash = get_hash(srcfilename) - filename = "{}.{}".format(hash, ext) - try: - shutil.copy(srcfilename, config.get_storage_path(filename)) - except shutil.SameFileError: - pass + filenames = [] - return filename + with open(src_file_name, "rb") as src_fd: + slices = list(FileSlice.from_file(src_fd, slice_size)) + + for slice in slices: + slice_hash = get_hash_from_fd(slice) + slice.seek(0) + + out_file_name = "{}.{}".format(slice_hash, ext) + storage_path = config.get_storage_path(out_file_name) + + try: + is_same_file = os.path.samefile(storage_path, src_fd.name) + except FileNotFoundError: + is_same_file = False + + if not is_same_file: + with open(storage_path, "wb") as out_fd: + shutil.copyfileobj(slice, out_fd) + + filenames.append(out_file_name) + + if slice_size is None: + return filenames[0] + else: + return filenames def compress_video_file(filename, ffmpeg_settings): @@ -490,11 +529,10 @@ def validate(self): def process_file(self): try: - self.filename, self.ext = download(self.path, default_ext=self.default_ext) + self.filename, self.ext = self._download() # don't validate for single-digit extension, or no extension if not self.ext: self.ext = extract_path_ext(self.path) - return self.filename # Catch errors related to reading file path and handle silently except HTTP_CAUGHT_EXCEPTIONS as err: self.error = str(err) @@ -502,10 +540,57 @@ def process_file(self): config.FAILED_FILES.append(self) return None + return self.filename + + def _download(self): + return download(self.path, default_ext=self.default_ext) + def __str__(self): return self.path +class SplittableDownloadFile(DownloadFile): + """ + A type of DownloadFile that will be split into pieces if the source file + exceeds `slice_size`. This is separate from DownloadFile because not all + content types support file splitting. + """ + + # 2 GB in bytes + slice_size = 2000000000 + + def process_file(self): + filenames = super(SplittableDownloadFile, self).process_file() + + # TODO: When we call node.add_file, we are assuming files will be + # added in sequence and that order will be maintained. Should we + # add a mechanism where it adds split file order to extra_fields, + # similar to SlideshowNode? + + if isinstance(filenames, list): + self.filename = filenames[0] + for extra_filename in filenames[1:]: + extra_file = self.create_split(extra_filename) + self.node.add_file(extra_file) + + def create_split(self, filename): + download_file = SplittableDownloadFile( + self.path, + preset=self.get_preset(), + language=self.language, + default_ext=self.default_ext, + source_url=self.source_url, + ) + download_file.filename = filename + download_file.ext = self.ext + return download_file + + def _download(self): + return download( + self.path, default_ext=self.default_ext, slice_size=self.slice_size + ) + + IMAGE_EXTENSIONS = { file_formats.PNG, file_formats.JPG, diff --git a/ricecooker/classes/nodes.py b/ricecooker/classes/nodes.py index dd03e9c2..fd4d40e0 100644 --- a/ricecooker/classes/nodes.py +++ b/ricecooker/classes/nodes.py @@ -181,9 +181,15 @@ def process_files(self): - (optionally) generate thumbnail file from the node's content Returns: content-hash based filenames of all the files for this node """ - filenames = [] - for file in self.files: - filenames.append(file.process_file()) + + # Items may be added to self.files during file.process_file(), so + # we will work with a copy and generate our list of filenames + # separately. + + for file in list(self.files): + file.process_file() + + filenames = [file.filename for file in self.files] # Auto-generation of thumbnails happens here if derive_thumbnail or config.THUMBNAILS is set if not self.has_thumbnail() and (config.THUMBNAILS or self.derive_thumbnail): diff --git a/ricecooker/utils/file_slice.py b/ricecooker/utils/file_slice.py new file mode 100644 index 00000000..12665815 --- /dev/null +++ b/ricecooker/utils/file_slice.py @@ -0,0 +1,75 @@ +class FileSlice(object): + """ + File-like object that represents a slice of a file, starting from its + current offset until `count`. Reads are always relative to the slice's + start and end point. + """ + + def __init__(self, file, count=None): + self.file = file + self.start = file.tell() + + file.seek(0, 2) + self.file_size = file.tell() + + if count is None: + count = self.file_size + + count = min(self.file_size - self.start, count) + self.end = self.start + count + + # Seek to the end of the file so the next FileSlice object will be + # created from that point. + file.seek(self.end) + + self.__last_offset = self.start + + @classmethod + def from_file(cls, file, chunk_size): + slice = cls(file, chunk_size) + yield slice + + while slice.end < slice.file_size: + slice = cls(file, chunk_size) + yield slice + + @property + def size(self): + return self.end - self.start + + def seek(self, offset, whence=0): + if whence == 0: + offset = self.start + offset + elif whence == 1: + offset = self.tell() + offset + elif whence == 2: + offset = self.end + offset + self.file.seek(offset) + self.__store_offset() + return self.__last_offset + + def __reset_offset(self): + if self.file.tell() != self.__last_offset: + self.file.seek(self.__last_offset) + + def __store_offset(self): + self.__last_offset = self.file.tell() + + def tell(self): + self.__reset_offset() + return self.file.tell() - self.start + + def read(self, count=None): + self.__reset_offset() + + if count is None: + count = self.size + + remaining = max(0, self.size - self.tell()) + + buffer = self.file.read(min(count, remaining)) + self.__store_offset() + return buffer + + def write(self, string): + raise NotImplementedError()