|
29 | 29 | from ..exceptions import UnknownFileTypeError |
30 | 30 | from ricecooker.utils.encodings import get_base64_encoding |
31 | 31 | from ricecooker.utils.encodings import write_base64_to_file |
| 32 | +from ricecooker.utils.file_slice import FileSlice |
32 | 33 | from ricecooker.utils.images import create_image_from_epub |
33 | 34 | from ricecooker.utils.images import create_image_from_pdf_page |
34 | 35 | from ricecooker.utils.images import create_image_from_zip |
@@ -161,12 +162,12 @@ def download(path, default_ext=None): |
161 | 162 | # Get extension of file or use `default_ext` if none found |
162 | 163 | if not ext: |
163 | 164 | ext = extract_path_ext(path, default_ext=default_ext) |
164 | | - filename = copy_file_to_storage(tempf.name, ext=ext) |
165 | | - FILECACHE.set(key, bytes(filename, "utf-8")) |
166 | | - config.LOGGER.info("\t--- Downloaded {}".format(filename)) |
| 165 | + filenames = copy_file_to_storage(tempf.name, ext=ext) |
| 166 | + FILECACHE.set(key, bytes(",".join(filenames), "utf-8")) |
| 167 | + config.LOGGER.info("\t--- Downloaded {}".format(filenames)) |
167 | 168 | os.unlink(tempf.name) |
168 | 169 |
|
169 | | - return filename, ext |
| 170 | + return filenames, ext |
170 | 171 |
|
171 | 172 |
|
172 | 173 | def download_and_convert_video(path, ffmpeg_settings=None): |
@@ -242,29 +243,48 @@ def write_path_to_filename(path, write_to_file): |
242 | 243 |
|
243 | 244 |
|
244 | 245 | def get_hash(filepath): |
245 | | - file_hash = hashlib.md5() |
246 | 246 | with open(filepath, "rb") as fobj: |
247 | | - for chunk in iter(lambda: fobj.read(2097152), b""): |
248 | | - file_hash.update(chunk) |
| 247 | + return get_hash_from_fd(fobj) |
| 248 | + |
| 249 | + |
| 250 | +def get_hash_from_fd(fobj): |
| 251 | + file_hash = hashlib.md5() |
| 252 | + for chunk in iter(lambda: fobj.read(2097152), b""): |
| 253 | + file_hash.update(chunk) |
249 | 254 | return file_hash.hexdigest() |
250 | 255 |
|
251 | 256 |
|
252 | | -def copy_file_to_storage(srcfilename, ext=None): |
| 257 | +# 10 MB in bytes |
| 258 | +FILE_SIZE_MAX_BYTES = 10000000 |
| 259 | + |
| 260 | + |
| 261 | +def copy_file_to_storage(src_file_name, ext=None, chunk_size=FILE_SIZE_MAX_BYTES): |
253 | 262 | """ |
254 | | - Copy `srcfilename` (filepath) to destination. |
| 263 | + Copy `src_file_name` (filepath) to destination. |
| 264 | + The file will be broken into parts if its size exceeds `chunk_size`. |
255 | 265 | :rtype: None |
256 | 266 | """ |
257 | 267 | if ext is None: |
258 | | - ext = extract_path_ext(srcfilename) |
| 268 | + ext = extract_path_ext(src_file_name) |
259 | 269 |
|
260 | | - hash = get_hash(srcfilename) |
261 | | - filename = "{}.{}".format(hash, ext) |
262 | | - try: |
263 | | - shutil.copy(srcfilename, config.get_storage_path(filename)) |
264 | | - except shutil.SameFileError: |
265 | | - pass |
| 270 | + filenames = [] |
266 | 271 |
|
267 | | - return filename |
| 272 | + with open(src_file_name, "rb") as src_fd: |
| 273 | + slices = list(FileSlice.from_file(src_fd, chunk_size)) |
| 274 | + |
| 275 | + for slice in slices: |
| 276 | + slice_hash = get_hash_from_fd(slice) |
| 277 | + slice.seek(0) |
| 278 | + |
| 279 | + file_name = "{}.{}".format(slice_hash, ext) |
| 280 | + storage_path = config.get_storage_path(file_name) |
| 281 | + |
| 282 | + with open(storage_path, "wb") as out_fd: |
| 283 | + shutil.copyfileobj(slice, out_fd) |
| 284 | + |
| 285 | + filenames.append(file_name) |
| 286 | + |
| 287 | + return filenames |
268 | 288 |
|
269 | 289 |
|
270 | 290 | def compress_video_file(filename, ffmpeg_settings): |
@@ -386,6 +406,9 @@ class File(object): |
386 | 406 | language = None |
387 | 407 | assessment_item = None |
388 | 408 | is_primary = False |
| 409 | + # Supplementary files are additional File objects which have been |
| 410 | + # discovered that must be tracked in addition to this one. |
| 411 | + supplementary_files = [] |
389 | 412 |
|
390 | 413 | def __init__(self, preset=None, language=None, default_ext=None, source_url=None): |
391 | 414 | self.preset = preset |
@@ -490,22 +513,59 @@ def validate(self): |
490 | 513 |
|
491 | 514 | def process_file(self): |
492 | 515 | try: |
493 | | - self.filename, self.ext = download(self.path, default_ext=self.default_ext) |
| 516 | + filenames, self.ext = download(self.path, default_ext=self.default_ext) |
494 | 517 | # don't validate for single-digit extension, or no extension |
495 | 518 | if not self.ext: |
496 | 519 | self.ext = extract_path_ext(self.path) |
497 | | - return self.filename |
498 | 520 | # Catch errors related to reading file path and handle silently |
499 | 521 | except HTTP_CAUGHT_EXCEPTIONS as err: |
500 | 522 | self.error = str(err) |
501 | 523 | config.LOGGER.debug("Failed to download, error is: {}".format(err)) |
502 | 524 | config.FAILED_FILES.append(self) |
503 | 525 | return None |
504 | 526 |
|
| 527 | + supplementary_files = [] |
| 528 | + |
| 529 | + if isinstance(filenames, list): |
| 530 | + self.filename = filenames[0] |
| 531 | + for extra_filename in filenames[1:]: |
| 532 | + extra_file = SplitFile( |
| 533 | + self, |
| 534 | + extra_filename, |
| 535 | + self.ext, |
| 536 | + preset=self.preset, |
| 537 | + language=self.language, |
| 538 | + default_ext=self.default_ext, |
| 539 | + source_url=self.source_url, |
| 540 | + ) |
| 541 | + supplementary_files.append(extra_file) |
| 542 | + else: |
| 543 | + self.filename = filenames |
| 544 | + |
| 545 | + self.supplementary_files = supplementary_files |
| 546 | + |
| 547 | + return self.filename |
| 548 | + |
505 | 549 | def __str__(self): |
506 | 550 | return self.path |
507 | 551 |
|
508 | 552 |
|
| 553 | +class SplitFile(File): |
| 554 | + # FIXME: Move this to the ZimNode / ZimFile, and adjust DownloadFile so it |
| 555 | + # only creates split files if it is supported. |
| 556 | + def __init__(self, base_file, filename, ext, **kwargs): |
| 557 | + super(SplitFile, self).__init__(**kwargs) |
| 558 | + self.base_file = base_file |
| 559 | + self.filename = filename |
| 560 | + self.ext = ext |
| 561 | + |
| 562 | + def __str__(self): |
| 563 | + return "{} split {}".format(self.base_file, self.filename) |
| 564 | + |
| 565 | + def get_preset(self): |
| 566 | + return self.base_file.get_preset() |
| 567 | + |
| 568 | + |
509 | 569 | IMAGE_EXTENSIONS = { |
510 | 570 | file_formats.PNG, |
511 | 571 | file_formats.JPG, |
|
0 commit comments