Segment large files for upload without needing tons of RAM #551

drewbrew · 2019-01-28T18:23:57Z

Instead of hard-coding the temporary file names, why not use TemporaryFile?

drewbrew · 2019-01-28T18:28:47Z

equal with --> equal to

-Original file line number
+Diff line change
@@ Expand Up @@
                 sequence = str(segment + 1).zfill(digits)
                 seg_name = "%s.%s" % (obj_name, sequence)
                 with utils.SelfDeletingTempfile() as tmpname:
+                    # Write the temporary file in small pieces, to be memory efficient.
                     with open(tmpname, "wb") as tmp:
-                        tmp.write(content.read(MAX_FILE_SIZE))
+                        for chunk in utils.read_in_chunks(content,
+                                max_size=MAX_FILE_SIZE):
+                            tmp.write(chunk)
                     with open(tmpname, "rb") as tmp:
                         # We have to calculate the etag for each segment
                         etag = utils.get_checksum(tmp)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -784,3 +784,17 @@ def to_slug(value, incoming=None, errors="strict"): @@
     # For backwards compatibility, alias slugify to point to_slug
     slugify = to_slug
+    def read_in_chunks(file_object, max_size, chunk_size=8192):
+        bytes_left_to_read = int(max_size)
+        chunk_size = int(chunk_size)
+        # Set read size to pick the smaller of the two values
+        # in case max_size is smaller than the default chunk size
+        read_size = min(max_size, chunk_size)
+        while read_size > 0:
+            data = file_object.read(read_size)
+            bytes_left_to_read -= chunk_size
+            read_size = min(bytes_left_to_read, chunk_size)
+            if not data:
+                break
+            yield data

-Original file line number
+Diff line change
@@ Expand Up / @@ -422,6 +422,54 @@ def test_update_exc(self): @@
             ret = utils.update_exc(err, msg2, before=False, separator=sep)
             self.assertEqual(ret.message, exp)
+        def test_read_in_chunks(self):
+            # create junk file to test size.
+            source_file = "source_file.dat"
+            target_file = "target_file.dat"
+            def compare_contents(source_file_name, target_file_name):
+                # compare whatever is contained in the target file to the first part of the source file
+                with open(source_file_name, "rb") as source_handle, open(target_file_name, "rb") as target_handle:
+                    read_size = os.path.getsize(target_file_name)
+                    target_contents = target_handle.read(read_size)
+                    source_contents = source_handle.read(read_size)
+                    self.assertEqual(target_contents, source_contents)
+            # Try block is just to make sure we delete the files.
+            # Using only main python libraries to hopefully improve test reliability.
+            try:
+                # Write something into the source file
+                with open(source_file, "wb") as source:
+                    source.write(os.urandom(1024))
+                # Make sure it's the size we're expecting
+                self.assertEqual(1024, os.path.getsize(source_file))
+                # Now test different sizing cases for file consistency.
+                # test max_size smaller than chunk size
+                with open(target_file, "wb") as target, open(source_file, "rb") as source:
+                    for chunk in utils.read_in_chunks(source, max_size=1, chunk_size=1024):
+                        target.write(chunk)
+                    compare_contents(source_file, target_file)
+                os.unlink(target_file)
+                # test max_size larger than chunk size
+                with open(target_file, "wb") as target, open(source_file, "rb") as source:
+                    for chunk in utils.read_in_chunks(source, max_size=512, chunk_size=64):
+                        target.write(chunk)
+                    compare_contents(source_file, target_file)
+                os.unlink(target_file)
+                # test max_size equal with chunk size
+                with open(target_file, "wb") as target, open(source_file, "rb") as source:
+                    for chunk in utils.read_in_chunks(source, max_size=512, chunk_size=512):
+                        target.write(chunk)
+                    compare_contents(source_file, target_file)
+                os.unlink(target_file)
+                os.unlink(source_file)
+            except:
+                raise
+            finally:
+                # Remove the test files we made!
+                if os.path.exists(source_file):
+                    os.unlink(source_file)
+                if os.path.exists(target_file):
+                    os.unlink(target_file)
     if __name__ == "__main__":
         unittest.main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Segment large files for upload without needing tons of RAM #551

Uh oh!

Diff view

Diff view

There are no files selected for viewing

drewbrew Jan 28, 2019

Uh oh!

drewbrew Jan 28, 2019

Uh oh!

Uh oh!

Segment large files for upload without needing tons of RAM #551

Are you sure you want to change the base?

Uh oh!

Segment large files for upload without needing tons of RAM #551

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

drewbrew Jan 28, 2019

Choose a reason for hiding this comment

Uh oh!

drewbrew Jan 28, 2019

Choose a reason for hiding this comment

Uh oh!