-
Notifications
You must be signed in to change notification settings - Fork 13
Add Google Cloud Storage Adapter #84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
MaxGroot
wants to merge
14
commits into
fox-it:main
Choose a base branch
from
MaxGroot:feature/gcs-adapter
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
5b6360e
Support file-like objects in record writer
MaxGroot f078d4f
Add Google Cloud Storage Adapter
MaxGroot 36c8f34
Merge branch 'main' into feature/gcs-adapter
MaxGroot 79acd5e
Implement review suggestions
MaxGroot 3394b6a
Use `re.split` to determine prefix and pattern
MaxGroot 901c2ec
Further simplify prefix and pattern logic
MaxGroot a7d7860
Implement code review suggestions
MaxGroot 4f03249
Implement code review suggestions
MaxGroot c3747b6
Merge branch 'fox-it:main' into feature/gcs-adapter
MaxGroot 67b43ba
Merge branch 'main' into feature/gcs-adapter
MaxGroot 6624d73
Support transparent compression when writing to fileobj
MaxGroot 92390bf
Fix flushing & closing, make write test use gzip
MaxGroot 028b4e4
Implement review suggestion
MaxGroot 873ba91
Merge branch 'main' into feature/gcs-adapter
yunzheng File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| import re | ||
| from fnmatch import fnmatch | ||
| from typing import Iterator | ||
|
|
||
| from google.cloud.storage.client import Client | ||
| from google.cloud.storage.fileio import BlobReader, BlobWriter | ||
MaxGroot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| from flow.record.adapter import AbstractReader, AbstractWriter | ||
| from flow.record.base import Record, RecordAdapter | ||
| from flow.record.selector import Selector | ||
|
|
||
| __usage__ = """ | ||
| Google Cloud Storage adapter | ||
| --- | ||
| Read usage: rdump gcs://[BUCKET_ID]/path?project=[PROJECT] | ||
| Write usage: rdump -w gcs://[BUCKET_ID]/path?project=[PROJECT] | ||
|
|
||
| [BUCKET_ID]: Bucket ID | ||
| [path]: Path to read from or write to, supports glob-pattern matching when reading | ||
|
|
||
| Optional arguments: | ||
| [PROJECT]: Google Cloud Project ID, If not passed, falls back to the default inferred from the environment. | ||
| """ | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
| GLOB_CHARACTERS_RE = r"[\[\]\*\?]" | ||
|
|
||
|
|
||
| class GcsReader(AbstractReader): | ||
| def __init__(self, uri: str, *, project: str | None = None, selector: Selector | None = None, **kwargs): | ||
| self.selector = selector | ||
| bucket_name, _, path = uri.partition("/") | ||
| self.gcs = Client(project=project) | ||
| self.bucket = self.gcs.bucket(bucket_name) | ||
|
|
||
| # GCS Doesn't support iterating blobs using a glob pattern, so we have to do that ourselves. To extract the path | ||
| # prefix from the glob-pattern we have to find the first place where the glob starts. | ||
| self.prefix, *glob_pattern = re.split(GLOB_CHARACTERS_RE, path) | ||
| self.pattern = path if glob_pattern else None | ||
|
|
||
| def __iter__(self) -> Iterator[Record]: | ||
| blobs = self.gcs.list_blobs(bucket_or_name=self.bucket, prefix=self.prefix) | ||
| for blob in blobs: | ||
| if blob.size == 0: # Skip empty files | ||
| continue | ||
| if self.pattern and not fnmatch(blob.name, self.pattern): | ||
| continue | ||
| blobreader = BlobReader(blob) | ||
|
|
||
| # Give the file-like object to RecordAdapter so it will select the right adapter by peeking into the stream | ||
| reader = RecordAdapter(fileobj=blobreader, out=False, selector=self.selector) | ||
| for record in reader: | ||
| yield record | ||
|
|
||
| def close(self) -> None: | ||
| self.gcs.close() | ||
|
|
||
|
|
||
| class GcsWriter(AbstractWriter): | ||
| def __init__(self, uri: str, *, project: str | None = None, **kwargs): | ||
| bucket_name, _, path = uri.partition("/") | ||
| self.writer = None | ||
|
|
||
| self.gcs = Client(project=project) | ||
| self.bucket = self.gcs.bucket(bucket_name) | ||
|
|
||
| blob = self.bucket.blob(path) | ||
| self.writer = BlobWriter(blob, ignore_flush=True) | ||
| self.adapter = RecordAdapter(url=path, fileobj=self.writer, out=True, **kwargs) | ||
|
|
||
| def write(self, record: Record) -> None: | ||
| self.adapter.write(record) | ||
|
|
||
| def flush(self) -> None: | ||
| # The underlying adapter may require flushing | ||
| self.adapter.flush() | ||
|
|
||
| def close(self) -> None: | ||
| self.flush() | ||
| self.adapter.close() | ||
|
|
||
| if self.writer: | ||
| self.writer.close() | ||
| self.writer = None | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import sys | ||
| from io import BytesIO | ||
| from typing import Any, Generator, Iterator | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| import pytest | ||
|
|
||
| from flow.record import Record, RecordAdapter, RecordDescriptor, RecordStreamWriter | ||
| from flow.record.base import GZIP_MAGIC | ||
|
|
||
|
|
||
| def generate_records(amount) -> Generator[Record, Any, None]: | ||
| TestRecordWithFooBar = RecordDescriptor( | ||
| "test/record", | ||
| [ | ||
| ("string", "name"), | ||
| ("string", "foo"), | ||
| ("varint", "idx"), | ||
| ], | ||
| ) | ||
| for i in range(amount): | ||
| yield TestRecordWithFooBar(name=f"record{i}", foo="bar", idx=i) | ||
|
|
||
|
|
||
| def clean_up_adapter_import(test_function): | ||
| def wrapper(mock_google_sdk): | ||
| try: | ||
| result = test_function(mock_google_sdk) | ||
| finally: | ||
| if "flow.record.adapter.gcs" in sys.modules: | ||
| del sys.modules["flow.record.adapter.gcs"] | ||
| return result | ||
|
|
||
| return wrapper | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def mock_google_sdk(monkeypatch: pytest.MonkeyPatch) -> Iterator[MagicMock]: | ||
| with monkeypatch.context() as m: | ||
| mock_google_sdk = MagicMock() | ||
| m.setitem(sys.modules, "google", mock_google_sdk) | ||
| m.setitem(sys.modules, "google.cloud", mock_google_sdk.cloud) | ||
| m.setitem(sys.modules, "google.cloud.storage", mock_google_sdk.cloud.storage) | ||
| m.setitem(sys.modules, "google.cloud.storage.client", mock_google_sdk.cloud.storage.client) | ||
| m.setitem(sys.modules, "google.cloud.storage.fileio", mock_google_sdk.cloud.storage.fileio) | ||
|
|
||
| yield mock_google_sdk | ||
|
|
||
|
|
||
| @clean_up_adapter_import | ||
| def test_gcs_uri_and_path(mock_google_sdk: MagicMock) -> None: | ||
| from flow.record.adapter.gcs import GcsReader | ||
|
|
||
| mock_client = MagicMock() | ||
| mock_google_sdk.cloud.storage.client.Client.return_value = mock_client | ||
| adapter_with_glob = RecordAdapter("gcs://test-bucket/path/to/records/*/*.avro", project="test-project") | ||
|
|
||
| assert isinstance(adapter_with_glob, GcsReader) | ||
|
|
||
| mock_google_sdk.cloud.storage.client.Client.assert_called_with(project="test-project") | ||
| mock_client.bucket.assert_called_with("test-bucket") | ||
|
|
||
| assert adapter_with_glob.prefix == "path/to/records/" | ||
| assert adapter_with_glob.pattern == "path/to/records/*/*.avro" | ||
|
|
||
| adapter_without_glob = RecordAdapter("gcs://test-bucket/path/to/records/test-records.rec", project="test-project") | ||
| assert isinstance(adapter_without_glob, GcsReader) | ||
|
|
||
| assert adapter_without_glob.prefix == "path/to/records/test-records.rec" | ||
| assert adapter_without_glob.pattern is None | ||
|
|
||
|
|
||
| @clean_up_adapter_import | ||
| def test_gcs_reader_glob(mock_google_sdk) -> None: | ||
| # Create a mocked record stream | ||
| test_records = list(generate_records(10)) | ||
| mock_blob = BytesIO() | ||
| writer = RecordStreamWriter(fp=mock_blob) | ||
| for record in test_records: | ||
| writer.write(record) | ||
| writer.flush() | ||
| mock_recordstream = mock_blob.getvalue() | ||
| writer.close() | ||
|
|
||
| # Create a mocked client that will return the test-bucket | ||
| mock_client = MagicMock() | ||
| mock_client.bucket.return_value = "test-bucket-returned-from-client" | ||
| mock_google_sdk.cloud.storage.client.Client.return_value = mock_client | ||
|
|
||
| # Create a mocked instance of the 'Blob' class of google.cloud.storage.fileio | ||
| recordsfile_blob_mock = MagicMock() | ||
| recordsfile_blob_mock.name = "path/to/records/subfolder/results/tests.records" | ||
| recordsfile_blob_mock.data = mock_recordstream | ||
| recordsfile_blob_mock.size = len(mock_recordstream) | ||
|
|
||
| # As this blob is located in the '🍩 select' folder, it should not match with the glob that will be used later | ||
| # (which requires /results/ to be present in the path string) | ||
| wrong_location_blob = MagicMock() | ||
| wrong_location_blob.name = "path/to/records/subfolder/donutselect/tests.records" | ||
| wrong_location_blob.size = 0x69 | ||
| wrong_location_blob.data = b"" | ||
|
|
||
| # Return one empty file, one file that should match the glob, and one file that shouldn't match the glob | ||
| mock_client.list_blobs.return_value = [MagicMock(size=0), recordsfile_blob_mock, wrong_location_blob] | ||
|
|
||
| test_read_buf = BytesIO(mock_recordstream) | ||
| mock_reader = MagicMock(wraps=test_read_buf, spec=BytesIO) | ||
| mock_reader.closed = False | ||
| mock_google_sdk.cloud.storage.fileio.BlobReader.return_value = mock_reader | ||
| with patch("io.open", MagicMock(return_value=mock_reader)): | ||
| adapter = RecordAdapter( | ||
| url="gcs://test-bucket/path/to/records/*/results/*.records", | ||
| project="test-project", | ||
| selector="r.idx >= 5", | ||
| ) | ||
|
|
||
| found_records = list(adapter) | ||
| mock_client.bucket.assert_called_with("test-bucket") | ||
| mock_client.list_blobs.assert_called_with( | ||
| bucket_or_name="test-bucket-returned-from-client", | ||
| prefix="path/to/records/", | ||
| ) | ||
|
|
||
| # We expect the GCS Reader to skip over blobs of size 0, as those will inherently not contain records. | ||
| # Thus, a BlobReader should only have been initialized once, for the mocked records blob. | ||
| mock_google_sdk.cloud.storage.fileio.BlobReader.assert_called_once() | ||
|
|
||
| # We expect 5 records rather than 10 because of the selector that we used | ||
| assert len(found_records) == 5 | ||
| for record in found_records: | ||
| assert record.foo == "bar" | ||
| assert record == test_records[record.idx] | ||
|
|
||
| adapter.close() | ||
| mock_client.close.assert_called() | ||
|
|
||
|
|
||
| @clean_up_adapter_import | ||
| def test_gcs_writer(mock_google_sdk) -> None: | ||
| from flow.record.adapter.gcs import GcsWriter | ||
|
|
||
| test_buf = BytesIO() | ||
| mock_writer = MagicMock(wraps=test_buf, spec=BytesIO) | ||
| mock_google_sdk.cloud.storage.fileio.BlobWriter.return_value = mock_writer | ||
|
|
||
| adapter = RecordAdapter("gcs://test-bucket/test/test.records.gz", project="test-project", out=True) | ||
|
|
||
| assert isinstance(adapter, GcsWriter) | ||
|
|
||
| # Add mock records | ||
| test_records = list(generate_records(10)) | ||
| for record in test_records: | ||
| adapter.write(record) | ||
|
|
||
| adapter.flush() | ||
| mock_writer.flush.assert_called() | ||
|
|
||
| # Grab the bytes before it's too late | ||
| written_bytes = test_buf.getvalue() | ||
| assert written_bytes.startswith(GZIP_MAGIC) | ||
|
|
||
| read_buf = BytesIO(test_buf.getvalue()) | ||
|
|
||
| # Close the writer and assure the object has been closed | ||
| adapter.close() | ||
| mock_writer.close.assert_called() | ||
| assert test_buf.closed | ||
|
|
||
| # Verify if the written record stream is something we can read | ||
| reader = RecordAdapter(fileobj=read_buf) | ||
| read_records = list(reader) | ||
| assert len(read_records) == 10 | ||
| for idx, record in enumerate(read_records): | ||
| assert record == test_records[idx] |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.