diff --git a/sagemaker-train/src/sagemaker/train/tuner.py b/sagemaker-train/src/sagemaker/train/tuner.py index d1af08e2f1..bf227912ae 100644 --- a/sagemaker-train/src/sagemaker/train/tuner.py +++ b/sagemaker-train/src/sagemaker/train/tuner.py @@ -457,18 +457,22 @@ def _prepare_model_trainer_for_tuning(cls, model_trainer, inputs=None, job_name= @classmethod def _upload_source_code_and_configure_hyperparameters(cls, model_trainer): """Upload source code to S3 and add script mode hyperparameters. - + Framework containers (PyTorch, TensorFlow) expect sagemaker_program and sagemaker_submit_directory hyperparameters for script mode execution. This method: 1. Checks if source_dir is a local path or S3 URI - 2. Creates a tar.gz archive and uploads to S3 + 2. Creates a tar.gz archive and uploads to S3 (respecting ignore_patterns) 3. Adds required script mode hyperparameters to model_trainer.hyperparameters - + + Files and directories matching patterns in source_code.ignore_patterns + will be excluded from the tarball. + This follows V2's pattern of creating sourcedir.tar.gz files. - + Args: model_trainer: ModelTrainer instance with source_code configured """ + import fnmatch import os import tarfile import tempfile @@ -500,9 +504,21 @@ def _upload_source_code_and_configure_hyperparameters(cls, model_trainer): try: # Create tar.gz archive with tarfile.open(tar_path, 'w:gz') as tar: - # Add all files from source_dir + # Get ignore patterns from source_code (default to empty list if None) + ignore_pats = source_code.ignore_patterns or [] + + # Helper to check if a name matches any ignore pattern + def should_ignore(name): + return any(fnmatch.fnmatch(name, pat) for pat in ignore_pats) + + # Add files from source_dir, respecting ignore_patterns for root, dirs, files in os.walk(source_dir): + # Filter directories in-place to skip ignored dirs + dirs[:] = [d for d in dirs if not should_ignore(d)] + for file in files: + if should_ignore(file): + continue file_path = os.path.join(root, file) # Calculate arcname to preserve directory structure arcname = os.path.relpath(file_path, source_dir) diff --git a/sagemaker-train/tests/unit/train/test_tuner.py b/sagemaker-train/tests/unit/train/test_tuner.py index bb09b86204..72d7275f2e 100644 --- a/sagemaker-train/tests/unit/train/test_tuner.py +++ b/sagemaker-train/tests/unit/train/test_tuner.py @@ -507,3 +507,201 @@ def test_prepare_parameter_ranges_for_tuning(self): assert len(processed_ranges["ContinuousParameterRanges"]) == 1 assert len(processed_ranges["IntegerParameterRanges"]) == 1 assert len(processed_ranges["CategoricalParameterRanges"]) == 1 + + +class TestUploadSourceCodeIgnorePatterns: + """Test _upload_source_code_and_configure_hyperparameters respects ignore_patterns.""" + + @pytest.fixture + def mock_model_trainer_with_source(self, tmp_path): + """Create a mock ModelTrainer with source_code configured.""" + # Create test source directory with files that should be included/ignored + source_dir = tmp_path / "source" + source_dir.mkdir() + + # Create files that should be included + (source_dir / "train.py").write_text("# training script") + (source_dir / "utils.py").write_text("# utils") + + # Create files that should be ignored + (source_dir / ".env").write_text("SECRET=123") + (source_dir / "model.pt").write_text("fake model weights") + + # Create directories that should be ignored + pycache_dir = source_dir / "__pycache__" + pycache_dir.mkdir() + (pycache_dir / "train.cpython-310.pyc").write_text("bytecode") + + git_dir = source_dir / ".git" + git_dir.mkdir() + (git_dir / "config").write_text("git config") + + # Create subdirectory with mixed files + sub_dir = source_dir / "subdir" + sub_dir.mkdir() + (sub_dir / "helper.py").write_text("# helper") + (sub_dir / "cache.pyc").write_text("bytecode") + + # Create mock model trainer + trainer = MagicMock() + trainer.sagemaker_session = MagicMock() + trainer.sagemaker_session.boto_session = MagicMock() + trainer.sagemaker_session.boto_session.client.return_value = MagicMock() + trainer.sagemaker_session.boto_region_name = "us-west-2" + trainer.sagemaker_session.default_bucket.return_value = "test-bucket" + trainer.base_job_name = "test-job" + trainer.hyperparameters = {} + + # Create source_code mock + source_code = MagicMock() + source_code.source_dir = str(source_dir) + source_code.entry_script = "train.py" + source_code.ignore_patterns = [".env", ".git", "__pycache__", "*.pt", "*.pyc"] + trainer.source_code = source_code + + return trainer, source_dir + + def test_upload_source_code_respects_ignore_patterns(self, mock_model_trainer_with_source): + """Test that files matching ignore_patterns are excluded from tarball.""" + import tarfile + import tempfile + + trainer, source_dir = mock_model_trainer_with_source + + # Call the method + HyperparameterTuner._upload_source_code_and_configure_hyperparameters(trainer) + + # Get the uploaded tarball path from the mock call + s3_client = trainer.sagemaker_session.boto_session.client.return_value + upload_call = s3_client.upload_file.call_args + assert upload_call is not None + + # The tarball was already deleted, so we need to test by recreating the scenario + # Let's verify the logic by checking what would have been uploaded + # We do this by examining the method behavior directly + + # For a more thorough test, let's capture what files were added + # by patching tarfile.open + with patch("tarfile.open") as mock_tarfile: + mock_tar = MagicMock() + mock_tarfile.return_value.__enter__.return_value = mock_tar + + # Reset the mock to test again + trainer.hyperparameters = {} + + HyperparameterTuner._upload_source_code_and_configure_hyperparameters(trainer) + + # Check which files were added to the tarball + added_files = [call[0][0] for call in mock_tar.add.call_args_list] + added_arcnames = [call[1]["arcname"] for call in mock_tar.add.call_args_list] + + # Should include these files + assert any("train.py" in f for f in added_files) + assert any("utils.py" in f for f in added_files) + assert any("helper.py" in f for f in added_files) + + # Should NOT include these files (ignored patterns) + assert not any(".env" in f for f in added_files) + assert not any("model.pt" in f for f in added_files) + assert not any("__pycache__" in f for f in added_files) + assert not any(".git" in f for f in added_files) + assert not any(".pyc" in f for f in added_files) + + def test_upload_source_code_ignores_directories(self, mock_model_trainer_with_source): + """Test that directories matching ignore_patterns are completely skipped.""" + trainer, source_dir = mock_model_trainer_with_source + + with patch("tarfile.open") as mock_tarfile: + mock_tar = MagicMock() + mock_tarfile.return_value.__enter__.return_value = mock_tar + + HyperparameterTuner._upload_source_code_and_configure_hyperparameters(trainer) + + added_files = [call[0][0] for call in mock_tar.add.call_args_list] + + # No files from __pycache__ directory should be added + assert not any("__pycache__" in f for f in added_files) + # No files from .git directory should be added + assert not any(".git" in f for f in added_files) + + def test_upload_source_code_with_empty_ignore_patterns(self, tmp_path): + """Test backward compatibility with None/empty ignore_patterns.""" + # Create simple source directory + source_dir = tmp_path / "source" + source_dir.mkdir() + (source_dir / "train.py").write_text("# training script") + (source_dir / ".env").write_text("SECRET=123") + + # Create mock trainer with no ignore_patterns + trainer = MagicMock() + trainer.sagemaker_session = MagicMock() + trainer.sagemaker_session.boto_session = MagicMock() + trainer.sagemaker_session.boto_session.client.return_value = MagicMock() + trainer.sagemaker_session.boto_region_name = "us-west-2" + trainer.sagemaker_session.default_bucket.return_value = "test-bucket" + trainer.base_job_name = "test-job" + trainer.hyperparameters = {} + + source_code = MagicMock() + source_code.source_dir = str(source_dir) + source_code.entry_script = "train.py" + source_code.ignore_patterns = None # No ignore patterns + trainer.source_code = source_code + + with patch("tarfile.open") as mock_tarfile: + mock_tar = MagicMock() + mock_tarfile.return_value.__enter__.return_value = mock_tar + + HyperparameterTuner._upload_source_code_and_configure_hyperparameters(trainer) + + added_files = [call[0][0] for call in mock_tar.add.call_args_list] + + # Both files should be added when no ignore_patterns + assert any("train.py" in f for f in added_files) + assert any(".env" in f for f in added_files) + + def test_upload_source_code_with_default_ignore_patterns(self, tmp_path): + """Test that default ignore patterns from SourceCode class work.""" + # Create source directory with default-ignored files + source_dir = tmp_path / "source" + source_dir.mkdir() + (source_dir / "train.py").write_text("# training script") + (source_dir / ".DS_Store").write_text("mac file") + + cache_dir = source_dir / ".cache" + cache_dir.mkdir() + (cache_dir / "data.json").write_text("{}") + + # Create mock trainer with default ignore_patterns + trainer = MagicMock() + trainer.sagemaker_session = MagicMock() + trainer.sagemaker_session.boto_session = MagicMock() + trainer.sagemaker_session.boto_session.client.return_value = MagicMock() + trainer.sagemaker_session.boto_region_name = "us-west-2" + trainer.sagemaker_session.default_bucket.return_value = "test-bucket" + trainer.base_job_name = "test-job" + trainer.hyperparameters = {} + + source_code = MagicMock() + source_code.source_dir = str(source_dir) + source_code.entry_script = "train.py" + # Default patterns as defined in SourceCode class + source_code.ignore_patterns = [ + ".env", ".git", "__pycache__", ".DS_Store", ".cache", ".ipynb_checkpoints" + ] + trainer.source_code = source_code + + with patch("tarfile.open") as mock_tarfile: + mock_tar = MagicMock() + mock_tarfile.return_value.__enter__.return_value = mock_tar + + HyperparameterTuner._upload_source_code_and_configure_hyperparameters(trainer) + + added_files = [call[0][0] for call in mock_tar.add.call_args_list] + + # train.py should be included + assert any("train.py" in f for f in added_files) + # .DS_Store should be ignored + assert not any(".DS_Store" in f for f in added_files) + # .cache directory should be ignored + assert not any(".cache" in f for f in added_files)