diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md index d19f674fda8..a36ee980b74 100644 --- a/components/google-cloud/RELEASE.md +++ b/components/google-cloud/RELEASE.md @@ -1,5 +1,7 @@ ## Upcoming release +* Remove deprecated Wide and Deep Tabular Workflow pipeline. + ## Release 2.22.0 * Fix for dataproc batch components pipeline failure. diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/__init__.py b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/__init__.py index bbdea81c12c..1c50c878b4a 100644 --- a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/__init__.py +++ b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/__init__.py @@ -24,10 +24,6 @@ from google_cloud_pipeline_components.preview.automl.tabular.tabnet_trainer import tabnet_trainer as TabNetTrainerOp from google_cloud_pipeline_components.preview.automl.tabular.utils import get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters from google_cloud_pipeline_components.preview.automl.tabular.utils import get_tabnet_trainer_pipeline_and_parameters -from google_cloud_pipeline_components.preview.automl.tabular.utils import get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters -from google_cloud_pipeline_components.preview.automl.tabular.utils import get_wide_and_deep_trainer_pipeline_and_parameters -from google_cloud_pipeline_components.preview.automl.tabular.wide_and_deep_hyperparameter_tuning_job import wide_and_deep_hyperparameter_tuning_job as WideAndDeepHyperparameterTuningJobOp -from google_cloud_pipeline_components.preview.automl.tabular.wide_and_deep_trainer import wide_and_deep_trainer as WideAndDeepTrainerOp from kfp import components __all__ = [ @@ -37,12 +33,8 @@ 'FeatureTransformEngineOp', 'TabNetHyperparameterTuningJobOp', 'TabNetTrainerOp', - 'WideAndDeepHyperparameterTuningJobOp', - 'WideAndDeepTrainerOp', 'get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters', 'get_tabnet_trainer_pipeline_and_parameters', - 'get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters', - 'get_wide_and_deep_trainer_pipeline_and_parameters', ] tabnet_trainer_pipeline = components.load_component_from_file( @@ -50,11 +42,3 @@ # the generated file. os.path.join(os.path.dirname(__file__), 'tabnet_trainer_pipeline.yaml') ) - -wide_and_deep_trainer_pipeline = components.load_component_from_file( - # Note, please don't name it as `component.yaml` which will conflict with - # the generated file. - os.path.join( - os.path.dirname(__file__), 'wide_and_deep_trainer_pipeline.yaml' - ) -) diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/utils.py b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/utils.py index bd07294b8b8..6b86f10417a 100644 --- a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/utils.py +++ b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/utils.py @@ -1112,301 +1112,6 @@ def get_skip_architecture_search_pipeline_and_parameters( ) -def get_wide_and_deep_trainer_pipeline_and_parameters( - project: str, - location: str, - root_dir: str, - target_column: str, - prediction_type: str, - learning_rate: float, - dnn_learning_rate: float, - transform_config: Optional[str] = None, - dataset_level_custom_transformation_definitions: Optional[ - List[Dict[str, Any]] - ] = None, - dataset_level_transformations: Optional[List[Dict[str, Any]]] = None, - run_feature_selection: bool = False, - feature_selection_algorithm: Optional[str] = None, - materialized_examples_format: Optional[str] = None, - max_selected_features: Optional[int] = None, - predefined_split_key: Optional[str] = None, - stratified_split_key: Optional[str] = None, - training_fraction: Optional[float] = None, - validation_fraction: Optional[float] = None, - test_fraction: Optional[float] = None, - tf_transform_execution_engine: Optional[str] = None, - tf_auto_transform_features: Optional[ - Union[List[str], Dict[str, List[str]]] - ] = None, - tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None, - tf_transformations_path: Optional[str] = None, - optimizer_type: str = 'adam', - max_steps: int = -1, - max_train_secs: int = -1, - l1_regularization_strength: float = 0, - l2_regularization_strength: float = 0, - l2_shrinkage_regularization_strength: float = 0, - beta_1: float = 0.9, - beta_2: float = 0.999, - hidden_units: str = '30,30,30', - use_wide: bool = True, - embed_categories: bool = True, - dnn_dropout: float = 0, - dnn_optimizer_type: str = 'adam', - dnn_l1_regularization_strength: float = 0, - dnn_l2_regularization_strength: float = 0, - dnn_l2_shrinkage_regularization_strength: float = 0, - dnn_beta_1: float = 0.9, - dnn_beta_2: float = 0.999, - enable_profiler: bool = False, - cache_data: str = 'auto', - seed: int = 1, - eval_steps: int = 0, - batch_size: int = 100, - measurement_selection_type: Optional[str] = None, - optimization_metric: Optional[str] = None, - eval_frequency_secs: int = 600, - data_source_csv_filenames: Optional[str] = None, - data_source_bigquery_table_path: Optional[str] = None, - bigquery_staging_full_dataset_id: Optional[str] = None, - weight_column: str = '', - transform_dataflow_machine_type: str = 'n1-standard-16', - transform_dataflow_max_num_workers: int = 25, - transform_dataflow_disk_size_gb: int = 40, - worker_pool_specs_override: Optional[Dict[str, Any]] = None, - run_evaluation: bool = True, - evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE, - evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT, - evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT, - evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE, - evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS, - evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS, - evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB, - dataflow_service_account: str = '', - dataflow_subnetwork: str = '', - dataflow_use_public_ips: bool = True, - encryption_spec_key_name: str = '', -) -> Tuple[str, Dict[str, Any]]: - # fmt: off - """Get the Wide & Deep training pipeline. - - Args: - project: The GCP project that runs the pipeline components. - location: The GCP region that runs the pipeline components. - root_dir: The root GCS directory for the pipeline components. - target_column: The target column name. - prediction_type: The type of prediction the model is to produce. 'classification' or 'regression'. - learning_rate: The learning rate used by the linear optimizer. - dnn_learning_rate: The learning rate for training the deep part of the model. - transform_config: Path to v1 TF transformation configuration. - dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format. - dataset_level_transformations: Dataset-level transformation configuration in string format. - run_feature_selection: Whether to enable feature selection. - feature_selection_algorithm: Feature selection algorithm. - materialized_examples_format: The format for the materialized examples. - max_selected_features: Maximum number of features to select. - predefined_split_key: Predefined split key. - stratified_split_key: Stratified split key. - training_fraction: Training fraction. - validation_fraction: Validation fraction. - test_fraction: Test fraction. - tf_transform_execution_engine: The execution engine used to execute TF-based transformations. - tf_auto_transform_features: List of auto transform features in the comma-separated string format. - tf_custom_transformation_definitions: TF custom transformation definitions in string format. - tf_transformations_path: Path to TF transformation configuration. - optimizer_type: The type of optimizer to use. Choices are "adam", "ftrl" and "sgd" for the Adam, FTRL, and Gradient Descent Optimizers, respectively. - max_steps: Number of steps to run the trainer for. - max_train_secs: Amount of time in seconds to run the trainer for. - l1_regularization_strength: L1 regularization strength for optimizer_type="ftrl". - l2_regularization_strength: L2 regularization strength for optimizer_type="ftrl". - l2_shrinkage_regularization_strength: L2 shrinkage regularization strength for optimizer_type="ftrl". - beta_1: Beta 1 value for optimizer_type="adam". - beta_2: Beta 2 value for optimizer_type="adam". - hidden_units: Hidden layer sizes to use for DNN feature columns, provided in comma-separated layers. - use_wide: If set to true, the categorical columns will be used in the wide part of the DNN model. - embed_categories: If set to true, the categorical columns will be used embedded and used in the deep part of the model. Embedding size is the square root of the column cardinality. - dnn_dropout: The probability we will drop out a given coordinate. - dnn_optimizer_type: The type of optimizer to use for the deep part of the model. Choices are "adam", "ftrl" and "sgd". for the Adam, FTRL, and Gradient Descent Optimizers, respectively. - dnn_l1_regularization_strength: L1 regularization strength for dnn_optimizer_type="ftrl". - dnn_l2_regularization_strength: L2 regularization strength for dnn_optimizer_type="ftrl". - dnn_l2_shrinkage_regularization_strength: L2 shrinkage regularization strength for dnn_optimizer_type="ftrl". - dnn_beta_1: Beta 1 value for dnn_optimizer_type="adam". - dnn_beta_2: Beta 2 value for dnn_optimizer_type="adam". - enable_profiler: Enables profiling and saves a trace during evaluation. - cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size. - seed: Seed to be used for this run. - eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples. - batch_size: Batch size for training. - measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT". - optimization_metric: Optimization metric used for `measurement_selection_type`. Default is "rmse" for regression and "auc" for classification. - eval_frequency_secs: Frequency at which evaluation and checkpointing will take place. - data_source_csv_filenames: The CSV data source. - data_source_bigquery_table_path: The BigQuery data source. - bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables. - weight_column: The weight column name. - transform_dataflow_machine_type: The dataflow machine type for transform component. - transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component. - transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component. - worker_pool_specs_override: The dictionary for overriding training and evaluation worker pool specs. The dictionary should be of format https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172. - run_evaluation: Whether to run evaluation steps during training. - evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation. - evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation. - evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation. - evaluation_dataflow_machine_type: The dataflow machine type for evaluation components. - evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components. - evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components. - evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components. - dataflow_service_account: Custom service account to run dataflow jobs. - dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications - dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses. - encryption_spec_key_name: The KMS key name. - - Returns: - Tuple of pipeline_definition_path and parameter_values. - """ - # fmt: on - if isinstance(tf_auto_transform_features, list): - tf_auto_transform_features = {'auto': tf_auto_transform_features} - - if transform_config and tf_transformations_path: - raise ValueError( - 'Only one of transform_config and tf_transformations_path can ' - 'be specified.' - ) - - elif transform_config: - warnings.warn( - 'transform_config parameter is deprecated. ' - 'Please use the flattened transform config arguments instead.' - ) - tf_transformations_path = transform_config - - if not worker_pool_specs_override: - worker_pool_specs_override = [] - - parameter_values = {} - training_and_eval_parameters = { - 'project': project, - 'location': location, - 'root_dir': root_dir, - 'target_column': target_column, - 'prediction_type': prediction_type, - 'learning_rate': learning_rate, - 'dnn_learning_rate': dnn_learning_rate, - 'optimizer_type': optimizer_type, - 'max_steps': max_steps, - 'max_train_secs': max_train_secs, - 'l1_regularization_strength': l1_regularization_strength, - 'l2_regularization_strength': l2_regularization_strength, - 'l2_shrinkage_regularization_strength': ( - l2_shrinkage_regularization_strength - ), - 'beta_1': beta_1, - 'beta_2': beta_2, - 'hidden_units': hidden_units, - 'use_wide': use_wide, - 'embed_categories': embed_categories, - 'dnn_dropout': dnn_dropout, - 'dnn_optimizer_type': dnn_optimizer_type, - 'dnn_l1_regularization_strength': dnn_l1_regularization_strength, - 'dnn_l2_regularization_strength': dnn_l2_regularization_strength, - 'dnn_l2_shrinkage_regularization_strength': ( - dnn_l2_shrinkage_regularization_strength - ), - 'dnn_beta_1': dnn_beta_1, - 'dnn_beta_2': dnn_beta_2, - 'enable_profiler': enable_profiler, - 'cache_data': cache_data, - 'seed': seed, - 'eval_steps': eval_steps, - 'batch_size': batch_size, - 'measurement_selection_type': measurement_selection_type, - 'optimization_metric': optimization_metric, - 'eval_frequency_secs': eval_frequency_secs, - 'weight_column': weight_column, - 'transform_dataflow_machine_type': transform_dataflow_machine_type, - 'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers, - 'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb, - 'worker_pool_specs_override': worker_pool_specs_override, - 'run_evaluation': run_evaluation, - 'evaluation_batch_predict_machine_type': ( - evaluation_batch_predict_machine_type - ), - 'evaluation_batch_predict_starting_replica_count': ( - evaluation_batch_predict_starting_replica_count - ), - 'evaluation_batch_predict_max_replica_count': ( - evaluation_batch_predict_max_replica_count - ), - 'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type, - 'evaluation_dataflow_starting_num_workers': ( - evaluation_dataflow_starting_num_workers - ), - 'evaluation_dataflow_max_num_workers': ( - evaluation_dataflow_max_num_workers - ), - 'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb, - 'dataflow_service_account': dataflow_service_account, - 'dataflow_subnetwork': dataflow_subnetwork, - 'dataflow_use_public_ips': dataflow_use_public_ips, - 'encryption_spec_key_name': encryption_spec_key_name, - } - _update_parameters(parameter_values, training_and_eval_parameters) - - fte_params = { - 'dataset_level_custom_transformation_definitions': ( - dataset_level_custom_transformation_definitions - if dataset_level_custom_transformation_definitions - else [] - ), - 'dataset_level_transformations': ( - dataset_level_transformations if dataset_level_transformations else [] - ), - 'run_feature_selection': run_feature_selection, - 'feature_selection_algorithm': feature_selection_algorithm, - 'max_selected_features': max_selected_features, - 'predefined_split_key': predefined_split_key, - 'stratified_split_key': stratified_split_key, - 'training_fraction': training_fraction, - 'validation_fraction': validation_fraction, - 'test_fraction': test_fraction, - 'tf_auto_transform_features': ( - tf_auto_transform_features if tf_auto_transform_features else {} - ), - 'tf_custom_transformation_definitions': ( - tf_custom_transformation_definitions - if tf_custom_transformation_definitions - else [] - ), - 'tf_transformations_path': tf_transformations_path, - 'materialized_examples_format': ( - materialized_examples_format - if materialized_examples_format - else 'tfrecords_gzip' - ), - 'tf_transform_execution_engine': ( - tf_transform_execution_engine - if tf_transform_execution_engine - else 'dataflow' - ), - } - _update_parameters(parameter_values, fte_params) - - data_source_and_split_parameters = { - 'data_source_csv_filenames': data_source_csv_filenames, - 'data_source_bigquery_table_path': data_source_bigquery_table_path, - 'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id, - } - _update_parameters(parameter_values, data_source_and_split_parameters) - - pipeline_definition_path = os.path.join( - pathlib.Path(__file__).parent.resolve(), - 'wide_and_deep_trainer_pipeline.yaml', - ) - - return pipeline_definition_path, parameter_values - - def get_builtin_algorithm_hyperparameter_tuning_job_pipeline_and_parameters( project: str, location: str, @@ -1510,418 +1215,119 @@ def get_builtin_algorithm_hyperparameter_tuning_job_pipeline_and_parameters( tf_transformations_path: Path to TF transformation configuration. data_source_csv_filenames: The CSV data source. data_source_bigquery_table_path: The BigQuery data source. - bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for - storing intermediate tables. - weight_column: The weight column name. - max_failed_trial_count: The number of failed trials that need to be seen - before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides - how many trials must fail before the whole job fails. - study_spec_algorithm: The search algorithm specified for the study. One of - "ALGORITHM_UNSPECIFIED", "GRID_SEARCH", or "RANDOM_SEARCH". - study_spec_measurement_selection_type: Which measurement to use if/when the - service automatically selects the final measurement from previously - reported intermediate measurements. One of "BEST_MEASUREMENT" or - "LAST_MEASUREMENT". - transform_dataflow_machine_type: The dataflow machine type for transform - component. - transform_dataflow_max_num_workers: The max number of Dataflow workers for - transform component. - transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for - transform component. - worker_pool_specs_override: The dictionary for overriding training and - evaluation worker pool specs. The dictionary should be of format - https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172. - run_evaluation: Whether to run evaluation steps during training. - evaluation_batch_predict_machine_type: The prediction server machine type - for batch predict components during evaluation. - evaluation_batch_predict_starting_replica_count: The initial number of - prediction server for batch predict components during evaluation. - evaluation_batch_predict_max_replica_count: The max number of prediction - server for batch predict components during evaluation. - evaluation_dataflow_machine_type: The dataflow machine type for evaluation - components. - evaluation_dataflow_starting_num_workers: The initial number of Dataflow - workers for evaluation components. - evaluation_dataflow_max_num_workers: The max number of Dataflow workers for - evaluation components. - evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for - evaluation components. - dataflow_service_account: Custom service account to run dataflow jobs. - dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty - the default subnetwork will be used. Example: - https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications - dataflow_use_public_ips: Specifies whether Dataflow workers use public IP - addresses. - encryption_spec_key_name: The KMS key name. - - Returns: - Tuple of pipeline_definition_path and parameter_values. - """ - warnings.warn( - 'This method is deprecated. Please use' - ' get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters or' - ' get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters' - ' instead.' - ) - - if algorithm == 'tabnet': - return get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters( - project=project, - location=location, - root_dir=root_dir, - target_column=target_column, - prediction_type=prediction_type, - study_spec_metric_id=study_spec_metric_id, - study_spec_metric_goal=study_spec_metric_goal, - study_spec_parameters_override=study_spec_parameters_override, - max_trial_count=max_trial_count, - parallel_trial_count=parallel_trial_count, - transform_config=transform_config, - dataset_level_custom_transformation_definitions=dataset_level_custom_transformation_definitions, - dataset_level_transformations=dataset_level_transformations, - predefined_split_key=predefined_split_key, - stratified_split_key=stratified_split_key, - training_fraction=training_fraction, - validation_fraction=validation_fraction, - test_fraction=test_fraction, - tf_transform_execution_engine=tf_transform_execution_engine, - tf_auto_transform_features=tf_auto_transform_features, - tf_custom_transformation_definitions=tf_custom_transformation_definitions, - tf_transformations_path=tf_transformations_path, - enable_profiler=enable_profiler, - seed=seed, - eval_steps=eval_steps, - eval_frequency_secs=eval_frequency_secs, - data_source_csv_filenames=data_source_csv_filenames, - data_source_bigquery_table_path=data_source_bigquery_table_path, - bigquery_staging_full_dataset_id=bigquery_staging_full_dataset_id, - weight_column=weight_column, - max_failed_trial_count=max_failed_trial_count, - study_spec_algorithm=study_spec_algorithm, - study_spec_measurement_selection_type=study_spec_measurement_selection_type, - transform_dataflow_machine_type=transform_dataflow_machine_type, - transform_dataflow_max_num_workers=transform_dataflow_max_num_workers, - transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb, - worker_pool_specs_override=worker_pool_specs_override, - run_evaluation=run_evaluation, - evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type, - evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count, - evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count, - evaluation_dataflow_machine_type=evaluation_dataflow_machine_type, - evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb, - evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers, - evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers, - dataflow_service_account=dataflow_service_account, - dataflow_subnetwork=dataflow_subnetwork, - dataflow_use_public_ips=dataflow_use_public_ips, - encryption_spec_key_name=encryption_spec_key_name, - ) - elif algorithm == 'wide_and_deep': - return get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters( - project=project, - location=location, - root_dir=root_dir, - target_column=target_column, - prediction_type=prediction_type, - study_spec_metric_id=study_spec_metric_id, - study_spec_metric_goal=study_spec_metric_goal, - study_spec_parameters_override=study_spec_parameters_override, - max_trial_count=max_trial_count, - parallel_trial_count=parallel_trial_count, - transform_config=transform_config, - dataset_level_custom_transformation_definitions=dataset_level_custom_transformation_definitions, - dataset_level_transformations=dataset_level_transformations, - predefined_split_key=predefined_split_key, - stratified_split_key=stratified_split_key, - training_fraction=training_fraction, - validation_fraction=validation_fraction, - test_fraction=test_fraction, - tf_transform_execution_engine=tf_transform_execution_engine, - tf_auto_transform_features=tf_auto_transform_features, - tf_custom_transformation_definitions=tf_custom_transformation_definitions, - tf_transformations_path=tf_transformations_path, - enable_profiler=enable_profiler, - seed=seed, - eval_steps=eval_steps, - eval_frequency_secs=eval_frequency_secs, - data_source_csv_filenames=data_source_csv_filenames, - data_source_bigquery_table_path=data_source_bigquery_table_path, - bigquery_staging_full_dataset_id=bigquery_staging_full_dataset_id, - weight_column=weight_column, - max_failed_trial_count=max_failed_trial_count, - study_spec_algorithm=study_spec_algorithm, - study_spec_measurement_selection_type=study_spec_measurement_selection_type, - transform_dataflow_machine_type=transform_dataflow_machine_type, - transform_dataflow_max_num_workers=transform_dataflow_max_num_workers, - transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb, - worker_pool_specs_override=worker_pool_specs_override, - run_evaluation=run_evaluation, - evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type, - evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count, - evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count, - evaluation_dataflow_machine_type=evaluation_dataflow_machine_type, - evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb, - evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers, - evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers, - dataflow_service_account=dataflow_service_account, - dataflow_subnetwork=dataflow_subnetwork, - dataflow_use_public_ips=dataflow_use_public_ips, - encryption_spec_key_name=encryption_spec_key_name, - ) - else: - raise ValueError( - 'Invalid algorithm provided. Supported values are "tabnet" and' - ' "wide_and_deep".' - ) - - -def get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters( - project: str, - location: str, - root_dir: str, - target_column: str, - prediction_type: str, - study_spec_metric_id: str, - study_spec_metric_goal: str, - study_spec_parameters_override: List[Dict[str, Any]], - max_trial_count: int, - parallel_trial_count: int, - transform_config: Optional[str] = None, - dataset_level_custom_transformation_definitions: Optional[ - List[Dict[str, Any]] - ] = None, - dataset_level_transformations: Optional[List[Dict[str, Any]]] = None, - run_feature_selection: bool = False, - feature_selection_algorithm: Optional[str] = None, - materialized_examples_format: Optional[str] = None, - max_selected_features: Optional[int] = None, - predefined_split_key: Optional[str] = None, - stratified_split_key: Optional[str] = None, - training_fraction: Optional[float] = None, - validation_fraction: Optional[float] = None, - test_fraction: Optional[float] = None, - tf_transform_execution_engine: Optional[str] = None, - tf_auto_transform_features: Optional[ - Union[List[str], Dict[str, List[str]]] - ] = None, - tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None, - tf_transformations_path: Optional[str] = None, - enable_profiler: bool = False, - cache_data: str = 'auto', - seed: int = 1, - eval_steps: int = 0, - eval_frequency_secs: int = 600, - data_source_csv_filenames: Optional[str] = None, - data_source_bigquery_table_path: Optional[str] = None, - bigquery_staging_full_dataset_id: Optional[str] = None, - weight_column: str = '', - max_failed_trial_count: int = 0, - study_spec_algorithm: str = 'ALGORITHM_UNSPECIFIED', - study_spec_measurement_selection_type: str = 'BEST_MEASUREMENT', - transform_dataflow_machine_type: str = 'n1-standard-16', - transform_dataflow_max_num_workers: int = 25, - transform_dataflow_disk_size_gb: int = 40, - worker_pool_specs_override: Optional[Dict[str, Any]] = None, - run_evaluation: bool = True, - evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE, - evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT, - evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT, - evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE, - evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS, - evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS, - evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB, - dataflow_service_account: str = '', - dataflow_subnetwork: str = '', - dataflow_use_public_ips: bool = True, - encryption_spec_key_name: str = '', -) -> Tuple[str, Dict[str, Any]]: - # fmt: off - """Get the TabNet HyperparameterTuningJob pipeline. - - Args: - project: The GCP project that runs the pipeline components. - location: The GCP region that runs the pipeline components. - root_dir: The root GCS directory for the pipeline components. - target_column: The target column name. - prediction_type: The type of prediction the model is to produce. "classification" or "regression". - study_spec_metric_id: Metric to optimize, possible values: [ 'loss', 'average_loss', 'rmse', 'mae', 'mql', 'accuracy', 'auc', 'precision', 'recall']. - study_spec_metric_goal: Optimization goal of the metric, possible values: "MAXIMIZE", "MINIMIZE". - study_spec_parameters_override: List of dictionaries representing parameters to optimize. The dictionary key is the parameter_id, which is passed to training job as a command line argument, and the dictionary value is the parameter specification of the metric. - max_trial_count: The desired total number of trials. - parallel_trial_count: The desired number of trials to run in parallel. - transform_config: Path to v1 TF transformation configuration. - dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format. - dataset_level_transformations: Dataset-level transformation configuration in string format. - run_feature_selection: Whether to enable feature selection. - feature_selection_algorithm: Feature selection algorithm. - materialized_examples_format: The format for the materialized examples. - max_selected_features: Maximum number of features to select. - predefined_split_key: Predefined split key. - stratified_split_key: Stratified split key. - training_fraction: Training fraction. - validation_fraction: Validation fraction. - test_fraction: Test fraction. - tf_transform_execution_engine: The execution engine used to execute TF-based transformations. - tf_auto_transform_features: List of auto transform features in the comma-separated string format. - tf_custom_transformation_definitions: TF custom transformation definitions in string format. - tf_transformations_path: Path to TF transformation configuration. - enable_profiler: Enables profiling and saves a trace during evaluation. - cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size. - seed: Seed to be used for this run. - eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples. - eval_frequency_secs: Frequency at which evaluation and checkpointing will take place. - data_source_csv_filenames: The CSV data source. - data_source_bigquery_table_path: The BigQuery data source. - bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables. + bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for + storing intermediate tables. weight_column: The weight column name. - max_failed_trial_count: The number of failed trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials must fail before the whole job fails. - study_spec_algorithm: The search algorithm specified for the study. One of "ALGORITHM_UNSPECIFIED", "GRID_SEARCH", or "RANDOM_SEARCH". - study_spec_measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT". - transform_dataflow_machine_type: The dataflow machine type for transform component. - transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component. - transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component. - worker_pool_specs_override: The dictionary for overriding training and evaluation worker pool specs. The dictionary should be of format https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172. + max_failed_trial_count: The number of failed trials that need to be seen + before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides + how many trials must fail before the whole job fails. + study_spec_algorithm: The search algorithm specified for the study. One of + "ALGORITHM_UNSPECIFIED", "GRID_SEARCH", or "RANDOM_SEARCH". + study_spec_measurement_selection_type: Which measurement to use if/when the + service automatically selects the final measurement from previously + reported intermediate measurements. One of "BEST_MEASUREMENT" or + "LAST_MEASUREMENT". + transform_dataflow_machine_type: The dataflow machine type for transform + component. + transform_dataflow_max_num_workers: The max number of Dataflow workers for + transform component. + transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for + transform component. + worker_pool_specs_override: The dictionary for overriding training and + evaluation worker pool specs. The dictionary should be of format + https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172. run_evaluation: Whether to run evaluation steps during training. - evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation. - evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation. - evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation. - evaluation_dataflow_machine_type: The dataflow machine type for evaluation components. - evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components. - evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components. - evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components. + evaluation_batch_predict_machine_type: The prediction server machine type + for batch predict components during evaluation. + evaluation_batch_predict_starting_replica_count: The initial number of + prediction server for batch predict components during evaluation. + evaluation_batch_predict_max_replica_count: The max number of prediction + server for batch predict components during evaluation. + evaluation_dataflow_machine_type: The dataflow machine type for evaluation + components. + evaluation_dataflow_starting_num_workers: The initial number of Dataflow + workers for evaluation components. + evaluation_dataflow_max_num_workers: The max number of Dataflow workers for + evaluation components. + evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for + evaluation components. dataflow_service_account: Custom service account to run dataflow jobs. - dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications - dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses. + dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty + the default subnetwork will be used. Example: + https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications + dataflow_use_public_ips: Specifies whether Dataflow workers use public IP + addresses. encryption_spec_key_name: The KMS key name. Returns: Tuple of pipeline_definition_path and parameter_values. """ - # fmt: on - if isinstance(tf_auto_transform_features, list): - tf_auto_transform_features = {'auto': tf_auto_transform_features} + warnings.warn( + 'This method is deprecated. Please use' + ' get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters or' + ' get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters' + ' instead.' + ) - if transform_config and tf_transformations_path: - raise ValueError( - 'Only one of transform_config and tf_transformations_path can ' - 'be specified.' + if algorithm == 'tabnet': + return get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters( + project=project, + location=location, + root_dir=root_dir, + target_column=target_column, + prediction_type=prediction_type, + study_spec_metric_id=study_spec_metric_id, + study_spec_metric_goal=study_spec_metric_goal, + study_spec_parameters_override=study_spec_parameters_override, + max_trial_count=max_trial_count, + parallel_trial_count=parallel_trial_count, + transform_config=transform_config, + dataset_level_custom_transformation_definitions=dataset_level_custom_transformation_definitions, + dataset_level_transformations=dataset_level_transformations, + predefined_split_key=predefined_split_key, + stratified_split_key=stratified_split_key, + training_fraction=training_fraction, + validation_fraction=validation_fraction, + test_fraction=test_fraction, + tf_transform_execution_engine=tf_transform_execution_engine, + tf_auto_transform_features=tf_auto_transform_features, + tf_custom_transformation_definitions=tf_custom_transformation_definitions, + tf_transformations_path=tf_transformations_path, + enable_profiler=enable_profiler, + seed=seed, + eval_steps=eval_steps, + eval_frequency_secs=eval_frequency_secs, + data_source_csv_filenames=data_source_csv_filenames, + data_source_bigquery_table_path=data_source_bigquery_table_path, + bigquery_staging_full_dataset_id=bigquery_staging_full_dataset_id, + weight_column=weight_column, + max_failed_trial_count=max_failed_trial_count, + study_spec_algorithm=study_spec_algorithm, + study_spec_measurement_selection_type=study_spec_measurement_selection_type, + transform_dataflow_machine_type=transform_dataflow_machine_type, + transform_dataflow_max_num_workers=transform_dataflow_max_num_workers, + transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb, + worker_pool_specs_override=worker_pool_specs_override, + run_evaluation=run_evaluation, + evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type, + evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count, + evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count, + evaluation_dataflow_machine_type=evaluation_dataflow_machine_type, + evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb, + evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers, + evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers, + dataflow_service_account=dataflow_service_account, + dataflow_subnetwork=dataflow_subnetwork, + dataflow_use_public_ips=dataflow_use_public_ips, + encryption_spec_key_name=encryption_spec_key_name, ) - - elif transform_config: - warnings.warn( - 'transform_config parameter is deprecated. ' - 'Please use the flattened transform config arguments instead.' + else: + raise ValueError( + 'Invalid algorithm provided. Supported values is "tabnet".' ) - tf_transformations_path = transform_config - - if not worker_pool_specs_override: - worker_pool_specs_override = [] - - parameter_values = { - 'project': project, - 'location': location, - 'root_dir': root_dir, - 'target_column': target_column, - 'prediction_type': prediction_type, - 'study_spec_metric_id': study_spec_metric_id, - 'study_spec_metric_goal': study_spec_metric_goal, - 'study_spec_parameters_override': study_spec_parameters_override, - 'max_trial_count': max_trial_count, - 'parallel_trial_count': parallel_trial_count, - 'enable_profiler': enable_profiler, - 'cache_data': cache_data, - 'seed': seed, - 'eval_steps': eval_steps, - 'eval_frequency_secs': eval_frequency_secs, - 'weight_column': weight_column, - 'max_failed_trial_count': max_failed_trial_count, - 'study_spec_algorithm': study_spec_algorithm, - 'study_spec_measurement_selection_type': ( - study_spec_measurement_selection_type - ), - 'transform_dataflow_machine_type': transform_dataflow_machine_type, - 'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers, - 'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb, - 'worker_pool_specs_override': worker_pool_specs_override, - 'run_evaluation': run_evaluation, - 'evaluation_batch_predict_machine_type': ( - evaluation_batch_predict_machine_type - ), - 'evaluation_batch_predict_starting_replica_count': ( - evaluation_batch_predict_starting_replica_count - ), - 'evaluation_batch_predict_max_replica_count': ( - evaluation_batch_predict_max_replica_count - ), - 'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type, - 'evaluation_dataflow_starting_num_workers': ( - evaluation_dataflow_starting_num_workers - ), - 'evaluation_dataflow_max_num_workers': ( - evaluation_dataflow_max_num_workers - ), - 'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb, - 'dataflow_service_account': dataflow_service_account, - 'dataflow_subnetwork': dataflow_subnetwork, - 'dataflow_use_public_ips': dataflow_use_public_ips, - 'encryption_spec_key_name': encryption_spec_key_name, - } - - fte_params = { - 'dataset_level_custom_transformation_definitions': ( - dataset_level_custom_transformation_definitions - if dataset_level_custom_transformation_definitions - else [] - ), - 'dataset_level_transformations': ( - dataset_level_transformations if dataset_level_transformations else [] - ), - 'run_feature_selection': run_feature_selection, - 'feature_selection_algorithm': feature_selection_algorithm, - 'max_selected_features': max_selected_features, - 'predefined_split_key': predefined_split_key, - 'stratified_split_key': stratified_split_key, - 'training_fraction': training_fraction, - 'validation_fraction': validation_fraction, - 'test_fraction': test_fraction, - 'tf_auto_transform_features': ( - tf_auto_transform_features if tf_auto_transform_features else {} - ), - 'tf_custom_transformation_definitions': ( - tf_custom_transformation_definitions - if tf_custom_transformation_definitions - else [] - ), - 'tf_transformations_path': tf_transformations_path, - 'materialized_examples_format': ( - materialized_examples_format - if materialized_examples_format - else 'tfrecords_gzip' - ), - 'tf_transform_execution_engine': ( - tf_transform_execution_engine - if tf_transform_execution_engine - else 'dataflow' - ), - } - _update_parameters(parameter_values, fte_params) - - data_source_and_split_parameters = { - 'data_source_csv_filenames': data_source_csv_filenames, - 'data_source_bigquery_table_path': data_source_bigquery_table_path, - 'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id, - } - _update_parameters(parameter_values, data_source_and_split_parameters) - pipeline_definition_path = os.path.join( - pathlib.Path(__file__).parent.resolve(), - 'tabnet_hyperparameter_tuning_job_pipeline.yaml', - ) - return pipeline_definition_path, parameter_values - - -def get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters( +def get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters( project: str, location: str, root_dir: str, @@ -1982,7 +1388,7 @@ def get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters( encryption_spec_key_name: str = '', ) -> Tuple[str, Dict[str, Any]]: # fmt: off - """Get the Wide & Deep algorithm HyperparameterTuningJob pipeline. + """Get the TabNet HyperparameterTuningJob pipeline. Args: project: The GCP project that runs the pipeline components. @@ -2161,7 +1567,7 @@ def get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters( pipeline_definition_path = os.path.join( pathlib.Path(__file__).parent.resolve(), - 'wide_and_deep_hyperparameter_tuning_job_pipeline.yaml', + 'tabnet_hyperparameter_tuning_job_pipeline.yaml', ) return pipeline_definition_path, parameter_values @@ -2553,23 +1959,6 @@ def _format_tabnet_regression_study_spec_parameters_override( return formatted_params -def get_wide_and_deep_study_spec_parameters_override() -> List[Dict[str, Any]]: - """Get study_spec_parameters_override for a Wide & Deep hyperparameter tuning job. - - Returns: - List of study_spec_parameters_override. - """ - param_path = os.path.join( - pathlib.Path(__file__).parent.resolve(), - 'configs/wide_and_deep_params.json', - ) - with open(param_path, 'r') as f: - param_content = f.read() - params = json.loads(param_content) - - return params - - def get_feature_selection_pipeline_and_parameters( root_dir: str, project: str, diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job.py b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job.py deleted file mode 100644 index ceeae83d750..00000000000 --- a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2023 The Kubeflow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""AutoML Wide and Deep Hyperparameter Tuning component spec.""" - -from typing import Optional - -from kfp import dsl -from kfp.dsl import Artifact -from kfp.dsl import Input - - -@dsl.container_component -def wide_and_deep_hyperparameter_tuning_job( - project: str, - location: str, - root_dir: str, - target_column: str, - prediction_type: str, - study_spec_metric_id: str, - study_spec_metric_goal: str, - study_spec_parameters_override: list, - max_trial_count: int, - parallel_trial_count: int, - instance_baseline: Input[Artifact], - metadata: Input[Artifact], - materialized_train_split: Input[Artifact], - materialized_eval_split: Input[Artifact], - transform_output: Input[Artifact], - training_schema_uri: Input[Artifact], - gcp_resources: dsl.OutputPath(str), - instance_schema_uri: dsl.OutputPath(str), - prediction_schema_uri: dsl.OutputPath(str), - trials: dsl.OutputPath(str), - prediction_docker_uri_output: dsl.OutputPath(str), - execution_metrics: dsl.OutputPath(dict), - weight_column: Optional[str] = '', - enable_profiler: Optional[bool] = False, - cache_data: Optional[str] = 'auto', - seed: Optional[int] = 1, - eval_steps: Optional[int] = 0, - eval_frequency_secs: Optional[int] = 600, - max_failed_trial_count: Optional[int] = 0, - study_spec_algorithm: Optional[str] = 'ALGORITHM_UNSPECIFIED', - study_spec_measurement_selection_type: Optional[str] = 'BEST_MEASUREMENT', - training_machine_spec: Optional[dict] = {'machine_type': 'c2-standard-16'}, - training_disk_spec: Optional[dict] = { - 'boot_disk_type': 'pd-ssd', - 'boot_disk_size_gb': 100, - }, - encryption_spec_key_name: Optional[str] = '', -): - # fmt: off - """Tunes Wide & Deep hyperparameters using Vertex HyperparameterTuningJob API. - - Args: - project: The GCP project that runs the pipeline components. - location: The GCP region that runs the pipeline components. - root_dir: The root GCS directory for the pipeline components. - target_column: The target column name. - prediction_type: The type of prediction the model is to produce. "classification" or "regression". - weight_column: The weight column name. - enable_profiler: Enables profiling and saves a trace during evaluation. - cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size. - seed: Seed to be used for this run. - eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples. - eval_frequency_secs: Frequency at which evaluation and checkpointing will take place. - study_spec_metric_id: Metric to optimize, possible values: [ 'loss', 'average_loss', 'rmse', 'mae', 'mql', 'accuracy', 'auc', 'precision', 'recall']. - study_spec_metric_goal: Optimization goal of the metric, possible values: "MAXIMIZE", "MINIMIZE". - study_spec_parameters_override: List of dictionaries representing parameters to optimize. The dictionary key is the parameter_id, which is passed to training job as a command line argument, and the dictionary value is the parameter specification of the metric. - max_trial_count: The desired total number of trials. - parallel_trial_count: The desired number of trials to run in parallel. - max_failed_trial_count: The number of failed trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials must fail before the whole job fails. - study_spec_algorithm: The search algorithm specified for the study. One of 'ALGORITHM_UNSPECIFIED', 'GRID_SEARCH', or 'RANDOM_SEARCH'. - study_spec_measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT". - training_machine_spec: The training machine spec. See https://cloud.google.com/compute/docs/machine-types for options. - training_disk_spec: The training disk spec. - instance_baseline: The path to a JSON file for baseline values. - metadata: Amount of time in seconds to run the trainer for. - materialized_train_split: The path to the materialized train split. - materialized_eval_split: The path to the materialized validation split. - transform_output: The path to transform output. - training_schema_uri: The path to the training schema. - encryption_spec_key_name: The KMS key name. - - Returns: - gcp_resources: Serialized gcp_resources proto tracking the custom training job. - instance_schema_uri: The path to the instance schema. - prediction_schema_uri: The path to the prediction schema. - trials: The path to the hyperparameter tuning trials - prediction_docker_uri_output: The URI of the prediction container. - execution_metrics: Core metrics in dictionary of hyperparameter tuning job execution. - """ - # fmt: on - - return dsl.ContainerSpec( - image='gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.44', - command=[ - 'python3', - '-u', - '-m', - 'google_cloud_pipeline_components.container.v1.hyperparameter_tuning_job.launcher', - ], - args=[ - '--type', - 'HyperparameterTuningJobWithMetrics', - '--project', - project, - '--location', - location, - '--gcp_resources', - gcp_resources, - '--execution_metrics', - execution_metrics, - '--payload', - dsl.ConcatPlaceholder( - items=[ - ( - '{"display_name":' - f' "wide-and-deep-hyperparameter-tuning-job-{dsl.PIPELINE_JOB_ID_PLACEHOLDER}-{dsl.PIPELINE_TASK_ID_PLACEHOLDER}",' - ' "encryption_spec": {"kms_key_name":"' - ), - encryption_spec_key_name, - '"}, "study_spec": {"metrics": [{"metric_id": "', - study_spec_metric_id, - '", "goal": "', - study_spec_metric_goal, - '"}], "parameters": ', - study_spec_parameters_override, - ', "algorithm": "', - study_spec_algorithm, - '", "measurement_selection_type": "', - study_spec_measurement_selection_type, - '"}, "max_trial_count": ', - max_trial_count, - ', "parallel_trial_count": ', - parallel_trial_count, - ', "max_failed_trial_count": ', - max_failed_trial_count, - ( - ', "trial_job_spec": {"worker_pool_specs":' - ' [{"replica_count":"' - ), - '1', - '", "machine_spec": ', - training_machine_spec, - ', "disk_spec": ', - training_disk_spec, - ', "container_spec": {"image_uri":"', - 'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/wide-and-deep-training:20251102_1045', - '", "args": ["--target_column=', - target_column, - '", "--weight_column=', - weight_column, - '", "--model_type=', - prediction_type, - '", "--prediction_docker_uri=', - 'us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:20251102_1045', - '", "--prediction_docker_uri_artifact_path=', - prediction_docker_uri_output, - '", "--baseline_path=', - instance_baseline.uri, - '", "--metadata_path=', - metadata.uri, - '", "--transform_output_path=', - transform_output.uri, - '", "--training_schema_path=', - training_schema_uri.uri, - '", "--instance_schema_path=', - instance_schema_uri, - '", "--prediction_schema_path=', - prediction_schema_uri, - '", "--trials_path=', - trials, - '", "--job_dir=', - root_dir, - ( - f'/{dsl.PIPELINE_JOB_ID_PLACEHOLDER}/{dsl.PIPELINE_TASK_ID_PLACEHOLDER}/train",' - ' "--training_data_path=' - ), - materialized_train_split.uri, - '", "--validation_data_path=', - materialized_eval_split.uri, - '", "--enable_profiler=', - enable_profiler, - '", "--cache_data=', - cache_data, - '", "--measurement_selection_type=', - study_spec_measurement_selection_type, - '", "--metric_goal=', - study_spec_metric_goal, - '", "--seed=', - seed, - '", "--eval_steps=', - eval_steps, - '", "--eval_frequency_secs=', - eval_frequency_secs, - '"]}}]}}', - ] - ), - ], - ) diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job_pipeline.yaml b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job_pipeline.yaml deleted file mode 100644 index bcd3e5cf8e4..00000000000 --- a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job_pipeline.yaml +++ /dev/null @@ -1,3884 +0,0 @@ -# PIPELINE DEFINITION -# Name: automl-tabular-wide-and-deep-hyperparameter-tuning-job -# Description: The Wide & Deep built-in algorithm HyperparameterTuningJob pipeline. -# Inputs: -# bigquery_staging_full_dataset_id: str [Default: ''] -# cache_data: str [Default: 'auto'] -# data_source_bigquery_table_path: str [Default: ''] -# data_source_csv_filenames: str [Default: ''] -# dataflow_service_account: str [Default: ''] -# dataflow_subnetwork: str [Default: ''] -# dataflow_use_public_ips: bool [Default: True] -# dataset_level_custom_transformation_definitions: list -# dataset_level_transformations: list -# enable_profiler: bool [Default: False] -# encryption_spec_key_name: str [Default: ''] -# eval_frequency_secs: int [Default: 600.0] -# eval_steps: int [Default: 0.0] -# evaluation_batch_predict_machine_type: str [Default: 'n1-highmem-8'] -# evaluation_batch_predict_max_replica_count: int [Default: 20.0] -# evaluation_batch_predict_starting_replica_count: int [Default: 20.0] -# evaluation_dataflow_disk_size_gb: int [Default: 50.0] -# evaluation_dataflow_machine_type: str [Default: 'n1-standard-4'] -# evaluation_dataflow_max_num_workers: int [Default: 100.0] -# evaluation_dataflow_starting_num_workers: int [Default: 10.0] -# feature_selection_algorithm: str [Default: 'AMI'] -# location: str -# materialized_examples_format: str [Default: 'tfrecords_gzip'] -# max_failed_trial_count: int [Default: 0.0] -# max_selected_features: int [Default: -1.0] -# max_trial_count: int -# model_description: str [Default: ''] -# model_display_name: str [Default: ''] -# parallel_trial_count: int -# parent_model: system.Artifact -# predefined_split_key: str [Default: ''] -# prediction_type: str -# project: str -# root_dir: str -# run_evaluation: bool [Default: False] -# run_feature_selection: bool [Default: False] -# seed: int [Default: 1.0] -# stratified_split_key: str [Default: ''] -# study_spec_algorithm: str [Default: 'ALGORITHM_UNSPECIFIED'] -# study_spec_measurement_selection_type: str [Default: 'BEST_MEASUREMENT'] -# study_spec_metric_goal: str -# study_spec_metric_id: str -# study_spec_parameters_override: list -# target_column: str -# test_fraction: float [Default: -1.0] -# tf_auto_transform_features: dict -# tf_custom_transformation_definitions: list -# tf_transform_execution_engine: str [Default: 'bigquery'] -# tf_transformations_path: str [Default: ''] -# training_fraction: float [Default: -1.0] -# transform_dataflow_disk_size_gb: int [Default: 40.0] -# transform_dataflow_machine_type: str [Default: 'n1-standard-16'] -# transform_dataflow_max_num_workers: int [Default: 25.0] -# validation_fraction: float [Default: -1.0] -# vertex_dataset: system.Artifact -# weight_column: str [Default: ''] -# worker_pool_specs_override: list -# Outputs: -# model-evaluation-evaluation_metrics: system.Metrics -components: - comp-automl-tabular-finalizer: - executorLabel: exec-automl-tabular-finalizer - inputDefinitions: - parameters: - encryption_spec_key_name: - defaultValue: '' - description: Customer-managed encryption key. - isOptional: true - parameterType: STRING - location: - description: Location for running the Cross-validation trainer. - parameterType: STRING - project: - description: Project to run Cross-validation trainer. - parameterType: STRING - root_dir: - description: The Cloud Storage location to store the output. - parameterType: STRING - outputDefinitions: - parameters: - gcp_resources: - description: GCP resources created by this component. For more details, - see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/proto/README.md. - parameterType: STRING - comp-automl-tabular-infra-validator: - executorLabel: exec-automl-tabular-infra-validator - inputDefinitions: - artifacts: - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: google.UnmanagedContainerModel for model to be validated. - comp-bool-identity: - executorLabel: exec-bool-identity - inputDefinitions: - parameters: - value: - description: Boolean value to return - parameterType: BOOLEAN - outputDefinitions: - parameters: - Output: - parameterType: STRING - comp-condition-2: - dag: - outputs: - artifacts: - model-evaluation-evaluation_metrics: - artifactSelectors: - - outputArtifactKey: evaluation_metrics - producerSubtask: model-evaluation - tasks: - model-batch-predict: - cachingOptions: - enableCache: true - componentRef: - name: comp-model-batch-predict - inputs: - artifacts: - unmanaged_container_model: - componentInputArtifact: pipelinechannel--get-best-hyperparameter-tuning-job-trial-unmanaged_container_model - parameters: - bigquery_source_input_uri: - componentInputParameter: pipelinechannel--feature-transform-engine-bigquery_test_split_uri - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - gcs_destination_output_uri_prefix: - componentInputParameter: pipelinechannel--root_dir - instances_format: - runtimeValue: - constant: bigquery - job_display_name: - runtimeValue: - constant: batch-predict-evaluation-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}} - location: - componentInputParameter: pipelinechannel--location - machine_type: - componentInputParameter: pipelinechannel--evaluation_batch_predict_machine_type - max_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_max_replica_count - predictions_format: - runtimeValue: - constant: jsonl - project: - componentInputParameter: pipelinechannel--project - starting_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_starting_replica_count - taskInfo: - name: model-batch-predict - model-evaluation: - cachingOptions: - enableCache: true - componentRef: - name: comp-model-evaluation - dependentTasks: - - model-batch-predict - inputs: - artifacts: - batch_prediction_job: - taskOutputArtifact: - outputArtifactKey: batchpredictionjob - producerTask: model-batch-predict - parameters: - dataflow_disk_size: - componentInputParameter: pipelinechannel--evaluation_dataflow_disk_size_gb - dataflow_machine_type: - componentInputParameter: pipelinechannel--evaluation_dataflow_machine_type - dataflow_max_workers_num: - componentInputParameter: pipelinechannel--evaluation_dataflow_max_num_workers - dataflow_service_account: - componentInputParameter: pipelinechannel--dataflow_service_account - dataflow_subnetwork: - componentInputParameter: pipelinechannel--dataflow_subnetwork - dataflow_use_public_ips: - componentInputParameter: pipelinechannel--dataflow_use_public_ips - dataflow_workers_num: - componentInputParameter: pipelinechannel--evaluation_dataflow_starting_num_workers - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - ground_truth_column: - componentInputParameter: pipelinechannel--target_column - ground_truth_format: - runtimeValue: - constant: jsonl - location: - componentInputParameter: pipelinechannel--location - prediction_label_column: - runtimeValue: - constant: '' - prediction_score_column: - runtimeValue: - constant: '' - predictions_format: - runtimeValue: - constant: jsonl - problem_type: - componentInputParameter: pipelinechannel--prediction_type - project: - componentInputParameter: pipelinechannel--project - root_dir: - componentInputParameter: pipelinechannel--root_dir - taskInfo: - name: model-evaluation - inputDefinitions: - artifacts: - pipelinechannel--get-best-hyperparameter-tuning-job-trial-unmanaged_container_model: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - parameters: - pipelinechannel--bool-identity-Output: - parameterType: STRING - pipelinechannel--dataflow_service_account: - parameterType: STRING - pipelinechannel--dataflow_subnetwork: - parameterType: STRING - pipelinechannel--dataflow_use_public_ips: - parameterType: BOOLEAN - pipelinechannel--encryption_spec_key_name: - parameterType: STRING - pipelinechannel--evaluation_batch_predict_machine_type: - parameterType: STRING - pipelinechannel--evaluation_batch_predict_max_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_batch_predict_starting_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_disk_size_gb: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_machine_type: - parameterType: STRING - pipelinechannel--evaluation_dataflow_max_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_starting_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--feature-transform-engine-bigquery_test_split_uri: - parameterType: STRING - pipelinechannel--location: - parameterType: STRING - pipelinechannel--prediction_type: - parameterType: STRING - pipelinechannel--project: - parameterType: STRING - pipelinechannel--root_dir: - parameterType: STRING - pipelinechannel--target_column: - parameterType: STRING - outputDefinitions: - artifacts: - model-evaluation-evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 - comp-exit-handler-1: - dag: - outputs: - artifacts: - model-evaluation-evaluation_metrics: - artifactSelectors: - - outputArtifactKey: model-evaluation-evaluation_metrics - producerSubtask: condition-2 - tasks: - automl-tabular-infra-validator: - cachingOptions: - enableCache: true - componentRef: - name: comp-automl-tabular-infra-validator - dependentTasks: - - get-best-hyperparameter-tuning-job-trial - inputs: - artifacts: - unmanaged_container_model: - taskOutputArtifact: - outputArtifactKey: unmanaged_container_model - producerTask: get-best-hyperparameter-tuning-job-trial - taskInfo: - name: automl-tabular-infra-validator - bool-identity: - cachingOptions: - enableCache: true - componentRef: - name: comp-bool-identity - inputs: - parameters: - value: - componentInputParameter: pipelinechannel--run_evaluation - taskInfo: - name: bool-identity - condition-2: - componentRef: - name: comp-condition-2 - dependentTasks: - - bool-identity - - feature-transform-engine - - get-best-hyperparameter-tuning-job-trial - inputs: - artifacts: - pipelinechannel--get-best-hyperparameter-tuning-job-trial-unmanaged_container_model: - taskOutputArtifact: - outputArtifactKey: unmanaged_container_model - producerTask: get-best-hyperparameter-tuning-job-trial - parameters: - pipelinechannel--bool-identity-Output: - taskOutputParameter: - outputParameterKey: Output - producerTask: bool-identity - pipelinechannel--dataflow_service_account: - componentInputParameter: pipelinechannel--dataflow_service_account - pipelinechannel--dataflow_subnetwork: - componentInputParameter: pipelinechannel--dataflow_subnetwork - pipelinechannel--dataflow_use_public_ips: - componentInputParameter: pipelinechannel--dataflow_use_public_ips - pipelinechannel--encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - pipelinechannel--evaluation_batch_predict_machine_type: - componentInputParameter: pipelinechannel--evaluation_batch_predict_machine_type - pipelinechannel--evaluation_batch_predict_max_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_max_replica_count - pipelinechannel--evaluation_batch_predict_starting_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_starting_replica_count - pipelinechannel--evaluation_dataflow_disk_size_gb: - componentInputParameter: pipelinechannel--evaluation_dataflow_disk_size_gb - pipelinechannel--evaluation_dataflow_machine_type: - componentInputParameter: pipelinechannel--evaluation_dataflow_machine_type - pipelinechannel--evaluation_dataflow_max_num_workers: - componentInputParameter: pipelinechannel--evaluation_dataflow_max_num_workers - pipelinechannel--evaluation_dataflow_starting_num_workers: - componentInputParameter: pipelinechannel--evaluation_dataflow_starting_num_workers - pipelinechannel--feature-transform-engine-bigquery_test_split_uri: - taskOutputParameter: - outputParameterKey: bigquery_test_split_uri - producerTask: feature-transform-engine - pipelinechannel--location: - componentInputParameter: pipelinechannel--location - pipelinechannel--prediction_type: - componentInputParameter: pipelinechannel--prediction_type - pipelinechannel--project: - componentInputParameter: pipelinechannel--project - pipelinechannel--root_dir: - componentInputParameter: pipelinechannel--root_dir - pipelinechannel--target_column: - componentInputParameter: pipelinechannel--target_column - taskInfo: - name: run-evaluation - triggerPolicy: - condition: inputs.parameter_values['pipelinechannel--bool-identity-Output'] - == 'true' - feature-transform-engine: - cachingOptions: - enableCache: true - componentRef: - name: comp-feature-transform-engine - inputs: - parameters: - bigquery_staging_full_dataset_id: - componentInputParameter: pipelinechannel--bigquery_staging_full_dataset_id - data_source_bigquery_table_path: - componentInputParameter: pipelinechannel--set-optional-inputs-data_source_bigquery_table_path - data_source_csv_filenames: - componentInputParameter: pipelinechannel--set-optional-inputs-data_source_csv_filenames - dataflow_disk_size_gb: - componentInputParameter: pipelinechannel--transform_dataflow_disk_size_gb - dataflow_machine_type: - componentInputParameter: pipelinechannel--transform_dataflow_machine_type - dataflow_max_num_workers: - componentInputParameter: pipelinechannel--transform_dataflow_max_num_workers - dataflow_service_account: - componentInputParameter: pipelinechannel--dataflow_service_account - dataflow_subnetwork: - componentInputParameter: pipelinechannel--dataflow_subnetwork - dataflow_use_public_ips: - componentInputParameter: pipelinechannel--dataflow_use_public_ips - dataset_level_custom_transformation_definitions: - componentInputParameter: pipelinechannel--dataset_level_custom_transformation_definitions - dataset_level_transformations: - componentInputParameter: pipelinechannel--dataset_level_transformations - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - feature_selection_algorithm: - componentInputParameter: pipelinechannel--feature_selection_algorithm - location: - componentInputParameter: pipelinechannel--location - materialized_examples_format: - componentInputParameter: pipelinechannel--materialized_examples_format - max_selected_features: - componentInputParameter: pipelinechannel--max_selected_features - model_type: - runtimeValue: - constant: neural_network - predefined_split_key: - componentInputParameter: pipelinechannel--predefined_split_key - prediction_type: - componentInputParameter: pipelinechannel--prediction_type - project: - componentInputParameter: pipelinechannel--project - root_dir: - componentInputParameter: pipelinechannel--root_dir - run_feature_selection: - componentInputParameter: pipelinechannel--run_feature_selection - stratified_split_key: - componentInputParameter: pipelinechannel--stratified_split_key - target_column: - componentInputParameter: pipelinechannel--target_column - test_fraction: - componentInputParameter: pipelinechannel--test_fraction - tf_auto_transform_features: - componentInputParameter: pipelinechannel--tf_auto_transform_features - tf_custom_transformation_definitions: - componentInputParameter: pipelinechannel--tf_custom_transformation_definitions - tf_transform_execution_engine: - componentInputParameter: pipelinechannel--tf_transform_execution_engine - tf_transformations_path: - componentInputParameter: pipelinechannel--tf_transformations_path - training_fraction: - componentInputParameter: pipelinechannel--training_fraction - validation_fraction: - componentInputParameter: pipelinechannel--validation_fraction - weight_column: - componentInputParameter: pipelinechannel--weight_column - taskInfo: - name: feature-transform-engine - get-best-hyperparameter-tuning-job-trial: - cachingOptions: - enableCache: true - componentRef: - name: comp-get-best-hyperparameter-tuning-job-trial - dependentTasks: - - wide-and-deep-hyperparameter-tuning-job - inputs: - parameters: - gcp_resources: - taskOutputParameter: - outputParameterKey: gcp_resources - producerTask: wide-and-deep-hyperparameter-tuning-job - instance_schema_uri: - taskOutputParameter: - outputParameterKey: instance_schema_uri - producerTask: wide-and-deep-hyperparameter-tuning-job - prediction_docker_uri: - taskOutputParameter: - outputParameterKey: prediction_docker_uri_output - producerTask: wide-and-deep-hyperparameter-tuning-job - prediction_schema_uri: - taskOutputParameter: - outputParameterKey: prediction_schema_uri - producerTask: wide-and-deep-hyperparameter-tuning-job - study_spec_metric_goal: - componentInputParameter: pipelinechannel--study_spec_metric_goal - trials_dir: - taskOutputParameter: - outputParameterKey: trials - producerTask: wide-and-deep-hyperparameter-tuning-job - taskInfo: - name: get-best-hyperparameter-tuning-job-trial - get-wide-and-deep-study-spec-parameters: - cachingOptions: - enableCache: true - componentRef: - name: comp-get-wide-and-deep-study-spec-parameters - inputs: - parameters: - study_spec_parameters_override: - componentInputParameter: pipelinechannel--study_spec_parameters_override - taskInfo: - name: get-wide-and-deep-study-spec-parameters - model-upload: - cachingOptions: - enableCache: true - componentRef: - name: comp-model-upload - dependentTasks: - - automl-tabular-infra-validator - - get-best-hyperparameter-tuning-job-trial - inputs: - artifacts: - parent_model: - componentInputArtifact: pipelinechannel--parent_model - unmanaged_container_model: - taskOutputArtifact: - outputArtifactKey: unmanaged_container_model - producerTask: get-best-hyperparameter-tuning-job-trial - parameters: - description: - componentInputParameter: pipelinechannel--model_description - display_name: - componentInputParameter: pipelinechannel--get-model-display-name-model_display_name - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - location: - componentInputParameter: pipelinechannel--location - project: - componentInputParameter: pipelinechannel--project - taskInfo: - name: model-upload - parse-worker-pool-specs-override: - cachingOptions: - enableCache: true - componentRef: - name: comp-parse-worker-pool-specs-override - inputs: - parameters: - worker_pool_specs_override: - componentInputParameter: pipelinechannel--worker_pool_specs_override - taskInfo: - name: parse-worker-pool-specs-override - split-materialized-data: - cachingOptions: - enableCache: true - componentRef: - name: comp-split-materialized-data - dependentTasks: - - feature-transform-engine - inputs: - artifacts: - materialized_data: - taskOutputArtifact: - outputArtifactKey: materialized_data - producerTask: feature-transform-engine - taskInfo: - name: split-materialized-data - training-configurator-and-validator: - cachingOptions: - enableCache: true - componentRef: - name: comp-training-configurator-and-validator - dependentTasks: - - feature-transform-engine - inputs: - artifacts: - dataset_stats: - taskOutputArtifact: - outputArtifactKey: dataset_stats - producerTask: feature-transform-engine - instance_schema: - taskOutputArtifact: - outputArtifactKey: instance_schema - producerTask: feature-transform-engine - training_schema: - taskOutputArtifact: - outputArtifactKey: training_schema - producerTask: feature-transform-engine - parameters: - prediction_type: - componentInputParameter: pipelinechannel--prediction_type - run_evaluation: - componentInputParameter: pipelinechannel--run_evaluation - split_example_counts: - taskOutputParameter: - outputParameterKey: split_example_counts - producerTask: feature-transform-engine - target_column: - componentInputParameter: pipelinechannel--target_column - weight_column: - componentInputParameter: pipelinechannel--weight_column - taskInfo: - name: training-configurator-and-validator - wide-and-deep-hyperparameter-tuning-job: - cachingOptions: - enableCache: true - componentRef: - name: comp-wide-and-deep-hyperparameter-tuning-job - dependentTasks: - - feature-transform-engine - - get-wide-and-deep-study-spec-parameters - - parse-worker-pool-specs-override - - split-materialized-data - - training-configurator-and-validator - inputs: - artifacts: - instance_baseline: - taskOutputArtifact: - outputArtifactKey: instance_baseline - producerTask: training-configurator-and-validator - materialized_eval_split: - taskOutputArtifact: - outputArtifactKey: materialized_eval_split - producerTask: split-materialized-data - materialized_train_split: - taskOutputArtifact: - outputArtifactKey: materialized_train_split - producerTask: split-materialized-data - metadata: - taskOutputArtifact: - outputArtifactKey: metadata - producerTask: training-configurator-and-validator - training_schema_uri: - taskOutputArtifact: - outputArtifactKey: training_schema - producerTask: feature-transform-engine - transform_output: - taskOutputArtifact: - outputArtifactKey: transform_output - producerTask: feature-transform-engine - parameters: - cache_data: - componentInputParameter: pipelinechannel--cache_data - enable_profiler: - componentInputParameter: pipelinechannel--enable_profiler - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - eval_frequency_secs: - componentInputParameter: pipelinechannel--eval_frequency_secs - eval_steps: - componentInputParameter: pipelinechannel--eval_steps - location: - componentInputParameter: pipelinechannel--location - max_failed_trial_count: - componentInputParameter: pipelinechannel--max_failed_trial_count - max_trial_count: - componentInputParameter: pipelinechannel--max_trial_count - parallel_trial_count: - componentInputParameter: pipelinechannel--parallel_trial_count - prediction_type: - componentInputParameter: pipelinechannel--prediction_type - project: - componentInputParameter: pipelinechannel--project - root_dir: - componentInputParameter: pipelinechannel--root_dir - seed: - componentInputParameter: pipelinechannel--seed - study_spec_algorithm: - componentInputParameter: pipelinechannel--study_spec_algorithm - study_spec_measurement_selection_type: - componentInputParameter: pipelinechannel--study_spec_measurement_selection_type - study_spec_metric_goal: - componentInputParameter: pipelinechannel--study_spec_metric_goal - study_spec_metric_id: - componentInputParameter: pipelinechannel--study_spec_metric_id - study_spec_parameters_override: - taskOutputParameter: - outputParameterKey: Output - producerTask: get-wide-and-deep-study-spec-parameters - target_column: - componentInputParameter: pipelinechannel--target_column - training_disk_spec: - taskOutputParameter: - outputParameterKey: training_disk_spec - producerTask: parse-worker-pool-specs-override - training_machine_spec: - taskOutputParameter: - outputParameterKey: training_machine_spec - producerTask: parse-worker-pool-specs-override - weight_column: - componentInputParameter: pipelinechannel--weight_column - taskInfo: - name: wide-and-deep-hyperparameter-tuning-job - inputDefinitions: - artifacts: - pipelinechannel--parent_model: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - parameters: - pipelinechannel--bigquery_staging_full_dataset_id: - parameterType: STRING - pipelinechannel--cache_data: - parameterType: STRING - pipelinechannel--dataflow_service_account: - parameterType: STRING - pipelinechannel--dataflow_subnetwork: - parameterType: STRING - pipelinechannel--dataflow_use_public_ips: - parameterType: BOOLEAN - pipelinechannel--dataset_level_custom_transformation_definitions: - parameterType: LIST - pipelinechannel--dataset_level_transformations: - parameterType: LIST - pipelinechannel--enable_profiler: - parameterType: BOOLEAN - pipelinechannel--encryption_spec_key_name: - parameterType: STRING - pipelinechannel--eval_frequency_secs: - parameterType: NUMBER_INTEGER - pipelinechannel--eval_steps: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_batch_predict_machine_type: - parameterType: STRING - pipelinechannel--evaluation_batch_predict_max_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_batch_predict_starting_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_disk_size_gb: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_machine_type: - parameterType: STRING - pipelinechannel--evaluation_dataflow_max_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_starting_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--feature_selection_algorithm: - parameterType: STRING - pipelinechannel--get-model-display-name-model_display_name: - parameterType: STRING - pipelinechannel--location: - parameterType: STRING - pipelinechannel--materialized_examples_format: - parameterType: STRING - pipelinechannel--max_failed_trial_count: - parameterType: NUMBER_INTEGER - pipelinechannel--max_selected_features: - parameterType: NUMBER_INTEGER - pipelinechannel--max_trial_count: - parameterType: NUMBER_INTEGER - pipelinechannel--model_description: - parameterType: STRING - pipelinechannel--parallel_trial_count: - parameterType: NUMBER_INTEGER - pipelinechannel--predefined_split_key: - parameterType: STRING - pipelinechannel--prediction_type: - parameterType: STRING - pipelinechannel--project: - parameterType: STRING - pipelinechannel--root_dir: - parameterType: STRING - pipelinechannel--run_evaluation: - parameterType: BOOLEAN - pipelinechannel--run_feature_selection: - parameterType: BOOLEAN - pipelinechannel--seed: - parameterType: NUMBER_INTEGER - pipelinechannel--set-optional-inputs-data_source_bigquery_table_path: - parameterType: STRING - pipelinechannel--set-optional-inputs-data_source_csv_filenames: - parameterType: STRING - pipelinechannel--stratified_split_key: - parameterType: STRING - pipelinechannel--study_spec_algorithm: - parameterType: STRING - pipelinechannel--study_spec_measurement_selection_type: - parameterType: STRING - pipelinechannel--study_spec_metric_goal: - parameterType: STRING - pipelinechannel--study_spec_metric_id: - parameterType: STRING - pipelinechannel--study_spec_parameters_override: - parameterType: LIST - pipelinechannel--target_column: - parameterType: STRING - pipelinechannel--test_fraction: - parameterType: NUMBER_DOUBLE - pipelinechannel--tf_auto_transform_features: - parameterType: STRUCT - pipelinechannel--tf_custom_transformation_definitions: - parameterType: LIST - pipelinechannel--tf_transform_execution_engine: - parameterType: STRING - pipelinechannel--tf_transformations_path: - parameterType: STRING - pipelinechannel--training_fraction: - parameterType: NUMBER_DOUBLE - pipelinechannel--transform_dataflow_disk_size_gb: - parameterType: NUMBER_INTEGER - pipelinechannel--transform_dataflow_machine_type: - parameterType: STRING - pipelinechannel--transform_dataflow_max_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--validation_fraction: - parameterType: NUMBER_DOUBLE - pipelinechannel--weight_column: - parameterType: STRING - pipelinechannel--worker_pool_specs_override: - parameterType: LIST - outputDefinitions: - artifacts: - model-evaluation-evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 - comp-feature-transform-engine: - executorLabel: exec-feature-transform-engine - inputDefinitions: - parameters: - autodetect_csv_schema: - defaultValue: false - description: 'If True, infers the column types - - when importing CSVs into BigQuery.' - isOptional: true - parameterType: BOOLEAN - bigquery_staging_full_dataset_id: - defaultValue: '' - description: Dataset in "projectId.datasetId" format for storing intermediate-FTE - BigQuery tables. If the specified dataset does not exist in BigQuery, - FTE will create the dataset. If no bigquery_staging_full_dataset_id is - specified, all intermediate tables will be stored in a dataset created - under the provided project in the input data source's location during - FTE execution called "vertex_feature_transform_engine_staging_{location.replace('-', - '_')}". All tables generated by FTE will have a 30 day TTL. - isOptional: true - parameterType: STRING - data_source_bigquery_table_path: - defaultValue: '' - description: BigQuery input data source to run feature transform on. - isOptional: true - parameterType: STRING - data_source_csv_filenames: - defaultValue: '' - description: CSV input data source to run feature transform on. - isOptional: true - parameterType: STRING - dataflow_disk_size_gb: - defaultValue: 40.0 - description: The disk size, in gigabytes, to use on each Dataflow worker - instance. If not set, default to 40. - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_machine_type: - defaultValue: n1-standard-16 - description: The machine type used for dataflow jobs. If not set, default - to n1-standard-16. - isOptional: true - parameterType: STRING - dataflow_max_num_workers: - defaultValue: 25.0 - description: The number of workers to run the dataflow job. If not set, - default to 25. - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_service_account: - defaultValue: '' - description: Custom service account to run Dataflow jobs. - isOptional: true - parameterType: STRING - dataflow_subnetwork: - defaultValue: '' - description: 'Dataflow''s fully qualified subnetwork name, when empty the - default subnetwork will be used. More details: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications' - isOptional: true - parameterType: STRING - dataflow_use_public_ips: - defaultValue: true - description: Specifies whether Dataflow workers use public IP addresses. - isOptional: true - parameterType: BOOLEAN - dataset_level_custom_transformation_definitions: - defaultValue: [] - description: 'List of dataset-level custom transformation definitions. Custom, - bring-your-own dataset-level transform functions, where users can define - and import their own transform function and use it with FTE''s built-in - transformations. Using custom transformations is an experimental feature - and it is currently not supported during batch prediction. - - [ { "transformation": "ConcatCols", "module_path": "/path/to/custom_transform_fn_dlt.py", - "function_name": "concat_cols" } ] Using custom transform function together - with FTE''s built-in transformations: .. code-block:: python [ { "transformation": - "Join", "right_table_uri": "bq://test-project.dataset_test.table", "join_keys": - [["join_key_col", "join_key_col"]] },{ "transformation": "ConcatCols", - "cols": ["feature_1", "feature_2"], "output_col": "feature_1_2" } ]' - isOptional: true - parameterType: LIST - dataset_level_transformations: - defaultValue: [] - description: "List of dataset-level transformations.\n[ { \"transformation\"\ - : \"Join\", \"right_table_uri\": \"bq://test-project.dataset_test.table\"\ - , \"join_keys\": [[\"join_key_col\", \"join_key_col\"]] }, ... ] Additional\ - \ information about FTE's currently supported built-in\n transformations:\n\ - \ Join: Joins features from right_table_uri. For each join key, the\ - \ left table keys will be included and the right table keys will be dropped.\n\ - \ Example: .. code-block:: python { \"transformation\": \"Join\"\ - , \"right_table_uri\": \"bq://test-project.dataset_test.table\", \"join_keys\"\ - : [[\"join_key_col\", \"join_key_col\"]] }\n Arguments:\n \ - \ right_table_uri: Right table BigQuery uri to join with input_full_table_id.\n\ - \ join_keys: Features to join on. For each nested list, the\ - \ first element is a left table column and the second is its corresponding\ - \ right table column.\n TimeAggregate: Creates a new feature composed\ - \ of values of an existing feature from a fixed time period ago or in\ - \ the future.\n Ex: A feature for sales by store 1 year ago.\n \ - \ Example: .. code-block:: python { \"transformation\": \"TimeAggregate\"\ - , \"time_difference\": 40, \"time_difference_units\": \"DAY\", \"time_series_identifier_columns\"\ - : [\"store_id\"], \"time_column\": \"time_col\", \"time_difference_target_column\"\ - : \"target_col\", \"output_column\": \"output_col\" }\n Arguments:\n\ - \ time_difference: Number of time_difference_units to look\ - \ back or into the future on our time_difference_target_column.\n \ - \ time_difference_units: Units of time_difference to look back\ - \ or into the future on our time_difference_target_column. Must be one\ - \ of * 'DAY' * 'WEEK' (Equivalent to 7 DAYs) * 'MONTH' * 'QUARTER' * 'YEAR'\n\ - \ time_series_identifier_columns: Names of the time series\ - \ identifier columns.\n time_column: Name of the time column.\n\ - \ time_difference_target_column: Column we wish to get the\ - \ value of time_difference time_difference_units in the past or future.\n\ - \ output_column: Name of our new time aggregate feature.\n\ - \ is_future: Whether we wish to look forward in time. Defaults\ - \ to False. PartitionByMax/PartitionByMin/PartitionByAvg/PartitionBySum:\ - \ Performs a partition by reduce operation (one of max, min, avg, or sum)\ - \ with a fixed historic time period. Ex: Getting avg sales (the reduce\ - \ column) for each store (partition_by_column) over the previous 5 days\ - \ (time_column, time_ago_units, and time_ago).\n Example: .. code-block::\ - \ python { \"transformation\": \"PartitionByMax\", \"reduce_column\"\ - : \"sell_price\", \"partition_by_columns\": [\"store_id\", \"state_id\"\ - ], \"time_column\": \"date\", \"time_ago\": 1, \"time_ago_units\": \"\ - WEEK\", \"output_column\": \"partition_by_reduce_max_output\" }\n \ - \ Arguments:\n reduce_column: Column to apply the reduce\ - \ operation on. Reduce operations include the\n following:\ - \ Max, Min, Avg, Sum.\n partition_by_columns: List of columns\ - \ to partition by.\n time_column: Time column for the partition\ - \ by operation's window function.\n time_ago: Number of time_ago_units\ - \ to look back on our target_column, starting from time_column (inclusive).\n\ - \ time_ago_units: Units of time_ago to look back on our target_column.\ - \ Must be one of * 'DAY' * 'WEEK'\n output_column: Name of\ - \ our output feature." - isOptional: true - parameterType: LIST - encryption_spec_key_name: - defaultValue: '' - description: Customer-managed encryption key. - isOptional: true - parameterType: STRING - feature_selection_algorithm: - defaultValue: AMI - description: "The algorithm of feature selection. One of \"AMI\", \"CMIM\"\ - , \"JMIM\", \"MRMR\", default to be \"AMI\". The algorithms available\ - \ are: AMI(Adjusted Mutual Information):\nReference: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html\ - \ Arrays are not yet supported in this algorithm. CMIM(Conditional Mutual\ - \ Information Maximization): Reference paper: Mohamed Bennasar, Yulia\ - \ Hicks, Rossitza Setchi, \u201CFeature selection using Joint Mutual Information\ - \ Maximisation,\u201D Expert Systems with Applications, vol. 42, issue\ - \ 22, 1 December 2015, Pages 8520-8532. JMIM(Joint Mutual Information\ - \ Maximization\nReference:\n paper: Mohamed Bennasar, Yulia Hicks, Rossitza\ - \ Setchi, \u201CFeature selection using Joint Mutual Information Maximisation,\u201D\ - \ Expert Systems with Applications, vol. 42, issue 22, 1 December 2015,\ - \ Pages 8520-8532. MRMR(MIQ Minimum-redundancy Maximum-relevance): Reference\ - \ paper: Hanchuan Peng, Fuhui Long, and Chris Ding. \"Feature selection\ - \ based on mutual information criteria of max-dependency, max-relevance,\ - \ and min-redundancy.\" IEEE Transactions on pattern analysis and machine\ - \ intelligence 27, no.\n 8: 1226-1238." - isOptional: true - parameterType: STRING - feature_selection_execution_engine: - defaultValue: dataflow - description: Execution engine to run feature selection, value can be dataflow, - bigquery. - isOptional: true - parameterType: STRING - forecasting_apply_windowing: - defaultValue: true - description: Whether to apply window strategy. - isOptional: true - parameterType: BOOLEAN - forecasting_available_at_forecast_columns: - defaultValue: [] - description: Forecasting available at forecast columns. - isOptional: true - parameterType: LIST - forecasting_context_window: - defaultValue: -1.0 - description: Forecasting context window. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_forecast_horizon: - defaultValue: -1.0 - description: Forecasting horizon. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_holiday_regions: - defaultValue: [] - description: 'The geographical region based on which the holiday effect - is applied in modeling by adding holiday categorical array feature that - include all holidays matching the date. This option only allowed when - data granularity is day. By default, holiday effect modeling is disabled. - To turn it on, specify the holiday region using this option. - - Top level: * ''GLOBAL'' - - Second level: continental regions: * ''NA'': North America - - * ''JAPAC'': Japan and Asia Pacific - - * ''EMEA'': Europe, the Middle East and Africa - - * ''LAC'': Latin America and the Caribbean - - Third level: countries from ISO 3166-1 Country codes. - - Valid regions: * ''GLOBAL'' * ''NA'' * ''JAPAC'' * ''EMEA'' * ''LAC'' - * ''AE'' - - * ''AR'' * ''AT'' * ''AU'' * ''BE'' * ''BR'' * ''CA'' * ''CH'' * ''CL'' - * ''CN'' * ''CO'' - - * ''CZ'' * ''DE'' * ''DK'' * ''DZ'' * ''EC'' * ''EE'' * ''EG'' * ''ES'' - * ''FI'' * ''FR'' - - * ''GB'' * ''GR'' * ''HK'' * ''HU'' * ''ID'' * ''IE'' * ''IL'' * ''IN'' - * ''IR'' * ''IT'' - - * ''JP'' * ''KR'' * ''LV'' * ''MA'' * ''MX'' * ''MY'' * ''NG'' * ''NL'' - * ''NO'' * ''NZ'' - - * ''PE'' * ''PH'' * ''PK'' * ''PL'' * ''PT'' * ''RO'' * ''RS'' * ''RU'' - * ''SA'' * ''SE'' - - * ''SG'' * ''SI'' * ''SK'' * ''TH'' * ''TR'' * ''TW'' * ''UA'' * ''US'' - * ''VE'' * ''VN'' - - * ''ZA''' - isOptional: true - parameterType: LIST - forecasting_predefined_window_column: - defaultValue: '' - description: Forecasting predefined window column. - isOptional: true - parameterType: STRING - forecasting_time_column: - defaultValue: '' - description: Forecasting time column. - isOptional: true - parameterType: STRING - forecasting_time_series_attribute_columns: - defaultValue: [] - description: Forecasting time series attribute columns. - isOptional: true - parameterType: LIST - forecasting_time_series_identifier_column: - description: '[Deprecated] A forecasting time series identifier column. - Raises an exception if used - use the "time_series_identifier_column" - field instead.' - isOptional: true - parameterType: STRING - forecasting_time_series_identifier_columns: - defaultValue: [] - description: The list of forecasting time series identifier columns. - isOptional: true - parameterType: LIST - forecasting_unavailable_at_forecast_columns: - defaultValue: [] - description: Forecasting unavailable at forecast columns. - isOptional: true - parameterType: LIST - forecasting_window_max_count: - defaultValue: -1.0 - description: Forecasting window max count. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_window_stride_length: - defaultValue: -1.0 - description: Forecasting window stride length. - isOptional: true - parameterType: NUMBER_INTEGER - group_columns: - isOptional: true - parameterType: LIST - group_temporal_total_weight: - defaultValue: 0.0 - isOptional: true - parameterType: NUMBER_DOUBLE - group_total_weight: - defaultValue: 0.0 - isOptional: true - parameterType: NUMBER_DOUBLE - legacy_transformations_path: - defaultValue: '' - isOptional: true - parameterType: STRING - location: - description: Location for the created GCP services. - parameterType: STRING - materialized_examples_format: - defaultValue: tfrecords_gzip - description: The format to use for the materialized examples. Should be - either 'tfrecords_gzip' (default) or 'parquet'. - isOptional: true - parameterType: STRING - max_selected_features: - defaultValue: 1000.0 - description: Maximum number of features to select. If specified, the transform - config will be purged by only using the selected features that ranked - top in the feature ranking, which has the ranking value for all supported - features. If the number of input features is smaller than max_selected_features - specified, we will still run the feature selection process and generate - the feature ranking, no features will be excluded. The value will be - set to 1000 by default if run_feature_selection is enabled. - isOptional: true - parameterType: NUMBER_INTEGER - model_type: - description: 'Model type, which we wish to engineer features for. Can be - one of: neural_network, boosted_trees, l2l, seq2seq, tft, or tide. Defaults - to the empty value, `None`.' - isOptional: true - parameterType: STRING - multimodal_image_columns: - defaultValue: [] - description: List of multimodal image columns. Defaults to an empty list. - isOptional: true - parameterType: LIST - multimodal_tabular_columns: - defaultValue: [] - description: List of multimodal tabular columns. Defaults to an empty list - isOptional: true - parameterType: LIST - multimodal_text_columns: - defaultValue: [] - description: List of multimodal text columns. Defaults to an empty list - isOptional: true - parameterType: LIST - multimodal_timeseries_columns: - defaultValue: [] - description: List of multimodal timeseries columns. Defaults to an empty - list - isOptional: true - parameterType: LIST - predefined_split_key: - defaultValue: '' - description: Predefined split key. - isOptional: true - parameterType: STRING - prediction_type: - defaultValue: '' - description: Model prediction type. One of "classification", "regression", - "time_series". - isOptional: true - parameterType: STRING - project: - description: Project to run feature transform engine. - parameterType: STRING - root_dir: - description: The Cloud Storage location to store the output. - parameterType: STRING - run_distill: - defaultValue: false - description: (deprecated) Whether the distillation should be applied to - the training. - isOptional: true - parameterType: BOOLEAN - run_feature_selection: - defaultValue: false - description: Whether the feature selection should be applied to the dataset. - isOptional: true - parameterType: BOOLEAN - stats_gen_execution_engine: - defaultValue: dataflow - description: 'Execution engine to perform statistics generation. Can be - one of: "dataflow" (by default) or "bigquery". Using "bigquery" as the - execution engine is experimental.' - isOptional: true - parameterType: STRING - stratified_split_key: - defaultValue: '' - description: Stratified split key. - isOptional: true - parameterType: STRING - target_column: - defaultValue: '' - description: Target column of input data. - isOptional: true - parameterType: STRING - temporal_total_weight: - defaultValue: 0.0 - isOptional: true - parameterType: NUMBER_DOUBLE - test_fraction: - defaultValue: -1.0 - description: Fraction of input data for testing. - isOptional: true - parameterType: NUMBER_DOUBLE - tf_auto_transform_features: - defaultValue: {} - description: 'Dict mapping auto and/or type-resolutions to TF transform - features. FTE will automatically configure a set of built-in transformations - for each feature based on its data statistics. If users do not want auto - type resolution, but want the set of transformations for a given type - to be automatically generated, they may specify pre-resolved transformations - types. The following type hint dict keys are supported: * ''auto'' * ''categorical'' - * ''numeric'' * ''text'' * ''timestamp'' Example: `{ "auto": ["feature1"], - "categorical": ["feature2", "feature3"], }`. Note that the target and - weight column may not be included as an auto transformation unless users - are running forecasting.' - isOptional: true - parameterType: STRUCT - tf_custom_transformation_definitions: - defaultValue: [] - description: 'List of TensorFlow-based custom transformation definitions. Custom, - bring-your-own transform functions, where users can define and import - their own transform function and use it with FTE''s built-in transformations. - `[ { "transformation": "PlusOne", "module_path": "gs://bucket/custom_transform_fn.py", - "function_name": "plus_one_transform" }, { "transformation": "MultiplyTwo", - "module_path": "gs://bucket/custom_transform_fn.py", "function_name": - "multiply_two_transform" } ] Using custom transform function together - with FTE''s built-in transformations: .. code-block:: python [ { "transformation": - "CastToFloat", "input_columns": ["feature_1"], "output_columns": ["feature_1"] - },{ "transformation": "PlusOne", "input_columns": ["feature_1"] "output_columns": - ["feature_1_plused_one"] },{ "transformation": "MultiplyTwo", "input_columns": - ["feature_1"] "output_columns": ["feature_1_multiplied_two"] } ]' - isOptional: true - parameterType: LIST - tf_transform_execution_engine: - defaultValue: dataflow - description: 'Execution engine to perform row-level TF transformations. - Can be one of: "dataflow" (by default) or "bigquery". Using "bigquery" - as the execution engine is experimental and is for allowlisted customers - only. In addition, executing on "bigquery" only supports auto transformations - (i.e., specified by tf_auto_transform_features) and will raise an error - when tf_custom_transformation_definitions or tf_transformations_path is - set.' - isOptional: true - parameterType: STRING - tf_transformations_path: - defaultValue: '' - description: "Path to TensorFlow-based transformation configuration. Path\ - \ to a JSON file used to specified FTE's TF transformation configurations.\ - \ In the following, we provide some sample transform configurations to\ - \ demonstrate FTE's capabilities. All transformations on input columns\ - \ are explicitly specified with FTE's built-in transformations. Chaining\ - \ of multiple transformations on a single column is also supported. For\ - \ example: .. code-block:: python [ { \"transformation\": \"ZScale\"\ - , \"input_columns\": [\"feature_1\"] }, { \"transformation\": \"ZScale\"\ - , \"input_columns\": [\"feature_2\"] } ]`. Additional information about\ - \ FTE's currently supported built-in\ntransformations:\nDatetime: Extracts\ - \ datetime featues from a column containing timestamp strings.\n Example:\ - \ .. code-block:: python { \"transformation\": \"Datetime\", \"input_columns\"\ - : [\"feature_1\"], \"time_format\": \"%Y-%m-%d\" }\n Arguments:\n \ - \ input_columns: A list with a single column to perform the datetime\ - \ transformation on.\n output_columns: Names of output columns,\ - \ one for each datetime_features element.\n time_format: Datetime\ - \ format string. Time format is a combination of Date + Time Delimiter\ - \ (optional) + Time (optional) directives. Valid date directives are as\ - \ follows * '%Y-%m-%d' # 2018-11-30 * '%Y/%m/%d' # 2018/11/30 * '%y-%m-%d'\ - \ # 18-11-30 * '%y/%m/%d' # 18/11/30 * '%m-%d-%Y' # 11-30-2018 * '%m/%d/%Y'\ - \ # 11/30/2018 * '%m-%d-%y' # 11-30-18 * '%m/%d/%y' # 11/30/18 * '%d-%m-%Y'\ - \ # 30-11-2018 * '%d/%m/%Y' # 30/11/2018 * '%d-%B-%Y' # 30-November-2018\ - \ * '%d-%m-%y' # 30-11-18 * '%d/%m/%y' # 30/11/18 * '%d-%B-%y' # 30-November-18\ - \ * '%d%m%Y' # 30112018 * '%m%d%Y' # 11302018 * '%Y%m%d' # 20181130\ - \ Valid time delimiters are as follows * 'T' * ' ' Valid time directives\ - \ are as follows * '%H:%M' # 23:59 * '%H:%M:%S' #\n \ - \ 23:59:58 * '%H:%M:%S.%f' # 23:59:58[.123456] * '%H:%M:%S.%f%z'\ - \ # 23:59:58[.123456]+0000 * '%H:%M:%S%z', # 23:59:58+0000\n \ - \ datetime_features: List of datetime features to be extract. Each entry\ - \ must be one of * 'YEAR' * 'MONTH' * 'DAY' * 'DAY_OF_WEEK' * 'DAY_OF_YEAR'\ - \ * 'WEEK_OF_YEAR' * 'QUARTER' * 'HOUR' * 'MINUTE' * 'SECOND' Defaults\ - \ to ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'DAY_OF_YEAR', 'WEEK_OF_YEAR']\n\ - Log: Performs the natural log on a numeric column.\n Example: .. code-block::\ - \ python { \"transformation\": \"Log\", \"input_columns\": [\"feature_1\"\ - ] }\n Arguments:\n input_columns: A list with a single column\ - \ to perform the log transformation on.\n output_columns: A list\ - \ with a single output column name, corresponding to the output of our\ - \ transformation.\nZScale: Performs Z-scale normalization on a numeric\ - \ column.\n Example: .. code-block:: python { \"transformation\"\ - : \"ZScale\", \"input_columns\": [\"feature_1\"] }\n Arguments:\n \ - \ input_columns: A list with a single column to perform the z-scale\ - \ transformation on.\n output_columns: A list with a single output\ - \ column name, corresponding to the output of our transformation.\nVocabulary:\ - \ Converts strings to integers, where each unique string gets a unique\ - \ integer representation.\n Example: .. code-block:: python { \"\ - transformation\": \"Vocabulary\", \"input_columns\": [\"feature_1\"] }\n\ - \ Arguments:\n input_columns: A list with a single column to\ - \ perform the vocabulary transformation on.\n output_columns: A\ - \ list with a single output column name, corresponding to the output of\ - \ our transformation.\n top_k: Number of the most frequent words\ - \ in the vocabulary to use for generating dictionary lookup indices. If\ - \ not specified, all words in the vocabulary will be used. Defaults to\ - \ None.\n frequency_threshold: Limit the vocabulary only to words\ - \ whose number of occurrences in the input exceeds frequency_threshold.\ - \ If not specified, all words in the vocabulary will be included. If both\ - \ top_k and frequency_threshold are specified, a word must satisfy both\ - \ conditions to be included. Defaults to None.\nCategorical: Transforms\ - \ categorical columns to integer columns.\n Example: .. code-block::\ - \ python { \"transformation\": \"Categorical\", \"input_columns\": [\"\ - feature_1\"], \"top_k\": 10 }\n Arguments:\n input_columns:\ - \ A list with a single column to perform the categorical transformation\ - \ on.\n output_columns: A list with a single output column name,\ - \ corresponding to the output of our transformation.\n top_k: Number\ - \ of the most frequent words in the vocabulary to use for generating dictionary\ - \ lookup indices. If not specified, all words in the vocabulary will be\ - \ used.\n frequency_threshold: Limit the vocabulary only to words\ - \ whose number of occurrences in the input exceeds frequency_threshold.\ - \ If not specified, all words in the vocabulary will be included. If both\ - \ top_k and frequency_threshold are specified, a word must satisfy both\ - \ conditions to be included.\nReduce: Given a column where each entry\ - \ is a numeric array, reduces arrays according to our reduce_mode.\n \ - \ Example: .. code-block:: python { \"transformation\": \"Reduce\"\ - , \"input_columns\": [\"feature_1\"], \"reduce_mode\": \"MEAN\", \"output_columns\"\ - : [\"feature_1_mean\"] }\n Arguments:\n input_columns: A list\ - \ with a single column to perform the reduce transformation on.\n \ - \ output_columns: A list with a single output column name, corresponding\ - \ to the output of our transformation.\n reduce_mode: One of *\ - \ 'MAX' * 'MIN' * 'MEAN' * 'LAST_K' Defaults to 'MEAN'.\n last_k:\ - \ The number of last k elements when 'LAST_K' reduce mode is used. Defaults\ - \ to 1.\nSplitString: Given a column of strings, splits strings into token\ - \ arrays.\n Example: .. code-block:: python { \"transformation\"\ - : \"SplitString\", \"input_columns\": [\"feature_1\"], \"separator\":\ - \ \"$\" }\n Arguments:\n input_columns: A list with a single\ - \ column to perform the split string transformation on.\n output_columns:\ - \ A list with a single output column name, corresponding to the output\ - \ of our transformation.\n separator: Separator to split input\ - \ string into tokens. Defaults to ' '.\n missing_token: Missing\ - \ token to use when no string is included. Defaults to ' _MISSING_ '.\n\ - NGram: Given a column of strings, splits strings into token arrays where\ - \ each token is an integer.\n Example: .. code-block:: python { \"\ - transformation\": \"NGram\", \"input_columns\": [\"feature_1\"], \"min_ngram_size\"\ - : 1, \"max_ngram_size\": 2, \"separator\": \" \" }\n Arguments:\n \ - \ input_columns: A list with a single column to perform the n-gram\ - \ transformation on.\n output_columns: A list with a single output\ - \ column name, corresponding to the output of our transformation.\n \ - \ min_ngram_size: Minimum n-gram size. Must be a positive number\ - \ and <= max_ngram_size. Defaults to 1.\n max_ngram_size: Maximum\ - \ n-gram size. Must be a positive number and >= min_ngram_size. Defaults\ - \ to 2.\n top_k: Number of the most frequent words in the vocabulary\ - \ to use for generating dictionary lookup indices. If not specified, all\ - \ words in the vocabulary will be used. Defaults to None.\n frequency_threshold:\ - \ Limit the dictionary's vocabulary only to words whose number of occurrences\ - \ in the input exceeds frequency_threshold. If not specified, all words\ - \ in the vocabulary will be included. If both top_k and frequency_threshold\ - \ are specified, a word must satisfy both conditions to be included. Defaults\ - \ to None.\n separator: Separator to split input string into tokens.\ - \ Defaults to ' '.\n missing_token: Missing token to use when no\ - \ string is included. Defaults to ' _MISSING_ '.\nClip: Given a numeric\ - \ column, clips elements such that elements < min_value are assigned min_value,\ - \ and elements > max_value are assigned max_value.\n Example: .. code-block::\ - \ python { \"transformation\": \"Clip\", \"input_columns\": [\"col1\"\ - ], \"output_columns\": [\"col1_clipped\"], \"min_value\": 1., \"max_value\"\ - : 10., }\n Arguments:\n input_columns: A list with a single\ - \ column to perform the n-gram transformation on.\n output_columns:\ - \ A list with a single output column name, corresponding to the output\ - \ of our transformation.\n min_value: Number where all values below\ - \ min_value are set to min_value. If no min_value is provided, min clipping\ - \ will not occur. Defaults to None.\n max_value: Number where all\ - \ values above max_value are set to max_value If no max_value is provided,\ - \ max clipping will not occur. Defaults to None.\nMultiHotEncoding: Performs\ - \ multi-hot encoding on a categorical array column.\n Example: ..\ - \ code-block:: python { \"transformation\": \"MultiHotEncoding\", \"\ - input_columns\": [\"col1\"], } The number of classes is determened by\ - \ the largest number included in the input if it is numeric or the total\ - \ number of unique values of the input if it is type str. If the input\ - \ is has type str and an element contians separator tokens, the input\ - \ will be split at separator indices, and the each element of the split\ - \ list will be considered a seperate class. For example,\n Input: \ - \ .. code-block:: python [ [\"foo bar\"], # Example 0 [\"foo\",\ - \ \"bar\"], # Example 1 [\"foo\"], # Example 2 [\"bar\"], \ - \ # Example 3 ] Output (with default separator=\" \"): .. code-block::\ - \ python [ [1, 1], # Example 0 [1, 1], # Example 1 [1,\ - \ 0], # Example 2 [0, 1], # Example 3 ]\n Arguments:\n\ - \ input_columns: A list with a single column to perform the multi-hot-encoding\ - \ on.\n output_columns: A list with a single output column name,\ - \ corresponding to the output of our transformation.\n top_k: Number\ - \ of the most frequent words in the vocabulary to use for generating dictionary\ - \ lookup indices. If not specified, all words in the vocabulary will be\ - \ used. Defaults to None.\n frequency_threshold: Limit the dictionary's\ - \ vocabulary only to words whose number of occurrences in the input exceeds\ - \ frequency_threshold. If not specified, all words in the vocabulary will\ - \ be included. If both top_k and frequency_threshold are specified, a\ - \ word must satisfy both conditions to be included. Defaults to None.\n\ - \ separator: Separator to split input string into tokens. Defaults\ - \ to ' '.\nMaxAbsScale: Performs maximum absolute scaling on a numeric\ - \ column.\n Example: .. code-block:: python { \"transformation\"\ - : \"MaxAbsScale\", \"input_columns\": [\"col1\"], \"output_columns\":\ - \ [\"col1_max_abs_scaled\"] }\n Arguments:\n input_columns:\ - \ A list with a single column to perform max-abs-scale on.\n output_columns:\ - \ A list with a single output column name, corresponding to the output\ - \ of our transformation.\nCustom: Transformations defined in tf_custom_transformation_definitions\ - \ are included here in the TensorFlow-based transformation configuration.\ - \ For example, given the following tf_custom_transformation_definitions:\ - \ .. code-block:: python [ { \"transformation\": \"PlusX\", \"module_path\"\ - : \"gs://bucket/custom_transform_fn.py\", \"function_name\": \"plus_one_transform\"\ - \ } ] We can include the following transformation: .. code-block:: python\ - \ { \"transformation\": \"PlusX\", \"input_columns\": [\"col1\"], \"\ - output_columns\": [\"col1_max_abs_scaled\"] \"x\": 5 } Note that input_columns\ - \ must still be included in our arguments and output_columns is optional.\ - \ All other arguments are those defined in custom_transform_fn.py, which\ - \ includes `\"x\"` in this case. See tf_custom_transformation_definitions\ - \ above. legacy_transformations_path (Optional[str]) Deprecated. Prefer\ - \ tf_auto_transform_features. Path to a GCS file containing JSON string\ - \ for legacy style transformations. Note that legacy_transformations_path\ - \ and tf_auto_transform_features cannot both be specified." - isOptional: true - parameterType: STRING - timestamp_split_key: - defaultValue: '' - description: Timestamp split key. - isOptional: true - parameterType: STRING - training_fraction: - defaultValue: -1.0 - description: Fraction of input data for training. - isOptional: true - parameterType: NUMBER_DOUBLE - validation_fraction: - defaultValue: -1.0 - description: Fraction of input data for validation. - isOptional: true - parameterType: NUMBER_DOUBLE - weight_column: - defaultValue: '' - description: Weight column of input data. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - dataset_stats: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The stats of the dataset. - feature_ranking: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The ranking of features, all features supported in the dataset - will be included. For "AMI" algorithm, array features won't be available - in the ranking as arrays are not supported yet. - instance_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - materialized_data: - artifactType: - schemaTitle: system.Dataset - schemaVersion: 0.0.1 - description: The materialized dataset. - training_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - transform_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The transform output artifact. - parameters: - bigquery_downsampled_test_split_uri: - description: BigQuery URI for the downsampled test split to pass to the - batch prediction component during batch explain. - parameterType: STRING - bigquery_test_split_uri: - description: BigQuery URI for the test split to pass to the batch prediction - component during evaluation. - parameterType: STRING - bigquery_train_split_uri: - description: BigQuery URI for the train split to pass to the batch prediction - component during distillation. - parameterType: STRING - bigquery_validation_split_uri: - description: BigQuery URI for the validation split to pass to the batch - prediction component during distillation. - parameterType: STRING - gcp_resources: - description: GCP resources created by this component. For more details, - see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/proto/README.md. - parameterType: STRING - split_example_counts: - description: JSON string of data split example counts for train, validate, - and test splits. - parameterType: STRING - comp-get-best-hyperparameter-tuning-job-trial: - executorLabel: exec-get-best-hyperparameter-tuning-job-trial - inputDefinitions: - parameters: - gcp_resources: - description: Proto tracking the hyperparameter tuning job. - parameterType: STRING - instance_schema_uri: - defaultValue: '' - description: The instance schema uri. - isOptional: true - parameterType: STRING - prediction_docker_uri: - defaultValue: '' - description: The prediction docker container uri. - isOptional: true - parameterType: STRING - prediction_schema_uri: - defaultValue: '' - description: The prediction schema_uri. - isOptional: true - parameterType: STRING - read_value_from_file: - defaultValue: false - description: If true, read file to get the relevant value. - isOptional: true - parameterType: BOOLEAN - study_spec_metric_goal: - description: 'Optimization goal of the metric, possible values: - - "MAXIMIZE", "MINIMIZE".' - parameterType: STRING - trials_dir: - defaultValue: '' - description: The path to the hyperparameter tuning trials. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - unmanaged_container_model: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - comp-get-model-display-name: - executorLabel: exec-get-model-display-name - inputDefinitions: - parameters: - model_display_name: - parameterType: STRING - outputDefinitions: - parameters: - model_display_name: - parameterType: STRING - comp-get-wide-and-deep-study-spec-parameters: - executorLabel: exec-get-wide-and-deep-study-spec-parameters - inputDefinitions: - parameters: - study_spec_parameters_override: - description: 'List of dictionaries representing parameters - - to optimize. The dictionary key is the parameter_id, which is passed to - - training job as a command line argument, and the dictionary value is the - - parameter specification of the metric.' - parameterType: LIST - outputDefinitions: - parameters: - Output: - parameterType: LIST - comp-model-batch-predict: - executorLabel: exec-model-batch-predict - inputDefinitions: - artifacts: - model: - artifactType: - schemaTitle: google.VertexModel - schemaVersion: 0.0.1 - description: 'The Model used to get predictions via this job. Must share - the same - - ancestor Location. Starting this job has no impact on any existing - - deployments of the Model and their resources. Either this or - - `unmanaged_container_model` must be specified.' - isOptional: true - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: 'The unmanaged container model used to get predictions via - this job. - - This should be used for models that are not uploaded to Vertex. Either - - this or model must be specified.' - isOptional: true - parameters: - accelerator_count: - defaultValue: 0.0 - description: 'The number of accelerators to attach - - to the `machine_type`. Only used if `machine_type` is set. For more - - details about the machine spec, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec' - isOptional: true - parameterType: NUMBER_INTEGER - accelerator_type: - defaultValue: '' - description: 'The type of accelerator(s) that may be - - attached to the machine as per `accelerator_count`. Only used if - - `machine_type` is set. For more details about the machine spec, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec' - isOptional: true - parameterType: STRING - bigquery_destination_output_uri: - defaultValue: '' - description: 'The BigQuery project location where the output is to be written - to. In - - the given project a new dataset is created with name - - `prediction__` where is made - - BigQuery-dataset-name compatible (for example, most special characters - - become underscores), and timestamp is in YYYY_MM_DDThh_mm_ss_sssZ - - "based on ISO-8601" format. In the dataset two tables will be created, - - `predictions`, and `errors`. If the Model has both `instance` - - and `prediction` schemata defined then the tables have columns as - - follows: The `predictions` table contains instances for which the - - prediction succeeded, it has columns as per a concatenation of the - - Model''s instance and prediction schemata. The `errors` table - - contains rows for which the prediction has failed, it has instance - - columns, as per the instance schema, followed by a single "errors" - - column, which as values has [google.rpc.Status](Status) - - represented as a STRUCT, and containing only `code` and - - `message`. For more details about this output config, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig.' - isOptional: true - parameterType: STRING - bigquery_source_input_uri: - defaultValue: '' - description: 'BigQuery URI to a table, up to 2000 characters long. For example: - - `projectId.bqDatasetId.bqTableId` For more details about this input - - config, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig.' - isOptional: true - parameterType: STRING - encryption_spec_key_name: - defaultValue: '' - description: 'Customer-managed encryption - - key options for a BatchPredictionJob. If this is set, then all - - resources created by the BatchPredictionJob will be encrypted with the - - provided encryption key. Has the form: - - `projects/my-project/locations/my-location/keyRings/my-kr/cryptoKeys/my-key`. - - The key needs to be in the same region as where the compute resource - - is created.' - isOptional: true - parameterType: STRING - excluded_fields: - defaultValue: [] - description: 'Fields that will be excluded in the prediction instance that - is - - sent to the Model. - - Excluded will be attached to the batch prediction output if - - key_field is not specified. - - When `excluded_fields` is populated, `included_fields` must be empty. - - The input must be JSONL with objects at each line, CSV, BigQuery - - or TfRecord. - - may be specified via the Model''s `parameters_schema_uri`.' - isOptional: true - parameterType: LIST - explanation_metadata: - defaultValue: {} - description: 'Explanation metadata - - configuration for this BatchPredictionJob. Can be specified only if - - `generate_explanation` is set to `True`. This value overrides the - - value of `Model.explanation_metadata`. All fields of - - `explanation_metadata` are optional in the request. If a field of the - - `explanation_metadata` object is not populated, the corresponding - - field of the `Model.explanation_metadata` object is inherited. For - - more details, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#explanationmetadata.' - isOptional: true - parameterType: STRUCT - explanation_parameters: - defaultValue: {} - description: 'Parameters to configure - - explaining for Model''s predictions. Can be specified only if - - `generate_explanation` is set to `True`. This value overrides the - - value of `Model.explanation_parameters`. All fields of - - `explanation_parameters` are optional in the request. If a field of - - the `explanation_parameters` object is not populated, the - - corresponding field of the `Model.explanation_parameters` object is - - inherited. For more details, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#ExplanationParameters.' - isOptional: true - parameterType: STRUCT - gcs_destination_output_uri_prefix: - defaultValue: '' - description: 'The Google Cloud - - Storage location of the directory where the output is to be written - - to. In the given directory a new directory is created. Its name is - - `prediction--`, where timestamp - - is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. Inside of it files - - `predictions_0001.`, `predictions_0002.`, - - ..., `predictions_N.` are created where `` - - depends on chosen `predictions_format`, and N may equal 0001 and - - depends on the total number of successfully predicted instances. If - - the Model has both `instance` and `prediction` schemata defined - - then each such file contains predictions as per the - - `predictions_format`. If prediction for any instance failed - - (partially or completely), then an additional - - `errors_0001.`, `errors_0002.`,..., - - `errors_N.` files are created (N depends on total number - - of failed predictions). These files contain the failed instances, as - - per their schema, followed by an additional `error` field which as - - value has `google.rpc.Status` containing only `code` and - - `message` fields. For more details about this output config, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig.' - isOptional: true - parameterType: STRING - gcs_source_uris: - defaultValue: [] - description: 'Google Cloud Storage URI(-s) to your instances to run batch - prediction - - on. They must match `instances_format`. May contain wildcards. For more - - information on wildcards, see [WildcardNames](https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames). - - For more details about this input config, see [InputConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig).' - isOptional: true - parameterType: LIST - generate_explanation: - defaultValue: false - description: 'Generate explanation along with - - the batch prediction results. This will cause the batch prediction - - output to include explanations based on the `prediction_format`: - - - `bigquery`: output includes a column named `explanation`. The value is - - a struct that conforms to the [aiplatform.gapic.Explanation] object. - - - `jsonl`: The JSON objects on each line include an additional entry - - keyed `explanation`. The value of the entry is a JSON object that - - conforms to the [aiplatform.gapic.Explanation] object. - `csv`: - - Generating explanations for CSV format is not supported. If this - - field is set to true, either the Model.explanation_spec or - - explanation_metadata and explanation_parameters must be populated.' - isOptional: true - parameterType: BOOLEAN - included_fields: - defaultValue: [] - description: 'Fields that will be included in the prediction instance that - is - - sent to the Model. - - If `instance_type` is `array`, the order of field names in - - `included_fields` also determines the order of the values in the array. - - When `included_fields` is populated, `excluded_fields` must be empty. - - The input must be JSONL with objects at each line, CSV, BigQuery - - or TfRecord.' - isOptional: true - parameterType: LIST - instance_type: - defaultValue: '' - description: "The format of the instance that the Model\naccepts. Vertex\ - \ AI will convert compatible\n[InstancesFormat](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig)\n\ - to the specified format. Supported values are:\n`object`: Each input is\ - \ converted to JSON object format.\n * For `bigquery`, each row is converted\ - \ to an object.\n * For `jsonl`, each line of the JSONL input must be\ - \ an object.\n * Does not apply to `csv`, `file-list`, `tf-record`, or\ - \ `tf-record-gzip`.\n`array`: Each input is converted to JSON array format.\n\ - \ * For `bigquery`, each row is converted to an array. The order\n \ - \ of columns is determined by the BigQuery column order, unless\n \ - \ [included_fields](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig)\ - \ is populated.\n `included_fields` must be populated for specifying\ - \ field orders.\n * For `jsonl`, if each line of the JSONL input is an\ - \ object,\n `included_fields` must be populated for specifying field\ - \ orders.\n * Does not apply to `csv`, `file-list`, `tf-record`, or\n\ - \ `tf-record-gzip`.\nIf not specified, Vertex AI converts the batch\ - \ prediction input as\nfollows:\n * For `bigquery` and `csv`, the behavior\ - \ is the same as `array`. The\n order of columns is the same as defined\ - \ in the file or table, unless\n included_fields is populated.\n * For\ - \ `jsonl`, the prediction instance format is determined by\n each line\ - \ of the input.\n * For `tf-record`/`tf-record-gzip`, each record will\ - \ be converted to\n an object in the format of `{\"b64\": }`,\ - \ where `` is\n the Base64-encoded string of the content of the\ - \ record.\n * For `file-list`, each file in the list will be converted\ - \ to an\n object in the format of `{\"b64\": }`, where ``\ - \ is\n the Base64-encoded string of the content of the file." - isOptional: true - parameterType: STRING - instances_format: - defaultValue: jsonl - description: 'The format in which instances are - - given, must be one of the [Model](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models)''s - supportedInputStorageFormats. - - For more details about this input config, see - - [InputConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig.)' - isOptional: true - parameterType: STRING - job_display_name: - description: The user-defined name of this BatchPredictionJob. - parameterType: STRING - key_field: - defaultValue: '' - description: "The name of the field that is considered as a key.\nThe values\ - \ identified by the key field is not included in the\ntransformed instances\ - \ that is sent to the Model. This is similar to\nspecifying this name\ - \ of the field in [excluded_fields](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig).\ - \ In addition,\nthe batch prediction output will not include the instances.\ - \ Instead the\noutput will only include the value of the key field, in\ - \ a field named\n`key` in the output:\n * For `jsonl` output format, the\ - \ output will have a `key` field\n instead of the `instance` field.\n\ - \ * For `csv`/`bigquery` output format, the output will have have a `key`\n\ - \ column instead of the instance feature columns.\nThe input must be\ - \ JSONL with objects at each line, CSV, BigQuery\nor TfRecord." - isOptional: true - parameterType: STRING - labels: - defaultValue: {} - description: 'The labels with user-defined metadata to - - organize your BatchPredictionJobs. Label keys and values can be no - - longer than 64 characters (Unicode codepoints), can only contain - - lowercase letters, numeric characters, underscores and dashes. - - International characters are allowed. See https://goo.gl/xmQnxf for - - more information and examples of labels.' - isOptional: true - parameterType: STRUCT - location: - defaultValue: us-central1 - description: Location for creating the BatchPredictionJob. - isOptional: true - parameterType: STRING - machine_type: - defaultValue: '' - description: 'The type of machine for running batch - - prediction on dedicated resources. If the Model supports - - DEDICATED_RESOURCES this config may be provided (and the job will use - - these resources). If the Model doesn''t support AUTOMATIC_RESOURCES, - - this config must be provided. For more details about the - - BatchDedicatedResources, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#BatchDedicatedResources. - - For more details about the machine spec, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec' - isOptional: true - parameterType: STRING - manual_batch_tuning_parameters_batch_size: - defaultValue: 0.0 - description: 'The number of - - the records (e.g. instances) of the operation given in each batch to a - - machine replica. Machine type, and size of a single record should be - - considered when setting this parameter, higher value speeds up the - - batch operation''s execution, but too high value will result in a whole - - batch not fitting in a machine''s memory, and the whole operation will - - fail.' - isOptional: true - parameterType: NUMBER_INTEGER - max_replica_count: - defaultValue: 0.0 - description: 'The maximum number of machine replicas the batch operation - may be scaled - - to. Only used if `machine_type` is set.' - isOptional: true - parameterType: NUMBER_INTEGER - model_parameters: - defaultValue: {} - description: The parameters that govern the predictions. The schema of the - parameters - isOptional: true - parameterType: STRUCT - predictions_format: - defaultValue: jsonl - description: 'The format in which Vertex AI gives the predictions. Must - be one of the - - Model''s supportedOutputStorageFormats. - - For more details about this output config, see [OutputConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig).' - isOptional: true - parameterType: STRING - project: - defaultValue: '{{$.pipeline_google_cloud_project_id}}' - description: Project to create the BatchPredictionJob. Defaults to the project - in which the PipelineJob is run. - isOptional: true - parameterType: STRING - starting_replica_count: - defaultValue: 0.0 - description: 'The number of machine replicas - - used at the start of the batch operation. If not set, Vertex AI - - decides starting number, not greater than `max_replica_count`. Only - - used if `machine_type` is set.' - isOptional: true - parameterType: NUMBER_INTEGER - outputDefinitions: - artifacts: - batchpredictionjob: - artifactType: - schemaTitle: google.VertexBatchPredictionJob - schemaVersion: 0.0.1 - description: '[**Deprecated. Use gcs_output_directory and bigquery_output_table - - instead.**] Artifact - - representation of the created batch prediction job.' - bigquery_output_table: - artifactType: - schemaTitle: google.BQTable - schemaVersion: 0.0.1 - description: 'Artifact tracking the batch prediction job output. This is - only - - available if - - bigquery_output_table is specified.' - gcs_output_directory: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: 'Artifact tracking the batch prediction job output. This is - only - - available if - - gcs_destination_output_uri_prefix is specified.' - parameters: - gcp_resources: - description: 'Serialized gcp_resources proto tracking the batch prediction - job. - - For more details, see - - https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/proto/README.md.' - parameterType: STRING - comp-model-evaluation: - executorLabel: exec-model-evaluation - inputDefinitions: - artifacts: - batch_prediction_job: - artifactType: - schemaTitle: google.VertexBatchPredictionJob - schemaVersion: 0.0.1 - parameters: - dataflow_disk_size: - defaultValue: 50.0 - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_machine_type: - defaultValue: n1-standard-4 - isOptional: true - parameterType: STRING - dataflow_max_workers_num: - defaultValue: 100.0 - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_service_account: - defaultValue: '' - isOptional: true - parameterType: STRING - dataflow_subnetwork: - defaultValue: '' - isOptional: true - parameterType: STRING - dataflow_use_public_ips: - defaultValue: true - isOptional: true - parameterType: BOOLEAN - dataflow_workers_num: - defaultValue: 10.0 - isOptional: true - parameterType: NUMBER_INTEGER - encryption_spec_key_name: - defaultValue: '' - isOptional: true - parameterType: STRING - example_weight_column: - defaultValue: '' - isOptional: true - parameterType: STRING - ground_truth_column: - parameterType: STRING - ground_truth_format: - defaultValue: jsonl - isOptional: true - parameterType: STRING - location: - defaultValue: us-central1 - isOptional: true - parameterType: STRING - prediction_id_column: - defaultValue: '' - isOptional: true - parameterType: STRING - prediction_label_column: - defaultValue: '' - isOptional: true - parameterType: STRING - prediction_score_column: - defaultValue: '' - isOptional: true - parameterType: STRING - predictions_format: - defaultValue: jsonl - isOptional: true - parameterType: STRING - problem_type: - parameterType: STRING - project: - parameterType: STRING - root_dir: - parameterType: STRING - outputDefinitions: - artifacts: - evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 - parameters: - gcp_resources: - parameterType: STRING - comp-model-upload: - executorLabel: exec-model-upload - inputDefinitions: - artifacts: - parent_model: - artifactType: - schemaTitle: google.VertexModel - schemaVersion: 0.0.1 - description: An artifact of a model which to upload a new version to. Only - specify this field when uploading a new version. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models/upload#request-body) - isOptional: true - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: "The unmanaged container model to be uploaded. The Model can\ - \ be passed from an upstream step or imported via a KFP `dsl.importer`.\n\ - :Examples:\n ::\n\n from kfp import dsl\n from google_cloud_pipeline_components.google_cloud_pipeline_components.types\ - \ import artifact_types\n\n importer_spec = dsl.importer(\n artifact_uri='gs://managed-pipeline-gcpc-e2e-test/automl-tabular/model',\n\ - \ artifact_class=artifact_types.UnmanagedContainerModel,\n metadata={\n\ - \ 'containerSpec': { 'imageUri':\n 'us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:prod'\n\ - \ }\n })" - isOptional: true - parameters: - description: - defaultValue: '' - description: The description of the Model. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models#Model) - isOptional: true - parameterType: STRING - display_name: - description: 'The display name of the Model. The name - - can be up to 128 characters long and can be consist of any UTF-8 - - characters. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models#Model)' - parameterType: STRING - encryption_spec_key_name: - defaultValue: '' - description: 'Customer-managed encryption - - key spec for a Model. If set, this Model and all sub-resources of this - - Model will be secured by this key. Has the form: - - `projects/my-project/locations/my-location/keyRings/my-kr/cryptoKeys/my-key`. - - The key needs to be in the same region as where the compute resource - - is created.' - isOptional: true - parameterType: STRING - explanation_metadata: - defaultValue: {} - description: 'Metadata describing the Model''s - - input and output for explanation. Both `explanation_metadata` and `explanation_parameters` - must be passed together when used. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#explanationmetadata)' - isOptional: true - parameterType: STRUCT - explanation_parameters: - defaultValue: {} - description: 'Parameters to configure - - explaining for Model''s predictions. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#ExplanationParameters)' - isOptional: true - parameterType: STRUCT - labels: - defaultValue: {} - description: 'The labels with user-defined metadata to - - organize your model. Label keys and values can be no longer than 64 - - characters (Unicode codepoints), can only contain lowercase letters, - - numeric characters, underscores and dashes. International characters - - are allowed. See https://goo.gl/xmQnxf for more information and - - examples of labels.' - isOptional: true - parameterType: STRUCT - location: - defaultValue: us-central1 - description: 'Optional location to upload this Model to. If - - not set, defaults to `us-central1`.' - isOptional: true - parameterType: STRING - project: - defaultValue: '{{$.pipeline_google_cloud_project_id}}' - description: Project to upload this Model to. Defaults to the project in - which the PipelineJob is run. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - model: - artifactType: - schemaTitle: google.VertexModel - schemaVersion: 0.0.1 - description: Artifact tracking the created Model. - parameters: - gcp_resources: - description: Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) - which tracks the upload Model's long-running operation. - parameterType: STRING - comp-parse-worker-pool-specs-override: - executorLabel: exec-parse-worker-pool-specs-override - inputDefinitions: - parameters: - worker_pool_specs_override: - description: 'The list of dictionaries for overriding training - - and evaluation worker pool specs.' - parameterType: LIST - outputDefinitions: - parameters: - eval_machine_spec: - description: The eval machine spec. - parameterType: STRUCT - eval_replica_count: - description: The replica count for eval. - parameterType: NUMBER_INTEGER - training_disk_spec: - description: The training disk spec. - parameterType: STRUCT - training_machine_spec: - description: The training machine spec. - parameterType: STRUCT - comp-set-optional-inputs: - executorLabel: exec-set-optional-inputs - inputDefinitions: - artifacts: - vertex_dataset: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The Vertex dataset when data source is Vertex dataset. - parameters: - data_source_bigquery_table_path: - description: The BigQuery table when data source is BQ. - parameterType: STRING - data_source_csv_filenames: - description: The CSV GCS path when data source is CSV. - parameterType: STRING - location: - description: The GCP region that runs the pipeline components. - parameterType: STRING - project: - description: The GCP project that runs the pipeline components. - parameterType: STRING - outputDefinitions: - parameters: - data_source_bigquery_table_path: - parameterType: STRING - data_source_csv_filenames: - parameterType: STRING - comp-split-materialized-data: - executorLabel: exec-split-materialized-data - inputDefinitions: - artifacts: - materialized_data: - artifactType: - schemaTitle: system.Dataset - schemaVersion: 0.0.1 - description: 'Materialized dataset output by the Feature - - Transform Engine.' - outputDefinitions: - artifacts: - materialized_eval_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path patern to materialized eval split. - materialized_test_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path patern to materialized test split. - materialized_train_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path patern to materialized train split. - comp-training-configurator-and-validator: - executorLabel: exec-training-configurator-and-validator - inputDefinitions: - artifacts: - dataset_stats: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Dataset stats generated by feature transform engine. - instance_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Schema of input data to the tf_model at serving time. - training_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - parameters: - available_at_forecast_columns: - defaultValue: [] - description: The names of the columns that are available at forecast time. - isOptional: true - parameterType: LIST - context_window: - defaultValue: -1.0 - description: The length of the context window. - isOptional: true - parameterType: NUMBER_INTEGER - enable_probabilistic_inference: - defaultValue: false - description: If probabilistic inference is enabled, the model will fit a - distribution that captures the uncertainty of a prediction. At inference - time, the predictive distribution is used to make a point prediction that - minimizes the optimization objective. For example, the mean of a predictive - distribution is the point prediction that minimizes RMSE loss. If quantiles - are specified, then the quantiles of the distribution are also returned. - isOptional: true - parameterType: BOOLEAN - forecast_horizon: - defaultValue: -1.0 - description: The length of the forecast horizon. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_model_type: - defaultValue: '' - description: The model types, e.g. l2l, seq2seq, tft. - isOptional: true - parameterType: STRING - forecasting_transformations: - defaultValue: {} - description: Dict mapping auto and/or type-resolutions to feature columns. - The supported types are auto, categorical, numeric, text, and timestamp. - isOptional: true - parameterType: STRUCT - group_columns: - description: A list of time series attribute column names that define the - time series hierarchy. - isOptional: true - parameterType: LIST - group_temporal_total_weight: - defaultValue: 0.0 - description: The weight of the loss for predictions aggregated over both - the horizon and time series in the same hierarchy group. - isOptional: true - parameterType: NUMBER_DOUBLE - group_total_weight: - defaultValue: 0.0 - description: The weight of the loss for predictions aggregated over time - series in the same group. - isOptional: true - parameterType: NUMBER_DOUBLE - optimization_objective: - defaultValue: '' - description: 'Objective function the model is optimizing towards. The training - process creates a model that maximizes/minimizes the value of the objective - function over the validation set. The supported optimization objectives - depend on the prediction type. If the field is not set, a default objective - function is used. classification: "maximize-au-roc" (default) - Maximize - the area under the receiver operating characteristic (ROC) curve. "minimize-log-loss" - - Minimize log loss. "maximize-au-prc" - Maximize the area under the precision-recall - curve. "maximize-precision-at-recall" - Maximize precision for a specified - recall value. "maximize-recall-at-precision" - Maximize recall for a specified - precision value. classification (multi-class): "minimize-log-loss" (default) - - Minimize log loss. regression: "minimize-rmse" (default) - Minimize - root-mean-squared error (RMSE). "minimize-mae" - Minimize mean-absolute - error (MAE). "minimize-rmsle" - Minimize root-mean-squared log error - (RMSLE).' - isOptional: true - parameterType: STRING - optimization_objective_precision_value: - defaultValue: -1.0 - description: Required when optimization_objective is "maximize-recall-at-precision". - Must be between 0 and 1, inclusive. - isOptional: true - parameterType: NUMBER_DOUBLE - optimization_objective_recall_value: - defaultValue: -1.0 - description: Required when optimization_objective is "maximize-precision-at-recall". - Must be between 0 and 1, inclusive. - isOptional: true - parameterType: NUMBER_DOUBLE - prediction_type: - defaultValue: '' - description: Model prediction type. One of "classification", "regression", - "time_series". - isOptional: true - parameterType: STRING - quantiles: - defaultValue: [] - description: All quantiles that the model need to predict. - isOptional: true - parameterType: LIST - run_distill: - defaultValue: false - description: Whether the distillation should be applied to the training. - isOptional: true - parameterType: BOOLEAN - run_evaluation: - defaultValue: false - description: Whether we are running evaluation in the training pipeline. - isOptional: true - parameterType: BOOLEAN - split_example_counts: - description: JSON string of data split example counts for train, validate, - and test splits. - parameterType: STRING - stage_1_deadline_hours: - description: Stage 1 training budget in hours. - isOptional: true - parameterType: NUMBER_DOUBLE - stage_2_deadline_hours: - description: Stage 2 training budget in hours. - isOptional: true - parameterType: NUMBER_DOUBLE - target_column: - defaultValue: '' - description: Target column of input data. - isOptional: true - parameterType: STRING - temporal_total_weight: - defaultValue: 0.0 - description: The weight of the loss for predictions aggregated over the - horizon for a single time series. - isOptional: true - parameterType: NUMBER_DOUBLE - time_column: - defaultValue: '' - description: The column that indicates the time. Used by forecasting only. - isOptional: true - parameterType: STRING - time_series_attribute_columns: - defaultValue: [] - description: The column names of the time series attributes. - isOptional: true - parameterType: LIST - time_series_identifier_column: - description: '[Deprecated] The time series identifier column. Used by forecasting - only. Raises exception if used - use the "time_series_identifier_column" - field instead.' - isOptional: true - parameterType: STRING - time_series_identifier_columns: - defaultValue: [] - description: The list of time series identifier columns. Used by forecasting - only. - isOptional: true - parameterType: LIST - unavailable_at_forecast_columns: - defaultValue: [] - description: The names of the columns that are not available at forecast - time. - isOptional: true - parameterType: LIST - weight_column: - defaultValue: '' - description: Weight column of input data. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - instance_baseline: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - metadata: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The tabular example gen metadata. - comp-wide-and-deep-hyperparameter-tuning-job: - executorLabel: exec-wide-and-deep-hyperparameter-tuning-job - inputDefinitions: - artifacts: - instance_baseline: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to a JSON file for baseline values. - materialized_eval_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to the materialized validation split. - materialized_train_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to the materialized train split. - metadata: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Amount of time in seconds to run the trainer for. - training_schema_uri: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to the training schema. - transform_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to transform output. - parameters: - cache_data: - defaultValue: auto - description: Whether to cache data or not. If set to 'auto', caching is - determined based on the dataset size. - isOptional: true - parameterType: STRING - enable_profiler: - defaultValue: false - description: Enables profiling and saves a trace during evaluation. - isOptional: true - parameterType: BOOLEAN - encryption_spec_key_name: - defaultValue: '' - description: The KMS key name. - isOptional: true - parameterType: STRING - eval_frequency_secs: - defaultValue: 600.0 - description: Frequency at which evaluation and checkpointing will take place. - isOptional: true - parameterType: NUMBER_INTEGER - eval_steps: - defaultValue: 0.0 - description: Number of steps to run evaluation for. If not specified or - negative, it means run evaluation on the whole validation dataset. If - set to 0, it means run evaluation for a fixed number of samples. - isOptional: true - parameterType: NUMBER_INTEGER - location: - description: The GCP region that runs the pipeline components. - parameterType: STRING - max_failed_trial_count: - defaultValue: 0.0 - description: The number of failed trials that need to be seen before failing - the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials - must fail before the whole job fails. - isOptional: true - parameterType: NUMBER_INTEGER - max_trial_count: - description: The desired total number of trials. - parameterType: NUMBER_INTEGER - parallel_trial_count: - description: The desired number of trials to run in parallel. - parameterType: NUMBER_INTEGER - prediction_type: - description: The type of prediction the model is to produce. "classification" - or "regression". - parameterType: STRING - project: - description: The GCP project that runs the pipeline components. - parameterType: STRING - root_dir: - description: The root GCS directory for the pipeline components. - parameterType: STRING - seed: - defaultValue: 1.0 - description: Seed to be used for this run. - isOptional: true - parameterType: NUMBER_INTEGER - study_spec_algorithm: - defaultValue: ALGORITHM_UNSPECIFIED - description: The search algorithm specified for the study. One of 'ALGORITHM_UNSPECIFIED', - 'GRID_SEARCH', or 'RANDOM_SEARCH'. - isOptional: true - parameterType: STRING - study_spec_measurement_selection_type: - defaultValue: BEST_MEASUREMENT - description: Which measurement to use if/when the service automatically - selects the final measurement from previously reported intermediate measurements. - One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT". - isOptional: true - parameterType: STRING - study_spec_metric_goal: - description: 'Optimization goal of the metric, possible values: "MAXIMIZE", - "MINIMIZE".' - parameterType: STRING - study_spec_metric_id: - description: 'Metric to optimize, possible values: [ ''loss'', ''average_loss'', - ''rmse'', ''mae'', ''mql'', ''accuracy'', ''auc'', ''precision'', ''recall''].' - parameterType: STRING - study_spec_parameters_override: - description: List of dictionaries representing parameters to optimize. The - dictionary key is the parameter_id, which is passed to training job as - a command line argument, and the dictionary value is the parameter specification - of the metric. - parameterType: LIST - target_column: - description: The target column name. - parameterType: STRING - training_disk_spec: - defaultValue: - boot_disk_size_gb: 100.0 - boot_disk_type: pd-ssd - description: The training disk spec. - isOptional: true - parameterType: STRUCT - training_machine_spec: - defaultValue: - machine_type: c2-standard-16 - description: The training machine spec. See https://cloud.google.com/compute/docs/machine-types - for options. - isOptional: true - parameterType: STRUCT - weight_column: - defaultValue: '' - description: The weight column name. - isOptional: true - parameterType: STRING - outputDefinitions: - parameters: - execution_metrics: - description: Core metrics in dictionary of hyperparameter tuning job execution. - parameterType: STRUCT - gcp_resources: - description: Serialized gcp_resources proto tracking the custom training - job. - parameterType: STRING - instance_schema_uri: - description: The path to the instance schema. - parameterType: STRING - prediction_docker_uri_output: - description: The URI of the prediction container. - parameterType: STRING - prediction_schema_uri: - description: The path to the prediction schema. - parameterType: STRING - trials: - description: The path to the hyperparameter tuning trials - parameterType: STRING -deploymentSpec: - executors: - exec-automl-tabular-finalizer: - container: - args: - - --type - - CustomJob - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --payload - - '{"Concat": ["{\"display_name\": \"automl-tabular-finalizer-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}\", - \"encryption_spec\": {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}, \"job_spec\": {\"worker_pool_specs\": [{\"replica_count\": 1, \"machine_spec\": - {\"machine_type\": \"n1-standard-8\"}, \"container_spec\": {\"image_uri\":\"", - "us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/training:20251102_1045", "\", - \"args\": [\"cancel_l2l_tuner\", \"--error_file_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/error.pb\", \"--cleanup_lro_job_infos=", - "{{$.inputs.parameters[''root_dir'']}}", "/{{$.pipeline_job_uuid}}/lro\"]}}]}}"]}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.custom_job.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.44 - exec-automl-tabular-infra-validator: - container: - args: - - --executor_input - - '{{$}}' - image: us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:20251102_1045 - resources: - cpuLimit: 8.0 - memoryLimit: 52.0 - exec-bool-identity: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _bool_identity - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _bool_identity(value: bool) -> str:\n \"\"\"Returns boolean\ - \ value.\n\n Args:\n value: Boolean value to return\n\n Returns:\n\ - \ Boolean value.\n \"\"\"\n return 'true' if value else 'false'\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-feature-transform-engine: - container: - args: - - feature_transform_engine - - '{"Concat": ["--project=", "{{$.inputs.parameters[''project'']}}"]}' - - '{"Concat": ["--location=", "{{$.inputs.parameters[''location'']}}"]}' - - '{"Concat": ["--dataset_level_custom_transformation_definitions=", "{{$.inputs.parameters[''dataset_level_custom_transformation_definitions'']}}"]}' - - '{"Concat": ["--dataset_level_transformations=", "{{$.inputs.parameters[''dataset_level_transformations'']}}"]}' - - '{"Concat": ["--forecasting_time_column=", "{{$.inputs.parameters[''forecasting_time_column'']}}"]}' - - '{"IfPresent": {"InputName": "forecasting_time_series_identifier_column", - "Then": {"Concat": ["--forecasting_time_series_identifier_column=", "{{$.inputs.parameters[''forecasting_time_series_identifier_column'']}}"]}}}' - - '{"Concat": ["--forecasting_time_series_identifier_columns=", "{{$.inputs.parameters[''forecasting_time_series_identifier_columns'']}}"]}' - - '{"Concat": ["--forecasting_time_series_attribute_columns=", "{{$.inputs.parameters[''forecasting_time_series_attribute_columns'']}}"]}' - - '{"Concat": ["--forecasting_unavailable_at_forecast_columns=", "{{$.inputs.parameters[''forecasting_unavailable_at_forecast_columns'']}}"]}' - - '{"Concat": ["--forecasting_available_at_forecast_columns=", "{{$.inputs.parameters[''forecasting_available_at_forecast_columns'']}}"]}' - - '{"Concat": ["--forecasting_forecast_horizon=", "{{$.inputs.parameters[''forecasting_forecast_horizon'']}}"]}' - - '{"Concat": ["--forecasting_context_window=", "{{$.inputs.parameters[''forecasting_context_window'']}}"]}' - - '{"Concat": ["--forecasting_predefined_window_column=", "{{$.inputs.parameters[''forecasting_predefined_window_column'']}}"]}' - - '{"Concat": ["--forecasting_window_stride_length=", "{{$.inputs.parameters[''forecasting_window_stride_length'']}}"]}' - - '{"Concat": ["--forecasting_window_max_count=", "{{$.inputs.parameters[''forecasting_window_max_count'']}}"]}' - - '{"Concat": ["--forecasting_holiday_regions=", "{{$.inputs.parameters[''forecasting_holiday_regions'']}}"]}' - - '{"Concat": ["--forecasting_apply_windowing=", "{{$.inputs.parameters[''forecasting_apply_windowing'']}}"]}' - - '{"Concat": ["--predefined_split_key=", "{{$.inputs.parameters[''predefined_split_key'']}}"]}' - - '{"Concat": ["--stratified_split_key=", "{{$.inputs.parameters[''stratified_split_key'']}}"]}' - - '{"Concat": ["--timestamp_split_key=", "{{$.inputs.parameters[''timestamp_split_key'']}}"]}' - - '{"Concat": ["--training_fraction=", "{{$.inputs.parameters[''training_fraction'']}}"]}' - - '{"Concat": ["--validation_fraction=", "{{$.inputs.parameters[''validation_fraction'']}}"]}' - - '{"Concat": ["--test_fraction=", "{{$.inputs.parameters[''test_fraction'']}}"]}' - - '{"Concat": ["--stats_gen_execution_engine=", "{{$.inputs.parameters[''stats_gen_execution_engine'']}}"]}' - - '{"Concat": ["--tf_transform_execution_engine=", "{{$.inputs.parameters[''tf_transform_execution_engine'']}}"]}' - - '{"IfPresent": {"InputName": "tf_auto_transform_features", "Then": {"Concat": - ["--tf_auto_transform_features=", "{{$.inputs.parameters[''tf_auto_transform_features'']}}"]}}}' - - '{"Concat": ["--tf_custom_transformation_definitions=", "{{$.inputs.parameters[''tf_custom_transformation_definitions'']}}"]}' - - '{"Concat": ["--tf_transformations_path=", "{{$.inputs.parameters[''tf_transformations_path'']}}"]}' - - '{"Concat": ["--legacy_transformations_path=", "{{$.inputs.parameters[''legacy_transformations_path'']}}"]}' - - '{"Concat": ["--data_source_csv_filenames=", "{{$.inputs.parameters[''data_source_csv_filenames'']}}"]}' - - '{"Concat": ["--data_source_bigquery_table_path=", "{{$.inputs.parameters[''data_source_bigquery_table_path'']}}"]}' - - '{"Concat": ["--bigquery_staging_full_dataset_id=", "{{$.inputs.parameters[''bigquery_staging_full_dataset_id'']}}"]}' - - '{"Concat": ["--target_column=", "{{$.inputs.parameters[''target_column'']}}"]}' - - '{"Concat": ["--weight_column=", "{{$.inputs.parameters[''weight_column'']}}"]}' - - '{"Concat": ["--prediction_type=", "{{$.inputs.parameters[''prediction_type'']}}"]}' - - '{"IfPresent": {"InputName": "model_type", "Then": {"Concat": ["--model_type=", - "{{$.inputs.parameters[''model_type'']}}"]}}}' - - '{"Concat": ["--multimodal_tabular_columns=", "{{$.inputs.parameters[''multimodal_tabular_columns'']}}"]}' - - '{"Concat": ["--multimodal_timeseries_columns=", "{{$.inputs.parameters[''multimodal_timeseries_columns'']}}"]}' - - '{"Concat": ["--multimodal_text_columns=", "{{$.inputs.parameters[''multimodal_text_columns'']}}"]}' - - '{"Concat": ["--multimodal_image_columns=", "{{$.inputs.parameters[''multimodal_image_columns'']}}"]}' - - '{"Concat": ["--run_distill=", "{{$.inputs.parameters[''run_distill'']}}"]}' - - '{"Concat": ["--run_feature_selection=", "{{$.inputs.parameters[''run_feature_selection'']}}"]}' - - '{"Concat": ["--materialized_examples_format=", "{{$.inputs.parameters[''materialized_examples_format'']}}"]}' - - '{"Concat": ["--max_selected_features=", "{{$.inputs.parameters[''max_selected_features'']}}"]}' - - '{"Concat": ["--feature_selection_staging_dir=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/feature_selection_staging_dir"]}' - - '{"Concat": ["--feature_selection_algorithm=", "{{$.inputs.parameters[''feature_selection_algorithm'']}}"]}' - - '{"Concat": ["--feature_selection_execution_engine=", "{{$.inputs.parameters[''feature_selection_execution_engine'']}}"]}' - - '{"Concat": ["--feature_ranking_path=", "{{$.outputs.artifacts[''feature_ranking''].uri}}"]}' - - '{"Concat": ["--error_file_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/error.txt"]}' - - '{"Concat": ["--stats_result_path=", "{{$.outputs.artifacts[''dataset_stats''].uri}}"]}' - - '{"Concat": ["--transform_output_artifact_path=", "{{$.outputs.artifacts[''transform_output''].uri}}"]}' - - '{"Concat": ["--transform_output_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/transform"]}' - - '{"Concat": ["--materialized_examples_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/materialized"]}' - - '{"Concat": ["--export_data_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/export"]}' - - '{"Concat": ["--materialized_data_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/materialized_data"]}' - - '{"Concat": ["--materialized_data_artifact_path=", "{{$.outputs.artifacts[''materialized_data''].uri}}"]}' - - '{"Concat": ["--bigquery_train_split_uri_path=", "{{$.outputs.parameters[''bigquery_train_split_uri''].output_file}}"]}' - - '{"Concat": ["--bigquery_validation_split_uri_path=", "{{$.outputs.parameters[''bigquery_validation_split_uri''].output_file}}"]}' - - '{"Concat": ["--bigquery_test_split_uri_path=", "{{$.outputs.parameters[''bigquery_test_split_uri''].output_file}}"]}' - - '{"Concat": ["--bigquery_downsampled_test_split_uri_path=", "{{$.outputs.parameters[''bigquery_downsampled_test_split_uri''].output_file}}"]}' - - '{"Concat": ["--split_example_counts_path=", "{{$.outputs.parameters[''split_example_counts''].output_file}}"]}' - - '{"Concat": ["--instance_schema_path=", "{{$.outputs.artifacts[''instance_schema''].path}}"]}' - - '{"Concat": ["--training_schema_path=", "{{$.outputs.artifacts[''training_schema''].path}}"]}' - - --job_name=feature-transform-engine-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}} - - '{"Concat": ["--dataflow_project=", "{{$.inputs.parameters[''project'']}}"]}' - - '{"Concat": ["--dataflow_staging_dir=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/dataflow_staging"]}' - - '{"Concat": ["--dataflow_tmp_dir=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/dataflow_tmp"]}' - - '{"Concat": ["--dataflow_max_num_workers=", "{{$.inputs.parameters[''dataflow_max_num_workers'']}}"]}' - - '{"Concat": ["--dataflow_machine_type=", "{{$.inputs.parameters[''dataflow_machine_type'']}}"]}' - - --dataflow_worker_container_image=us-docker.pkg.dev/vertex-ai/automl-tabular/dataflow-worker:20251102_1045 - - --feature_transform_engine_docker_uri=us-docker.pkg.dev/vertex-ai/automl-tabular/feature-transform-engine:20251102_1045 - - '{"Concat": ["--dataflow_disk_size_gb=", "{{$.inputs.parameters[''dataflow_disk_size_gb'']}}"]}' - - '{"Concat": ["--dataflow_subnetwork_fully_qualified=", "{{$.inputs.parameters[''dataflow_subnetwork'']}}"]}' - - '{"Concat": ["--dataflow_use_public_ips=", "{{$.inputs.parameters[''dataflow_use_public_ips'']}}"]}' - - '{"Concat": ["--dataflow_service_account=", "{{$.inputs.parameters[''dataflow_service_account'']}}"]}' - - '{"Concat": ["--dataflow_kms_key=", "{{$.inputs.parameters[''encryption_spec_key_name'']}}"]}' - - '{"Concat": ["--autodetect_csv_schema=", "{{$.inputs.parameters[''autodetect_csv_schema'']}}"]}' - - '{"Concat": ["--gcp_resources_path=", "{{$.outputs.parameters[''gcp_resources''].output_file}}"]}' - - '{"IfPresent": {"InputName": "group_columns", "Then": {"Concat": ["--group_columns=", - "{{$.inputs.parameters[''group_columns'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_total_weight", "Then": {"Concat": ["--group_total_weight=", - "{{$.inputs.parameters[''group_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "temporal_total_weight", "Then": {"Concat": - ["--temporal_total_weight=", "{{$.inputs.parameters[''temporal_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_temporal_total_weight", "Then": {"Concat": - ["--group_temporal_total_weight=", "{{$.inputs.parameters[''group_temporal_total_weight'']}}"]}}}' - - '{"Concat": ["--encryption_spec_key_name=", "{{$.inputs.parameters[''encryption_spec_key_name'']}}"]}' - image: us-docker.pkg.dev/vertex-ai/automl-tabular/feature-transform-engine:20251102_1045 - resources: - cpuLimit: 8.0 - memoryLimit: 30.0 - exec-get-best-hyperparameter-tuning-job-trial: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _get_best_hyperparameter_tuning_job_trial - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _get_best_hyperparameter_tuning_job_trial(\n gcp_resources:\ - \ str,\n study_spec_metric_goal: str,\n unmanaged_container_model:\ - \ dsl.Output[dsl.Artifact],\n trials_dir: str = '',\n instance_schema_uri:\ - \ str = '',\n prediction_schema_uri: str = '',\n prediction_docker_uri:\ - \ str = '',\n read_value_from_file: bool = False,\n):\n \"\"\"Gets best\ - \ HyperparameterTuningJob trial.\n\n Args:\n gcp_resources: Proto tracking\ - \ the hyperparameter tuning job.\n study_spec_metric_goal: Optimization\ - \ goal of the metric, possible values:\n \"MAXIMIZE\", \"MINIMIZE\"\ - .\n unmanaged_container_model: The unmanaged model.\n trials_dir:\ - \ The path to the hyperparameter tuning trials.\n instance_schema_uri:\ - \ The instance schema uri.\n prediction_schema_uri: The prediction schema_uri.\n\ - \ prediction_docker_uri: The prediction docker container uri.\n read_value_from_file:\ - \ If true, read file to get the relevant value.\n\n Raises:\n RuntimeError:\ - \ If there are multiple metrics.\n \"\"\"\n\n import os\n import json\n\ - \ from google.api_core.retry import Retry\n from google.cloud import aiplatform_v1beta1\ - \ as aip\n import tensorflow as tf\n\n # If path to file with value is\ - \ provided, read the file before continuing.\n if read_value_from_file:\n\ - \ with tf.io.gfile.GFile(trials_dir, 'r') as f:\n trials_dir = f.read()\n\ - \ with tf.io.gfile.GFile(instance_schema_uri, 'r') as f:\n instance_schema_uri\ - \ = f.read()\n with tf.io.gfile.GFile(prediction_schema_uri, 'r') as\ - \ f:\n prediction_schema_uri = f.read()\n with tf.io.gfile.GFile(prediction_docker_uri,\ - \ 'r') as f:\n prediction_docker_uri = f.read()\n\n api_endpoint_suffix\ - \ = '-aiplatform.googleapis.com'\n gcp_resources_json = json.loads(gcp_resources)\n\ - \ resource = gcp_resources_json['resources'][0]\n\n uri_key = 'resource_uri'\n\ - \ if uri_key not in resource:\n uri_key = 'resourceUri'\n\n gcp_resources_split\ - \ = resource[uri_key].partition('projects')\n resource_name = gcp_resources_split[1]\ - \ + gcp_resources_split[2]\n prefix_str = gcp_resources_split[0]\n prefix_str\ - \ = prefix_str[: prefix_str.find(api_endpoint_suffix)]\n api_endpoint =\ - \ (\n prefix_str[(prefix_str.rfind('//') + 2) :] + api_endpoint_suffix\n\ - \ )\n\n job_client = aip.JobServiceClient(\n client_options={'api_endpoint':\ - \ api_endpoint}\n )\n response = job_client.get_hyperparameter_tuning_job(\n\ - \ name=resource_name,\n retry=Retry(initial=10.0, maximum=60.0,\ - \ deadline=10.0 * 60.0),\n )\n\n # Get best trial\n trials_list = []\n\ - \ for trial in response.trials:\n if trial.final_measurement:\n \ - \ trials_list.append({\n 'id': trial.id,\n 'objective_value':\ - \ trial.final_measurement.metrics[0].value,\n })\n\n if study_spec_metric_goal\ - \ == 'MAXIMIZE':\n best_fn = max\n elif study_spec_metric_goal == 'MINIMIZE':\n\ - \ best_fn = min\n else:\n raise ValueError(\n f'Unexpected\ - \ study spec metric goal: {study_spec_metric_goal}'\n )\n\n best_trial\ - \ = best_fn(trials_list, key=lambda trial: trial['objective_value'])\n\n\ - \ # Build unmanaged_container_model\n unmanaged_container_model.metadata['containerSpec']\ - \ = {\n 'imageUri': prediction_docker_uri,\n 'healthRoute': '/health',\n\ - \ 'predictRoute': '/predict',\n }\n unmanaged_container_model.metadata['predictSchemata']\ - \ = {\n 'instanceSchemaUri': instance_schema_uri,\n 'predictionSchemaUri':\ - \ prediction_schema_uri,\n }\n unmanaged_container_model.uri = os.path.join(\n\ - \ trials_dir, 'trial_{}'.format(best_trial['id']), 'model'\n )\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-get-model-display-name: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _get_model_display_name - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _get_model_display_name(\n model_display_name: str,\n) ->\ - \ NamedTuple('Outputs', [('model_display_name', str),]):\n \"\"\"Returns\ - \ the model display name.\"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \ import collections\n import uuid\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \n if not model_display_name:\n model_display_name = f'tabular-workflow-model-{uuid.uuid4()}'\n\ - \n return collections.namedtuple(\n 'Outputs',\n [\n \ - \ 'model_display_name',\n ],\n )(\n model_display_name,\n )\n\ - \n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-get-wide-and-deep-study-spec-parameters: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - get_wide_and_deep_study_spec_parameters - command: - - sh - - -c - - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ - \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.0-rc.2'\ - \ && \"$0\" \"$@\"\n" - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef get_wide_and_deep_study_spec_parameters(\n study_spec_parameters_override:\ - \ list # Required for KFP validation; pylint:disable=g-bare-generic\n)\ - \ -> list: # Required by KFP; pylint:disable=g-bare-generic\n \"\"\"Get\ - \ study_spec_parameters for a Wide & Deep hyperparameter tuning job.\n\n\ - \ Args:\n study_spec_parameters_override: List of dictionaries representing\ - \ parameters\n to optimize. The dictionary key is the parameter_id,\ - \ which is passed to\n training job as a command line argument, and\ - \ the dictionary value is the\n parameter specification of the metric.\n\ - \n Returns:\n List of final Vizier study_spec_parameters of type ParameterSpec.\n\ - \ \"\"\"\n default_params = [\n {\n 'parameter_id': 'max_steps',\n\ - \ 'discrete_value_spec': {\n 'values': [5000, 10000,\ - \ 20000, 30000, 40000, 50000]\n },\n },\n {\n \ - \ 'parameter_id': 'max_train_secs',\n 'discrete_value_spec':\ - \ {'values': [-1]},\n },\n {\n 'parameter_id': 'learning_rate',\n\ - \ 'double_value_spec': {'min_value': 0.0001, 'max_value': 0.0005},\n\ - \ 'scale_type': 'UNIT_LINEAR_SCALE',\n },\n {\n \ - \ 'parameter_id': 'optimizer_type',\n 'categorical_value_spec':\ - \ {'values': ['adam', 'ftrl', 'sgd']},\n },\n {\n 'parameter_id':\ - \ 'l1_regularization_strength',\n 'discrete_value_spec': {'values':\ - \ [0, 0.01, 0.02]},\n },\n {\n 'parameter_id': 'l2_regularization_strength',\n\ - \ 'discrete_value_spec': {'values': [0, 0.01, 0.02]},\n },\n\ - \ {\n 'parameter_id': 'l2_shrinkage_regularization_strength',\n\ - \ 'discrete_value_spec': {'values': [0, 0.01, 0.02]},\n },\n\ - \ {\n 'parameter_id': 'beta_1',\n 'discrete_value_spec':\ - \ {'values': [0.7, 0.8, 0.9]},\n },\n {\n 'parameter_id':\ - \ 'beta_2',\n 'discrete_value_spec': {'values': [0.8, 0.9, 0.999]},\n\ - \ },\n {\n 'parameter_id': 'hidden_units',\n \ - \ 'categorical_value_spec': {'values': ['30,30,30']},\n },\n \ - \ {\n 'parameter_id': 'use_wide',\n 'categorical_value_spec':\ - \ {'values': ['true', 'false']},\n },\n {\n 'parameter_id':\ - \ 'embed_categories',\n 'categorical_value_spec': {'values': ['true',\ - \ 'false']},\n },\n {\n 'parameter_id': 'dnn_dropout',\n\ - \ 'discrete_value_spec': {'values': [0, 0.1, 0.2]},\n },\n\ - \ {\n 'parameter_id': 'dnn_learning_rate',\n 'double_value_spec':\ - \ {'min_value': 0.0001, 'max_value': 0.0005},\n 'scale_type': 'UNIT_LINEAR_SCALE',\n\ - \ },\n {\n 'parameter_id': 'dnn_optimizer_type',\n \ - \ 'categorical_value_spec': {'values': ['adam', 'ftrl', 'sgd']},\n\ - \ },\n {\n 'parameter_id': 'dnn_l1_regularization_strength',\n\ - \ 'discrete_value_spec': {'values': [0, 0.01, 0.02]},\n },\n\ - \ {\n 'parameter_id': 'dnn_l2_regularization_strength',\n\ - \ 'discrete_value_spec': {'values': [0, 0.01, 0.02]},\n },\n\ - \ {\n 'parameter_id': 'dnn_l2_shrinkage_regularization_strength',\n\ - \ 'discrete_value_spec': {'values': [0, 0.01, 0.02]},\n },\n\ - \ {\n 'parameter_id': 'dnn_beta_1',\n 'discrete_value_spec':\ - \ {'values': [0.7, 0.8, 0.9]},\n },\n {\n 'parameter_id':\ - \ 'dnn_beta_2',\n 'discrete_value_spec': {'values': [0.8, 0.9,\ - \ 0.999]},\n },\n {\n 'parameter_id': 'batch_size',\n\ - \ 'discrete_value_spec': {'values': [1024, 2048, 4096, 8192, 16384]},\n\ - \ },\n ]\n # pylint:disable=g-import-not-at-top,redefined-outer-name\n\ - \ import warnings\n # pylint:enable=g-import-not-at-top,redefined-outer-name\n\ - \n override_params = {}\n for param in study_spec_parameters_override:\n\ - \ override_params[param['parameter_id']] = param\n\n study_spec_parameters\ - \ = []\n for param in default_params:\n study_spec_parameters.append(\n\ - \ override_params.get(param['parameter_id'], param)\n )\n\n extra_overrides\ - \ = set(override_params) - set(\n p['parameter_id'] for p in default_params\n\ - \ )\n if extra_overrides:\n extra_override_str = ', '.join(extra_overrides)\n\ - \ warnings.warn(\n f'The overrides {extra_override_str} were not\ - \ found in the params and '\n 'will be ignored.'\n )\n\n return\ - \ study_spec_parameters\n\n" - image: python:3.7 - exec-model-batch-predict: - container: - args: - - --type - - BatchPredictionJob - - --payload - - '{"Concat": ["{", "\"display_name\": \"", "{{$.inputs.parameters[''job_display_name'']}}", - "\", ", {"IfPresent": {"InputName": "model", "Then": {"Concat": ["\"model\": - \"", "{{$.inputs.artifacts[''model''].metadata[''resourceName'']}}", "\","]}}}, - " \"input_config\": {", "\"instances_format\": \"", "{{$.inputs.parameters[''instances_format'']}}", - "\"", ", \"gcs_source\": {", "\"uris\":", "{{$.inputs.parameters[''gcs_source_uris'']}}", - "}", ", \"bigquery_source\": {", "\"input_uri\": \"", "{{$.inputs.parameters[''bigquery_source_input_uri'']}}", - "\"", "}", "}", ", \"instance_config\": {", "\"instance_type\": \"", "{{$.inputs.parameters[''instance_type'']}}", - "\"", ", \"key_field\": \"", "{{$.inputs.parameters[''key_field'']}}", "\" - ", {"IfPresent": {"InputName": "included_fields", "Then": {"Concat": [", - \"included_fields\": ", "{{$.inputs.parameters[''included_fields'']}}"]}}}, - {"IfPresent": {"InputName": "excluded_fields", "Then": {"Concat": [", \"excluded_fields\": - ", "{{$.inputs.parameters[''excluded_fields'']}}"]}}}, "}", ", \"model_parameters\": - ", "{{$.inputs.parameters[''model_parameters'']}}", ", \"output_config\": - {", "\"predictions_format\": \"", "{{$.inputs.parameters[''predictions_format'']}}", - "\"", ", \"gcs_destination\": {", "\"output_uri_prefix\": \"", "{{$.inputs.parameters[''gcs_destination_output_uri_prefix'']}}", - "\"", "}", ", \"bigquery_destination\": {", "\"output_uri\": \"", "{{$.inputs.parameters[''bigquery_destination_output_uri'']}}", - "\"", "}", "}", ", \"dedicated_resources\": {", "\"machine_spec\": {", "\"machine_type\": - \"", "{{$.inputs.parameters[''machine_type'']}}", "\"", ", \"accelerator_type\": - \"", "{{$.inputs.parameters[''accelerator_type'']}}", "\"", ", \"accelerator_count\": - ", "{{$.inputs.parameters[''accelerator_count'']}}", "}", ", \"starting_replica_count\": - ", "{{$.inputs.parameters[''starting_replica_count'']}}", ", \"max_replica_count\": - ", "{{$.inputs.parameters[''max_replica_count'']}}", "}", ", \"manual_batch_tuning_parameters\": - {", "\"batch_size\": ", "{{$.inputs.parameters[''manual_batch_tuning_parameters_batch_size'']}}", - "}", ", \"generate_explanation\": ", "{{$.inputs.parameters[''generate_explanation'']}}", - ", \"explanation_spec\": {", "\"parameters\": ", "{{$.inputs.parameters[''explanation_parameters'']}}", - ", \"metadata\": ", "{{$.inputs.parameters[''explanation_metadata'']}}", - "}", ", \"labels\": ", "{{$.inputs.parameters[''labels'']}}", ", \"encryption_spec\": - {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}", "}"]}' - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --executor_input - - '{{$}}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.batch_prediction_job.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:2.3.1 - exec-model-evaluation: - container: - args: - - --setup_file - - /setup.py - - --json_mode - - 'true' - - --project_id - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --problem_type - - '{{$.inputs.parameters[''problem_type'']}}' - - --batch_prediction_format - - '{{$.inputs.parameters[''predictions_format'']}}' - - --batch_prediction_gcs_source - - '{{$.inputs.artifacts[''batch_prediction_job''].metadata[''gcsOutputDirectory'']}}' - - --ground_truth_format - - '{{$.inputs.parameters[''ground_truth_format'']}}' - - --key_prefix_in_prediction_dataset - - instance - - --root_dir - - '{{$.inputs.parameters[''root_dir'']}}/{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}' - - --classification_type - - multiclass - - --ground_truth_column - - instance.{{$.inputs.parameters['ground_truth_column']}} - - --prediction_score_column - - '{{$.inputs.parameters[''prediction_score_column'']}}' - - --prediction_label_column - - '{{$.inputs.parameters[''prediction_label_column'']}}' - - --prediction_id_column - - '' - - --example_weight_column - - '' - - --generate_feature_attribution - - 'false' - - --dataflow_job_prefix - - evaluation-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}} - - --dataflow_service_account - - '{{$.inputs.parameters[''dataflow_service_account'']}}' - - --dataflow_disk_size - - '{{$.inputs.parameters[''dataflow_disk_size'']}}' - - --dataflow_machine_type - - '{{$.inputs.parameters[''dataflow_machine_type'']}}' - - --dataflow_workers_num - - '{{$.inputs.parameters[''dataflow_workers_num'']}}' - - --dataflow_max_workers_num - - '{{$.inputs.parameters[''dataflow_max_workers_num'']}}' - - --dataflow_subnetwork - - '{{$.inputs.parameters[''dataflow_subnetwork'']}}' - - --dataflow_use_public_ips - - '{{$.inputs.parameters[''dataflow_use_public_ips'']}}' - - --kms_key_name - - '{{$.inputs.parameters[''encryption_spec_key_name'']}}' - - --output_metrics_gcs_path - - '{{$.outputs.artifacts[''evaluation_metrics''].uri}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --executor_input - - '{{$}}' - command: - - python - - /main.py - image: gcr.io/ml-pipeline/model-evaluation:v0.4 - exec-model-upload: - container: - args: - - --type - - UploadModel - - --payload - - '{"Concat": ["{", "\"display_name\": \"", "{{$.inputs.parameters[''display_name'']}}", - "\"", ", \"description\": \"", "{{$.inputs.parameters[''description'']}}", - "\"", ", \"explanation_spec\": {", "\"parameters\": ", "{{$.inputs.parameters[''explanation_parameters'']}}", - ", \"metadata\": ", "{{$.inputs.parameters[''explanation_metadata'']}}", - "}", ", \"encryption_spec\": {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}", ", \"labels\": ", "{{$.inputs.parameters[''labels'']}}", ", \"pipeline_job\": - \"", "projects/{{$.inputs.parameters[''project'']}}/locations/{{$.inputs.parameters[''location'']}}/pipelineJobs/{{$.pipeline_job_uuid}}", - "\"", "}"]}' - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --executor_input - - '{{$}}' - - '{"IfPresent": {"InputName": "parent_model", "Then": ["--parent_model_name", - "{{$.inputs.artifacts[''parent_model''].metadata[''resourceName'']}}"]}}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.model.upload_model.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:2.3.1 - exec-parse-worker-pool-specs-override: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _parse_worker_pool_specs_override - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _parse_worker_pool_specs_override(\n worker_pool_specs_override:\ - \ list, # pylint:disable=g-bare-generic\n) -> NamedTuple(\n 'Outputs',\n\ - \ [\n ('training_machine_spec', dict), # pylint:disable=g-bare-generic\n\ - \ ('training_disk_spec', dict),\n ('eval_machine_spec', dict),\ - \ # pylint:disable=g-bare-generic\n ('eval_replica_count', int),\n\ - \ ],\n):\n \"\"\"Parses worker_pool_specs_override and returns training\ - \ and evaluation machine specifications.\n\n Args:\n worker_pool_specs_override:\ - \ The list of dictionaries for overriding training\n and evaluation\ - \ worker pool specs.\n\n Returns:\n training_machine_spec: The training\ - \ machine spec.\n training_disk_spec: The training disk spec.\n \ - \ eval_machine_spec: The eval machine spec.\n eval_replica_count:\ - \ The replica count for eval.\n \"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \ import collections\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \n training_machine_spec = {'machine_type': 'c2-standard-16'}\n training_disk_spec\ - \ = {'boot_disk_type': 'pd-ssd', 'boot_disk_size_gb': 100}\n eval_machine_spec\ - \ = {'machine_type': 'c2-standard-8'}\n eval_replica_count = 1\n\n if\ - \ worker_pool_specs_override:\n if len(worker_pool_specs_override) >=\ - \ 1 and isinstance(\n worker_pool_specs_override[0], dict\n ):\n\ - \ training_machine_spec = worker_pool_specs_override[0].get(\n \ - \ 'machine_spec', training_machine_spec\n )\n training_disk_spec\ - \ = worker_pool_specs_override[0].get(\n 'disk_spec', training_disk_spec\n\ - \ )\n if len(worker_pool_specs_override) == 4 and isinstance(\n\ - \ worker_pool_specs_override[3], dict\n ):\n eval_machine_spec\ - \ = worker_pool_specs_override[3].get(\n 'machine_spec', eval_machine_spec\n\ - \ )\n eval_replica_count = worker_pool_specs_override[3].get(\n\ - \ 'replica_count', eval_replica_count\n )\n\n return collections.namedtuple(\n\ - \ 'Outputs',\n [\n 'training_machine_spec',\n \ - \ 'training_disk_spec',\n 'eval_machine_spec',\n 'eval_replica_count',\n\ - \ ],\n )(\n training_machine_spec,\n training_disk_spec,\n\ - \ eval_machine_spec,\n eval_replica_count,\n )\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-set-optional-inputs: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _set_optional_inputs - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _set_optional_inputs(\n project: str,\n location: str,\n\ - \ data_source_csv_filenames: str,\n data_source_bigquery_table_path:\ - \ str,\n vertex_dataset: dsl.Input[dsl.Artifact],\n) -> NamedTuple(\n\ - \ 'Outputs',\n [\n ('data_source_csv_filenames', str),\n \ - \ ('data_source_bigquery_table_path', str),\n ],\n):\n \"\"\"Get\ - \ the data source URI.\n\n Args:\n project: The GCP project that runs\ - \ the pipeline components.\n location: The GCP region that runs the pipeline\ - \ components.\n data_source_csv_filenames: The CSV GCS path when data\ - \ source is CSV.\n data_source_bigquery_table_path: The BigQuery table\ - \ when data source is BQ.\n vertex_dataset: The Vertex dataset when data\ - \ source is Vertex dataset.\n\n Returns:\n A named tuple of CSV or BQ\ - \ URI.\n \"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \ import collections\n from google.cloud import aiplatform\n from google.cloud\ - \ import aiplatform_v1beta1 as aip\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \n if vertex_dataset is not None:\n # of format\n # projects/294348452381/locations/us-central1/datasets/7104764862735056896\n\ - \ dataset_name = vertex_dataset.metadata['resourceName']\n\n aiplatform.init(project=project,\ - \ location=location)\n client = aip.DatasetServiceClient(\n client_options={'api_endpoint':\ - \ f'{location}-aiplatform.googleapis.com'}\n )\n dataset = client.get_dataset(name=dataset_name)\n\ - \ input_config = dataset.metadata['inputConfig']\n if 'gcsSource'\ - \ in input_config:\n data_source_csv_filenames = ','.join(input_config['gcsSource']['uri'])\n\ - \ elif 'bigquerySource' in input_config:\n data_source_bigquery_table_path\ - \ = input_config['bigquerySource']['uri']\n elif data_source_csv_filenames:\n\ - \ pass\n elif data_source_bigquery_table_path:\n pass\n else:\n\ - \ raise ValueError(\n 'One of vertex_dataset, data_source_csv_filenames,'\n\ - \ ' data_source_bigquery_table_path must be specified'\n )\n\n\ - \ return collections.namedtuple(\n 'Outputs',\n [\n \ - \ 'data_source_csv_filenames',\n 'data_source_bigquery_table_path',\n\ - \ ],\n )(\n data_source_csv_filenames,\n data_source_bigquery_table_path,\n\ - \ )\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-split-materialized-data: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _split_materialized_data - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _split_materialized_data(\n materialized_data: Input[Dataset],\n\ - \ materialized_train_split: OutputPath('MaterializedSplit'),\n materialized_eval_split:\ - \ OutputPath('MaterializedSplit'),\n materialized_test_split: OutputPath('MaterializedSplit')):\n\ - \ \"\"\"Splits materialized_data into materialized_data test, train, and\ - \ eval splits.\n\n Necessary adapter between FTE pipeline and trainer.\n\ - \n Args:\n materialized_data: materialized_data dataset output by FTE.\n\ - \ materialized_train_split: Path patern to materialized_train_split.\n\ - \ materialized_eval_split: Path patern to materialized_eval_split.\n\ - \ materialized_test_split: Path patern to materialized_test_split.\n\ - \ \"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name,reimported\n\ - \ import json\n import tensorflow as tf\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name,reimported\n\ - \n with tf.io.gfile.GFile(materialized_data.path, 'r') as f:\n artifact_path\ - \ = f.read()\n\n # needed to import tf because this is a path in gs://\n\ - \ with tf.io.gfile.GFile(artifact_path, 'r') as f:\n materialized_data_json\ - \ = json.load(f)\n\n if 'tf_record_data_source' in materialized_data_json:\n\ - \ file_patterns = materialized_data_json['tf_record_data_source'][\n\ - \ 'file_patterns']\n elif 'avro_data_source' in materialized_data_json:\n\ - \ file_patterns = materialized_data_json['avro_data_source'][\n \ - \ 'file_patterns']\n elif 'parquet_data_source' in materialized_data_json:\n\ - \ file_patterns = materialized_data_json['parquet_data_source'][\n \ - \ 'file_patterns']\n else:\n raise ValueError(f'Unsupported training\ - \ data source: {materialized_data_json}')\n\n # we map indices to file\ - \ patterns based on the ordering of insertion order\n # in our transform_data\ - \ (see above in _generate_analyze_and_transform_data)\n with tf.io.gfile.GFile(materialized_train_split,\ - \ 'w') as f:\n f.write(file_patterns[0])\n\n with tf.io.gfile.GFile(materialized_eval_split,\ - \ 'w') as f:\n f.write(file_patterns[1])\n\n with tf.io.gfile.GFile(materialized_test_split,\ - \ 'w') as f:\n f.write(file_patterns[2])\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/dataflow-worker:20251102_1045 - exec-training-configurator-and-validator: - container: - args: - - training_configurator_and_validator - - '{"Concat": ["--instance_schema_path=", "{{$.inputs.artifacts[''instance_schema''].uri}}"]}' - - '{"Concat": ["--training_schema_path=", "{{$.inputs.artifacts[''training_schema''].uri}}"]}' - - '{"Concat": ["--dataset_stats_path=", "{{$.inputs.artifacts[''dataset_stats''].uri}}"]}' - - '{"Concat": ["--split_example_counts=", "{{$.inputs.parameters[''split_example_counts'']}}"]}' - - '{"Concat": ["--target_column=", "{{$.inputs.parameters[''target_column'']}}"]}' - - '{"Concat": ["--weight_column=", "{{$.inputs.parameters[''weight_column'']}}"]}' - - '{"Concat": ["--prediction_type=", "{{$.inputs.parameters[''prediction_type'']}}"]}' - - '{"Concat": ["--optimization_objective=", "{{$.inputs.parameters[''optimization_objective'']}}"]}' - - '{"Concat": ["--optimization_objective_recall_value=", "{{$.inputs.parameters[''optimization_objective_recall_value'']}}"]}' - - '{"Concat": ["--optimization_objective_precision_value=", "{{$.inputs.parameters[''optimization_objective_precision_value'']}}"]}' - - '{"Concat": ["--metadata_path=", "{{$.outputs.artifacts[''metadata''].uri}}"]}' - - '{"Concat": ["--instance_baseline_path=", "{{$.outputs.artifacts[''instance_baseline''].uri}}"]}' - - '{"Concat": ["--run_evaluation=", "{{$.inputs.parameters[''run_evaluation'']}}"]}' - - '{"Concat": ["--run_distill=", "{{$.inputs.parameters[''run_distill'']}}"]}' - - '{"Concat": ["--enable_probabilistic_inference=", "{{$.inputs.parameters[''enable_probabilistic_inference'']}}"]}' - - '{"IfPresent": {"InputName": "time_series_identifier_column", "Then": {"Concat": - ["--time_series_identifier_column=", "{{$.inputs.parameters[''time_series_identifier_column'']}}"]}}}' - - '{"Concat": ["--time_series_identifier_columns=", "{{$.inputs.parameters[''time_series_identifier_columns'']}}"]}' - - '{"Concat": ["--time_column=", "{{$.inputs.parameters[''time_column'']}}"]}' - - '{"Concat": ["--time_series_attribute_columns=", "{{$.inputs.parameters[''time_series_attribute_columns'']}}"]}' - - '{"Concat": ["--available_at_forecast_columns=", "{{$.inputs.parameters[''available_at_forecast_columns'']}}"]}' - - '{"Concat": ["--unavailable_at_forecast_columns=", "{{$.inputs.parameters[''unavailable_at_forecast_columns'']}}"]}' - - '{"IfPresent": {"InputName": "quantiles", "Then": {"Concat": ["--quantiles=", - "{{$.inputs.parameters[''quantiles'']}}"]}}}' - - '{"Concat": ["--context_window=", "{{$.inputs.parameters[''context_window'']}}"]}' - - '{"Concat": ["--forecast_horizon=", "{{$.inputs.parameters[''forecast_horizon'']}}"]}' - - '{"Concat": ["--forecasting_model_type=", "{{$.inputs.parameters[''forecasting_model_type'']}}"]}' - - '{"Concat": ["--forecasting_transformations=", "{{$.inputs.parameters[''forecasting_transformations'']}}"]}' - - '{"IfPresent": {"InputName": "stage_1_deadline_hours", "Then": {"Concat": - ["--stage_1_deadline_hours=", "{{$.inputs.parameters[''stage_1_deadline_hours'']}}"]}}}' - - '{"IfPresent": {"InputName": "stage_2_deadline_hours", "Then": {"Concat": - ["--stage_2_deadline_hours=", "{{$.inputs.parameters[''stage_2_deadline_hours'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_columns", "Then": {"Concat": ["--group_columns=", - "{{$.inputs.parameters[''group_columns'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_total_weight", "Then": {"Concat": ["--group_total_weight=", - "{{$.inputs.parameters[''group_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "temporal_total_weight", "Then": {"Concat": - ["--temporal_total_weight=", "{{$.inputs.parameters[''temporal_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_temporal_total_weight", "Then": {"Concat": - ["--group_temporal_total_weight=", "{{$.inputs.parameters[''group_temporal_total_weight'']}}"]}}}' - image: us-docker.pkg.dev/vertex-ai/automl-tabular/feature-transform-engine:20251102_1045 - exec-wide-and-deep-hyperparameter-tuning-job: - container: - args: - - --type - - HyperparameterTuningJobWithMetrics - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --execution_metrics - - '{{$.outputs.parameters[''execution_metrics''].output_file}}' - - --payload - - '{"Concat": ["{\"display_name\": \"wide-and-deep-hyperparameter-tuning-job-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}\", - \"encryption_spec\": {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}, \"study_spec\": {\"metrics\": [{\"metric_id\": \"", "{{$.inputs.parameters[''study_spec_metric_id'']}}", - "\", \"goal\": \"", "{{$.inputs.parameters[''study_spec_metric_goal'']}}", - "\"}], \"parameters\": ", "{{$.inputs.parameters[''study_spec_parameters_override'']}}", - ", \"algorithm\": \"", "{{$.inputs.parameters[''study_spec_algorithm'']}}", - "\", \"measurement_selection_type\": \"", "{{$.inputs.parameters[''study_spec_measurement_selection_type'']}}", - "\"}, \"max_trial_count\": ", "{{$.inputs.parameters[''max_trial_count'']}}", - ", \"parallel_trial_count\": ", "{{$.inputs.parameters[''parallel_trial_count'']}}", - ", \"max_failed_trial_count\": ", "{{$.inputs.parameters[''max_failed_trial_count'']}}", - ", \"trial_job_spec\": {\"worker_pool_specs\": [{\"replica_count\":\"", - "1", "\", \"machine_spec\": ", "{{$.inputs.parameters[''training_machine_spec'']}}", - ", \"disk_spec\": ", "{{$.inputs.parameters[''training_disk_spec'']}}", - ", \"container_spec\": {\"image_uri\":\"", "us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/wide-and-deep-training:20251102_1045", - "\", \"args\": [\"--target_column=", "{{$.inputs.parameters[''target_column'']}}", - "\", \"--weight_column=", "{{$.inputs.parameters[''weight_column'']}}", - "\", \"--model_type=", "{{$.inputs.parameters[''prediction_type'']}}", "\", - \"--prediction_docker_uri=", "us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:20251102_1045", - "\", \"--prediction_docker_uri_artifact_path=", "{{$.outputs.parameters[''prediction_docker_uri_output''].output_file}}", - "\", \"--baseline_path=", "{{$.inputs.artifacts[''instance_baseline''].uri}}", - "\", \"--metadata_path=", "{{$.inputs.artifacts[''metadata''].uri}}", "\", - \"--transform_output_path=", "{{$.inputs.artifacts[''transform_output''].uri}}", - "\", \"--training_schema_path=", "{{$.inputs.artifacts[''training_schema_uri''].uri}}", - "\", \"--instance_schema_path=", "{{$.outputs.parameters[''instance_schema_uri''].output_file}}", - "\", \"--prediction_schema_path=", "{{$.outputs.parameters[''prediction_schema_uri''].output_file}}", - "\", \"--trials_path=", "{{$.outputs.parameters[''trials''].output_file}}", - "\", \"--job_dir=", "{{$.inputs.parameters[''root_dir'']}}", "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/train\", - \"--training_data_path=", "{{$.inputs.artifacts[''materialized_train_split''].uri}}", - "\", \"--validation_data_path=", "{{$.inputs.artifacts[''materialized_eval_split''].uri}}", - "\", \"--enable_profiler=", "{{$.inputs.parameters[''enable_profiler'']}}", - "\", \"--cache_data=", "{{$.inputs.parameters[''cache_data'']}}", "\", \"--measurement_selection_type=", - "{{$.inputs.parameters[''study_spec_measurement_selection_type'']}}", "\", - \"--metric_goal=", "{{$.inputs.parameters[''study_spec_metric_goal'']}}", - "\", \"--seed=", "{{$.inputs.parameters[''seed'']}}", "\", \"--eval_steps=", - "{{$.inputs.parameters[''eval_steps'']}}", "\", \"--eval_frequency_secs=", - "{{$.inputs.parameters[''eval_frequency_secs'']}}", "\"]}}]}}"]}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.hyperparameter_tuning_job.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.44 -pipelineInfo: - description: The Wide & Deep built-in algorithm HyperparameterTuningJob pipeline. - name: automl-tabular-wide-and-deep-hyperparameter-tuning-job -root: - dag: - outputs: - artifacts: - model-evaluation-evaluation_metrics: - artifactSelectors: - - outputArtifactKey: model-evaluation-evaluation_metrics - producerSubtask: exit-handler-1 - tasks: - automl-tabular-finalizer: - cachingOptions: - enableCache: true - componentRef: - name: comp-automl-tabular-finalizer - dependentTasks: - - exit-handler-1 - inputs: - parameters: - location: - componentInputParameter: location - project: - componentInputParameter: project - root_dir: - componentInputParameter: root_dir - taskInfo: - name: automl-tabular-finalizer - triggerPolicy: - strategy: ALL_UPSTREAM_TASKS_COMPLETED - exit-handler-1: - componentRef: - name: comp-exit-handler-1 - dependentTasks: - - get-model-display-name - - set-optional-inputs - inputs: - artifacts: - pipelinechannel--parent_model: - componentInputArtifact: parent_model - parameters: - pipelinechannel--bigquery_staging_full_dataset_id: - componentInputParameter: bigquery_staging_full_dataset_id - pipelinechannel--cache_data: - componentInputParameter: cache_data - pipelinechannel--dataflow_service_account: - componentInputParameter: dataflow_service_account - pipelinechannel--dataflow_subnetwork: - componentInputParameter: dataflow_subnetwork - pipelinechannel--dataflow_use_public_ips: - componentInputParameter: dataflow_use_public_ips - pipelinechannel--dataset_level_custom_transformation_definitions: - componentInputParameter: dataset_level_custom_transformation_definitions - pipelinechannel--dataset_level_transformations: - componentInputParameter: dataset_level_transformations - pipelinechannel--enable_profiler: - componentInputParameter: enable_profiler - pipelinechannel--encryption_spec_key_name: - componentInputParameter: encryption_spec_key_name - pipelinechannel--eval_frequency_secs: - componentInputParameter: eval_frequency_secs - pipelinechannel--eval_steps: - componentInputParameter: eval_steps - pipelinechannel--evaluation_batch_predict_machine_type: - componentInputParameter: evaluation_batch_predict_machine_type - pipelinechannel--evaluation_batch_predict_max_replica_count: - componentInputParameter: evaluation_batch_predict_max_replica_count - pipelinechannel--evaluation_batch_predict_starting_replica_count: - componentInputParameter: evaluation_batch_predict_starting_replica_count - pipelinechannel--evaluation_dataflow_disk_size_gb: - componentInputParameter: evaluation_dataflow_disk_size_gb - pipelinechannel--evaluation_dataflow_machine_type: - componentInputParameter: evaluation_dataflow_machine_type - pipelinechannel--evaluation_dataflow_max_num_workers: - componentInputParameter: evaluation_dataflow_max_num_workers - pipelinechannel--evaluation_dataflow_starting_num_workers: - componentInputParameter: evaluation_dataflow_starting_num_workers - pipelinechannel--feature_selection_algorithm: - componentInputParameter: feature_selection_algorithm - pipelinechannel--get-model-display-name-model_display_name: - taskOutputParameter: - outputParameterKey: model_display_name - producerTask: get-model-display-name - pipelinechannel--location: - componentInputParameter: location - pipelinechannel--materialized_examples_format: - componentInputParameter: materialized_examples_format - pipelinechannel--max_failed_trial_count: - componentInputParameter: max_failed_trial_count - pipelinechannel--max_selected_features: - componentInputParameter: max_selected_features - pipelinechannel--max_trial_count: - componentInputParameter: max_trial_count - pipelinechannel--model_description: - componentInputParameter: model_description - pipelinechannel--parallel_trial_count: - componentInputParameter: parallel_trial_count - pipelinechannel--predefined_split_key: - componentInputParameter: predefined_split_key - pipelinechannel--prediction_type: - componentInputParameter: prediction_type - pipelinechannel--project: - componentInputParameter: project - pipelinechannel--root_dir: - componentInputParameter: root_dir - pipelinechannel--run_evaluation: - componentInputParameter: run_evaluation - pipelinechannel--run_feature_selection: - componentInputParameter: run_feature_selection - pipelinechannel--seed: - componentInputParameter: seed - pipelinechannel--set-optional-inputs-data_source_bigquery_table_path: - taskOutputParameter: - outputParameterKey: data_source_bigquery_table_path - producerTask: set-optional-inputs - pipelinechannel--set-optional-inputs-data_source_csv_filenames: - taskOutputParameter: - outputParameterKey: data_source_csv_filenames - producerTask: set-optional-inputs - pipelinechannel--stratified_split_key: - componentInputParameter: stratified_split_key - pipelinechannel--study_spec_algorithm: - componentInputParameter: study_spec_algorithm - pipelinechannel--study_spec_measurement_selection_type: - componentInputParameter: study_spec_measurement_selection_type - pipelinechannel--study_spec_metric_goal: - componentInputParameter: study_spec_metric_goal - pipelinechannel--study_spec_metric_id: - componentInputParameter: study_spec_metric_id - pipelinechannel--study_spec_parameters_override: - componentInputParameter: study_spec_parameters_override - pipelinechannel--target_column: - componentInputParameter: target_column - pipelinechannel--test_fraction: - componentInputParameter: test_fraction - pipelinechannel--tf_auto_transform_features: - componentInputParameter: tf_auto_transform_features - pipelinechannel--tf_custom_transformation_definitions: - componentInputParameter: tf_custom_transformation_definitions - pipelinechannel--tf_transform_execution_engine: - componentInputParameter: tf_transform_execution_engine - pipelinechannel--tf_transformations_path: - componentInputParameter: tf_transformations_path - pipelinechannel--training_fraction: - componentInputParameter: training_fraction - pipelinechannel--transform_dataflow_disk_size_gb: - componentInputParameter: transform_dataflow_disk_size_gb - pipelinechannel--transform_dataflow_machine_type: - componentInputParameter: transform_dataflow_machine_type - pipelinechannel--transform_dataflow_max_num_workers: - componentInputParameter: transform_dataflow_max_num_workers - pipelinechannel--validation_fraction: - componentInputParameter: validation_fraction - pipelinechannel--weight_column: - componentInputParameter: weight_column - pipelinechannel--worker_pool_specs_override: - componentInputParameter: worker_pool_specs_override - taskInfo: - name: exit-handler-1 - get-model-display-name: - cachingOptions: - enableCache: true - componentRef: - name: comp-get-model-display-name - inputs: - parameters: - model_display_name: - componentInputParameter: model_display_name - taskInfo: - name: get-model-display-name - set-optional-inputs: - cachingOptions: - enableCache: true - componentRef: - name: comp-set-optional-inputs - inputs: - artifacts: - vertex_dataset: - componentInputArtifact: vertex_dataset - parameters: - data_source_bigquery_table_path: - componentInputParameter: data_source_bigquery_table_path - data_source_csv_filenames: - componentInputParameter: data_source_csv_filenames - location: - componentInputParameter: location - project: - componentInputParameter: project - taskInfo: - name: set-optional-inputs - inputDefinitions: - artifacts: - parent_model: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Parent model if this model is uploaded as a version. - isOptional: true - vertex_dataset: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The Vertex dataset artifact. - parameters: - bigquery_staging_full_dataset_id: - defaultValue: '' - description: Staging directory for BigQuery tables. - isOptional: true - parameterType: STRING - cache_data: - defaultValue: auto - description: 'Whether to cache data or not. If set to ''auto'', caching is - - determined based on the dataset size.' - isOptional: true - parameterType: STRING - data_source_bigquery_table_path: - defaultValue: '' - description: 'The BigQuery table path of format - - bq://bq_project.bq_dataset.bq_table' - isOptional: true - parameterType: STRING - data_source_csv_filenames: - defaultValue: '' - description: 'A string that represents a list of comma - - separated CSV filenames.' - isOptional: true - parameterType: STRING - dataflow_service_account: - defaultValue: '' - description: Custom service account to run dataflow jobs. - isOptional: true - parameterType: STRING - dataflow_subnetwork: - defaultValue: '' - description: 'Dataflow''s fully qualified subnetwork name, when empty - - the default subnetwork will be used. Example: - - https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications' - isOptional: true - parameterType: STRING - dataflow_use_public_ips: - defaultValue: true - description: 'Specifies whether Dataflow workers use public IP - - addresses.' - isOptional: true - parameterType: BOOLEAN - dataset_level_custom_transformation_definitions: - description: 'Dataset-level custom - - transformation definitions in string format.' - isOptional: true - parameterType: LIST - dataset_level_transformations: - description: 'Dataset-level transformation configuration in - - string format.' - isOptional: true - parameterType: LIST - enable_profiler: - defaultValue: false - description: Enables profiling and saves a trace during evaluation. - isOptional: true - parameterType: BOOLEAN - encryption_spec_key_name: - defaultValue: '' - description: The KMS key name. - isOptional: true - parameterType: STRING - eval_frequency_secs: - defaultValue: 600.0 - description: 'Frequency at which evaluation and checkpointing will - - take place.' - isOptional: true - parameterType: NUMBER_INTEGER - eval_steps: - defaultValue: 0.0 - description: 'Number of steps to run evaluation for. If not specified or - - negative, it means run evaluation on the whole validation dataset. If set - - to 0, it means run evaluation for a fixed number of samples.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_batch_predict_machine_type: - defaultValue: n1-highmem-8 - description: 'The prediction server machine type - - for batch predict components during evaluation.' - isOptional: true - parameterType: STRING - evaluation_batch_predict_max_replica_count: - defaultValue: 20.0 - description: 'The max number of prediction - - server for batch predict components during evaluation.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_batch_predict_starting_replica_count: - defaultValue: 20.0 - description: 'The initial number of - - prediction server for batch predict components during evaluation.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_dataflow_disk_size_gb: - defaultValue: 50.0 - description: 'Dataflow worker''s disk size in GB for - - evaluation components.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_dataflow_machine_type: - defaultValue: n1-standard-4 - description: 'The dataflow machine type for evaluation - - components.' - isOptional: true - parameterType: STRING - evaluation_dataflow_max_num_workers: - defaultValue: 100.0 - description: 'The max number of Dataflow workers for - - evaluation components.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_dataflow_starting_num_workers: - defaultValue: 10.0 - description: 'The initial number of Dataflow - - workers for evaluation components.' - isOptional: true - parameterType: NUMBER_INTEGER - feature_selection_algorithm: - defaultValue: AMI - description: Feature selection algorithm. - isOptional: true - parameterType: STRING - location: - description: The GCP region that runs the pipeline components. - parameterType: STRING - materialized_examples_format: - defaultValue: tfrecords_gzip - description: The format for the materialized examples. - isOptional: true - parameterType: STRING - max_failed_trial_count: - defaultValue: 0.0 - description: 'The number of failed trials that need to be seen - - before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides - - how many trials must fail before the whole job fails.' - isOptional: true - parameterType: NUMBER_INTEGER - max_selected_features: - defaultValue: -1.0 - description: Maximum number of features to select. - isOptional: true - parameterType: NUMBER_INTEGER - max_trial_count: - description: The desired total number of trials. - parameterType: NUMBER_INTEGER - model_description: - defaultValue: '' - description: The description name of the uploaded Vertex model. - isOptional: true - parameterType: STRING - model_display_name: - defaultValue: '' - description: The display name of the uploaded Vertex model. - isOptional: true - parameterType: STRING - parallel_trial_count: - description: The desired number of trials to run in parallel. - parameterType: NUMBER_INTEGER - predefined_split_key: - defaultValue: '' - description: Predefined split key. - isOptional: true - parameterType: STRING - prediction_type: - description: 'The type of prediction the model is to produce. - - "classification" or "regression".' - parameterType: STRING - project: - description: The GCP project that runs the pipeline components. - parameterType: STRING - root_dir: - description: The root GCS directory for the pipeline components. - parameterType: STRING - run_evaluation: - defaultValue: false - description: Whether to run evaluation steps during training. - isOptional: true - parameterType: BOOLEAN - run_feature_selection: - defaultValue: false - description: Whether to enable feature selection. - isOptional: true - parameterType: BOOLEAN - seed: - defaultValue: 1.0 - description: Seed to be used for this run. - isOptional: true - parameterType: NUMBER_INTEGER - stratified_split_key: - defaultValue: '' - description: Stratified split key. - isOptional: true - parameterType: STRING - study_spec_algorithm: - defaultValue: ALGORITHM_UNSPECIFIED - description: 'The search algorithm specified for the study. One of - - ''ALGORITHM_UNSPECIFIED'', ''GRID_SEARCH'', or ''RANDOM_SEARCH''.' - isOptional: true - parameterType: STRING - study_spec_measurement_selection_type: - defaultValue: BEST_MEASUREMENT - description: ' Which measurement to use if/when the - - service automatically selects the final measurement from previously - - reported intermediate measurements. One of "BEST_MEASUREMENT" or - - "LAST_MEASUREMENT".' - isOptional: true - parameterType: STRING - study_spec_metric_goal: - description: 'Optimization goal of the metric, possible values: - - "MAXIMIZE", "MINIMIZE".' - parameterType: STRING - study_spec_metric_id: - description: 'Metric to optimize, possible values: [ ''loss'', - - ''average_loss'', ''rmse'', ''mae'', ''mql'', ''accuracy'', ''auc'', ''precision'', - - ''recall''].' - parameterType: STRING - study_spec_parameters_override: - description: 'List of dictionaries representing parameters - - to optimize. The dictionary key is the parameter_id, which is passed to - - training job as a command line argument, and the dictionary value is the - - parameter specification of the metric.' - parameterType: LIST - target_column: - description: The target column name. - parameterType: STRING - test_fraction: - defaultValue: -1.0 - description: Test fraction. - isOptional: true - parameterType: NUMBER_DOUBLE - tf_auto_transform_features: - description: List of auto transform features. - isOptional: true - parameterType: STRUCT - tf_custom_transformation_definitions: - description: 'TF custom transformation definitions - - in string format.' - isOptional: true - parameterType: LIST - tf_transform_execution_engine: - defaultValue: bigquery - description: 'Execution engine to run TF-based - - transformations. Currently supports "dataflow" or "bigquery"' - isOptional: true - parameterType: STRING - tf_transformations_path: - defaultValue: '' - description: Path to TF transformation configuration. - isOptional: true - parameterType: STRING - training_fraction: - defaultValue: -1.0 - description: Training fraction. - isOptional: true - parameterType: NUMBER_DOUBLE - transform_dataflow_disk_size_gb: - defaultValue: 40.0 - description: 'Dataflow worker''s disk size in GB for - - transform component.' - isOptional: true - parameterType: NUMBER_INTEGER - transform_dataflow_machine_type: - defaultValue: n1-standard-16 - description: 'The dataflow machine type for transform - - component.' - isOptional: true - parameterType: STRING - transform_dataflow_max_num_workers: - defaultValue: 25.0 - description: 'The max number of Dataflow workers for - - transform component.' - isOptional: true - parameterType: NUMBER_INTEGER - validation_fraction: - defaultValue: -1.0 - description: Validation fraction. - isOptional: true - parameterType: NUMBER_DOUBLE - weight_column: - defaultValue: '' - description: The weight column name. - isOptional: true - parameterType: STRING - worker_pool_specs_override: - description: 'The dictionary for overriding training and - - evaluation worker pool specs. The dictionary should be of format - - https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.' - isOptional: true - parameterType: LIST - outputDefinitions: - artifacts: - model-evaluation-evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 -schemaVersion: 2.1.0 -sdkVersion: kfp-2.0.0-rc.2 diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer.py b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer.py deleted file mode 100644 index 31c71b62ab0..00000000000 --- a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer.py +++ /dev/null @@ -1,250 +0,0 @@ -# Copyright 2023 The Kubeflow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""AutoML Wide and Deep Trainer component spec.""" - -from typing import Optional - -from google_cloud_pipeline_components.types.artifact_types import UnmanagedContainerModel -from kfp import dsl -from kfp.dsl import Artifact -from kfp.dsl import Input -from kfp.dsl import Output - - -@dsl.container_component -def wide_and_deep_trainer( - project: str, - location: str, - root_dir: str, - target_column: str, - prediction_type: str, - learning_rate: float, - dnn_learning_rate: float, - instance_baseline: Input[Artifact], - metadata: Input[Artifact], - materialized_train_split: Input[Artifact], - materialized_eval_split: Input[Artifact], - transform_output: Input[Artifact], - training_schema_uri: Input[Artifact], - gcp_resources: dsl.OutputPath(str), - unmanaged_container_model: Output[UnmanagedContainerModel], # pylint: disable=unused-argument - weight_column: Optional[str] = '', - max_steps: Optional[int] = -1, - max_train_secs: Optional[int] = -1, - optimizer_type: Optional[str] = 'adam', - l1_regularization_strength: Optional[float] = 0, - l2_regularization_strength: Optional[float] = 0, - l2_shrinkage_regularization_strength: Optional[float] = 0, - beta_1: Optional[float] = 0.9, - beta_2: Optional[float] = 0.999, - hidden_units: Optional[str] = '30,30,30', - use_wide: Optional[bool] = True, - embed_categories: Optional[bool] = True, - dnn_dropout: Optional[float] = 0, - dnn_optimizer_type: Optional[str] = 'ftrl', - dnn_l1_regularization_strength: Optional[float] = 0, - dnn_l2_regularization_strength: Optional[float] = 0, - dnn_l2_shrinkage_regularization_strength: Optional[float] = 0, - dnn_beta_1: Optional[float] = 0.9, - dnn_beta_2: Optional[float] = 0.999, - enable_profiler: Optional[bool] = False, - cache_data: Optional[str] = 'auto', - seed: Optional[int] = 1, - eval_steps: Optional[int] = 0, - batch_size: Optional[int] = 100, - measurement_selection_type: Optional[str] = 'BEST_MEASUREMENT', - optimization_metric: Optional[str] = '', - eval_frequency_secs: Optional[int] = 600, - training_machine_spec: Optional[dict] = {'machine_type': 'c2-standard-16'}, - training_disk_spec: Optional[dict] = { - 'boot_disk_type': 'pd-ssd', - 'boot_disk_size_gb': 100, - }, - encryption_spec_key_name: Optional[str] = '', -): - # fmt: off - """Trains a Wide & Deep model using Vertex CustomJob API. - - Args: - project: The GCP project that runs the pipeline components. - location: The GCP region that runs the pipeline components. - root_dir: The root GCS directory for the pipeline components. - target_column: The target column name. - prediction_type: The type of prediction the model is to produce. "classification" or "regression". - weight_column: The weight column name. - max_steps: Number of steps to run the trainer for. - max_train_secs: Amount of time in seconds to run the trainer for. - learning_rate: The learning rate used by the linear optimizer. - optimizer_type: The type of optimizer to use. Choices are "adam", "ftrl" and "sgd" for the Adam, FTRL, and Gradient Descent Optimizers, respectively. - l1_regularization_strength: L1 regularization strength for optimizer_type="ftrl". - l2_regularization_strength: L2 regularization strength for optimizer_type="ftrl" - l2_shrinkage_regularization_strength: L2 shrinkage regularization strength for optimizer_type="ftrl". - beta_1: Beta 1 value for optimizer_type="adam". - beta_2: Beta 2 value for optimizer_type="adam". - hidden_units: Hidden layer sizes to use for DNN feature columns, provided in comma-separated layers. - use_wide: If set to true, the categorical columns will be used in the wide part of the DNN model. - embed_categories: If set to true, the categorical columns will be used embedded and used in the deep part of the model. Embedding size is the square root of the column cardinality. - dnn_dropout: The probability we will drop out a given coordinate. - dnn_learning_rate: The learning rate for training the deep part of the model. - dnn_optimizer_type: The type of optimizer to use for the deep part of the model. Choices are "adam", "ftrl" and "sgd". for the Adam, FTRL, and Gradient Descent Optimizers, respectively. - dnn_l1_regularization_strength: L1 regularization strength for dnn_optimizer_type="ftrl". - dnn_l2_regularization_strength: L2 regularization strength for dnn_optimizer_type="ftrl". - dnn_l2_shrinkage_regularization_strength: L2 shrinkage regularization strength for dnn_optimizer_type="ftrl". - dnn_beta_1: Beta 1 value for dnn_optimizer_type="adam". - dnn_beta_2: Beta 2 value for dnn_optimizer_type="adam". - enable_profiler: Enables profiling and saves a trace during evaluation. - cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size. - seed: Seed to be used for this run. - eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples. - batch_size: Batch size for training. - measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT". - optimization_metric: Optimization metric used for `measurement_selection_type`. Default is "rmse" for regression and "auc" for classification. - eval_frequency_secs: Frequency at which evaluation and checkpointing will take place. - training_machine_spec: The training machine spec. See https://cloud.google.com/compute/docs/machine-types for options. - training_disk_spec: The training disk spec. - instance_baseline: The path to a JSON file for baseline values. - metadata: Amount of time in seconds to run the trainer for. - materialized_train_split: The path to the materialized train split. - materialized_eval_split: The path to the materialized validation split. - transform_output: The path to transform output. - training_schema_uri: The path to the training schema. - encryption_spec_key_name: The KMS key name. - - Returns: - gcp_resources: Serialized gcp_resources proto tracking the custom training job. - unmanaged_container_model: The UnmanagedContainerModel artifact. - """ - # fmt: on - - return dsl.ContainerSpec( - image='gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.44', - command=[ - 'python3', - '-u', - '-m', - 'google_cloud_pipeline_components.container.v1.custom_job.launcher', - ], - args=[ - '--type', - 'CustomJob', - '--project', - project, - '--location', - location, - '--gcp_resources', - gcp_resources, - '--payload', - dsl.ConcatPlaceholder( - items=[ - ( - '{"display_name":' - f' "wide-and-deep-trainer-{dsl.PIPELINE_JOB_ID_PLACEHOLDER}-{dsl.PIPELINE_TASK_ID_PLACEHOLDER}",' - ' "encryption_spec": {"kms_key_name":"' - ), - encryption_spec_key_name, - '"}, "job_spec": {"worker_pool_specs": [{"replica_count":"', - '1', - '", "machine_spec": ', - training_machine_spec, - ', "disk_spec": ', - training_disk_spec, - ', "container_spec": {"image_uri":"', - 'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/wide-and-deep-training:20251102_1045', - '", "args": ["--target_column=', - target_column, - '", "--weight_column=', - weight_column, - '", "--model_type=', - prediction_type, - '", "--prediction_docker_uri=', - 'us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:20251102_1045', - '", "--baseline_path=', - instance_baseline.uri, - '", "--metadata_path=', - metadata.uri, - '", "--transform_output_path=', - transform_output.uri, - '", "--training_schema_path=', - training_schema_uri.uri, - '", "--job_dir=', - root_dir, - ( - f'/{dsl.PIPELINE_JOB_ID_PLACEHOLDER}/{dsl.PIPELINE_TASK_ID_PLACEHOLDER}/train",' - ' "--training_data_path=' - ), - materialized_train_split.uri, - '", "--validation_data_path=', - materialized_eval_split.uri, - '", "--max_steps=', - max_steps, - '", "--max_train_secs=', - max_train_secs, - '", "--learning_rate=', - learning_rate, - '", "--optimizer_type=', - optimizer_type, - '", "--l1_regularization_strength=', - l1_regularization_strength, - '", "--l2_regularization_strength=', - l2_regularization_strength, - '", "--l2_shrinkage_regularization_strength=', - l2_shrinkage_regularization_strength, - '", "--beta_1=', - beta_1, - '", "--beta_2=', - beta_2, - '", "--hidden_units=', - hidden_units, - '", "--use_wide=', - use_wide, - '", "--embed_categories=', - embed_categories, - '", "--dnn_dropout=', - dnn_dropout, - '", "--dnn_learning_rate=', - dnn_learning_rate, - '", "--dnn_optimizer_type=', - dnn_optimizer_type, - '", "--dnn_l1_regularization_strength=', - dnn_l1_regularization_strength, - '", "--dnn_l2_regularization_strength=', - dnn_l2_regularization_strength, - '", "--dnn_l2_shrinkage_regularization_strength=', - dnn_l2_shrinkage_regularization_strength, - '", "--dnn_beta_1=', - dnn_beta_1, - '", "--dnn_beta_2=', - dnn_beta_2, - '", "--enable_profiler=', - enable_profiler, - '", "--cache_data=', - cache_data, - '", "--seed=', - seed, - '", "--eval_steps=', - eval_steps, - '", "--batch_size=', - batch_size, - '", "--measurement_selection_type=', - measurement_selection_type, - '", "--optimization_metric=', - optimization_metric, - '", "--eval_frequency_secs=', - eval_frequency_secs, - '", "--executor_input={{$.json_escape[1]}}"]}}]}}', - ] - ), - ], - ) diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer_pipeline.yaml b/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer_pipeline.yaml deleted file mode 100644 index 23e6333f8e0..00000000000 --- a/components/google-cloud/google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer_pipeline.yaml +++ /dev/null @@ -1,3912 +0,0 @@ -# PIPELINE DEFINITION -# Name: automl-tabular-wide-and-deep-trainer -# Description: Train a model using the Tabular Workflow for Wide & Deep pipelines. -# Wide & Deep jointly trains wide linear models and deep neural networks. It -# combines the benefits of memorization and generalization. -# Inputs: -# batch_size: int [Default: 100.0] -# beta_1: float [Default: 0.9] -# beta_2: float [Default: 0.999] -# bigquery_staging_full_dataset_id: str [Default: ''] -# cache_data: str [Default: 'auto'] -# data_source_bigquery_table_path: str [Default: ''] -# data_source_csv_filenames: str [Default: ''] -# dataflow_service_account: str [Default: ''] -# dataflow_subnetwork: str [Default: ''] -# dataflow_use_public_ips: bool [Default: True] -# dataset_level_custom_transformation_definitions: list -# dataset_level_transformations: list -# dnn_beta_1: float [Default: 0.9] -# dnn_beta_2: float [Default: 0.999] -# dnn_dropout: float [Default: 0.0] -# dnn_l1_regularization_strength: float [Default: 0.0] -# dnn_l2_regularization_strength: float [Default: 0.0] -# dnn_l2_shrinkage_regularization_strength: float [Default: 0.0] -# dnn_learning_rate: float -# dnn_optimizer_type: str [Default: 'adam'] -# embed_categories: bool [Default: True] -# enable_profiler: bool [Default: False] -# encryption_spec_key_name: str [Default: ''] -# eval_frequency_secs: int [Default: 600.0] -# eval_steps: int [Default: 0.0] -# evaluation_batch_predict_machine_type: str [Default: 'n1-highmem-8'] -# evaluation_batch_predict_max_replica_count: int [Default: 20.0] -# evaluation_batch_predict_starting_replica_count: int [Default: 20.0] -# evaluation_dataflow_disk_size_gb: int [Default: 50.0] -# evaluation_dataflow_machine_type: str [Default: 'n1-standard-4'] -# evaluation_dataflow_max_num_workers: int [Default: 100.0] -# evaluation_dataflow_starting_num_workers: int [Default: 10.0] -# feature_selection_algorithm: str [Default: 'AMI'] -# hidden_units: str [Default: '30,30,30'] -# l1_regularization_strength: float [Default: 0.0] -# l2_regularization_strength: float [Default: 0.0] -# l2_shrinkage_regularization_strength: float [Default: 0.0] -# learning_rate: float -# location: str -# materialized_examples_format: str [Default: 'tfrecords_gzip'] -# max_selected_features: int [Default: -1.0] -# max_steps: int [Default: -1.0] -# max_train_secs: int [Default: -1.0] -# measurement_selection_type: str [Default: 'BEST_MEASUREMENT'] -# model_description: str [Default: ''] -# model_display_name: str [Default: ''] -# optimization_metric: str [Default: ''] -# optimizer_type: str [Default: 'adam'] -# parent_model: system.Artifact -# predefined_split_key: str [Default: ''] -# prediction_type: str -# project: str -# root_dir: str -# run_evaluation: bool [Default: False] -# run_feature_selection: bool [Default: False] -# seed: int [Default: 1.0] -# stratified_split_key: str [Default: ''] -# target_column: str -# test_fraction: float [Default: -1.0] -# tf_auto_transform_features: dict -# tf_custom_transformation_definitions: list -# tf_transform_execution_engine: str [Default: 'bigquery'] -# tf_transformations_path: str [Default: ''] -# training_fraction: float [Default: -1.0] -# transform_dataflow_disk_size_gb: int [Default: 40.0] -# transform_dataflow_machine_type: str [Default: 'n1-standard-16'] -# transform_dataflow_max_num_workers: int [Default: 25.0] -# use_wide: bool [Default: True] -# validation_fraction: float [Default: -1.0] -# vertex_dataset: system.Artifact -# weight_column: str [Default: ''] -# worker_pool_specs_override: list -# Outputs: -# model-evaluation-evaluation_metrics: system.Metrics -components: - comp-automl-tabular-finalizer: - executorLabel: exec-automl-tabular-finalizer - inputDefinitions: - parameters: - encryption_spec_key_name: - defaultValue: '' - description: Customer-managed encryption key. - isOptional: true - parameterType: STRING - location: - description: Location for running the Cross-validation trainer. - parameterType: STRING - project: - description: Project to run Cross-validation trainer. - parameterType: STRING - root_dir: - description: The Cloud Storage location to store the output. - parameterType: STRING - outputDefinitions: - parameters: - gcp_resources: - description: GCP resources created by this component. For more details, - see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/proto/README.md. - parameterType: STRING - comp-automl-tabular-infra-validator: - executorLabel: exec-automl-tabular-infra-validator - inputDefinitions: - artifacts: - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: google.UnmanagedContainerModel for model to be validated. - comp-bool-identity: - executorLabel: exec-bool-identity - inputDefinitions: - parameters: - value: - description: Boolean value to return - parameterType: BOOLEAN - outputDefinitions: - parameters: - Output: - parameterType: STRING - comp-condition-2: - dag: - outputs: - artifacts: - model-evaluation-evaluation_metrics: - artifactSelectors: - - outputArtifactKey: evaluation_metrics - producerSubtask: model-evaluation - tasks: - model-batch-predict: - cachingOptions: - enableCache: true - componentRef: - name: comp-model-batch-predict - inputs: - artifacts: - unmanaged_container_model: - componentInputArtifact: pipelinechannel--wide-and-deep-trainer-unmanaged_container_model - parameters: - bigquery_source_input_uri: - componentInputParameter: pipelinechannel--feature-transform-engine-bigquery_test_split_uri - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - gcs_destination_output_uri_prefix: - componentInputParameter: pipelinechannel--root_dir - instances_format: - runtimeValue: - constant: bigquery - job_display_name: - runtimeValue: - constant: batch-predict-evaluation-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}} - location: - componentInputParameter: pipelinechannel--location - machine_type: - componentInputParameter: pipelinechannel--evaluation_batch_predict_machine_type - max_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_max_replica_count - predictions_format: - runtimeValue: - constant: jsonl - project: - componentInputParameter: pipelinechannel--project - starting_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_starting_replica_count - taskInfo: - name: model-batch-predict - model-evaluation: - cachingOptions: - enableCache: true - componentRef: - name: comp-model-evaluation - dependentTasks: - - model-batch-predict - inputs: - artifacts: - batch_prediction_job: - taskOutputArtifact: - outputArtifactKey: batchpredictionjob - producerTask: model-batch-predict - parameters: - dataflow_disk_size: - componentInputParameter: pipelinechannel--evaluation_dataflow_disk_size_gb - dataflow_machine_type: - componentInputParameter: pipelinechannel--evaluation_dataflow_machine_type - dataflow_max_workers_num: - componentInputParameter: pipelinechannel--evaluation_dataflow_max_num_workers - dataflow_service_account: - componentInputParameter: pipelinechannel--dataflow_service_account - dataflow_subnetwork: - componentInputParameter: pipelinechannel--dataflow_subnetwork - dataflow_use_public_ips: - componentInputParameter: pipelinechannel--dataflow_use_public_ips - dataflow_workers_num: - componentInputParameter: pipelinechannel--evaluation_dataflow_starting_num_workers - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - ground_truth_column: - componentInputParameter: pipelinechannel--target_column - ground_truth_format: - runtimeValue: - constant: jsonl - location: - componentInputParameter: pipelinechannel--location - prediction_label_column: - runtimeValue: - constant: '' - prediction_score_column: - runtimeValue: - constant: '' - predictions_format: - runtimeValue: - constant: jsonl - problem_type: - componentInputParameter: pipelinechannel--prediction_type - project: - componentInputParameter: pipelinechannel--project - root_dir: - componentInputParameter: pipelinechannel--root_dir - taskInfo: - name: model-evaluation - inputDefinitions: - artifacts: - pipelinechannel--wide-and-deep-trainer-unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - parameters: - pipelinechannel--bool-identity-Output: - parameterType: STRING - pipelinechannel--dataflow_service_account: - parameterType: STRING - pipelinechannel--dataflow_subnetwork: - parameterType: STRING - pipelinechannel--dataflow_use_public_ips: - parameterType: BOOLEAN - pipelinechannel--encryption_spec_key_name: - parameterType: STRING - pipelinechannel--evaluation_batch_predict_machine_type: - parameterType: STRING - pipelinechannel--evaluation_batch_predict_max_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_batch_predict_starting_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_disk_size_gb: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_machine_type: - parameterType: STRING - pipelinechannel--evaluation_dataflow_max_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_starting_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--feature-transform-engine-bigquery_test_split_uri: - parameterType: STRING - pipelinechannel--location: - parameterType: STRING - pipelinechannel--prediction_type: - parameterType: STRING - pipelinechannel--project: - parameterType: STRING - pipelinechannel--root_dir: - parameterType: STRING - pipelinechannel--target_column: - parameterType: STRING - outputDefinitions: - artifacts: - model-evaluation-evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 - comp-exit-handler-1: - dag: - outputs: - artifacts: - model-evaluation-evaluation_metrics: - artifactSelectors: - - outputArtifactKey: model-evaluation-evaluation_metrics - producerSubtask: condition-2 - tasks: - automl-tabular-infra-validator: - cachingOptions: - enableCache: true - componentRef: - name: comp-automl-tabular-infra-validator - dependentTasks: - - wide-and-deep-trainer - inputs: - artifacts: - unmanaged_container_model: - taskOutputArtifact: - outputArtifactKey: unmanaged_container_model - producerTask: wide-and-deep-trainer - taskInfo: - name: automl-tabular-infra-validator - bool-identity: - cachingOptions: - enableCache: true - componentRef: - name: comp-bool-identity - inputs: - parameters: - value: - componentInputParameter: pipelinechannel--run_evaluation - taskInfo: - name: bool-identity - condition-2: - componentRef: - name: comp-condition-2 - dependentTasks: - - bool-identity - - feature-transform-engine - - wide-and-deep-trainer - inputs: - artifacts: - pipelinechannel--wide-and-deep-trainer-unmanaged_container_model: - taskOutputArtifact: - outputArtifactKey: unmanaged_container_model - producerTask: wide-and-deep-trainer - parameters: - pipelinechannel--bool-identity-Output: - taskOutputParameter: - outputParameterKey: Output - producerTask: bool-identity - pipelinechannel--dataflow_service_account: - componentInputParameter: pipelinechannel--dataflow_service_account - pipelinechannel--dataflow_subnetwork: - componentInputParameter: pipelinechannel--dataflow_subnetwork - pipelinechannel--dataflow_use_public_ips: - componentInputParameter: pipelinechannel--dataflow_use_public_ips - pipelinechannel--encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - pipelinechannel--evaluation_batch_predict_machine_type: - componentInputParameter: pipelinechannel--evaluation_batch_predict_machine_type - pipelinechannel--evaluation_batch_predict_max_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_max_replica_count - pipelinechannel--evaluation_batch_predict_starting_replica_count: - componentInputParameter: pipelinechannel--evaluation_batch_predict_starting_replica_count - pipelinechannel--evaluation_dataflow_disk_size_gb: - componentInputParameter: pipelinechannel--evaluation_dataflow_disk_size_gb - pipelinechannel--evaluation_dataflow_machine_type: - componentInputParameter: pipelinechannel--evaluation_dataflow_machine_type - pipelinechannel--evaluation_dataflow_max_num_workers: - componentInputParameter: pipelinechannel--evaluation_dataflow_max_num_workers - pipelinechannel--evaluation_dataflow_starting_num_workers: - componentInputParameter: pipelinechannel--evaluation_dataflow_starting_num_workers - pipelinechannel--feature-transform-engine-bigquery_test_split_uri: - taskOutputParameter: - outputParameterKey: bigquery_test_split_uri - producerTask: feature-transform-engine - pipelinechannel--location: - componentInputParameter: pipelinechannel--location - pipelinechannel--prediction_type: - componentInputParameter: pipelinechannel--prediction_type - pipelinechannel--project: - componentInputParameter: pipelinechannel--project - pipelinechannel--root_dir: - componentInputParameter: pipelinechannel--root_dir - pipelinechannel--target_column: - componentInputParameter: pipelinechannel--target_column - taskInfo: - name: run-evaluation - triggerPolicy: - condition: inputs.parameter_values['pipelinechannel--bool-identity-Output'] - == 'true' - feature-transform-engine: - cachingOptions: - enableCache: true - componentRef: - name: comp-feature-transform-engine - inputs: - parameters: - bigquery_staging_full_dataset_id: - componentInputParameter: pipelinechannel--bigquery_staging_full_dataset_id - data_source_bigquery_table_path: - componentInputParameter: pipelinechannel--set-optional-inputs-data_source_bigquery_table_path - data_source_csv_filenames: - componentInputParameter: pipelinechannel--set-optional-inputs-data_source_csv_filenames - dataflow_disk_size_gb: - componentInputParameter: pipelinechannel--transform_dataflow_disk_size_gb - dataflow_machine_type: - componentInputParameter: pipelinechannel--transform_dataflow_machine_type - dataflow_max_num_workers: - componentInputParameter: pipelinechannel--transform_dataflow_max_num_workers - dataflow_service_account: - componentInputParameter: pipelinechannel--dataflow_service_account - dataflow_subnetwork: - componentInputParameter: pipelinechannel--dataflow_subnetwork - dataflow_use_public_ips: - componentInputParameter: pipelinechannel--dataflow_use_public_ips - dataset_level_custom_transformation_definitions: - componentInputParameter: pipelinechannel--dataset_level_custom_transformation_definitions - dataset_level_transformations: - componentInputParameter: pipelinechannel--dataset_level_transformations - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - feature_selection_algorithm: - componentInputParameter: pipelinechannel--feature_selection_algorithm - location: - componentInputParameter: pipelinechannel--location - materialized_examples_format: - componentInputParameter: pipelinechannel--materialized_examples_format - max_selected_features: - componentInputParameter: pipelinechannel--max_selected_features - model_type: - runtimeValue: - constant: neural_network - predefined_split_key: - componentInputParameter: pipelinechannel--predefined_split_key - prediction_type: - componentInputParameter: pipelinechannel--prediction_type - project: - componentInputParameter: pipelinechannel--project - root_dir: - componentInputParameter: pipelinechannel--root_dir - run_feature_selection: - componentInputParameter: pipelinechannel--run_feature_selection - stratified_split_key: - componentInputParameter: pipelinechannel--stratified_split_key - target_column: - componentInputParameter: pipelinechannel--target_column - test_fraction: - componentInputParameter: pipelinechannel--test_fraction - tf_auto_transform_features: - componentInputParameter: pipelinechannel--tf_auto_transform_features - tf_custom_transformation_definitions: - componentInputParameter: pipelinechannel--tf_custom_transformation_definitions - tf_transform_execution_engine: - componentInputParameter: pipelinechannel--tf_transform_execution_engine - tf_transformations_path: - componentInputParameter: pipelinechannel--tf_transformations_path - training_fraction: - componentInputParameter: pipelinechannel--training_fraction - validation_fraction: - componentInputParameter: pipelinechannel--validation_fraction - weight_column: - componentInputParameter: pipelinechannel--weight_column - taskInfo: - name: feature-transform-engine - model-upload: - cachingOptions: - enableCache: true - componentRef: - name: comp-model-upload - dependentTasks: - - automl-tabular-infra-validator - - wide-and-deep-trainer - inputs: - artifacts: - parent_model: - componentInputArtifact: pipelinechannel--parent_model - unmanaged_container_model: - taskOutputArtifact: - outputArtifactKey: unmanaged_container_model - producerTask: wide-and-deep-trainer - parameters: - description: - componentInputParameter: pipelinechannel--model_description - display_name: - componentInputParameter: pipelinechannel--get-model-display-name-model_display_name - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - location: - componentInputParameter: pipelinechannel--location - project: - componentInputParameter: pipelinechannel--project - taskInfo: - name: model-upload - parse-worker-pool-specs-override: - cachingOptions: - enableCache: true - componentRef: - name: comp-parse-worker-pool-specs-override - inputs: - parameters: - worker_pool_specs_override: - componentInputParameter: pipelinechannel--worker_pool_specs_override - taskInfo: - name: parse-worker-pool-specs-override - split-materialized-data: - cachingOptions: - enableCache: true - componentRef: - name: comp-split-materialized-data - dependentTasks: - - feature-transform-engine - inputs: - artifacts: - materialized_data: - taskOutputArtifact: - outputArtifactKey: materialized_data - producerTask: feature-transform-engine - taskInfo: - name: split-materialized-data - training-configurator-and-validator: - cachingOptions: - enableCache: true - componentRef: - name: comp-training-configurator-and-validator - dependentTasks: - - feature-transform-engine - inputs: - artifacts: - dataset_stats: - taskOutputArtifact: - outputArtifactKey: dataset_stats - producerTask: feature-transform-engine - instance_schema: - taskOutputArtifact: - outputArtifactKey: instance_schema - producerTask: feature-transform-engine - training_schema: - taskOutputArtifact: - outputArtifactKey: training_schema - producerTask: feature-transform-engine - parameters: - prediction_type: - componentInputParameter: pipelinechannel--prediction_type - run_evaluation: - componentInputParameter: pipelinechannel--run_evaluation - split_example_counts: - taskOutputParameter: - outputParameterKey: split_example_counts - producerTask: feature-transform-engine - target_column: - componentInputParameter: pipelinechannel--target_column - weight_column: - componentInputParameter: pipelinechannel--weight_column - taskInfo: - name: training-configurator-and-validator - wide-and-deep-trainer: - cachingOptions: - enableCache: true - componentRef: - name: comp-wide-and-deep-trainer - dependentTasks: - - feature-transform-engine - - parse-worker-pool-specs-override - - split-materialized-data - - training-configurator-and-validator - inputs: - artifacts: - instance_baseline: - taskOutputArtifact: - outputArtifactKey: instance_baseline - producerTask: training-configurator-and-validator - materialized_eval_split: - taskOutputArtifact: - outputArtifactKey: materialized_eval_split - producerTask: split-materialized-data - materialized_train_split: - taskOutputArtifact: - outputArtifactKey: materialized_train_split - producerTask: split-materialized-data - metadata: - taskOutputArtifact: - outputArtifactKey: metadata - producerTask: training-configurator-and-validator - training_schema_uri: - taskOutputArtifact: - outputArtifactKey: training_schema - producerTask: feature-transform-engine - transform_output: - taskOutputArtifact: - outputArtifactKey: transform_output - producerTask: feature-transform-engine - parameters: - batch_size: - componentInputParameter: pipelinechannel--batch_size - beta_1: - componentInputParameter: pipelinechannel--beta_1 - beta_2: - componentInputParameter: pipelinechannel--beta_2 - cache_data: - componentInputParameter: pipelinechannel--cache_data - dnn_beta_1: - componentInputParameter: pipelinechannel--dnn_beta_1 - dnn_beta_2: - componentInputParameter: pipelinechannel--dnn_beta_2 - dnn_dropout: - componentInputParameter: pipelinechannel--dnn_dropout - dnn_l1_regularization_strength: - componentInputParameter: pipelinechannel--dnn_l1_regularization_strength - dnn_l2_regularization_strength: - componentInputParameter: pipelinechannel--dnn_l2_regularization_strength - dnn_l2_shrinkage_regularization_strength: - componentInputParameter: pipelinechannel--dnn_l2_shrinkage_regularization_strength - dnn_learning_rate: - componentInputParameter: pipelinechannel--dnn_learning_rate - dnn_optimizer_type: - componentInputParameter: pipelinechannel--dnn_optimizer_type - embed_categories: - componentInputParameter: pipelinechannel--embed_categories - enable_profiler: - componentInputParameter: pipelinechannel--enable_profiler - encryption_spec_key_name: - componentInputParameter: pipelinechannel--encryption_spec_key_name - eval_frequency_secs: - componentInputParameter: pipelinechannel--eval_frequency_secs - eval_steps: - componentInputParameter: pipelinechannel--eval_steps - hidden_units: - componentInputParameter: pipelinechannel--hidden_units - l1_regularization_strength: - componentInputParameter: pipelinechannel--l1_regularization_strength - l2_regularization_strength: - componentInputParameter: pipelinechannel--l2_regularization_strength - l2_shrinkage_regularization_strength: - componentInputParameter: pipelinechannel--l2_shrinkage_regularization_strength - learning_rate: - componentInputParameter: pipelinechannel--learning_rate - location: - componentInputParameter: pipelinechannel--location - max_steps: - componentInputParameter: pipelinechannel--max_steps - max_train_secs: - componentInputParameter: pipelinechannel--max_train_secs - measurement_selection_type: - componentInputParameter: pipelinechannel--measurement_selection_type - optimization_metric: - componentInputParameter: pipelinechannel--optimization_metric - optimizer_type: - componentInputParameter: pipelinechannel--optimizer_type - prediction_type: - componentInputParameter: pipelinechannel--prediction_type - project: - componentInputParameter: pipelinechannel--project - root_dir: - componentInputParameter: pipelinechannel--root_dir - seed: - componentInputParameter: pipelinechannel--seed - target_column: - componentInputParameter: pipelinechannel--target_column - training_disk_spec: - taskOutputParameter: - outputParameterKey: training_disk_spec - producerTask: parse-worker-pool-specs-override - training_machine_spec: - taskOutputParameter: - outputParameterKey: training_machine_spec - producerTask: parse-worker-pool-specs-override - use_wide: - componentInputParameter: pipelinechannel--use_wide - weight_column: - componentInputParameter: pipelinechannel--weight_column - taskInfo: - name: wide-and-deep-trainer - inputDefinitions: - artifacts: - pipelinechannel--parent_model: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - parameters: - pipelinechannel--batch_size: - parameterType: NUMBER_INTEGER - pipelinechannel--beta_1: - parameterType: NUMBER_DOUBLE - pipelinechannel--beta_2: - parameterType: NUMBER_DOUBLE - pipelinechannel--bigquery_staging_full_dataset_id: - parameterType: STRING - pipelinechannel--cache_data: - parameterType: STRING - pipelinechannel--dataflow_service_account: - parameterType: STRING - pipelinechannel--dataflow_subnetwork: - parameterType: STRING - pipelinechannel--dataflow_use_public_ips: - parameterType: BOOLEAN - pipelinechannel--dataset_level_custom_transformation_definitions: - parameterType: LIST - pipelinechannel--dataset_level_transformations: - parameterType: LIST - pipelinechannel--dnn_beta_1: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_beta_2: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_dropout: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_l1_regularization_strength: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_l2_regularization_strength: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_l2_shrinkage_regularization_strength: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_learning_rate: - parameterType: NUMBER_DOUBLE - pipelinechannel--dnn_optimizer_type: - parameterType: STRING - pipelinechannel--embed_categories: - parameterType: BOOLEAN - pipelinechannel--enable_profiler: - parameterType: BOOLEAN - pipelinechannel--encryption_spec_key_name: - parameterType: STRING - pipelinechannel--eval_frequency_secs: - parameterType: NUMBER_INTEGER - pipelinechannel--eval_steps: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_batch_predict_machine_type: - parameterType: STRING - pipelinechannel--evaluation_batch_predict_max_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_batch_predict_starting_replica_count: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_disk_size_gb: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_machine_type: - parameterType: STRING - pipelinechannel--evaluation_dataflow_max_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--evaluation_dataflow_starting_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--feature_selection_algorithm: - parameterType: STRING - pipelinechannel--get-model-display-name-model_display_name: - parameterType: STRING - pipelinechannel--hidden_units: - parameterType: STRING - pipelinechannel--l1_regularization_strength: - parameterType: NUMBER_DOUBLE - pipelinechannel--l2_regularization_strength: - parameterType: NUMBER_DOUBLE - pipelinechannel--l2_shrinkage_regularization_strength: - parameterType: NUMBER_DOUBLE - pipelinechannel--learning_rate: - parameterType: NUMBER_DOUBLE - pipelinechannel--location: - parameterType: STRING - pipelinechannel--materialized_examples_format: - parameterType: STRING - pipelinechannel--max_selected_features: - parameterType: NUMBER_INTEGER - pipelinechannel--max_steps: - parameterType: NUMBER_INTEGER - pipelinechannel--max_train_secs: - parameterType: NUMBER_INTEGER - pipelinechannel--measurement_selection_type: - parameterType: STRING - pipelinechannel--model_description: - parameterType: STRING - pipelinechannel--optimization_metric: - parameterType: STRING - pipelinechannel--optimizer_type: - parameterType: STRING - pipelinechannel--predefined_split_key: - parameterType: STRING - pipelinechannel--prediction_type: - parameterType: STRING - pipelinechannel--project: - parameterType: STRING - pipelinechannel--root_dir: - parameterType: STRING - pipelinechannel--run_evaluation: - parameterType: BOOLEAN - pipelinechannel--run_feature_selection: - parameterType: BOOLEAN - pipelinechannel--seed: - parameterType: NUMBER_INTEGER - pipelinechannel--set-optional-inputs-data_source_bigquery_table_path: - parameterType: STRING - pipelinechannel--set-optional-inputs-data_source_csv_filenames: - parameterType: STRING - pipelinechannel--stratified_split_key: - parameterType: STRING - pipelinechannel--target_column: - parameterType: STRING - pipelinechannel--test_fraction: - parameterType: NUMBER_DOUBLE - pipelinechannel--tf_auto_transform_features: - parameterType: STRUCT - pipelinechannel--tf_custom_transformation_definitions: - parameterType: LIST - pipelinechannel--tf_transform_execution_engine: - parameterType: STRING - pipelinechannel--tf_transformations_path: - parameterType: STRING - pipelinechannel--training_fraction: - parameterType: NUMBER_DOUBLE - pipelinechannel--transform_dataflow_disk_size_gb: - parameterType: NUMBER_INTEGER - pipelinechannel--transform_dataflow_machine_type: - parameterType: STRING - pipelinechannel--transform_dataflow_max_num_workers: - parameterType: NUMBER_INTEGER - pipelinechannel--use_wide: - parameterType: BOOLEAN - pipelinechannel--validation_fraction: - parameterType: NUMBER_DOUBLE - pipelinechannel--weight_column: - parameterType: STRING - pipelinechannel--worker_pool_specs_override: - parameterType: LIST - outputDefinitions: - artifacts: - model-evaluation-evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 - comp-feature-transform-engine: - executorLabel: exec-feature-transform-engine - inputDefinitions: - parameters: - autodetect_csv_schema: - defaultValue: false - description: 'If True, infers the column types - - when importing CSVs into BigQuery.' - isOptional: true - parameterType: BOOLEAN - bigquery_staging_full_dataset_id: - defaultValue: '' - description: Dataset in "projectId.datasetId" format for storing intermediate-FTE - BigQuery tables. If the specified dataset does not exist in BigQuery, - FTE will create the dataset. If no bigquery_staging_full_dataset_id is - specified, all intermediate tables will be stored in a dataset created - under the provided project in the input data source's location during - FTE execution called "vertex_feature_transform_engine_staging_{location.replace('-', - '_')}". All tables generated by FTE will have a 30 day TTL. - isOptional: true - parameterType: STRING - data_source_bigquery_table_path: - defaultValue: '' - description: BigQuery input data source to run feature transform on. - isOptional: true - parameterType: STRING - data_source_csv_filenames: - defaultValue: '' - description: CSV input data source to run feature transform on. - isOptional: true - parameterType: STRING - dataflow_disk_size_gb: - defaultValue: 40.0 - description: The disk size, in gigabytes, to use on each Dataflow worker - instance. If not set, default to 40. - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_machine_type: - defaultValue: n1-standard-16 - description: The machine type used for dataflow jobs. If not set, default - to n1-standard-16. - isOptional: true - parameterType: STRING - dataflow_max_num_workers: - defaultValue: 25.0 - description: The number of workers to run the dataflow job. If not set, - default to 25. - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_service_account: - defaultValue: '' - description: Custom service account to run Dataflow jobs. - isOptional: true - parameterType: STRING - dataflow_subnetwork: - defaultValue: '' - description: 'Dataflow''s fully qualified subnetwork name, when empty the - default subnetwork will be used. More details: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications' - isOptional: true - parameterType: STRING - dataflow_use_public_ips: - defaultValue: true - description: Specifies whether Dataflow workers use public IP addresses. - isOptional: true - parameterType: BOOLEAN - dataset_level_custom_transformation_definitions: - defaultValue: [] - description: 'List of dataset-level custom transformation definitions. Custom, - bring-your-own dataset-level transform functions, where users can define - and import their own transform function and use it with FTE''s built-in - transformations. Using custom transformations is an experimental feature - and it is currently not supported during batch prediction. - - [ { "transformation": "ConcatCols", "module_path": "/path/to/custom_transform_fn_dlt.py", - "function_name": "concat_cols" } ] Using custom transform function together - with FTE''s built-in transformations: .. code-block:: python [ { "transformation": - "Join", "right_table_uri": "bq://test-project.dataset_test.table", "join_keys": - [["join_key_col", "join_key_col"]] },{ "transformation": "ConcatCols", - "cols": ["feature_1", "feature_2"], "output_col": "feature_1_2" } ]' - isOptional: true - parameterType: LIST - dataset_level_transformations: - defaultValue: [] - description: "List of dataset-level transformations.\n[ { \"transformation\"\ - : \"Join\", \"right_table_uri\": \"bq://test-project.dataset_test.table\"\ - , \"join_keys\": [[\"join_key_col\", \"join_key_col\"]] }, ... ] Additional\ - \ information about FTE's currently supported built-in\n transformations:\n\ - \ Join: Joins features from right_table_uri. For each join key, the\ - \ left table keys will be included and the right table keys will be dropped.\n\ - \ Example: .. code-block:: python { \"transformation\": \"Join\"\ - , \"right_table_uri\": \"bq://test-project.dataset_test.table\", \"join_keys\"\ - : [[\"join_key_col\", \"join_key_col\"]] }\n Arguments:\n \ - \ right_table_uri: Right table BigQuery uri to join with input_full_table_id.\n\ - \ join_keys: Features to join on. For each nested list, the\ - \ first element is a left table column and the second is its corresponding\ - \ right table column.\n TimeAggregate: Creates a new feature composed\ - \ of values of an existing feature from a fixed time period ago or in\ - \ the future.\n Ex: A feature for sales by store 1 year ago.\n \ - \ Example: .. code-block:: python { \"transformation\": \"TimeAggregate\"\ - , \"time_difference\": 40, \"time_difference_units\": \"DAY\", \"time_series_identifier_columns\"\ - : [\"store_id\"], \"time_column\": \"time_col\", \"time_difference_target_column\"\ - : \"target_col\", \"output_column\": \"output_col\" }\n Arguments:\n\ - \ time_difference: Number of time_difference_units to look\ - \ back or into the future on our time_difference_target_column.\n \ - \ time_difference_units: Units of time_difference to look back\ - \ or into the future on our time_difference_target_column. Must be one\ - \ of * 'DAY' * 'WEEK' (Equivalent to 7 DAYs) * 'MONTH' * 'QUARTER' * 'YEAR'\n\ - \ time_series_identifier_columns: Names of the time series\ - \ identifier columns.\n time_column: Name of the time column.\n\ - \ time_difference_target_column: Column we wish to get the\ - \ value of time_difference time_difference_units in the past or future.\n\ - \ output_column: Name of our new time aggregate feature.\n\ - \ is_future: Whether we wish to look forward in time. Defaults\ - \ to False. PartitionByMax/PartitionByMin/PartitionByAvg/PartitionBySum:\ - \ Performs a partition by reduce operation (one of max, min, avg, or sum)\ - \ with a fixed historic time period. Ex: Getting avg sales (the reduce\ - \ column) for each store (partition_by_column) over the previous 5 days\ - \ (time_column, time_ago_units, and time_ago).\n Example: .. code-block::\ - \ python { \"transformation\": \"PartitionByMax\", \"reduce_column\"\ - : \"sell_price\", \"partition_by_columns\": [\"store_id\", \"state_id\"\ - ], \"time_column\": \"date\", \"time_ago\": 1, \"time_ago_units\": \"\ - WEEK\", \"output_column\": \"partition_by_reduce_max_output\" }\n \ - \ Arguments:\n reduce_column: Column to apply the reduce\ - \ operation on. Reduce operations include the\n following:\ - \ Max, Min, Avg, Sum.\n partition_by_columns: List of columns\ - \ to partition by.\n time_column: Time column for the partition\ - \ by operation's window function.\n time_ago: Number of time_ago_units\ - \ to look back on our target_column, starting from time_column (inclusive).\n\ - \ time_ago_units: Units of time_ago to look back on our target_column.\ - \ Must be one of * 'DAY' * 'WEEK'\n output_column: Name of\ - \ our output feature." - isOptional: true - parameterType: LIST - encryption_spec_key_name: - defaultValue: '' - description: Customer-managed encryption key. - isOptional: true - parameterType: STRING - feature_selection_algorithm: - defaultValue: AMI - description: "The algorithm of feature selection. One of \"AMI\", \"CMIM\"\ - , \"JMIM\", \"MRMR\", default to be \"AMI\". The algorithms available\ - \ are: AMI(Adjusted Mutual Information):\nReference: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html\ - \ Arrays are not yet supported in this algorithm. CMIM(Conditional Mutual\ - \ Information Maximization): Reference paper: Mohamed Bennasar, Yulia\ - \ Hicks, Rossitza Setchi, \u201CFeature selection using Joint Mutual Information\ - \ Maximisation,\u201D Expert Systems with Applications, vol. 42, issue\ - \ 22, 1 December 2015, Pages 8520-8532. JMIM(Joint Mutual Information\ - \ Maximization\nReference:\n paper: Mohamed Bennasar, Yulia Hicks, Rossitza\ - \ Setchi, \u201CFeature selection using Joint Mutual Information Maximisation,\u201D\ - \ Expert Systems with Applications, vol. 42, issue 22, 1 December 2015,\ - \ Pages 8520-8532. MRMR(MIQ Minimum-redundancy Maximum-relevance): Reference\ - \ paper: Hanchuan Peng, Fuhui Long, and Chris Ding. \"Feature selection\ - \ based on mutual information criteria of max-dependency, max-relevance,\ - \ and min-redundancy.\" IEEE Transactions on pattern analysis and machine\ - \ intelligence 27, no.\n 8: 1226-1238." - isOptional: true - parameterType: STRING - feature_selection_execution_engine: - defaultValue: dataflow - description: Execution engine to run feature selection, value can be dataflow, - bigquery. - isOptional: true - parameterType: STRING - forecasting_apply_windowing: - defaultValue: true - description: Whether to apply window strategy. - isOptional: true - parameterType: BOOLEAN - forecasting_available_at_forecast_columns: - defaultValue: [] - description: Forecasting available at forecast columns. - isOptional: true - parameterType: LIST - forecasting_context_window: - defaultValue: -1.0 - description: Forecasting context window. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_forecast_horizon: - defaultValue: -1.0 - description: Forecasting horizon. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_holiday_regions: - defaultValue: [] - description: 'The geographical region based on which the holiday effect - is applied in modeling by adding holiday categorical array feature that - include all holidays matching the date. This option only allowed when - data granularity is day. By default, holiday effect modeling is disabled. - To turn it on, specify the holiday region using this option. - - Top level: * ''GLOBAL'' - - Second level: continental regions: * ''NA'': North America - - * ''JAPAC'': Japan and Asia Pacific - - * ''EMEA'': Europe, the Middle East and Africa - - * ''LAC'': Latin America and the Caribbean - - Third level: countries from ISO 3166-1 Country codes. - - Valid regions: * ''GLOBAL'' * ''NA'' * ''JAPAC'' * ''EMEA'' * ''LAC'' - * ''AE'' - - * ''AR'' * ''AT'' * ''AU'' * ''BE'' * ''BR'' * ''CA'' * ''CH'' * ''CL'' - * ''CN'' * ''CO'' - - * ''CZ'' * ''DE'' * ''DK'' * ''DZ'' * ''EC'' * ''EE'' * ''EG'' * ''ES'' - * ''FI'' * ''FR'' - - * ''GB'' * ''GR'' * ''HK'' * ''HU'' * ''ID'' * ''IE'' * ''IL'' * ''IN'' - * ''IR'' * ''IT'' - - * ''JP'' * ''KR'' * ''LV'' * ''MA'' * ''MX'' * ''MY'' * ''NG'' * ''NL'' - * ''NO'' * ''NZ'' - - * ''PE'' * ''PH'' * ''PK'' * ''PL'' * ''PT'' * ''RO'' * ''RS'' * ''RU'' - * ''SA'' * ''SE'' - - * ''SG'' * ''SI'' * ''SK'' * ''TH'' * ''TR'' * ''TW'' * ''UA'' * ''US'' - * ''VE'' * ''VN'' - - * ''ZA''' - isOptional: true - parameterType: LIST - forecasting_predefined_window_column: - defaultValue: '' - description: Forecasting predefined window column. - isOptional: true - parameterType: STRING - forecasting_time_column: - defaultValue: '' - description: Forecasting time column. - isOptional: true - parameterType: STRING - forecasting_time_series_attribute_columns: - defaultValue: [] - description: Forecasting time series attribute columns. - isOptional: true - parameterType: LIST - forecasting_time_series_identifier_column: - description: '[Deprecated] A forecasting time series identifier column. - Raises an exception if used - use the "time_series_identifier_column" - field instead.' - isOptional: true - parameterType: STRING - forecasting_time_series_identifier_columns: - defaultValue: [] - description: The list of forecasting time series identifier columns. - isOptional: true - parameterType: LIST - forecasting_unavailable_at_forecast_columns: - defaultValue: [] - description: Forecasting unavailable at forecast columns. - isOptional: true - parameterType: LIST - forecasting_window_max_count: - defaultValue: -1.0 - description: Forecasting window max count. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_window_stride_length: - defaultValue: -1.0 - description: Forecasting window stride length. - isOptional: true - parameterType: NUMBER_INTEGER - group_columns: - isOptional: true - parameterType: LIST - group_temporal_total_weight: - defaultValue: 0.0 - isOptional: true - parameterType: NUMBER_DOUBLE - group_total_weight: - defaultValue: 0.0 - isOptional: true - parameterType: NUMBER_DOUBLE - legacy_transformations_path: - defaultValue: '' - isOptional: true - parameterType: STRING - location: - description: Location for the created GCP services. - parameterType: STRING - materialized_examples_format: - defaultValue: tfrecords_gzip - description: The format to use for the materialized examples. Should be - either 'tfrecords_gzip' (default) or 'parquet'. - isOptional: true - parameterType: STRING - max_selected_features: - defaultValue: 1000.0 - description: Maximum number of features to select. If specified, the transform - config will be purged by only using the selected features that ranked - top in the feature ranking, which has the ranking value for all supported - features. If the number of input features is smaller than max_selected_features - specified, we will still run the feature selection process and generate - the feature ranking, no features will be excluded. The value will be - set to 1000 by default if run_feature_selection is enabled. - isOptional: true - parameterType: NUMBER_INTEGER - model_type: - description: 'Model type, which we wish to engineer features for. Can be - one of: neural_network, boosted_trees, l2l, seq2seq, tft, or tide. Defaults - to the empty value, `None`.' - isOptional: true - parameterType: STRING - multimodal_image_columns: - defaultValue: [] - description: List of multimodal image columns. Defaults to an empty list. - isOptional: true - parameterType: LIST - multimodal_tabular_columns: - defaultValue: [] - description: List of multimodal tabular columns. Defaults to an empty list - isOptional: true - parameterType: LIST - multimodal_text_columns: - defaultValue: [] - description: List of multimodal text columns. Defaults to an empty list - isOptional: true - parameterType: LIST - multimodal_timeseries_columns: - defaultValue: [] - description: List of multimodal timeseries columns. Defaults to an empty - list - isOptional: true - parameterType: LIST - predefined_split_key: - defaultValue: '' - description: Predefined split key. - isOptional: true - parameterType: STRING - prediction_type: - defaultValue: '' - description: Model prediction type. One of "classification", "regression", - "time_series". - isOptional: true - parameterType: STRING - project: - description: Project to run feature transform engine. - parameterType: STRING - root_dir: - description: The Cloud Storage location to store the output. - parameterType: STRING - run_distill: - defaultValue: false - description: (deprecated) Whether the distillation should be applied to - the training. - isOptional: true - parameterType: BOOLEAN - run_feature_selection: - defaultValue: false - description: Whether the feature selection should be applied to the dataset. - isOptional: true - parameterType: BOOLEAN - stats_gen_execution_engine: - defaultValue: dataflow - description: 'Execution engine to perform statistics generation. Can be - one of: "dataflow" (by default) or "bigquery". Using "bigquery" as the - execution engine is experimental.' - isOptional: true - parameterType: STRING - stratified_split_key: - defaultValue: '' - description: Stratified split key. - isOptional: true - parameterType: STRING - target_column: - defaultValue: '' - description: Target column of input data. - isOptional: true - parameterType: STRING - temporal_total_weight: - defaultValue: 0.0 - isOptional: true - parameterType: NUMBER_DOUBLE - test_fraction: - defaultValue: -1.0 - description: Fraction of input data for testing. - isOptional: true - parameterType: NUMBER_DOUBLE - tf_auto_transform_features: - defaultValue: {} - description: 'Dict mapping auto and/or type-resolutions to TF transform - features. FTE will automatically configure a set of built-in transformations - for each feature based on its data statistics. If users do not want auto - type resolution, but want the set of transformations for a given type - to be automatically generated, they may specify pre-resolved transformations - types. The following type hint dict keys are supported: * ''auto'' * ''categorical'' - * ''numeric'' * ''text'' * ''timestamp'' Example: `{ "auto": ["feature1"], - "categorical": ["feature2", "feature3"], }`. Note that the target and - weight column may not be included as an auto transformation unless users - are running forecasting.' - isOptional: true - parameterType: STRUCT - tf_custom_transformation_definitions: - defaultValue: [] - description: 'List of TensorFlow-based custom transformation definitions. Custom, - bring-your-own transform functions, where users can define and import - their own transform function and use it with FTE''s built-in transformations. - `[ { "transformation": "PlusOne", "module_path": "gs://bucket/custom_transform_fn.py", - "function_name": "plus_one_transform" }, { "transformation": "MultiplyTwo", - "module_path": "gs://bucket/custom_transform_fn.py", "function_name": - "multiply_two_transform" } ] Using custom transform function together - with FTE''s built-in transformations: .. code-block:: python [ { "transformation": - "CastToFloat", "input_columns": ["feature_1"], "output_columns": ["feature_1"] - },{ "transformation": "PlusOne", "input_columns": ["feature_1"] "output_columns": - ["feature_1_plused_one"] },{ "transformation": "MultiplyTwo", "input_columns": - ["feature_1"] "output_columns": ["feature_1_multiplied_two"] } ]' - isOptional: true - parameterType: LIST - tf_transform_execution_engine: - defaultValue: dataflow - description: 'Execution engine to perform row-level TF transformations. - Can be one of: "dataflow" (by default) or "bigquery". Using "bigquery" - as the execution engine is experimental and is for allowlisted customers - only. In addition, executing on "bigquery" only supports auto transformations - (i.e., specified by tf_auto_transform_features) and will raise an error - when tf_custom_transformation_definitions or tf_transformations_path is - set.' - isOptional: true - parameterType: STRING - tf_transformations_path: - defaultValue: '' - description: "Path to TensorFlow-based transformation configuration. Path\ - \ to a JSON file used to specified FTE's TF transformation configurations.\ - \ In the following, we provide some sample transform configurations to\ - \ demonstrate FTE's capabilities. All transformations on input columns\ - \ are explicitly specified with FTE's built-in transformations. Chaining\ - \ of multiple transformations on a single column is also supported. For\ - \ example: .. code-block:: python [ { \"transformation\": \"ZScale\"\ - , \"input_columns\": [\"feature_1\"] }, { \"transformation\": \"ZScale\"\ - , \"input_columns\": [\"feature_2\"] } ]`. Additional information about\ - \ FTE's currently supported built-in\ntransformations:\nDatetime: Extracts\ - \ datetime featues from a column containing timestamp strings.\n Example:\ - \ .. code-block:: python { \"transformation\": \"Datetime\", \"input_columns\"\ - : [\"feature_1\"], \"time_format\": \"%Y-%m-%d\" }\n Arguments:\n \ - \ input_columns: A list with a single column to perform the datetime\ - \ transformation on.\n output_columns: Names of output columns,\ - \ one for each datetime_features element.\n time_format: Datetime\ - \ format string. Time format is a combination of Date + Time Delimiter\ - \ (optional) + Time (optional) directives. Valid date directives are as\ - \ follows * '%Y-%m-%d' # 2018-11-30 * '%Y/%m/%d' # 2018/11/30 * '%y-%m-%d'\ - \ # 18-11-30 * '%y/%m/%d' # 18/11/30 * '%m-%d-%Y' # 11-30-2018 * '%m/%d/%Y'\ - \ # 11/30/2018 * '%m-%d-%y' # 11-30-18 * '%m/%d/%y' # 11/30/18 * '%d-%m-%Y'\ - \ # 30-11-2018 * '%d/%m/%Y' # 30/11/2018 * '%d-%B-%Y' # 30-November-2018\ - \ * '%d-%m-%y' # 30-11-18 * '%d/%m/%y' # 30/11/18 * '%d-%B-%y' # 30-November-18\ - \ * '%d%m%Y' # 30112018 * '%m%d%Y' # 11302018 * '%Y%m%d' # 20181130\ - \ Valid time delimiters are as follows * 'T' * ' ' Valid time directives\ - \ are as follows * '%H:%M' # 23:59 * '%H:%M:%S' #\n \ - \ 23:59:58 * '%H:%M:%S.%f' # 23:59:58[.123456] * '%H:%M:%S.%f%z'\ - \ # 23:59:58[.123456]+0000 * '%H:%M:%S%z', # 23:59:58+0000\n \ - \ datetime_features: List of datetime features to be extract. Each entry\ - \ must be one of * 'YEAR' * 'MONTH' * 'DAY' * 'DAY_OF_WEEK' * 'DAY_OF_YEAR'\ - \ * 'WEEK_OF_YEAR' * 'QUARTER' * 'HOUR' * 'MINUTE' * 'SECOND' Defaults\ - \ to ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'DAY_OF_YEAR', 'WEEK_OF_YEAR']\n\ - Log: Performs the natural log on a numeric column.\n Example: .. code-block::\ - \ python { \"transformation\": \"Log\", \"input_columns\": [\"feature_1\"\ - ] }\n Arguments:\n input_columns: A list with a single column\ - \ to perform the log transformation on.\n output_columns: A list\ - \ with a single output column name, corresponding to the output of our\ - \ transformation.\nZScale: Performs Z-scale normalization on a numeric\ - \ column.\n Example: .. code-block:: python { \"transformation\"\ - : \"ZScale\", \"input_columns\": [\"feature_1\"] }\n Arguments:\n \ - \ input_columns: A list with a single column to perform the z-scale\ - \ transformation on.\n output_columns: A list with a single output\ - \ column name, corresponding to the output of our transformation.\nVocabulary:\ - \ Converts strings to integers, where each unique string gets a unique\ - \ integer representation.\n Example: .. code-block:: python { \"\ - transformation\": \"Vocabulary\", \"input_columns\": [\"feature_1\"] }\n\ - \ Arguments:\n input_columns: A list with a single column to\ - \ perform the vocabulary transformation on.\n output_columns: A\ - \ list with a single output column name, corresponding to the output of\ - \ our transformation.\n top_k: Number of the most frequent words\ - \ in the vocabulary to use for generating dictionary lookup indices. If\ - \ not specified, all words in the vocabulary will be used. Defaults to\ - \ None.\n frequency_threshold: Limit the vocabulary only to words\ - \ whose number of occurrences in the input exceeds frequency_threshold.\ - \ If not specified, all words in the vocabulary will be included. If both\ - \ top_k and frequency_threshold are specified, a word must satisfy both\ - \ conditions to be included. Defaults to None.\nCategorical: Transforms\ - \ categorical columns to integer columns.\n Example: .. code-block::\ - \ python { \"transformation\": \"Categorical\", \"input_columns\": [\"\ - feature_1\"], \"top_k\": 10 }\n Arguments:\n input_columns:\ - \ A list with a single column to perform the categorical transformation\ - \ on.\n output_columns: A list with a single output column name,\ - \ corresponding to the output of our transformation.\n top_k: Number\ - \ of the most frequent words in the vocabulary to use for generating dictionary\ - \ lookup indices. If not specified, all words in the vocabulary will be\ - \ used.\n frequency_threshold: Limit the vocabulary only to words\ - \ whose number of occurrences in the input exceeds frequency_threshold.\ - \ If not specified, all words in the vocabulary will be included. If both\ - \ top_k and frequency_threshold are specified, a word must satisfy both\ - \ conditions to be included.\nReduce: Given a column where each entry\ - \ is a numeric array, reduces arrays according to our reduce_mode.\n \ - \ Example: .. code-block:: python { \"transformation\": \"Reduce\"\ - , \"input_columns\": [\"feature_1\"], \"reduce_mode\": \"MEAN\", \"output_columns\"\ - : [\"feature_1_mean\"] }\n Arguments:\n input_columns: A list\ - \ with a single column to perform the reduce transformation on.\n \ - \ output_columns: A list with a single output column name, corresponding\ - \ to the output of our transformation.\n reduce_mode: One of *\ - \ 'MAX' * 'MIN' * 'MEAN' * 'LAST_K' Defaults to 'MEAN'.\n last_k:\ - \ The number of last k elements when 'LAST_K' reduce mode is used. Defaults\ - \ to 1.\nSplitString: Given a column of strings, splits strings into token\ - \ arrays.\n Example: .. code-block:: python { \"transformation\"\ - : \"SplitString\", \"input_columns\": [\"feature_1\"], \"separator\":\ - \ \"$\" }\n Arguments:\n input_columns: A list with a single\ - \ column to perform the split string transformation on.\n output_columns:\ - \ A list with a single output column name, corresponding to the output\ - \ of our transformation.\n separator: Separator to split input\ - \ string into tokens. Defaults to ' '.\n missing_token: Missing\ - \ token to use when no string is included. Defaults to ' _MISSING_ '.\n\ - NGram: Given a column of strings, splits strings into token arrays where\ - \ each token is an integer.\n Example: .. code-block:: python { \"\ - transformation\": \"NGram\", \"input_columns\": [\"feature_1\"], \"min_ngram_size\"\ - : 1, \"max_ngram_size\": 2, \"separator\": \" \" }\n Arguments:\n \ - \ input_columns: A list with a single column to perform the n-gram\ - \ transformation on.\n output_columns: A list with a single output\ - \ column name, corresponding to the output of our transformation.\n \ - \ min_ngram_size: Minimum n-gram size. Must be a positive number\ - \ and <= max_ngram_size. Defaults to 1.\n max_ngram_size: Maximum\ - \ n-gram size. Must be a positive number and >= min_ngram_size. Defaults\ - \ to 2.\n top_k: Number of the most frequent words in the vocabulary\ - \ to use for generating dictionary lookup indices. If not specified, all\ - \ words in the vocabulary will be used. Defaults to None.\n frequency_threshold:\ - \ Limit the dictionary's vocabulary only to words whose number of occurrences\ - \ in the input exceeds frequency_threshold. If not specified, all words\ - \ in the vocabulary will be included. If both top_k and frequency_threshold\ - \ are specified, a word must satisfy both conditions to be included. Defaults\ - \ to None.\n separator: Separator to split input string into tokens.\ - \ Defaults to ' '.\n missing_token: Missing token to use when no\ - \ string is included. Defaults to ' _MISSING_ '.\nClip: Given a numeric\ - \ column, clips elements such that elements < min_value are assigned min_value,\ - \ and elements > max_value are assigned max_value.\n Example: .. code-block::\ - \ python { \"transformation\": \"Clip\", \"input_columns\": [\"col1\"\ - ], \"output_columns\": [\"col1_clipped\"], \"min_value\": 1., \"max_value\"\ - : 10., }\n Arguments:\n input_columns: A list with a single\ - \ column to perform the n-gram transformation on.\n output_columns:\ - \ A list with a single output column name, corresponding to the output\ - \ of our transformation.\n min_value: Number where all values below\ - \ min_value are set to min_value. If no min_value is provided, min clipping\ - \ will not occur. Defaults to None.\n max_value: Number where all\ - \ values above max_value are set to max_value If no max_value is provided,\ - \ max clipping will not occur. Defaults to None.\nMultiHotEncoding: Performs\ - \ multi-hot encoding on a categorical array column.\n Example: ..\ - \ code-block:: python { \"transformation\": \"MultiHotEncoding\", \"\ - input_columns\": [\"col1\"], } The number of classes is determened by\ - \ the largest number included in the input if it is numeric or the total\ - \ number of unique values of the input if it is type str. If the input\ - \ is has type str and an element contians separator tokens, the input\ - \ will be split at separator indices, and the each element of the split\ - \ list will be considered a seperate class. For example,\n Input: \ - \ .. code-block:: python [ [\"foo bar\"], # Example 0 [\"foo\",\ - \ \"bar\"], # Example 1 [\"foo\"], # Example 2 [\"bar\"], \ - \ # Example 3 ] Output (with default separator=\" \"): .. code-block::\ - \ python [ [1, 1], # Example 0 [1, 1], # Example 1 [1,\ - \ 0], # Example 2 [0, 1], # Example 3 ]\n Arguments:\n\ - \ input_columns: A list with a single column to perform the multi-hot-encoding\ - \ on.\n output_columns: A list with a single output column name,\ - \ corresponding to the output of our transformation.\n top_k: Number\ - \ of the most frequent words in the vocabulary to use for generating dictionary\ - \ lookup indices. If not specified, all words in the vocabulary will be\ - \ used. Defaults to None.\n frequency_threshold: Limit the dictionary's\ - \ vocabulary only to words whose number of occurrences in the input exceeds\ - \ frequency_threshold. If not specified, all words in the vocabulary will\ - \ be included. If both top_k and frequency_threshold are specified, a\ - \ word must satisfy both conditions to be included. Defaults to None.\n\ - \ separator: Separator to split input string into tokens. Defaults\ - \ to ' '.\nMaxAbsScale: Performs maximum absolute scaling on a numeric\ - \ column.\n Example: .. code-block:: python { \"transformation\"\ - : \"MaxAbsScale\", \"input_columns\": [\"col1\"], \"output_columns\":\ - \ [\"col1_max_abs_scaled\"] }\n Arguments:\n input_columns:\ - \ A list with a single column to perform max-abs-scale on.\n output_columns:\ - \ A list with a single output column name, corresponding to the output\ - \ of our transformation.\nCustom: Transformations defined in tf_custom_transformation_definitions\ - \ are included here in the TensorFlow-based transformation configuration.\ - \ For example, given the following tf_custom_transformation_definitions:\ - \ .. code-block:: python [ { \"transformation\": \"PlusX\", \"module_path\"\ - : \"gs://bucket/custom_transform_fn.py\", \"function_name\": \"plus_one_transform\"\ - \ } ] We can include the following transformation: .. code-block:: python\ - \ { \"transformation\": \"PlusX\", \"input_columns\": [\"col1\"], \"\ - output_columns\": [\"col1_max_abs_scaled\"] \"x\": 5 } Note that input_columns\ - \ must still be included in our arguments and output_columns is optional.\ - \ All other arguments are those defined in custom_transform_fn.py, which\ - \ includes `\"x\"` in this case. See tf_custom_transformation_definitions\ - \ above. legacy_transformations_path (Optional[str]) Deprecated. Prefer\ - \ tf_auto_transform_features. Path to a GCS file containing JSON string\ - \ for legacy style transformations. Note that legacy_transformations_path\ - \ and tf_auto_transform_features cannot both be specified." - isOptional: true - parameterType: STRING - timestamp_split_key: - defaultValue: '' - description: Timestamp split key. - isOptional: true - parameterType: STRING - training_fraction: - defaultValue: -1.0 - description: Fraction of input data for training. - isOptional: true - parameterType: NUMBER_DOUBLE - validation_fraction: - defaultValue: -1.0 - description: Fraction of input data for validation. - isOptional: true - parameterType: NUMBER_DOUBLE - weight_column: - defaultValue: '' - description: Weight column of input data. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - dataset_stats: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The stats of the dataset. - feature_ranking: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The ranking of features, all features supported in the dataset - will be included. For "AMI" algorithm, array features won't be available - in the ranking as arrays are not supported yet. - instance_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - materialized_data: - artifactType: - schemaTitle: system.Dataset - schemaVersion: 0.0.1 - description: The materialized dataset. - training_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - transform_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The transform output artifact. - parameters: - bigquery_downsampled_test_split_uri: - description: BigQuery URI for the downsampled test split to pass to the - batch prediction component during batch explain. - parameterType: STRING - bigquery_test_split_uri: - description: BigQuery URI for the test split to pass to the batch prediction - component during evaluation. - parameterType: STRING - bigquery_train_split_uri: - description: BigQuery URI for the train split to pass to the batch prediction - component during distillation. - parameterType: STRING - bigquery_validation_split_uri: - description: BigQuery URI for the validation split to pass to the batch - prediction component during distillation. - parameterType: STRING - gcp_resources: - description: GCP resources created by this component. For more details, - see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/proto/README.md. - parameterType: STRING - split_example_counts: - description: JSON string of data split example counts for train, validate, - and test splits. - parameterType: STRING - comp-get-model-display-name: - executorLabel: exec-get-model-display-name - inputDefinitions: - parameters: - model_display_name: - parameterType: STRING - outputDefinitions: - parameters: - model_display_name: - parameterType: STRING - comp-model-batch-predict: - executorLabel: exec-model-batch-predict - inputDefinitions: - artifacts: - model: - artifactType: - schemaTitle: google.VertexModel - schemaVersion: 0.0.1 - description: 'The Model used to get predictions via this job. Must share - the same - - ancestor Location. Starting this job has no impact on any existing - - deployments of the Model and their resources. Either this or - - `unmanaged_container_model` must be specified.' - isOptional: true - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: 'The unmanaged container model used to get predictions via - this job. - - This should be used for models that are not uploaded to Vertex. Either - - this or model must be specified.' - isOptional: true - parameters: - accelerator_count: - defaultValue: 0.0 - description: 'The number of accelerators to attach - - to the `machine_type`. Only used if `machine_type` is set. For more - - details about the machine spec, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec' - isOptional: true - parameterType: NUMBER_INTEGER - accelerator_type: - defaultValue: '' - description: 'The type of accelerator(s) that may be - - attached to the machine as per `accelerator_count`. Only used if - - `machine_type` is set. For more details about the machine spec, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec' - isOptional: true - parameterType: STRING - bigquery_destination_output_uri: - defaultValue: '' - description: 'The BigQuery project location where the output is to be written - to. In - - the given project a new dataset is created with name - - `prediction__` where is made - - BigQuery-dataset-name compatible (for example, most special characters - - become underscores), and timestamp is in YYYY_MM_DDThh_mm_ss_sssZ - - "based on ISO-8601" format. In the dataset two tables will be created, - - `predictions`, and `errors`. If the Model has both `instance` - - and `prediction` schemata defined then the tables have columns as - - follows: The `predictions` table contains instances for which the - - prediction succeeded, it has columns as per a concatenation of the - - Model''s instance and prediction schemata. The `errors` table - - contains rows for which the prediction has failed, it has instance - - columns, as per the instance schema, followed by a single "errors" - - column, which as values has [google.rpc.Status](Status) - - represented as a STRUCT, and containing only `code` and - - `message`. For more details about this output config, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig.' - isOptional: true - parameterType: STRING - bigquery_source_input_uri: - defaultValue: '' - description: 'BigQuery URI to a table, up to 2000 characters long. For example: - - `projectId.bqDatasetId.bqTableId` For more details about this input - - config, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig.' - isOptional: true - parameterType: STRING - encryption_spec_key_name: - defaultValue: '' - description: 'Customer-managed encryption - - key options for a BatchPredictionJob. If this is set, then all - - resources created by the BatchPredictionJob will be encrypted with the - - provided encryption key. Has the form: - - `projects/my-project/locations/my-location/keyRings/my-kr/cryptoKeys/my-key`. - - The key needs to be in the same region as where the compute resource - - is created.' - isOptional: true - parameterType: STRING - excluded_fields: - defaultValue: [] - description: 'Fields that will be excluded in the prediction instance that - is - - sent to the Model. - - Excluded will be attached to the batch prediction output if - - key_field is not specified. - - When `excluded_fields` is populated, `included_fields` must be empty. - - The input must be JSONL with objects at each line, CSV, BigQuery - - or TfRecord. - - may be specified via the Model''s `parameters_schema_uri`.' - isOptional: true - parameterType: LIST - explanation_metadata: - defaultValue: {} - description: 'Explanation metadata - - configuration for this BatchPredictionJob. Can be specified only if - - `generate_explanation` is set to `True`. This value overrides the - - value of `Model.explanation_metadata`. All fields of - - `explanation_metadata` are optional in the request. If a field of the - - `explanation_metadata` object is not populated, the corresponding - - field of the `Model.explanation_metadata` object is inherited. For - - more details, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#explanationmetadata.' - isOptional: true - parameterType: STRUCT - explanation_parameters: - defaultValue: {} - description: 'Parameters to configure - - explaining for Model''s predictions. Can be specified only if - - `generate_explanation` is set to `True`. This value overrides the - - value of `Model.explanation_parameters`. All fields of - - `explanation_parameters` are optional in the request. If a field of - - the `explanation_parameters` object is not populated, the - - corresponding field of the `Model.explanation_parameters` object is - - inherited. For more details, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#ExplanationParameters.' - isOptional: true - parameterType: STRUCT - gcs_destination_output_uri_prefix: - defaultValue: '' - description: 'The Google Cloud - - Storage location of the directory where the output is to be written - - to. In the given directory a new directory is created. Its name is - - `prediction--`, where timestamp - - is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. Inside of it files - - `predictions_0001.`, `predictions_0002.`, - - ..., `predictions_N.` are created where `` - - depends on chosen `predictions_format`, and N may equal 0001 and - - depends on the total number of successfully predicted instances. If - - the Model has both `instance` and `prediction` schemata defined - - then each such file contains predictions as per the - - `predictions_format`. If prediction for any instance failed - - (partially or completely), then an additional - - `errors_0001.`, `errors_0002.`,..., - - `errors_N.` files are created (N depends on total number - - of failed predictions). These files contain the failed instances, as - - per their schema, followed by an additional `error` field which as - - value has `google.rpc.Status` containing only `code` and - - `message` fields. For more details about this output config, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig.' - isOptional: true - parameterType: STRING - gcs_source_uris: - defaultValue: [] - description: 'Google Cloud Storage URI(-s) to your instances to run batch - prediction - - on. They must match `instances_format`. May contain wildcards. For more - - information on wildcards, see [WildcardNames](https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames). - - For more details about this input config, see [InputConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig).' - isOptional: true - parameterType: LIST - generate_explanation: - defaultValue: false - description: 'Generate explanation along with - - the batch prediction results. This will cause the batch prediction - - output to include explanations based on the `prediction_format`: - - - `bigquery`: output includes a column named `explanation`. The value is - - a struct that conforms to the [aiplatform.gapic.Explanation] object. - - - `jsonl`: The JSON objects on each line include an additional entry - - keyed `explanation`. The value of the entry is a JSON object that - - conforms to the [aiplatform.gapic.Explanation] object. - `csv`: - - Generating explanations for CSV format is not supported. If this - - field is set to true, either the Model.explanation_spec or - - explanation_metadata and explanation_parameters must be populated.' - isOptional: true - parameterType: BOOLEAN - included_fields: - defaultValue: [] - description: 'Fields that will be included in the prediction instance that - is - - sent to the Model. - - If `instance_type` is `array`, the order of field names in - - `included_fields` also determines the order of the values in the array. - - When `included_fields` is populated, `excluded_fields` must be empty. - - The input must be JSONL with objects at each line, CSV, BigQuery - - or TfRecord.' - isOptional: true - parameterType: LIST - instance_type: - defaultValue: '' - description: "The format of the instance that the Model\naccepts. Vertex\ - \ AI will convert compatible\n[InstancesFormat](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig)\n\ - to the specified format. Supported values are:\n`object`: Each input is\ - \ converted to JSON object format.\n * For `bigquery`, each row is converted\ - \ to an object.\n * For `jsonl`, each line of the JSONL input must be\ - \ an object.\n * Does not apply to `csv`, `file-list`, `tf-record`, or\ - \ `tf-record-gzip`.\n`array`: Each input is converted to JSON array format.\n\ - \ * For `bigquery`, each row is converted to an array. The order\n \ - \ of columns is determined by the BigQuery column order, unless\n \ - \ [included_fields](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig)\ - \ is populated.\n `included_fields` must be populated for specifying\ - \ field orders.\n * For `jsonl`, if each line of the JSONL input is an\ - \ object,\n `included_fields` must be populated for specifying field\ - \ orders.\n * Does not apply to `csv`, `file-list`, `tf-record`, or\n\ - \ `tf-record-gzip`.\nIf not specified, Vertex AI converts the batch\ - \ prediction input as\nfollows:\n * For `bigquery` and `csv`, the behavior\ - \ is the same as `array`. The\n order of columns is the same as defined\ - \ in the file or table, unless\n included_fields is populated.\n * For\ - \ `jsonl`, the prediction instance format is determined by\n each line\ - \ of the input.\n * For `tf-record`/`tf-record-gzip`, each record will\ - \ be converted to\n an object in the format of `{\"b64\": }`,\ - \ where `` is\n the Base64-encoded string of the content of the\ - \ record.\n * For `file-list`, each file in the list will be converted\ - \ to an\n object in the format of `{\"b64\": }`, where ``\ - \ is\n the Base64-encoded string of the content of the file." - isOptional: true - parameterType: STRING - instances_format: - defaultValue: jsonl - description: 'The format in which instances are - - given, must be one of the [Model](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models)''s - supportedInputStorageFormats. - - For more details about this input config, see - - [InputConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig.)' - isOptional: true - parameterType: STRING - job_display_name: - description: The user-defined name of this BatchPredictionJob. - parameterType: STRING - key_field: - defaultValue: '' - description: "The name of the field that is considered as a key.\nThe values\ - \ identified by the key field is not included in the\ntransformed instances\ - \ that is sent to the Model. This is similar to\nspecifying this name\ - \ of the field in [excluded_fields](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig).\ - \ In addition,\nthe batch prediction output will not include the instances.\ - \ Instead the\noutput will only include the value of the key field, in\ - \ a field named\n`key` in the output:\n * For `jsonl` output format, the\ - \ output will have a `key` field\n instead of the `instance` field.\n\ - \ * For `csv`/`bigquery` output format, the output will have have a `key`\n\ - \ column instead of the instance feature columns.\nThe input must be\ - \ JSONL with objects at each line, CSV, BigQuery\nor TfRecord." - isOptional: true - parameterType: STRING - labels: - defaultValue: {} - description: 'The labels with user-defined metadata to - - organize your BatchPredictionJobs. Label keys and values can be no - - longer than 64 characters (Unicode codepoints), can only contain - - lowercase letters, numeric characters, underscores and dashes. - - International characters are allowed. See https://goo.gl/xmQnxf for - - more information and examples of labels.' - isOptional: true - parameterType: STRUCT - location: - defaultValue: us-central1 - description: Location for creating the BatchPredictionJob. - isOptional: true - parameterType: STRING - machine_type: - defaultValue: '' - description: 'The type of machine for running batch - - prediction on dedicated resources. If the Model supports - - DEDICATED_RESOURCES this config may be provided (and the job will use - - these resources). If the Model doesn''t support AUTOMATIC_RESOURCES, - - this config must be provided. For more details about the - - BatchDedicatedResources, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#BatchDedicatedResources. - - For more details about the machine spec, see - - https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec' - isOptional: true - parameterType: STRING - manual_batch_tuning_parameters_batch_size: - defaultValue: 0.0 - description: 'The number of - - the records (e.g. instances) of the operation given in each batch to a - - machine replica. Machine type, and size of a single record should be - - considered when setting this parameter, higher value speeds up the - - batch operation''s execution, but too high value will result in a whole - - batch not fitting in a machine''s memory, and the whole operation will - - fail.' - isOptional: true - parameterType: NUMBER_INTEGER - max_replica_count: - defaultValue: 0.0 - description: 'The maximum number of machine replicas the batch operation - may be scaled - - to. Only used if `machine_type` is set.' - isOptional: true - parameterType: NUMBER_INTEGER - model_parameters: - defaultValue: {} - description: The parameters that govern the predictions. The schema of the - parameters - isOptional: true - parameterType: STRUCT - predictions_format: - defaultValue: jsonl - description: 'The format in which Vertex AI gives the predictions. Must - be one of the - - Model''s supportedOutputStorageFormats. - - For more details about this output config, see [OutputConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig).' - isOptional: true - parameterType: STRING - project: - defaultValue: '{{$.pipeline_google_cloud_project_id}}' - description: Project to create the BatchPredictionJob. Defaults to the project - in which the PipelineJob is run. - isOptional: true - parameterType: STRING - starting_replica_count: - defaultValue: 0.0 - description: 'The number of machine replicas - - used at the start of the batch operation. If not set, Vertex AI - - decides starting number, not greater than `max_replica_count`. Only - - used if `machine_type` is set.' - isOptional: true - parameterType: NUMBER_INTEGER - outputDefinitions: - artifacts: - batchpredictionjob: - artifactType: - schemaTitle: google.VertexBatchPredictionJob - schemaVersion: 0.0.1 - description: '[**Deprecated. Use gcs_output_directory and bigquery_output_table - - instead.**] Artifact - - representation of the created batch prediction job.' - bigquery_output_table: - artifactType: - schemaTitle: google.BQTable - schemaVersion: 0.0.1 - description: 'Artifact tracking the batch prediction job output. This is - only - - available if - - bigquery_output_table is specified.' - gcs_output_directory: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: 'Artifact tracking the batch prediction job output. This is - only - - available if - - gcs_destination_output_uri_prefix is specified.' - parameters: - gcp_resources: - description: 'Serialized gcp_resources proto tracking the batch prediction - job. - - For more details, see - - https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/proto/README.md.' - parameterType: STRING - comp-model-evaluation: - executorLabel: exec-model-evaluation - inputDefinitions: - artifacts: - batch_prediction_job: - artifactType: - schemaTitle: google.VertexBatchPredictionJob - schemaVersion: 0.0.1 - parameters: - dataflow_disk_size: - defaultValue: 50.0 - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_machine_type: - defaultValue: n1-standard-4 - isOptional: true - parameterType: STRING - dataflow_max_workers_num: - defaultValue: 100.0 - isOptional: true - parameterType: NUMBER_INTEGER - dataflow_service_account: - defaultValue: '' - isOptional: true - parameterType: STRING - dataflow_subnetwork: - defaultValue: '' - isOptional: true - parameterType: STRING - dataflow_use_public_ips: - defaultValue: true - isOptional: true - parameterType: BOOLEAN - dataflow_workers_num: - defaultValue: 10.0 - isOptional: true - parameterType: NUMBER_INTEGER - encryption_spec_key_name: - defaultValue: '' - isOptional: true - parameterType: STRING - example_weight_column: - defaultValue: '' - isOptional: true - parameterType: STRING - ground_truth_column: - parameterType: STRING - ground_truth_format: - defaultValue: jsonl - isOptional: true - parameterType: STRING - location: - defaultValue: us-central1 - isOptional: true - parameterType: STRING - prediction_id_column: - defaultValue: '' - isOptional: true - parameterType: STRING - prediction_label_column: - defaultValue: '' - isOptional: true - parameterType: STRING - prediction_score_column: - defaultValue: '' - isOptional: true - parameterType: STRING - predictions_format: - defaultValue: jsonl - isOptional: true - parameterType: STRING - problem_type: - parameterType: STRING - project: - parameterType: STRING - root_dir: - parameterType: STRING - outputDefinitions: - artifacts: - evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 - parameters: - gcp_resources: - parameterType: STRING - comp-model-upload: - executorLabel: exec-model-upload - inputDefinitions: - artifacts: - parent_model: - artifactType: - schemaTitle: google.VertexModel - schemaVersion: 0.0.1 - description: An artifact of a model which to upload a new version to. Only - specify this field when uploading a new version. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models/upload#request-body) - isOptional: true - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: "The unmanaged container model to be uploaded. The Model can\ - \ be passed from an upstream step or imported via a KFP `dsl.importer`.\n\ - :Examples:\n ::\n\n from kfp import dsl\n from google_cloud_pipeline_components.google_cloud_pipeline_components.types\ - \ import artifact_types\n\n importer_spec = dsl.importer(\n artifact_uri='gs://managed-pipeline-gcpc-e2e-test/automl-tabular/model',\n\ - \ artifact_class=artifact_types.UnmanagedContainerModel,\n metadata={\n\ - \ 'containerSpec': { 'imageUri':\n 'us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:prod'\n\ - \ }\n })" - isOptional: true - parameters: - description: - defaultValue: '' - description: The description of the Model. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models#Model) - isOptional: true - parameterType: STRING - display_name: - description: 'The display name of the Model. The name - - can be up to 128 characters long and can be consist of any UTF-8 - - characters. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.models#Model)' - parameterType: STRING - encryption_spec_key_name: - defaultValue: '' - description: 'Customer-managed encryption - - key spec for a Model. If set, this Model and all sub-resources of this - - Model will be secured by this key. Has the form: - - `projects/my-project/locations/my-location/keyRings/my-kr/cryptoKeys/my-key`. - - The key needs to be in the same region as where the compute resource - - is created.' - isOptional: true - parameterType: STRING - explanation_metadata: - defaultValue: {} - description: 'Metadata describing the Model''s - - input and output for explanation. Both `explanation_metadata` and `explanation_parameters` - must be passed together when used. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#explanationmetadata)' - isOptional: true - parameterType: STRUCT - explanation_parameters: - defaultValue: {} - description: 'Parameters to configure - - explaining for Model''s predictions. [More information.](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/ExplanationSpec#ExplanationParameters)' - isOptional: true - parameterType: STRUCT - labels: - defaultValue: {} - description: 'The labels with user-defined metadata to - - organize your model. Label keys and values can be no longer than 64 - - characters (Unicode codepoints), can only contain lowercase letters, - - numeric characters, underscores and dashes. International characters - - are allowed. See https://goo.gl/xmQnxf for more information and - - examples of labels.' - isOptional: true - parameterType: STRUCT - location: - defaultValue: us-central1 - description: 'Optional location to upload this Model to. If - - not set, defaults to `us-central1`.' - isOptional: true - parameterType: STRING - project: - defaultValue: '{{$.pipeline_google_cloud_project_id}}' - description: Project to upload this Model to. Defaults to the project in - which the PipelineJob is run. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - model: - artifactType: - schemaTitle: google.VertexModel - schemaVersion: 0.0.1 - description: Artifact tracking the created Model. - parameters: - gcp_resources: - description: Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) - which tracks the upload Model's long-running operation. - parameterType: STRING - comp-parse-worker-pool-specs-override: - executorLabel: exec-parse-worker-pool-specs-override - inputDefinitions: - parameters: - worker_pool_specs_override: - description: 'The list of dictionaries for overriding training - - and evaluation worker pool specs.' - parameterType: LIST - outputDefinitions: - parameters: - eval_machine_spec: - description: The eval machine spec. - parameterType: STRUCT - eval_replica_count: - description: The replica count for eval. - parameterType: NUMBER_INTEGER - training_disk_spec: - description: The training disk spec. - parameterType: STRUCT - training_machine_spec: - description: The training machine spec. - parameterType: STRUCT - comp-set-optional-inputs: - executorLabel: exec-set-optional-inputs - inputDefinitions: - artifacts: - vertex_dataset: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The Vertex dataset when data source is Vertex dataset. - parameters: - data_source_bigquery_table_path: - description: The BigQuery table when data source is BQ. - parameterType: STRING - data_source_csv_filenames: - description: The CSV GCS path when data source is CSV. - parameterType: STRING - location: - description: The GCP region that runs the pipeline components. - parameterType: STRING - project: - description: The GCP project that runs the pipeline components. - parameterType: STRING - outputDefinitions: - parameters: - data_source_bigquery_table_path: - parameterType: STRING - data_source_csv_filenames: - parameterType: STRING - comp-split-materialized-data: - executorLabel: exec-split-materialized-data - inputDefinitions: - artifacts: - materialized_data: - artifactType: - schemaTitle: system.Dataset - schemaVersion: 0.0.1 - description: 'Materialized dataset output by the Feature - - Transform Engine.' - outputDefinitions: - artifacts: - materialized_eval_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path patern to materialized eval split. - materialized_test_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path patern to materialized test split. - materialized_train_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path patern to materialized train split. - comp-training-configurator-and-validator: - executorLabel: exec-training-configurator-and-validator - inputDefinitions: - artifacts: - dataset_stats: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Dataset stats generated by feature transform engine. - instance_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Schema of input data to the tf_model at serving time. - training_schema: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - parameters: - available_at_forecast_columns: - defaultValue: [] - description: The names of the columns that are available at forecast time. - isOptional: true - parameterType: LIST - context_window: - defaultValue: -1.0 - description: The length of the context window. - isOptional: true - parameterType: NUMBER_INTEGER - enable_probabilistic_inference: - defaultValue: false - description: If probabilistic inference is enabled, the model will fit a - distribution that captures the uncertainty of a prediction. At inference - time, the predictive distribution is used to make a point prediction that - minimizes the optimization objective. For example, the mean of a predictive - distribution is the point prediction that minimizes RMSE loss. If quantiles - are specified, then the quantiles of the distribution are also returned. - isOptional: true - parameterType: BOOLEAN - forecast_horizon: - defaultValue: -1.0 - description: The length of the forecast horizon. - isOptional: true - parameterType: NUMBER_INTEGER - forecasting_model_type: - defaultValue: '' - description: The model types, e.g. l2l, seq2seq, tft. - isOptional: true - parameterType: STRING - forecasting_transformations: - defaultValue: {} - description: Dict mapping auto and/or type-resolutions to feature columns. - The supported types are auto, categorical, numeric, text, and timestamp. - isOptional: true - parameterType: STRUCT - group_columns: - description: A list of time series attribute column names that define the - time series hierarchy. - isOptional: true - parameterType: LIST - group_temporal_total_weight: - defaultValue: 0.0 - description: The weight of the loss for predictions aggregated over both - the horizon and time series in the same hierarchy group. - isOptional: true - parameterType: NUMBER_DOUBLE - group_total_weight: - defaultValue: 0.0 - description: The weight of the loss for predictions aggregated over time - series in the same group. - isOptional: true - parameterType: NUMBER_DOUBLE - optimization_objective: - defaultValue: '' - description: 'Objective function the model is optimizing towards. The training - process creates a model that maximizes/minimizes the value of the objective - function over the validation set. The supported optimization objectives - depend on the prediction type. If the field is not set, a default objective - function is used. classification: "maximize-au-roc" (default) - Maximize - the area under the receiver operating characteristic (ROC) curve. "minimize-log-loss" - - Minimize log loss. "maximize-au-prc" - Maximize the area under the precision-recall - curve. "maximize-precision-at-recall" - Maximize precision for a specified - recall value. "maximize-recall-at-precision" - Maximize recall for a specified - precision value. classification (multi-class): "minimize-log-loss" (default) - - Minimize log loss. regression: "minimize-rmse" (default) - Minimize - root-mean-squared error (RMSE). "minimize-mae" - Minimize mean-absolute - error (MAE). "minimize-rmsle" - Minimize root-mean-squared log error - (RMSLE).' - isOptional: true - parameterType: STRING - optimization_objective_precision_value: - defaultValue: -1.0 - description: Required when optimization_objective is "maximize-recall-at-precision". - Must be between 0 and 1, inclusive. - isOptional: true - parameterType: NUMBER_DOUBLE - optimization_objective_recall_value: - defaultValue: -1.0 - description: Required when optimization_objective is "maximize-precision-at-recall". - Must be between 0 and 1, inclusive. - isOptional: true - parameterType: NUMBER_DOUBLE - prediction_type: - defaultValue: '' - description: Model prediction type. One of "classification", "regression", - "time_series". - isOptional: true - parameterType: STRING - quantiles: - defaultValue: [] - description: All quantiles that the model need to predict. - isOptional: true - parameterType: LIST - run_distill: - defaultValue: false - description: Whether the distillation should be applied to the training. - isOptional: true - parameterType: BOOLEAN - run_evaluation: - defaultValue: false - description: Whether we are running evaluation in the training pipeline. - isOptional: true - parameterType: BOOLEAN - split_example_counts: - description: JSON string of data split example counts for train, validate, - and test splits. - parameterType: STRING - stage_1_deadline_hours: - description: Stage 1 training budget in hours. - isOptional: true - parameterType: NUMBER_DOUBLE - stage_2_deadline_hours: - description: Stage 2 training budget in hours. - isOptional: true - parameterType: NUMBER_DOUBLE - target_column: - defaultValue: '' - description: Target column of input data. - isOptional: true - parameterType: STRING - temporal_total_weight: - defaultValue: 0.0 - description: The weight of the loss for predictions aggregated over the - horizon for a single time series. - isOptional: true - parameterType: NUMBER_DOUBLE - time_column: - defaultValue: '' - description: The column that indicates the time. Used by forecasting only. - isOptional: true - parameterType: STRING - time_series_attribute_columns: - defaultValue: [] - description: The column names of the time series attributes. - isOptional: true - parameterType: LIST - time_series_identifier_column: - description: '[Deprecated] The time series identifier column. Used by forecasting - only. Raises exception if used - use the "time_series_identifier_column" - field instead.' - isOptional: true - parameterType: STRING - time_series_identifier_columns: - defaultValue: [] - description: The list of time series identifier columns. Used by forecasting - only. - isOptional: true - parameterType: LIST - unavailable_at_forecast_columns: - defaultValue: [] - description: The names of the columns that are not available at forecast - time. - isOptional: true - parameterType: LIST - weight_column: - defaultValue: '' - description: Weight column of input data. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - instance_baseline: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - metadata: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The tabular example gen metadata. - comp-wide-and-deep-trainer: - executorLabel: exec-wide-and-deep-trainer - inputDefinitions: - artifacts: - instance_baseline: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to a JSON file for baseline values. - materialized_eval_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to the materialized validation split. - materialized_train_split: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to the materialized train split. - metadata: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Amount of time in seconds to run the trainer for. - training_schema_uri: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to the training schema. - transform_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The path to transform output. - parameters: - batch_size: - defaultValue: 100.0 - description: Batch size for training. - isOptional: true - parameterType: NUMBER_INTEGER - beta_1: - defaultValue: 0.9 - description: Beta 1 value for optimizer_type="adam". - isOptional: true - parameterType: NUMBER_DOUBLE - beta_2: - defaultValue: 0.999 - description: Beta 2 value for optimizer_type="adam". - isOptional: true - parameterType: NUMBER_DOUBLE - cache_data: - defaultValue: auto - description: Whether to cache data or not. If set to 'auto', caching is - determined based on the dataset size. - isOptional: true - parameterType: STRING - dnn_beta_1: - defaultValue: 0.9 - description: Beta 1 value for dnn_optimizer_type="adam". - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_beta_2: - defaultValue: 0.999 - description: Beta 2 value for dnn_optimizer_type="adam". - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_dropout: - defaultValue: 0.0 - description: The probability we will drop out a given coordinate. - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_l1_regularization_strength: - defaultValue: 0.0 - description: L1 regularization strength for dnn_optimizer_type="ftrl". - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_l2_regularization_strength: - defaultValue: 0.0 - description: L2 regularization strength for dnn_optimizer_type="ftrl". - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_l2_shrinkage_regularization_strength: - defaultValue: 0.0 - description: L2 shrinkage regularization strength for dnn_optimizer_type="ftrl". - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_learning_rate: - description: The learning rate for training the deep part of the model. - parameterType: NUMBER_DOUBLE - dnn_optimizer_type: - defaultValue: ftrl - description: The type of optimizer to use for the deep part of the model. - Choices are "adam", "ftrl" and "sgd". for the Adam, FTRL, and Gradient - Descent Optimizers, respectively. - isOptional: true - parameterType: STRING - embed_categories: - defaultValue: true - description: If set to true, the categorical columns will be used embedded - and used in the deep part of the model. Embedding size is the square root - of the column cardinality. - isOptional: true - parameterType: BOOLEAN - enable_profiler: - defaultValue: false - description: Enables profiling and saves a trace during evaluation. - isOptional: true - parameterType: BOOLEAN - encryption_spec_key_name: - defaultValue: '' - description: The KMS key name. - isOptional: true - parameterType: STRING - eval_frequency_secs: - defaultValue: 600.0 - description: Frequency at which evaluation and checkpointing will take place. - isOptional: true - parameterType: NUMBER_INTEGER - eval_steps: - defaultValue: 0.0 - description: Number of steps to run evaluation for. If not specified or - negative, it means run evaluation on the whole validation dataset. If - set to 0, it means run evaluation for a fixed number of samples. - isOptional: true - parameterType: NUMBER_INTEGER - hidden_units: - defaultValue: 30,30,30 - description: Hidden layer sizes to use for DNN feature columns, provided - in comma-separated layers. - isOptional: true - parameterType: STRING - l1_regularization_strength: - defaultValue: 0.0 - description: L1 regularization strength for optimizer_type="ftrl". - isOptional: true - parameterType: NUMBER_DOUBLE - l2_regularization_strength: - defaultValue: 0.0 - description: L2 regularization strength for optimizer_type="ftrl" - isOptional: true - parameterType: NUMBER_DOUBLE - l2_shrinkage_regularization_strength: - defaultValue: 0.0 - description: L2 shrinkage regularization strength for optimizer_type="ftrl". - isOptional: true - parameterType: NUMBER_DOUBLE - learning_rate: - description: The learning rate used by the linear optimizer. - parameterType: NUMBER_DOUBLE - location: - description: The GCP region that runs the pipeline components. - parameterType: STRING - max_steps: - defaultValue: -1.0 - description: Number of steps to run the trainer for. - isOptional: true - parameterType: NUMBER_INTEGER - max_train_secs: - defaultValue: -1.0 - description: Amount of time in seconds to run the trainer for. - isOptional: true - parameterType: NUMBER_INTEGER - measurement_selection_type: - defaultValue: BEST_MEASUREMENT - description: Which measurement to use if/when the service automatically - selects the final measurement from previously reported intermediate measurements. - One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT". - isOptional: true - parameterType: STRING - optimization_metric: - defaultValue: '' - description: Optimization metric used for `measurement_selection_type`. - Default is "rmse" for regression and "auc" for classification. - isOptional: true - parameterType: STRING - optimizer_type: - defaultValue: adam - description: The type of optimizer to use. Choices are "adam", "ftrl" and - "sgd" for the Adam, FTRL, and Gradient Descent Optimizers, respectively. - isOptional: true - parameterType: STRING - prediction_type: - description: The type of prediction the model is to produce. "classification" - or "regression". - parameterType: STRING - project: - description: The GCP project that runs the pipeline components. - parameterType: STRING - root_dir: - description: The root GCS directory for the pipeline components. - parameterType: STRING - seed: - defaultValue: 1.0 - description: Seed to be used for this run. - isOptional: true - parameterType: NUMBER_INTEGER - target_column: - description: The target column name. - parameterType: STRING - training_disk_spec: - defaultValue: - boot_disk_size_gb: 100.0 - boot_disk_type: pd-ssd - description: The training disk spec. - isOptional: true - parameterType: STRUCT - training_machine_spec: - defaultValue: - machine_type: c2-standard-16 - description: The training machine spec. See https://cloud.google.com/compute/docs/machine-types - for options. - isOptional: true - parameterType: STRUCT - use_wide: - defaultValue: true - description: If set to true, the categorical columns will be used in the - wide part of the DNN model. - isOptional: true - parameterType: BOOLEAN - weight_column: - defaultValue: '' - description: The weight column name. - isOptional: true - parameterType: STRING - outputDefinitions: - artifacts: - unmanaged_container_model: - artifactType: - schemaTitle: google.UnmanagedContainerModel - schemaVersion: 0.0.1 - description: The UnmanagedContainerModel artifact. - parameters: - gcp_resources: - description: Serialized gcp_resources proto tracking the custom training - job. - parameterType: STRING -deploymentSpec: - executors: - exec-automl-tabular-finalizer: - container: - args: - - --type - - CustomJob - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --payload - - '{"Concat": ["{\"display_name\": \"automl-tabular-finalizer-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}\", - \"encryption_spec\": {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}, \"job_spec\": {\"worker_pool_specs\": [{\"replica_count\": 1, \"machine_spec\": - {\"machine_type\": \"n1-standard-8\"}, \"container_spec\": {\"image_uri\":\"", - "us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/training:20251102_1045", "\", - \"args\": [\"cancel_l2l_tuner\", \"--error_file_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/error.pb\", \"--cleanup_lro_job_infos=", - "{{$.inputs.parameters[''root_dir'']}}", "/{{$.pipeline_job_uuid}}/lro\"]}}]}}"]}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.custom_job.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.44 - exec-automl-tabular-infra-validator: - container: - args: - - --executor_input - - '{{$}}' - image: us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:20251102_1045 - resources: - cpuLimit: 8.0 - memoryLimit: 52.0 - exec-bool-identity: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _bool_identity - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _bool_identity(value: bool) -> str:\n \"\"\"Returns boolean\ - \ value.\n\n Args:\n value: Boolean value to return\n\n Returns:\n\ - \ Boolean value.\n \"\"\"\n return 'true' if value else 'false'\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-feature-transform-engine: - container: - args: - - feature_transform_engine - - '{"Concat": ["--project=", "{{$.inputs.parameters[''project'']}}"]}' - - '{"Concat": ["--location=", "{{$.inputs.parameters[''location'']}}"]}' - - '{"Concat": ["--dataset_level_custom_transformation_definitions=", "{{$.inputs.parameters[''dataset_level_custom_transformation_definitions'']}}"]}' - - '{"Concat": ["--dataset_level_transformations=", "{{$.inputs.parameters[''dataset_level_transformations'']}}"]}' - - '{"Concat": ["--forecasting_time_column=", "{{$.inputs.parameters[''forecasting_time_column'']}}"]}' - - '{"IfPresent": {"InputName": "forecasting_time_series_identifier_column", - "Then": {"Concat": ["--forecasting_time_series_identifier_column=", "{{$.inputs.parameters[''forecasting_time_series_identifier_column'']}}"]}}}' - - '{"Concat": ["--forecasting_time_series_identifier_columns=", "{{$.inputs.parameters[''forecasting_time_series_identifier_columns'']}}"]}' - - '{"Concat": ["--forecasting_time_series_attribute_columns=", "{{$.inputs.parameters[''forecasting_time_series_attribute_columns'']}}"]}' - - '{"Concat": ["--forecasting_unavailable_at_forecast_columns=", "{{$.inputs.parameters[''forecasting_unavailable_at_forecast_columns'']}}"]}' - - '{"Concat": ["--forecasting_available_at_forecast_columns=", "{{$.inputs.parameters[''forecasting_available_at_forecast_columns'']}}"]}' - - '{"Concat": ["--forecasting_forecast_horizon=", "{{$.inputs.parameters[''forecasting_forecast_horizon'']}}"]}' - - '{"Concat": ["--forecasting_context_window=", "{{$.inputs.parameters[''forecasting_context_window'']}}"]}' - - '{"Concat": ["--forecasting_predefined_window_column=", "{{$.inputs.parameters[''forecasting_predefined_window_column'']}}"]}' - - '{"Concat": ["--forecasting_window_stride_length=", "{{$.inputs.parameters[''forecasting_window_stride_length'']}}"]}' - - '{"Concat": ["--forecasting_window_max_count=", "{{$.inputs.parameters[''forecasting_window_max_count'']}}"]}' - - '{"Concat": ["--forecasting_holiday_regions=", "{{$.inputs.parameters[''forecasting_holiday_regions'']}}"]}' - - '{"Concat": ["--forecasting_apply_windowing=", "{{$.inputs.parameters[''forecasting_apply_windowing'']}}"]}' - - '{"Concat": ["--predefined_split_key=", "{{$.inputs.parameters[''predefined_split_key'']}}"]}' - - '{"Concat": ["--stratified_split_key=", "{{$.inputs.parameters[''stratified_split_key'']}}"]}' - - '{"Concat": ["--timestamp_split_key=", "{{$.inputs.parameters[''timestamp_split_key'']}}"]}' - - '{"Concat": ["--training_fraction=", "{{$.inputs.parameters[''training_fraction'']}}"]}' - - '{"Concat": ["--validation_fraction=", "{{$.inputs.parameters[''validation_fraction'']}}"]}' - - '{"Concat": ["--test_fraction=", "{{$.inputs.parameters[''test_fraction'']}}"]}' - - '{"Concat": ["--stats_gen_execution_engine=", "{{$.inputs.parameters[''stats_gen_execution_engine'']}}"]}' - - '{"Concat": ["--tf_transform_execution_engine=", "{{$.inputs.parameters[''tf_transform_execution_engine'']}}"]}' - - '{"IfPresent": {"InputName": "tf_auto_transform_features", "Then": {"Concat": - ["--tf_auto_transform_features=", "{{$.inputs.parameters[''tf_auto_transform_features'']}}"]}}}' - - '{"Concat": ["--tf_custom_transformation_definitions=", "{{$.inputs.parameters[''tf_custom_transformation_definitions'']}}"]}' - - '{"Concat": ["--tf_transformations_path=", "{{$.inputs.parameters[''tf_transformations_path'']}}"]}' - - '{"Concat": ["--legacy_transformations_path=", "{{$.inputs.parameters[''legacy_transformations_path'']}}"]}' - - '{"Concat": ["--data_source_csv_filenames=", "{{$.inputs.parameters[''data_source_csv_filenames'']}}"]}' - - '{"Concat": ["--data_source_bigquery_table_path=", "{{$.inputs.parameters[''data_source_bigquery_table_path'']}}"]}' - - '{"Concat": ["--bigquery_staging_full_dataset_id=", "{{$.inputs.parameters[''bigquery_staging_full_dataset_id'']}}"]}' - - '{"Concat": ["--target_column=", "{{$.inputs.parameters[''target_column'']}}"]}' - - '{"Concat": ["--weight_column=", "{{$.inputs.parameters[''weight_column'']}}"]}' - - '{"Concat": ["--prediction_type=", "{{$.inputs.parameters[''prediction_type'']}}"]}' - - '{"IfPresent": {"InputName": "model_type", "Then": {"Concat": ["--model_type=", - "{{$.inputs.parameters[''model_type'']}}"]}}}' - - '{"Concat": ["--multimodal_tabular_columns=", "{{$.inputs.parameters[''multimodal_tabular_columns'']}}"]}' - - '{"Concat": ["--multimodal_timeseries_columns=", "{{$.inputs.parameters[''multimodal_timeseries_columns'']}}"]}' - - '{"Concat": ["--multimodal_text_columns=", "{{$.inputs.parameters[''multimodal_text_columns'']}}"]}' - - '{"Concat": ["--multimodal_image_columns=", "{{$.inputs.parameters[''multimodal_image_columns'']}}"]}' - - '{"Concat": ["--run_distill=", "{{$.inputs.parameters[''run_distill'']}}"]}' - - '{"Concat": ["--run_feature_selection=", "{{$.inputs.parameters[''run_feature_selection'']}}"]}' - - '{"Concat": ["--materialized_examples_format=", "{{$.inputs.parameters[''materialized_examples_format'']}}"]}' - - '{"Concat": ["--max_selected_features=", "{{$.inputs.parameters[''max_selected_features'']}}"]}' - - '{"Concat": ["--feature_selection_staging_dir=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/feature_selection_staging_dir"]}' - - '{"Concat": ["--feature_selection_algorithm=", "{{$.inputs.parameters[''feature_selection_algorithm'']}}"]}' - - '{"Concat": ["--feature_selection_execution_engine=", "{{$.inputs.parameters[''feature_selection_execution_engine'']}}"]}' - - '{"Concat": ["--feature_ranking_path=", "{{$.outputs.artifacts[''feature_ranking''].uri}}"]}' - - '{"Concat": ["--error_file_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/error.txt"]}' - - '{"Concat": ["--stats_result_path=", "{{$.outputs.artifacts[''dataset_stats''].uri}}"]}' - - '{"Concat": ["--transform_output_artifact_path=", "{{$.outputs.artifacts[''transform_output''].uri}}"]}' - - '{"Concat": ["--transform_output_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/transform"]}' - - '{"Concat": ["--materialized_examples_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/materialized"]}' - - '{"Concat": ["--export_data_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/export"]}' - - '{"Concat": ["--materialized_data_path=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/materialized_data"]}' - - '{"Concat": ["--materialized_data_artifact_path=", "{{$.outputs.artifacts[''materialized_data''].uri}}"]}' - - '{"Concat": ["--bigquery_train_split_uri_path=", "{{$.outputs.parameters[''bigquery_train_split_uri''].output_file}}"]}' - - '{"Concat": ["--bigquery_validation_split_uri_path=", "{{$.outputs.parameters[''bigquery_validation_split_uri''].output_file}}"]}' - - '{"Concat": ["--bigquery_test_split_uri_path=", "{{$.outputs.parameters[''bigquery_test_split_uri''].output_file}}"]}' - - '{"Concat": ["--bigquery_downsampled_test_split_uri_path=", "{{$.outputs.parameters[''bigquery_downsampled_test_split_uri''].output_file}}"]}' - - '{"Concat": ["--split_example_counts_path=", "{{$.outputs.parameters[''split_example_counts''].output_file}}"]}' - - '{"Concat": ["--instance_schema_path=", "{{$.outputs.artifacts[''instance_schema''].path}}"]}' - - '{"Concat": ["--training_schema_path=", "{{$.outputs.artifacts[''training_schema''].path}}"]}' - - --job_name=feature-transform-engine-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}} - - '{"Concat": ["--dataflow_project=", "{{$.inputs.parameters[''project'']}}"]}' - - '{"Concat": ["--dataflow_staging_dir=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/dataflow_staging"]}' - - '{"Concat": ["--dataflow_tmp_dir=", "{{$.inputs.parameters[''root_dir'']}}", - "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/dataflow_tmp"]}' - - '{"Concat": ["--dataflow_max_num_workers=", "{{$.inputs.parameters[''dataflow_max_num_workers'']}}"]}' - - '{"Concat": ["--dataflow_machine_type=", "{{$.inputs.parameters[''dataflow_machine_type'']}}"]}' - - --dataflow_worker_container_image=us-docker.pkg.dev/vertex-ai/automl-tabular/dataflow-worker:20251102_1045 - - --feature_transform_engine_docker_uri=us-docker.pkg.dev/vertex-ai/automl-tabular/feature-transform-engine:20251102_1045 - - '{"Concat": ["--dataflow_disk_size_gb=", "{{$.inputs.parameters[''dataflow_disk_size_gb'']}}"]}' - - '{"Concat": ["--dataflow_subnetwork_fully_qualified=", "{{$.inputs.parameters[''dataflow_subnetwork'']}}"]}' - - '{"Concat": ["--dataflow_use_public_ips=", "{{$.inputs.parameters[''dataflow_use_public_ips'']}}"]}' - - '{"Concat": ["--dataflow_service_account=", "{{$.inputs.parameters[''dataflow_service_account'']}}"]}' - - '{"Concat": ["--dataflow_kms_key=", "{{$.inputs.parameters[''encryption_spec_key_name'']}}"]}' - - '{"Concat": ["--autodetect_csv_schema=", "{{$.inputs.parameters[''autodetect_csv_schema'']}}"]}' - - '{"Concat": ["--gcp_resources_path=", "{{$.outputs.parameters[''gcp_resources''].output_file}}"]}' - - '{"IfPresent": {"InputName": "group_columns", "Then": {"Concat": ["--group_columns=", - "{{$.inputs.parameters[''group_columns'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_total_weight", "Then": {"Concat": ["--group_total_weight=", - "{{$.inputs.parameters[''group_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "temporal_total_weight", "Then": {"Concat": - ["--temporal_total_weight=", "{{$.inputs.parameters[''temporal_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_temporal_total_weight", "Then": {"Concat": - ["--group_temporal_total_weight=", "{{$.inputs.parameters[''group_temporal_total_weight'']}}"]}}}' - - '{"Concat": ["--encryption_spec_key_name=", "{{$.inputs.parameters[''encryption_spec_key_name'']}}"]}' - image: us-docker.pkg.dev/vertex-ai/automl-tabular/feature-transform-engine:20251102_1045 - resources: - cpuLimit: 8.0 - memoryLimit: 30.0 - exec-get-model-display-name: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _get_model_display_name - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _get_model_display_name(\n model_display_name: str,\n) ->\ - \ NamedTuple('Outputs', [('model_display_name', str),]):\n \"\"\"Returns\ - \ the model display name.\"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \ import collections\n import uuid\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \n if not model_display_name:\n model_display_name = f'tabular-workflow-model-{uuid.uuid4()}'\n\ - \n return collections.namedtuple(\n 'Outputs',\n [\n \ - \ 'model_display_name',\n ],\n )(\n model_display_name,\n )\n\ - \n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-model-batch-predict: - container: - args: - - --type - - BatchPredictionJob - - --payload - - '{"Concat": ["{", "\"display_name\": \"", "{{$.inputs.parameters[''job_display_name'']}}", - "\", ", {"IfPresent": {"InputName": "model", "Then": {"Concat": ["\"model\": - \"", "{{$.inputs.artifacts[''model''].metadata[''resourceName'']}}", "\","]}}}, - " \"input_config\": {", "\"instances_format\": \"", "{{$.inputs.parameters[''instances_format'']}}", - "\"", ", \"gcs_source\": {", "\"uris\":", "{{$.inputs.parameters[''gcs_source_uris'']}}", - "}", ", \"bigquery_source\": {", "\"input_uri\": \"", "{{$.inputs.parameters[''bigquery_source_input_uri'']}}", - "\"", "}", "}", ", \"instance_config\": {", "\"instance_type\": \"", "{{$.inputs.parameters[''instance_type'']}}", - "\"", ", \"key_field\": \"", "{{$.inputs.parameters[''key_field'']}}", "\" - ", {"IfPresent": {"InputName": "included_fields", "Then": {"Concat": [", - \"included_fields\": ", "{{$.inputs.parameters[''included_fields'']}}"]}}}, - {"IfPresent": {"InputName": "excluded_fields", "Then": {"Concat": [", \"excluded_fields\": - ", "{{$.inputs.parameters[''excluded_fields'']}}"]}}}, "}", ", \"model_parameters\": - ", "{{$.inputs.parameters[''model_parameters'']}}", ", \"output_config\": - {", "\"predictions_format\": \"", "{{$.inputs.parameters[''predictions_format'']}}", - "\"", ", \"gcs_destination\": {", "\"output_uri_prefix\": \"", "{{$.inputs.parameters[''gcs_destination_output_uri_prefix'']}}", - "\"", "}", ", \"bigquery_destination\": {", "\"output_uri\": \"", "{{$.inputs.parameters[''bigquery_destination_output_uri'']}}", - "\"", "}", "}", ", \"dedicated_resources\": {", "\"machine_spec\": {", "\"machine_type\": - \"", "{{$.inputs.parameters[''machine_type'']}}", "\"", ", \"accelerator_type\": - \"", "{{$.inputs.parameters[''accelerator_type'']}}", "\"", ", \"accelerator_count\": - ", "{{$.inputs.parameters[''accelerator_count'']}}", "}", ", \"starting_replica_count\": - ", "{{$.inputs.parameters[''starting_replica_count'']}}", ", \"max_replica_count\": - ", "{{$.inputs.parameters[''max_replica_count'']}}", "}", ", \"manual_batch_tuning_parameters\": - {", "\"batch_size\": ", "{{$.inputs.parameters[''manual_batch_tuning_parameters_batch_size'']}}", - "}", ", \"generate_explanation\": ", "{{$.inputs.parameters[''generate_explanation'']}}", - ", \"explanation_spec\": {", "\"parameters\": ", "{{$.inputs.parameters[''explanation_parameters'']}}", - ", \"metadata\": ", "{{$.inputs.parameters[''explanation_metadata'']}}", - "}", ", \"labels\": ", "{{$.inputs.parameters[''labels'']}}", ", \"encryption_spec\": - {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}", "}"]}' - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --executor_input - - '{{$}}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.batch_prediction_job.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:2.3.1 - exec-model-evaluation: - container: - args: - - --setup_file - - /setup.py - - --json_mode - - 'true' - - --project_id - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --problem_type - - '{{$.inputs.parameters[''problem_type'']}}' - - --batch_prediction_format - - '{{$.inputs.parameters[''predictions_format'']}}' - - --batch_prediction_gcs_source - - '{{$.inputs.artifacts[''batch_prediction_job''].metadata[''gcsOutputDirectory'']}}' - - --ground_truth_format - - '{{$.inputs.parameters[''ground_truth_format'']}}' - - --key_prefix_in_prediction_dataset - - instance - - --root_dir - - '{{$.inputs.parameters[''root_dir'']}}/{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}' - - --classification_type - - multiclass - - --ground_truth_column - - instance.{{$.inputs.parameters['ground_truth_column']}} - - --prediction_score_column - - '{{$.inputs.parameters[''prediction_score_column'']}}' - - --prediction_label_column - - '{{$.inputs.parameters[''prediction_label_column'']}}' - - --prediction_id_column - - '' - - --example_weight_column - - '' - - --generate_feature_attribution - - 'false' - - --dataflow_job_prefix - - evaluation-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}} - - --dataflow_service_account - - '{{$.inputs.parameters[''dataflow_service_account'']}}' - - --dataflow_disk_size - - '{{$.inputs.parameters[''dataflow_disk_size'']}}' - - --dataflow_machine_type - - '{{$.inputs.parameters[''dataflow_machine_type'']}}' - - --dataflow_workers_num - - '{{$.inputs.parameters[''dataflow_workers_num'']}}' - - --dataflow_max_workers_num - - '{{$.inputs.parameters[''dataflow_max_workers_num'']}}' - - --dataflow_subnetwork - - '{{$.inputs.parameters[''dataflow_subnetwork'']}}' - - --dataflow_use_public_ips - - '{{$.inputs.parameters[''dataflow_use_public_ips'']}}' - - --kms_key_name - - '{{$.inputs.parameters[''encryption_spec_key_name'']}}' - - --output_metrics_gcs_path - - '{{$.outputs.artifacts[''evaluation_metrics''].uri}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --executor_input - - '{{$}}' - command: - - python - - /main.py - image: gcr.io/ml-pipeline/model-evaluation:v0.4 - exec-model-upload: - container: - args: - - --type - - UploadModel - - --payload - - '{"Concat": ["{", "\"display_name\": \"", "{{$.inputs.parameters[''display_name'']}}", - "\"", ", \"description\": \"", "{{$.inputs.parameters[''description'']}}", - "\"", ", \"explanation_spec\": {", "\"parameters\": ", "{{$.inputs.parameters[''explanation_parameters'']}}", - ", \"metadata\": ", "{{$.inputs.parameters[''explanation_metadata'']}}", - "}", ", \"encryption_spec\": {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}", ", \"labels\": ", "{{$.inputs.parameters[''labels'']}}", ", \"pipeline_job\": - \"", "projects/{{$.inputs.parameters[''project'']}}/locations/{{$.inputs.parameters[''location'']}}/pipelineJobs/{{$.pipeline_job_uuid}}", - "\"", "}"]}' - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --executor_input - - '{{$}}' - - '{"IfPresent": {"InputName": "parent_model", "Then": ["--parent_model_name", - "{{$.inputs.artifacts[''parent_model''].metadata[''resourceName'']}}"]}}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.model.upload_model.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:2.3.1 - exec-parse-worker-pool-specs-override: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _parse_worker_pool_specs_override - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _parse_worker_pool_specs_override(\n worker_pool_specs_override:\ - \ list, # pylint:disable=g-bare-generic\n) -> NamedTuple(\n 'Outputs',\n\ - \ [\n ('training_machine_spec', dict), # pylint:disable=g-bare-generic\n\ - \ ('training_disk_spec', dict),\n ('eval_machine_spec', dict),\ - \ # pylint:disable=g-bare-generic\n ('eval_replica_count', int),\n\ - \ ],\n):\n \"\"\"Parses worker_pool_specs_override and returns training\ - \ and evaluation machine specifications.\n\n Args:\n worker_pool_specs_override:\ - \ The list of dictionaries for overriding training\n and evaluation\ - \ worker pool specs.\n\n Returns:\n training_machine_spec: The training\ - \ machine spec.\n training_disk_spec: The training disk spec.\n \ - \ eval_machine_spec: The eval machine spec.\n eval_replica_count:\ - \ The replica count for eval.\n \"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \ import collections\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \n training_machine_spec = {'machine_type': 'c2-standard-16'}\n training_disk_spec\ - \ = {'boot_disk_type': 'pd-ssd', 'boot_disk_size_gb': 100}\n eval_machine_spec\ - \ = {'machine_type': 'c2-standard-8'}\n eval_replica_count = 1\n\n if\ - \ worker_pool_specs_override:\n if len(worker_pool_specs_override) >=\ - \ 1 and isinstance(\n worker_pool_specs_override[0], dict\n ):\n\ - \ training_machine_spec = worker_pool_specs_override[0].get(\n \ - \ 'machine_spec', training_machine_spec\n )\n training_disk_spec\ - \ = worker_pool_specs_override[0].get(\n 'disk_spec', training_disk_spec\n\ - \ )\n if len(worker_pool_specs_override) == 4 and isinstance(\n\ - \ worker_pool_specs_override[3], dict\n ):\n eval_machine_spec\ - \ = worker_pool_specs_override[3].get(\n 'machine_spec', eval_machine_spec\n\ - \ )\n eval_replica_count = worker_pool_specs_override[3].get(\n\ - \ 'replica_count', eval_replica_count\n )\n\n return collections.namedtuple(\n\ - \ 'Outputs',\n [\n 'training_machine_spec',\n \ - \ 'training_disk_spec',\n 'eval_machine_spec',\n 'eval_replica_count',\n\ - \ ],\n )(\n training_machine_spec,\n training_disk_spec,\n\ - \ eval_machine_spec,\n eval_replica_count,\n )\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-set-optional-inputs: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _set_optional_inputs - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _set_optional_inputs(\n project: str,\n location: str,\n\ - \ data_source_csv_filenames: str,\n data_source_bigquery_table_path:\ - \ str,\n vertex_dataset: dsl.Input[dsl.Artifact],\n) -> NamedTuple(\n\ - \ 'Outputs',\n [\n ('data_source_csv_filenames', str),\n \ - \ ('data_source_bigquery_table_path', str),\n ],\n):\n \"\"\"Get\ - \ the data source URI.\n\n Args:\n project: The GCP project that runs\ - \ the pipeline components.\n location: The GCP region that runs the pipeline\ - \ components.\n data_source_csv_filenames: The CSV GCS path when data\ - \ source is CSV.\n data_source_bigquery_table_path: The BigQuery table\ - \ when data source is BQ.\n vertex_dataset: The Vertex dataset when data\ - \ source is Vertex dataset.\n\n Returns:\n A named tuple of CSV or BQ\ - \ URI.\n \"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \ import collections\n from google.cloud import aiplatform\n from google.cloud\ - \ import aiplatform_v1beta1 as aip\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name\n\ - \n if vertex_dataset is not None:\n # of format\n # projects/294348452381/locations/us-central1/datasets/7104764862735056896\n\ - \ dataset_name = vertex_dataset.metadata['resourceName']\n\n aiplatform.init(project=project,\ - \ location=location)\n client = aip.DatasetServiceClient(\n client_options={'api_endpoint':\ - \ f'{location}-aiplatform.googleapis.com'}\n )\n dataset = client.get_dataset(name=dataset_name)\n\ - \ input_config = dataset.metadata['inputConfig']\n if 'gcsSource'\ - \ in input_config:\n data_source_csv_filenames = ','.join(input_config['gcsSource']['uri'])\n\ - \ elif 'bigquerySource' in input_config:\n data_source_bigquery_table_path\ - \ = input_config['bigquerySource']['uri']\n elif data_source_csv_filenames:\n\ - \ pass\n elif data_source_bigquery_table_path:\n pass\n else:\n\ - \ raise ValueError(\n 'One of vertex_dataset, data_source_csv_filenames,'\n\ - \ ' data_source_bigquery_table_path must be specified'\n )\n\n\ - \ return collections.namedtuple(\n 'Outputs',\n [\n \ - \ 'data_source_csv_filenames',\n 'data_source_bigquery_table_path',\n\ - \ ],\n )(\n data_source_csv_filenames,\n data_source_bigquery_table_path,\n\ - \ )\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/kfp-v2-base:20251102_1045 - exec-split-materialized-data: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - _split_materialized_data - command: - - sh - - -ec - - 'program_path=$(mktemp -d) - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef _split_materialized_data(\n materialized_data: Input[Dataset],\n\ - \ materialized_train_split: OutputPath('MaterializedSplit'),\n materialized_eval_split:\ - \ OutputPath('MaterializedSplit'),\n materialized_test_split: OutputPath('MaterializedSplit')):\n\ - \ \"\"\"Splits materialized_data into materialized_data test, train, and\ - \ eval splits.\n\n Necessary adapter between FTE pipeline and trainer.\n\ - \n Args:\n materialized_data: materialized_data dataset output by FTE.\n\ - \ materialized_train_split: Path patern to materialized_train_split.\n\ - \ materialized_eval_split: Path patern to materialized_eval_split.\n\ - \ materialized_test_split: Path patern to materialized_test_split.\n\ - \ \"\"\"\n # pylint: disable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name,reimported\n\ - \ import json\n import tensorflow as tf\n # pylint: enable=g-import-not-at-top,import-outside-toplevel,redefined-outer-name,reimported\n\ - \n with tf.io.gfile.GFile(materialized_data.path, 'r') as f:\n artifact_path\ - \ = f.read()\n\n # needed to import tf because this is a path in gs://\n\ - \ with tf.io.gfile.GFile(artifact_path, 'r') as f:\n materialized_data_json\ - \ = json.load(f)\n\n if 'tf_record_data_source' in materialized_data_json:\n\ - \ file_patterns = materialized_data_json['tf_record_data_source'][\n\ - \ 'file_patterns']\n elif 'avro_data_source' in materialized_data_json:\n\ - \ file_patterns = materialized_data_json['avro_data_source'][\n \ - \ 'file_patterns']\n elif 'parquet_data_source' in materialized_data_json:\n\ - \ file_patterns = materialized_data_json['parquet_data_source'][\n \ - \ 'file_patterns']\n else:\n raise ValueError(f'Unsupported training\ - \ data source: {materialized_data_json}')\n\n # we map indices to file\ - \ patterns based on the ordering of insertion order\n # in our transform_data\ - \ (see above in _generate_analyze_and_transform_data)\n with tf.io.gfile.GFile(materialized_train_split,\ - \ 'w') as f:\n f.write(file_patterns[0])\n\n with tf.io.gfile.GFile(materialized_eval_split,\ - \ 'w') as f:\n f.write(file_patterns[1])\n\n with tf.io.gfile.GFile(materialized_test_split,\ - \ 'w') as f:\n f.write(file_patterns[2])\n\n" - image: us-docker.pkg.dev/vertex-ai/automl-tabular/dataflow-worker:20251102_1045 - exec-training-configurator-and-validator: - container: - args: - - training_configurator_and_validator - - '{"Concat": ["--instance_schema_path=", "{{$.inputs.artifacts[''instance_schema''].uri}}"]}' - - '{"Concat": ["--training_schema_path=", "{{$.inputs.artifacts[''training_schema''].uri}}"]}' - - '{"Concat": ["--dataset_stats_path=", "{{$.inputs.artifacts[''dataset_stats''].uri}}"]}' - - '{"Concat": ["--split_example_counts=", "{{$.inputs.parameters[''split_example_counts'']}}"]}' - - '{"Concat": ["--target_column=", "{{$.inputs.parameters[''target_column'']}}"]}' - - '{"Concat": ["--weight_column=", "{{$.inputs.parameters[''weight_column'']}}"]}' - - '{"Concat": ["--prediction_type=", "{{$.inputs.parameters[''prediction_type'']}}"]}' - - '{"Concat": ["--optimization_objective=", "{{$.inputs.parameters[''optimization_objective'']}}"]}' - - '{"Concat": ["--optimization_objective_recall_value=", "{{$.inputs.parameters[''optimization_objective_recall_value'']}}"]}' - - '{"Concat": ["--optimization_objective_precision_value=", "{{$.inputs.parameters[''optimization_objective_precision_value'']}}"]}' - - '{"Concat": ["--metadata_path=", "{{$.outputs.artifacts[''metadata''].uri}}"]}' - - '{"Concat": ["--instance_baseline_path=", "{{$.outputs.artifacts[''instance_baseline''].uri}}"]}' - - '{"Concat": ["--run_evaluation=", "{{$.inputs.parameters[''run_evaluation'']}}"]}' - - '{"Concat": ["--run_distill=", "{{$.inputs.parameters[''run_distill'']}}"]}' - - '{"Concat": ["--enable_probabilistic_inference=", "{{$.inputs.parameters[''enable_probabilistic_inference'']}}"]}' - - '{"IfPresent": {"InputName": "time_series_identifier_column", "Then": {"Concat": - ["--time_series_identifier_column=", "{{$.inputs.parameters[''time_series_identifier_column'']}}"]}}}' - - '{"Concat": ["--time_series_identifier_columns=", "{{$.inputs.parameters[''time_series_identifier_columns'']}}"]}' - - '{"Concat": ["--time_column=", "{{$.inputs.parameters[''time_column'']}}"]}' - - '{"Concat": ["--time_series_attribute_columns=", "{{$.inputs.parameters[''time_series_attribute_columns'']}}"]}' - - '{"Concat": ["--available_at_forecast_columns=", "{{$.inputs.parameters[''available_at_forecast_columns'']}}"]}' - - '{"Concat": ["--unavailable_at_forecast_columns=", "{{$.inputs.parameters[''unavailable_at_forecast_columns'']}}"]}' - - '{"IfPresent": {"InputName": "quantiles", "Then": {"Concat": ["--quantiles=", - "{{$.inputs.parameters[''quantiles'']}}"]}}}' - - '{"Concat": ["--context_window=", "{{$.inputs.parameters[''context_window'']}}"]}' - - '{"Concat": ["--forecast_horizon=", "{{$.inputs.parameters[''forecast_horizon'']}}"]}' - - '{"Concat": ["--forecasting_model_type=", "{{$.inputs.parameters[''forecasting_model_type'']}}"]}' - - '{"Concat": ["--forecasting_transformations=", "{{$.inputs.parameters[''forecasting_transformations'']}}"]}' - - '{"IfPresent": {"InputName": "stage_1_deadline_hours", "Then": {"Concat": - ["--stage_1_deadline_hours=", "{{$.inputs.parameters[''stage_1_deadline_hours'']}}"]}}}' - - '{"IfPresent": {"InputName": "stage_2_deadline_hours", "Then": {"Concat": - ["--stage_2_deadline_hours=", "{{$.inputs.parameters[''stage_2_deadline_hours'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_columns", "Then": {"Concat": ["--group_columns=", - "{{$.inputs.parameters[''group_columns'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_total_weight", "Then": {"Concat": ["--group_total_weight=", - "{{$.inputs.parameters[''group_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "temporal_total_weight", "Then": {"Concat": - ["--temporal_total_weight=", "{{$.inputs.parameters[''temporal_total_weight'']}}"]}}}' - - '{"IfPresent": {"InputName": "group_temporal_total_weight", "Then": {"Concat": - ["--group_temporal_total_weight=", "{{$.inputs.parameters[''group_temporal_total_weight'']}}"]}}}' - image: us-docker.pkg.dev/vertex-ai/automl-tabular/feature-transform-engine:20251102_1045 - exec-wide-and-deep-trainer: - container: - args: - - --type - - CustomJob - - --project - - '{{$.inputs.parameters[''project'']}}' - - --location - - '{{$.inputs.parameters[''location'']}}' - - --gcp_resources - - '{{$.outputs.parameters[''gcp_resources''].output_file}}' - - --payload - - '{"Concat": ["{\"display_name\": \"wide-and-deep-trainer-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}\", - \"encryption_spec\": {\"kms_key_name\":\"", "{{$.inputs.parameters[''encryption_spec_key_name'']}}", - "\"}, \"job_spec\": {\"worker_pool_specs\": [{\"replica_count\":\"", "1", - "\", \"machine_spec\": ", "{{$.inputs.parameters[''training_machine_spec'']}}", - ", \"disk_spec\": ", "{{$.inputs.parameters[''training_disk_spec'']}}", - ", \"container_spec\": {\"image_uri\":\"", "us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/wide-and-deep-training:20251102_1045", - "\", \"args\": [\"--target_column=", "{{$.inputs.parameters[''target_column'']}}", - "\", \"--weight_column=", "{{$.inputs.parameters[''weight_column'']}}", - "\", \"--model_type=", "{{$.inputs.parameters[''prediction_type'']}}", "\", - \"--prediction_docker_uri=", "us-docker.pkg.dev/vertex-ai/automl-tabular/prediction-server:20251102_1045", - "\", \"--baseline_path=", "{{$.inputs.artifacts[''instance_baseline''].uri}}", - "\", \"--metadata_path=", "{{$.inputs.artifacts[''metadata''].uri}}", "\", - \"--transform_output_path=", "{{$.inputs.artifacts[''transform_output''].uri}}", - "\", \"--training_schema_path=", "{{$.inputs.artifacts[''training_schema_uri''].uri}}", - "\", \"--job_dir=", "{{$.inputs.parameters[''root_dir'']}}", "/{{$.pipeline_job_uuid}}/{{$.pipeline_task_uuid}}/train\", - \"--training_data_path=", "{{$.inputs.artifacts[''materialized_train_split''].uri}}", - "\", \"--validation_data_path=", "{{$.inputs.artifacts[''materialized_eval_split''].uri}}", - "\", \"--max_steps=", "{{$.inputs.parameters[''max_steps'']}}", "\", \"--max_train_secs=", - "{{$.inputs.parameters[''max_train_secs'']}}", "\", \"--learning_rate=", - "{{$.inputs.parameters[''learning_rate'']}}", "\", \"--optimizer_type=", - "{{$.inputs.parameters[''optimizer_type'']}}", "\", \"--l1_regularization_strength=", - "{{$.inputs.parameters[''l1_regularization_strength'']}}", "\", \"--l2_regularization_strength=", - "{{$.inputs.parameters[''l2_regularization_strength'']}}", "\", \"--l2_shrinkage_regularization_strength=", - "{{$.inputs.parameters[''l2_shrinkage_regularization_strength'']}}", "\", - \"--beta_1=", "{{$.inputs.parameters[''beta_1'']}}", "\", \"--beta_2=", - "{{$.inputs.parameters[''beta_2'']}}", "\", \"--hidden_units=", "{{$.inputs.parameters[''hidden_units'']}}", - "\", \"--use_wide=", "{{$.inputs.parameters[''use_wide'']}}", "\", \"--embed_categories=", - "{{$.inputs.parameters[''embed_categories'']}}", "\", \"--dnn_dropout=", - "{{$.inputs.parameters[''dnn_dropout'']}}", "\", \"--dnn_learning_rate=", - "{{$.inputs.parameters[''dnn_learning_rate'']}}", "\", \"--dnn_optimizer_type=", - "{{$.inputs.parameters[''dnn_optimizer_type'']}}", "\", \"--dnn_l1_regularization_strength=", - "{{$.inputs.parameters[''dnn_l1_regularization_strength'']}}", "\", \"--dnn_l2_regularization_strength=", - "{{$.inputs.parameters[''dnn_l2_regularization_strength'']}}", "\", \"--dnn_l2_shrinkage_regularization_strength=", - "{{$.inputs.parameters[''dnn_l2_shrinkage_regularization_strength'']}}", - "\", \"--dnn_beta_1=", "{{$.inputs.parameters[''dnn_beta_1'']}}", "\", \"--dnn_beta_2=", - "{{$.inputs.parameters[''dnn_beta_2'']}}", "\", \"--enable_profiler=", "{{$.inputs.parameters[''enable_profiler'']}}", - "\", \"--cache_data=", "{{$.inputs.parameters[''cache_data'']}}", "\", \"--seed=", - "{{$.inputs.parameters[''seed'']}}", "\", \"--eval_steps=", "{{$.inputs.parameters[''eval_steps'']}}", - "\", \"--batch_size=", "{{$.inputs.parameters[''batch_size'']}}", "\", \"--measurement_selection_type=", - "{{$.inputs.parameters[''measurement_selection_type'']}}", "\", \"--optimization_metric=", - "{{$.inputs.parameters[''optimization_metric'']}}", "\", \"--eval_frequency_secs=", - "{{$.inputs.parameters[''eval_frequency_secs'']}}", "\", \"--executor_input={{$.json_escape[1]}}\"]}}]}}"]}' - command: - - python3 - - -u - - -m - - google_cloud_pipeline_components.container.v1.custom_job.launcher - image: gcr.io/ml-pipeline/google-cloud-pipeline-components:1.0.44 -pipelineInfo: - description: 'Train a model using the Tabular Workflow for Wide & Deep pipelines. - - Wide & Deep jointly trains wide linear models and deep neural networks. It - - combines the benefits of memorization and generalization.' - name: automl-tabular-wide-and-deep-trainer -root: - dag: - outputs: - artifacts: - model-evaluation-evaluation_metrics: - artifactSelectors: - - outputArtifactKey: model-evaluation-evaluation_metrics - producerSubtask: exit-handler-1 - tasks: - automl-tabular-finalizer: - cachingOptions: - enableCache: true - componentRef: - name: comp-automl-tabular-finalizer - dependentTasks: - - exit-handler-1 - inputs: - parameters: - location: - componentInputParameter: location - project: - componentInputParameter: project - root_dir: - componentInputParameter: root_dir - taskInfo: - name: automl-tabular-finalizer - triggerPolicy: - strategy: ALL_UPSTREAM_TASKS_COMPLETED - exit-handler-1: - componentRef: - name: comp-exit-handler-1 - dependentTasks: - - get-model-display-name - - set-optional-inputs - inputs: - artifacts: - pipelinechannel--parent_model: - componentInputArtifact: parent_model - parameters: - pipelinechannel--batch_size: - componentInputParameter: batch_size - pipelinechannel--beta_1: - componentInputParameter: beta_1 - pipelinechannel--beta_2: - componentInputParameter: beta_2 - pipelinechannel--bigquery_staging_full_dataset_id: - componentInputParameter: bigquery_staging_full_dataset_id - pipelinechannel--cache_data: - componentInputParameter: cache_data - pipelinechannel--dataflow_service_account: - componentInputParameter: dataflow_service_account - pipelinechannel--dataflow_subnetwork: - componentInputParameter: dataflow_subnetwork - pipelinechannel--dataflow_use_public_ips: - componentInputParameter: dataflow_use_public_ips - pipelinechannel--dataset_level_custom_transformation_definitions: - componentInputParameter: dataset_level_custom_transformation_definitions - pipelinechannel--dataset_level_transformations: - componentInputParameter: dataset_level_transformations - pipelinechannel--dnn_beta_1: - componentInputParameter: dnn_beta_1 - pipelinechannel--dnn_beta_2: - componentInputParameter: dnn_beta_2 - pipelinechannel--dnn_dropout: - componentInputParameter: dnn_dropout - pipelinechannel--dnn_l1_regularization_strength: - componentInputParameter: dnn_l1_regularization_strength - pipelinechannel--dnn_l2_regularization_strength: - componentInputParameter: dnn_l2_regularization_strength - pipelinechannel--dnn_l2_shrinkage_regularization_strength: - componentInputParameter: dnn_l2_shrinkage_regularization_strength - pipelinechannel--dnn_learning_rate: - componentInputParameter: dnn_learning_rate - pipelinechannel--dnn_optimizer_type: - componentInputParameter: dnn_optimizer_type - pipelinechannel--embed_categories: - componentInputParameter: embed_categories - pipelinechannel--enable_profiler: - componentInputParameter: enable_profiler - pipelinechannel--encryption_spec_key_name: - componentInputParameter: encryption_spec_key_name - pipelinechannel--eval_frequency_secs: - componentInputParameter: eval_frequency_secs - pipelinechannel--eval_steps: - componentInputParameter: eval_steps - pipelinechannel--evaluation_batch_predict_machine_type: - componentInputParameter: evaluation_batch_predict_machine_type - pipelinechannel--evaluation_batch_predict_max_replica_count: - componentInputParameter: evaluation_batch_predict_max_replica_count - pipelinechannel--evaluation_batch_predict_starting_replica_count: - componentInputParameter: evaluation_batch_predict_starting_replica_count - pipelinechannel--evaluation_dataflow_disk_size_gb: - componentInputParameter: evaluation_dataflow_disk_size_gb - pipelinechannel--evaluation_dataflow_machine_type: - componentInputParameter: evaluation_dataflow_machine_type - pipelinechannel--evaluation_dataflow_max_num_workers: - componentInputParameter: evaluation_dataflow_max_num_workers - pipelinechannel--evaluation_dataflow_starting_num_workers: - componentInputParameter: evaluation_dataflow_starting_num_workers - pipelinechannel--feature_selection_algorithm: - componentInputParameter: feature_selection_algorithm - pipelinechannel--get-model-display-name-model_display_name: - taskOutputParameter: - outputParameterKey: model_display_name - producerTask: get-model-display-name - pipelinechannel--hidden_units: - componentInputParameter: hidden_units - pipelinechannel--l1_regularization_strength: - componentInputParameter: l1_regularization_strength - pipelinechannel--l2_regularization_strength: - componentInputParameter: l2_regularization_strength - pipelinechannel--l2_shrinkage_regularization_strength: - componentInputParameter: l2_shrinkage_regularization_strength - pipelinechannel--learning_rate: - componentInputParameter: learning_rate - pipelinechannel--location: - componentInputParameter: location - pipelinechannel--materialized_examples_format: - componentInputParameter: materialized_examples_format - pipelinechannel--max_selected_features: - componentInputParameter: max_selected_features - pipelinechannel--max_steps: - componentInputParameter: max_steps - pipelinechannel--max_train_secs: - componentInputParameter: max_train_secs - pipelinechannel--measurement_selection_type: - componentInputParameter: measurement_selection_type - pipelinechannel--model_description: - componentInputParameter: model_description - pipelinechannel--optimization_metric: - componentInputParameter: optimization_metric - pipelinechannel--optimizer_type: - componentInputParameter: optimizer_type - pipelinechannel--predefined_split_key: - componentInputParameter: predefined_split_key - pipelinechannel--prediction_type: - componentInputParameter: prediction_type - pipelinechannel--project: - componentInputParameter: project - pipelinechannel--root_dir: - componentInputParameter: root_dir - pipelinechannel--run_evaluation: - componentInputParameter: run_evaluation - pipelinechannel--run_feature_selection: - componentInputParameter: run_feature_selection - pipelinechannel--seed: - componentInputParameter: seed - pipelinechannel--set-optional-inputs-data_source_bigquery_table_path: - taskOutputParameter: - outputParameterKey: data_source_bigquery_table_path - producerTask: set-optional-inputs - pipelinechannel--set-optional-inputs-data_source_csv_filenames: - taskOutputParameter: - outputParameterKey: data_source_csv_filenames - producerTask: set-optional-inputs - pipelinechannel--stratified_split_key: - componentInputParameter: stratified_split_key - pipelinechannel--target_column: - componentInputParameter: target_column - pipelinechannel--test_fraction: - componentInputParameter: test_fraction - pipelinechannel--tf_auto_transform_features: - componentInputParameter: tf_auto_transform_features - pipelinechannel--tf_custom_transformation_definitions: - componentInputParameter: tf_custom_transformation_definitions - pipelinechannel--tf_transform_execution_engine: - componentInputParameter: tf_transform_execution_engine - pipelinechannel--tf_transformations_path: - componentInputParameter: tf_transformations_path - pipelinechannel--training_fraction: - componentInputParameter: training_fraction - pipelinechannel--transform_dataflow_disk_size_gb: - componentInputParameter: transform_dataflow_disk_size_gb - pipelinechannel--transform_dataflow_machine_type: - componentInputParameter: transform_dataflow_machine_type - pipelinechannel--transform_dataflow_max_num_workers: - componentInputParameter: transform_dataflow_max_num_workers - pipelinechannel--use_wide: - componentInputParameter: use_wide - pipelinechannel--validation_fraction: - componentInputParameter: validation_fraction - pipelinechannel--weight_column: - componentInputParameter: weight_column - pipelinechannel--worker_pool_specs_override: - componentInputParameter: worker_pool_specs_override - taskInfo: - name: exit-handler-1 - get-model-display-name: - cachingOptions: - enableCache: true - componentRef: - name: comp-get-model-display-name - inputs: - parameters: - model_display_name: - componentInputParameter: model_display_name - taskInfo: - name: get-model-display-name - set-optional-inputs: - cachingOptions: - enableCache: true - componentRef: - name: comp-set-optional-inputs - inputs: - artifacts: - vertex_dataset: - componentInputArtifact: vertex_dataset - parameters: - data_source_bigquery_table_path: - componentInputParameter: data_source_bigquery_table_path - data_source_csv_filenames: - componentInputParameter: data_source_csv_filenames - location: - componentInputParameter: location - project: - componentInputParameter: project - taskInfo: - name: set-optional-inputs - inputDefinitions: - artifacts: - parent_model: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Parent model if this model is uploaded as a version. - isOptional: true - vertex_dataset: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: The Vertex dataset artifact. - parameters: - batch_size: - defaultValue: 100.0 - description: Batch size for training. - isOptional: true - parameterType: NUMBER_INTEGER - beta_1: - defaultValue: 0.9 - description: Beta 1 value for optimizer_type='adam'. - isOptional: true - parameterType: NUMBER_DOUBLE - beta_2: - defaultValue: 0.999 - description: Beta 2 value for optimizer_type='adam'. - isOptional: true - parameterType: NUMBER_DOUBLE - bigquery_staging_full_dataset_id: - defaultValue: '' - description: Staging directory for BigQuery tables. - isOptional: true - parameterType: STRING - cache_data: - defaultValue: auto - description: 'Whether to cache data or not. If set to ''auto'', caching is - - determined based on the dataset size.' - isOptional: true - parameterType: STRING - data_source_bigquery_table_path: - defaultValue: '' - description: 'The BigQuery table path of format - - bq://bq_project.bq_dataset.bq_table' - isOptional: true - parameterType: STRING - data_source_csv_filenames: - defaultValue: '' - description: 'A string that represents a list of comma - - separated CSV filenames.' - isOptional: true - parameterType: STRING - dataflow_service_account: - defaultValue: '' - description: Custom service account to run dataflow jobs. - isOptional: true - parameterType: STRING - dataflow_subnetwork: - defaultValue: '' - description: 'Dataflow''s fully qualified subnetwork name, when empty - - the default subnetwork will be used. Example: - - https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications' - isOptional: true - parameterType: STRING - dataflow_use_public_ips: - defaultValue: true - description: 'Specifies whether Dataflow workers use public IP - - addresses.' - isOptional: true - parameterType: BOOLEAN - dataset_level_custom_transformation_definitions: - description: 'Dataset-level custom - - transformation definitions in string format.' - isOptional: true - parameterType: LIST - dataset_level_transformations: - description: 'Dataset-level transformation configuration in - - string format.' - isOptional: true - parameterType: LIST - dnn_beta_1: - defaultValue: 0.9 - description: Beta 1 value for dnn_optimizer_type='adam'. - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_beta_2: - defaultValue: 0.999 - description: Beta 2 value for dnn_optimizer_type='adam'. - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_dropout: - defaultValue: 0.0 - description: The probability we will drop out a given coordinate. - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_l1_regularization_strength: - defaultValue: 0.0 - description: 'L1 regularization strength for - - dnn_optimizer_type=''ftrl''.' - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_l2_regularization_strength: - defaultValue: 0.0 - description: 'L2 regularization strength for - - dnn_optimizer_type=''ftrl''.' - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_l2_shrinkage_regularization_strength: - defaultValue: 0.0 - description: 'L2 shrinkage regularization - - strength for dnn_optimizer_type=''ftrl''.' - isOptional: true - parameterType: NUMBER_DOUBLE - dnn_learning_rate: - description: 'The learning rate for training the deep part of the - - model.' - parameterType: NUMBER_DOUBLE - dnn_optimizer_type: - defaultValue: adam - description: 'The type of optimizer to use for the deep part of the - - model. Choices are ''adam'', ''ftrl'' and ''sgd''. for the Adam, FTRL, and - - Gradient Descent Optimizers, respectively.' - isOptional: true - parameterType: STRING - embed_categories: - defaultValue: true - description: 'If set to true, the categorical columns will be used - - embedded and used in the deep part of the model. Embedding size is the - - square root of the column cardinality.' - isOptional: true - parameterType: BOOLEAN - enable_profiler: - defaultValue: false - description: Enables profiling and saves a trace during evaluation. - isOptional: true - parameterType: BOOLEAN - encryption_spec_key_name: - defaultValue: '' - description: The KMS key name. - isOptional: true - parameterType: STRING - eval_frequency_secs: - defaultValue: 600.0 - description: 'Frequency at which evaluation and checkpointing will - - take place.' - isOptional: true - parameterType: NUMBER_INTEGER - eval_steps: - defaultValue: 0.0 - description: 'Number of steps to run evaluation for. If not specified or - - negative, it means run evaluation on the whole validation dataset. If set - - to 0, it means run evaluation for a fixed number of samples.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_batch_predict_machine_type: - defaultValue: n1-highmem-8 - description: 'The prediction server machine type - - for batch predict components during evaluation.' - isOptional: true - parameterType: STRING - evaluation_batch_predict_max_replica_count: - defaultValue: 20.0 - description: 'The max number of prediction - - server for batch predict components during evaluation.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_batch_predict_starting_replica_count: - defaultValue: 20.0 - description: 'The initial number of - - prediction server for batch predict components during evaluation.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_dataflow_disk_size_gb: - defaultValue: 50.0 - description: 'Dataflow worker''s disk size in GB for - - evaluation components.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_dataflow_machine_type: - defaultValue: n1-standard-4 - description: 'The dataflow machine type for evaluation - - components.' - isOptional: true - parameterType: STRING - evaluation_dataflow_max_num_workers: - defaultValue: 100.0 - description: 'The max number of Dataflow workers for - - evaluation components.' - isOptional: true - parameterType: NUMBER_INTEGER - evaluation_dataflow_starting_num_workers: - defaultValue: 10.0 - description: 'The initial number of Dataflow - - workers for evaluation components.' - isOptional: true - parameterType: NUMBER_INTEGER - feature_selection_algorithm: - defaultValue: AMI - description: Feature selection algorithm. - isOptional: true - parameterType: STRING - hidden_units: - defaultValue: 30,30,30 - description: 'Hidden layer sizes to use for DNN feature columns, provided - in - - comma-separated layers.' - isOptional: true - parameterType: STRING - l1_regularization_strength: - defaultValue: 0.0 - description: 'L1 regularization strength for - - optimizer_type=''ftrl''.' - isOptional: true - parameterType: NUMBER_DOUBLE - l2_regularization_strength: - defaultValue: 0.0 - description: 'L2 regularization strength for - - optimizer_type=''ftrl''.' - isOptional: true - parameterType: NUMBER_DOUBLE - l2_shrinkage_regularization_strength: - defaultValue: 0.0 - description: 'L2 shrinkage regularization strength - - for optimizer_type=''ftrl''.' - isOptional: true - parameterType: NUMBER_DOUBLE - learning_rate: - description: The learning rate used by the linear optimizer. - parameterType: NUMBER_DOUBLE - location: - description: The GCP region that runs the pipeline components. - parameterType: STRING - materialized_examples_format: - defaultValue: tfrecords_gzip - description: The format for the materialized examples. - isOptional: true - parameterType: STRING - max_selected_features: - defaultValue: -1.0 - description: Maximum number of features to select. - isOptional: true - parameterType: NUMBER_INTEGER - max_steps: - defaultValue: -1.0 - description: Number of steps to run the trainer for. - isOptional: true - parameterType: NUMBER_INTEGER - max_train_secs: - defaultValue: -1.0 - description: Amount of time in seconds to run the trainer for. - isOptional: true - parameterType: NUMBER_INTEGER - measurement_selection_type: - defaultValue: BEST_MEASUREMENT - description: 'Which measurement to use if/when the service - - automatically selects the final measurement from previously reported - - intermediate measurements. One of "BEST_MEASUREMENT" or - - "LAST_MEASUREMENT".' - isOptional: true - parameterType: STRING - model_description: - defaultValue: '' - description: The description name of the uploaded Vertex model. - isOptional: true - parameterType: STRING - model_display_name: - defaultValue: '' - description: The display name of the uploaded Vertex model. - isOptional: true - parameterType: STRING - optimization_metric: - defaultValue: '' - description: 'Optimization metric used for - - `measurement_selection_type`. Default is "rmse" for regression and "auc" - - for classification.' - isOptional: true - parameterType: STRING - optimizer_type: - defaultValue: adam - description: 'The type of optimizer to use. Choices are "adam", "ftrl" and - - "sgd" for the Adam, FTRL, and Gradient Descent Optimizers, respectively.' - isOptional: true - parameterType: STRING - predefined_split_key: - defaultValue: '' - description: Predefined split key. - isOptional: true - parameterType: STRING - prediction_type: - description: 'The type of prediction the model is to produce. - - "classification" or "regression".' - parameterType: STRING - project: - description: The GCP project that runs the pipeline components. - parameterType: STRING - root_dir: - description: The root GCS directory for the pipeline components. - parameterType: STRING - run_evaluation: - defaultValue: false - description: Whether to run evaluation steps during training. - isOptional: true - parameterType: BOOLEAN - run_feature_selection: - defaultValue: false - description: Whether to enable feature selection. - isOptional: true - parameterType: BOOLEAN - seed: - defaultValue: 1.0 - description: Seed to be used for this run. - isOptional: true - parameterType: NUMBER_INTEGER - stratified_split_key: - defaultValue: '' - description: Stratified split key. - isOptional: true - parameterType: STRING - target_column: - description: The target column name. - parameterType: STRING - test_fraction: - defaultValue: -1.0 - description: Test fraction. - isOptional: true - parameterType: NUMBER_DOUBLE - tf_auto_transform_features: - description: List of auto transform features. - isOptional: true - parameterType: STRUCT - tf_custom_transformation_definitions: - description: 'TF custom transformation definitions - - in string format.' - isOptional: true - parameterType: LIST - tf_transform_execution_engine: - defaultValue: bigquery - description: 'Execution engine to run TF-based - - transformations. Currently supports "dataflow" or "bigquery"' - isOptional: true - parameterType: STRING - tf_transformations_path: - defaultValue: '' - description: Path to TF transformation configuration. - isOptional: true - parameterType: STRING - training_fraction: - defaultValue: -1.0 - description: Training fraction. - isOptional: true - parameterType: NUMBER_DOUBLE - transform_dataflow_disk_size_gb: - defaultValue: 40.0 - description: 'Dataflow worker''s disk size in GB for - - transform component.' - isOptional: true - parameterType: NUMBER_INTEGER - transform_dataflow_machine_type: - defaultValue: n1-standard-16 - description: 'The dataflow machine type for transform - - component.' - isOptional: true - parameterType: STRING - transform_dataflow_max_num_workers: - defaultValue: 25.0 - description: 'The max number of Dataflow workers for - - transform component.' - isOptional: true - parameterType: NUMBER_INTEGER - use_wide: - defaultValue: true - description: 'If set to true, the categorical columns will be used in the - wide - - part of the DNN model.' - isOptional: true - parameterType: BOOLEAN - validation_fraction: - defaultValue: -1.0 - description: Validation fraction. - isOptional: true - parameterType: NUMBER_DOUBLE - weight_column: - defaultValue: '' - description: The weight column name. - isOptional: true - parameterType: STRING - worker_pool_specs_override: - description: 'The dictionary for overriding training and - - evaluation worker pool specs. The dictionary should be of format - - https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.' - isOptional: true - parameterType: LIST - outputDefinitions: - artifacts: - model-evaluation-evaluation_metrics: - artifactType: - schemaTitle: system.Metrics - schemaVersion: 0.0.1 -schemaVersion: 2.1.0 -sdkVersion: kfp-2.0.0-rc.2