[PRO-16170] Give ability to set keys, create submission with cluster report, and … (#99)

singhals · brandon-kaplan · web-flow · commit dc7a217d4131 · 2024-02-27T15:36:33.000-05:00
* Give ability to set keys, create submission with cluster report, and expose cluster events

---------

Co-authored-by: Brandon Kaplan &lt;Bkaplan31@gmail.com&gt;
diff --git a/sync/__init__.py b/sync/__init__.py
@@ -1,4 +1,4 @@
 """Library for leveraging the power of Sync"""
-__version__ = "1.0.3"
+__version__ = "1.1.0"
 
 TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
diff --git a/sync/_databricks.py b/sync/_databricks.py
@@ -18,7 +18,14 @@
 from sync.api import projects
 from sync.clients.databricks import get_default_client
 from sync.config import CONFIG  # noqa F401
-from sync.models import DatabricksAPIError, DatabricksClusterReport, DatabricksError, Response
+from sync.models import (
+    DatabricksAPIError,
+    DatabricksClusterReport,
+    DatabricksError,
+    DatabricksComputeType,
+    DatabricksPlanType,
+    Response
+)
 from sync.utils.dbfs import format_dbfs_filepath, read_dbfs_file
 
 logger = logging.getLogger(__name__)
@@ -56,6 +63,58 @@ def get_cluster(cluster_id: str) -> Response[dict]:
     return Response(result=cluster)
 
 
+def create_submission_with_cluster_info(
+    run_id: str,
+    project_id: str,
+    cluster: Dict,
+    cluster_info: Dict,
+    cluster_activity_events: Dict,
+    plan_type: DatabricksPlanType,
+    compute_type: DatabricksComputeType,
+) -> Response[str]:
+    """Create a Submission for the specified Databricks run given a cluster report"""
+
+    run = get_default_client().get_run(run_id)
+
+    if "error_code" in run:
+        return Response(error=DatabricksAPIError(**run))
+
+    project_response = projects.get_project(project_id)
+    if project_response.error:
+        return project_response
+    cluster_path = project_response.result.get("cluster_path")
+
+    project_cluster_tasks = _get_project_cluster_tasks(run, project_id, cluster_path)
+
+    cluster_tasks = project_cluster_tasks.get(project_id)
+    if not cluster_tasks:
+        return Response(
+            error=DatabricksError(
+                message=f"Failed to locate cluster in run {run_id} for project {project_id}"
+            )
+        )
+
+    _, tasks = cluster_tasks
+
+    cluster_report = _create_cluster_report(
+        cluster=cluster,
+        cluster_info=cluster_info,
+        cluster_activity_events=cluster_activity_events,
+        tasks=tasks,
+        plan_type=plan_type,
+        compute_type=compute_type
+    )
+    eventlog = _get_event_log_from_cluster(cluster, tasks).result
+
+    return projects.create_project_submission_with_eventlog_bytes(
+        get_default_client().get_platform(),
+        cluster_report.dict(exclude_none=True),
+        "eventlog.zip",
+        eventlog,
+        project_id,
+    )
+
+
 def create_submission_for_run(
     run_id: str,
     plan_type: str,
@@ -160,19 +219,27 @@ def _get_run_information(
     cluster_report = cluster_report_response.result
     if cluster_report:
         cluster = cluster_report.cluster
-        spark_context_id = _get_run_spark_context_id(tasks)
-        end_time = max(task["end_time"] for task in tasks)
-        eventlog_response = _get_eventlog(cluster, spark_context_id.result, end_time)
-
+        eventlog_response = _get_event_log_from_cluster(cluster, tasks)
         eventlog = eventlog_response.result
         if eventlog:
-            # TODO - allow submissions w/out eventlog. Best way to make eventlog optional?..
             return Response(result=(cluster_report, eventlog))
 
-        return eventlog_response
     return cluster_report_response
 
 
+def _get_event_log_from_cluster(cluster: Dict, tasks: List[Dict]) -> Response[bytes]:
+    spark_context_id = _get_run_spark_context_id(tasks)
+    end_time = max(task["end_time"] for task in tasks)
+    eventlog_response = _get_eventlog(cluster, spark_context_id.result, end_time)
+
+    eventlog = eventlog_response.result
+    if eventlog:
+        # TODO - allow submissions w/out eventlog. Best way to make eventlog optional?..
+        return Response(result=eventlog)
+
+    return eventlog_response  # return eventlog response with errors
+
+
 def get_cluster_report(
     run_id: str,
     plan_type: str,
@@ -240,6 +307,17 @@ def _get_cluster_report(
     raise NotImplementedError()
 
 
+def _create_cluster_report(
+        cluster: dict,
+        cluster_info: dict,
+        cluster_activity_events: dict,
+        tasks: List[dict],
+        plan_type: DatabricksPlanType,
+        compute_type: DatabricksComputeType
+) -> DatabricksClusterReport:
+    raise NotImplementedError()
+
+
 def _get_cluster_instances_from_dbfs(filepath: str):
     filepath = format_dbfs_filepath(filepath)
     dbx_client = get_default_client()
@@ -1493,7 +1571,7 @@ def _get_eventlog(
         return Response(error=DatabricksError(message=f"Unknown log destination: {filesystem}"))
 
 
-def _get_all_cluster_events(cluster_id: str):
+def get_all_cluster_events(cluster_id: str):
     """Fetches all ClusterEvents for a given Databricks cluster, optionally within a time window.
     Pages will be followed and returned as 1 object
     """
diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py
@@ -12,7 +12,7 @@
 import sync._databricks
 from sync._databricks import (
     _cluster_log_destination,
-    _get_all_cluster_events,
+    get_all_cluster_events,
     _get_cluster_instances_from_dbfs,
     _update_monitored_timelines,
     _wait_for_cluster_termination,
@@ -22,6 +22,7 @@
     create_cluster,
     create_run,
     create_submission_for_run,
+    create_submission_with_cluster_info,
     get_cluster,
     get_cluster_report,
     get_project_cluster,
@@ -48,6 +49,8 @@
     AccessStatusCode,
     AWSDatabricksClusterReport,
     DatabricksError,
+    DatabricksPlanType,
+    DatabricksComputeType,
     Response,
 )
 from sync.utils.dbfs import format_dbfs_filepath, write_dbfs_file
@@ -57,7 +60,9 @@
     "get_access_report",
     "run_and_record_job",
     "create_submission_for_run",
+    "create_submission_with_cluster_info",
     "get_cluster_report",
+    "get_all_cluster_events",
     "monitor_cluster",
     "create_cluster",
     "get_cluster",
@@ -217,7 +222,7 @@ def _get_cluster_report(
     else:
         timelines = timeline_response.result
 
-    cluster_events = _get_all_cluster_events(cluster_id)
+    cluster_events = get_all_cluster_events(cluster_id)
     return Response(
         result=AWSDatabricksClusterReport(
             plan_type=plan_type,
@@ -232,12 +237,33 @@ def _get_cluster_report(
     )
 
 
+def _create_cluster_report(
+    cluster: dict,
+    cluster_info: dict,
+    cluster_activity_events: dict,
+    tasks: List[dict],
+    plan_type: DatabricksPlanType,
+    compute_type: DatabricksComputeType,
+) -> AWSDatabricksClusterReport:
+    return AWSDatabricksClusterReport(
+        plan_type=plan_type,
+        compute_type=compute_type,
+        cluster=cluster,
+        cluster_events=cluster_activity_events,
+        tasks=tasks,
+        instances=cluster_info.get("instances"),
+        volumes=cluster_info.get("volumes"),
+        instance_timelines=cluster_info.get("instance_timelines"),
+    )
+
+
 if getattr(sync._databricks, "__claim", __name__) != __name__:
     raise RuntimeError(
         "Databricks modules for different cloud providers cannot be used in the same context"
     )
 
 sync._databricks._get_cluster_report = _get_cluster_report
+sync._databricks._create_cluster_report = _create_cluster_report
 setattr(sync._databricks, "__claim", __name__)
 
 
@@ -328,6 +354,7 @@ def monitor_cluster(
     cluster_id: str,
     polling_period: int = 20,
     cluster_report_destination_override: dict = None,
+    kill_on_termination: bool = False,
 ) -> None:
     cluster = get_default_client().get_cluster(cluster_id)
     spark_context_id = cluster.get("spark_context_id")
@@ -350,6 +377,7 @@ def monitor_cluster(
             cluster_id,
             spark_context_id,
             polling_period,
+            kill_on_termination,
         )
     else:
         logger.warning("Unable to monitor cluster due to missing cluster log destination - exiting")
@@ -360,6 +388,7 @@ def _monitor_cluster(
     cluster_id: str,
     spark_context_id: int,
     polling_period: int,
+    kill_on_termination: bool = False,
 ) -> None:
 
     (log_url, filesystem, bucket, base_prefix) = cluster_log_destination
@@ -377,14 +406,16 @@ def _monitor_cluster(
     active_timelines_by_id = {}
     retired_timelines = []
     recorded_volumes_by_id = {}
-    while True:
+
+    while_condition = True
+    while while_condition:
         try:
             current_insts = _get_ec2_instances(cluster_id, ec2)
             recorded_volumes_by_id.update(
                 {v["VolumeId"]: v for v in _get_ebs_volumes_for_instances(current_insts, ec2)}
             )
 
-            # Record new (or overrwite) existing instances.
+            # Record new (or overwrite) existing instances.
             # Separately record the ids of those that are in the "running" state.
             running_inst_ids = set({})
             for inst in current_insts:
@@ -412,6 +443,11 @@ def _monitor_cluster(
                     "utf-8",
                 )
             )
+
+            if kill_on_termination:
+                cluster_state = get_default_client().get_cluster(cluster_id).get("state")
+                if cluster_state == "TERMINATED":
+                    while_condition = False
         except Exception as e:
             logger.error(f"Exception encountered while polling cluster: {e}")
 
diff --git a/sync/azuredatabricks.py b/sync/azuredatabricks.py
@@ -15,7 +15,7 @@
 import sync._databricks
 from sync._databricks import (
     _cluster_log_destination,
-    _get_all_cluster_events,
+    get_all_cluster_events,
     _get_cluster_instances_from_dbfs,
     _update_monitored_timelines,
     _wait_for_cluster_termination,
@@ -25,6 +25,7 @@
     create_cluster,
     create_run,
     create_submission_for_run,
+    create_submission_with_cluster_info,
     get_cluster,
     get_cluster_report,
     get_project_cluster,
@@ -50,6 +51,8 @@
     AccessStatusCode,
     AzureDatabricksClusterReport,
     DatabricksError,
+    DatabricksPlanType,
+    DatabricksComputeType,
     Response,
 )
 from sync.utils.dbfs import format_dbfs_filepath, write_dbfs_file
@@ -62,7 +65,9 @@
     "create_cluster",
     "get_cluster",
     "create_submission_for_run",
+    "create_submission_with_cluster_info",
     "get_cluster_report",
+    "get_all_cluster_events",
     "handle_successful_job_run",
     "record_run",
     "get_project_cluster",
@@ -209,7 +214,7 @@ def _get_cluster_report(
         else:
             return instances
 
-    cluster_events = _get_all_cluster_events(cluster_id)
+    cluster_events = get_all_cluster_events(cluster_id)
     return Response(
         result=AzureDatabricksClusterReport(
             plan_type=plan_type,
@@ -223,6 +228,25 @@ def _get_cluster_report(
     )
 
 
+def _create_cluster_report(
+        cluster: dict,
+        cluster_info: dict,
+        cluster_activity_events: dict,
+        tasks: List[dict],
+        plan_type: DatabricksPlanType,
+        compute_type: DatabricksComputeType
+) -> AzureDatabricksClusterReport:
+    return AzureDatabricksClusterReport(
+        plan_type=plan_type,
+        compute_type=compute_type,
+        cluster=cluster,
+        cluster_events=cluster_activity_events,
+        tasks=tasks,
+        instances=cluster_info.get("instances"),
+        instance_timelines=cluster_info.get("timelines")
+    )
+
+
 if getattr(sync._databricks, "__claim", __name__) != __name__:
     # Unless building documentation you can't load both databricks modules in the same program
     if not sys.argv[0].endswith("sphinx-build"):
@@ -231,6 +255,7 @@ def _get_cluster_report(
         )
 
 sync._databricks._get_cluster_report = _get_cluster_report
+sync._databricks._create_cluster_report = _create_cluster_report
 setattr(sync._databricks, "__claim", __name__)
 
 
diff --git a/sync/config.py b/sync/config.py
@@ -116,6 +116,14 @@ def get_api_key() -> APIKey:
     return _api_key
 
 
+def set_api_key(api_key: APIKey):
+    global _api_key
+    if _api_key is not None:
+        raise RuntimeError("Sync API key/secret has already been set and the library does not support resetting "
+                           "credentials")
+    _api_key = api_key
+
+
 def get_config() -> Configuration:
     """Gets configuration
 
@@ -138,6 +146,14 @@ def get_databricks_config() -> DatabricksConf:
     return _db_config
 
 
+def set_databricks_config(db_config: DatabricksConf):
+    global _db_config
+    if _db_config is not None:
+        raise RuntimeError("Databricks config has already been set and the library does not support resetting "
+                           "credentials")
+    _db_config = db_config
+
+
 CONFIG: Configuration
 _config = None
 API_KEY: APIKey
diff --git a/tests/test_awsdatabricks.py b/tests/test_awsdatabricks.py
diff --git a/tests/test_config.py b/tests/test_config.py