|
18 | 18 | from sync.api import projects |
19 | 19 | from sync.clients.databricks import get_default_client |
20 | 20 | from sync.config import CONFIG # noqa F401 |
21 | | -from sync.models import DatabricksAPIError, DatabricksClusterReport, DatabricksError, Response |
| 21 | +from sync.models import ( |
| 22 | + DatabricksAPIError, |
| 23 | + DatabricksClusterReport, |
| 24 | + DatabricksError, |
| 25 | + DatabricksComputeType, |
| 26 | + DatabricksPlanType, |
| 27 | + Response |
| 28 | +) |
22 | 29 | from sync.utils.dbfs import format_dbfs_filepath, read_dbfs_file |
23 | 30 |
|
24 | 31 | logger = logging.getLogger(__name__) |
@@ -56,6 +63,58 @@ def get_cluster(cluster_id: str) -> Response[dict]: |
56 | 63 | return Response(result=cluster) |
57 | 64 |
|
58 | 65 |
|
| 66 | +def create_submission_with_cluster_info( |
| 67 | + run_id: str, |
| 68 | + project_id: str, |
| 69 | + cluster: Dict, |
| 70 | + cluster_info: Dict, |
| 71 | + cluster_activity_events: Dict, |
| 72 | + plan_type: DatabricksPlanType, |
| 73 | + compute_type: DatabricksComputeType, |
| 74 | +) -> Response[str]: |
| 75 | + """Create a Submission for the specified Databricks run given a cluster report""" |
| 76 | + |
| 77 | + run = get_default_client().get_run(run_id) |
| 78 | + |
| 79 | + if "error_code" in run: |
| 80 | + return Response(error=DatabricksAPIError(**run)) |
| 81 | + |
| 82 | + project_response = projects.get_project(project_id) |
| 83 | + if project_response.error: |
| 84 | + return project_response |
| 85 | + cluster_path = project_response.result.get("cluster_path") |
| 86 | + |
| 87 | + project_cluster_tasks = _get_project_cluster_tasks(run, project_id, cluster_path) |
| 88 | + |
| 89 | + cluster_tasks = project_cluster_tasks.get(project_id) |
| 90 | + if not cluster_tasks: |
| 91 | + return Response( |
| 92 | + error=DatabricksError( |
| 93 | + message=f"Failed to locate cluster in run {run_id} for project {project_id}" |
| 94 | + ) |
| 95 | + ) |
| 96 | + |
| 97 | + _, tasks = cluster_tasks |
| 98 | + |
| 99 | + cluster_report = _create_cluster_report( |
| 100 | + cluster=cluster, |
| 101 | + cluster_info=cluster_info, |
| 102 | + cluster_activity_events=cluster_activity_events, |
| 103 | + tasks=tasks, |
| 104 | + plan_type=plan_type, |
| 105 | + compute_type=compute_type |
| 106 | + ) |
| 107 | + eventlog = _get_event_log_from_cluster(cluster, tasks).result |
| 108 | + |
| 109 | + return projects.create_project_submission_with_eventlog_bytes( |
| 110 | + get_default_client().get_platform(), |
| 111 | + cluster_report.dict(exclude_none=True), |
| 112 | + "eventlog.zip", |
| 113 | + eventlog, |
| 114 | + project_id, |
| 115 | + ) |
| 116 | + |
| 117 | + |
59 | 118 | def create_submission_for_run( |
60 | 119 | run_id: str, |
61 | 120 | plan_type: str, |
@@ -160,19 +219,27 @@ def _get_run_information( |
160 | 219 | cluster_report = cluster_report_response.result |
161 | 220 | if cluster_report: |
162 | 221 | cluster = cluster_report.cluster |
163 | | - spark_context_id = _get_run_spark_context_id(tasks) |
164 | | - end_time = max(task["end_time"] for task in tasks) |
165 | | - eventlog_response = _get_eventlog(cluster, spark_context_id.result, end_time) |
166 | | - |
| 222 | + eventlog_response = _get_event_log_from_cluster(cluster, tasks) |
167 | 223 | eventlog = eventlog_response.result |
168 | 224 | if eventlog: |
169 | | - # TODO - allow submissions w/out eventlog. Best way to make eventlog optional?.. |
170 | 225 | return Response(result=(cluster_report, eventlog)) |
171 | 226 |
|
172 | | - return eventlog_response |
173 | 227 | return cluster_report_response |
174 | 228 |
|
175 | 229 |
|
| 230 | +def _get_event_log_from_cluster(cluster: Dict, tasks: List[Dict]) -> Response[bytes]: |
| 231 | + spark_context_id = _get_run_spark_context_id(tasks) |
| 232 | + end_time = max(task["end_time"] for task in tasks) |
| 233 | + eventlog_response = _get_eventlog(cluster, spark_context_id.result, end_time) |
| 234 | + |
| 235 | + eventlog = eventlog_response.result |
| 236 | + if eventlog: |
| 237 | + # TODO - allow submissions w/out eventlog. Best way to make eventlog optional?.. |
| 238 | + return Response(result=eventlog) |
| 239 | + |
| 240 | + return eventlog_response # return eventlog response with errors |
| 241 | + |
| 242 | + |
176 | 243 | def get_cluster_report( |
177 | 244 | run_id: str, |
178 | 245 | plan_type: str, |
@@ -240,6 +307,17 @@ def _get_cluster_report( |
240 | 307 | raise NotImplementedError() |
241 | 308 |
|
242 | 309 |
|
| 310 | +def _create_cluster_report( |
| 311 | + cluster: dict, |
| 312 | + cluster_info: dict, |
| 313 | + cluster_activity_events: dict, |
| 314 | + tasks: List[dict], |
| 315 | + plan_type: DatabricksPlanType, |
| 316 | + compute_type: DatabricksComputeType |
| 317 | +) -> DatabricksClusterReport: |
| 318 | + raise NotImplementedError() |
| 319 | + |
| 320 | + |
243 | 321 | def _get_cluster_instances_from_dbfs(filepath: str): |
244 | 322 | filepath = format_dbfs_filepath(filepath) |
245 | 323 | dbx_client = get_default_client() |
@@ -1493,7 +1571,7 @@ def _get_eventlog( |
1493 | 1571 | return Response(error=DatabricksError(message=f"Unknown log destination: {filesystem}")) |
1494 | 1572 |
|
1495 | 1573 |
|
1496 | | -def _get_all_cluster_events(cluster_id: str): |
| 1574 | +def get_all_cluster_events(cluster_id: str): |
1497 | 1575 | """Fetches all ClusterEvents for a given Databricks cluster, optionally within a time window. |
1498 | 1576 | Pages will be followed and returned as 1 object |
1499 | 1577 | """ |
|
0 commit comments