Skip to content

Commit a8b6555

Browse files
authored
Update cluster when config file changes (#205)
Move the code that expands the config file template to a separate lambda and custom resource so we can get its sha512 hash and use it to determine when the cluster needs to be updated. Resolves #202
1 parent 9d3c16f commit a8b6555

File tree

4 files changed

+226
-48
lines changed

4 files changed

+226
-48
lines changed

source/cdk/cdk_slurm_stack.py

Lines changed: 79 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,11 +1049,59 @@ def create_parallel_cluster_lambdas(self):
10491049
)
10501050
)
10511051

1052+
createParallelClusterConfigLambdaAsset = s3_assets.Asset(self, "CreateParallelClusterConfigAsset", path="resources/lambdas/CreateParallelClusterConfig")
1053+
self.create_parallel_cluster_config_lambda = aws_lambda.Function(
1054+
self, "CreateParallelClusterConfigLambda",
1055+
function_name=f"{self.stack_name}-CreateParallelClusterConfig",
1056+
description="Create ParallelCluster config",
1057+
memory_size=2048,
1058+
runtime=aws_lambda.Runtime.PYTHON_3_9,
1059+
architecture=aws_lambda.Architecture.X86_64,
1060+
timeout=Duration.minutes(15),
1061+
log_retention=logs.RetentionDays.INFINITE,
1062+
handler="CreateParallelClusterConfig.lambda_handler",
1063+
code=aws_lambda.Code.from_bucket(createParallelClusterConfigLambdaAsset.bucket, createParallelClusterConfigLambdaAsset.s3_object_key),
1064+
layers=[self.parallel_cluster_lambda_layer],
1065+
environment = {
1066+
'ClusterName': self.config['slurm']['ClusterName'],
1067+
'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''),
1068+
'ParallelClusterConfigS3Bucket': self.assets_bucket,
1069+
'ParallelClusterConfigYamlTemplateS3Key': self.parallel_cluster_config_template_yaml_s3_key,
1070+
'ParallelClusterConfigYamlS3Key': self.parallel_cluster_config_yaml_s3_key,
1071+
'Region': self.cluster_region
1072+
}
1073+
)
1074+
self.create_parallel_cluster_config_lambda.add_to_role_policy(
1075+
statement=iam.PolicyStatement(
1076+
effect=iam.Effect.ALLOW,
1077+
actions=[
1078+
's3:DeleteObject',
1079+
's3:GetObject',
1080+
's3:PutObject'
1081+
],
1082+
resources=[
1083+
f"arn:{Aws.PARTITION}:s3:::{self.assets_bucket}/{self.config['slurm']['ClusterName']}/*",
1084+
f"arn:{Aws.PARTITION}:s3:::{self.assets_bucket}/{self.config['slurm']['ClusterName']}/{self.parallel_cluster_config_template_yaml_s3_key}",
1085+
f"arn:{Aws.PARTITION}:s3:::{self.assets_bucket}/{self.config['slurm']['ClusterName']}/{self.parallel_cluster_config_yaml_s3_key}"
1086+
]
1087+
)
1088+
)
1089+
if 'ErrorSnsTopicArn' in self.config:
1090+
self.create_parallel_cluster_config_lambda.add_to_role_policy(
1091+
statement=iam.PolicyStatement(
1092+
effect=iam.Effect.ALLOW,
1093+
actions=[
1094+
'sns:Publish'
1095+
],
1096+
resources=[self.config['ErrorSnsTopicArn']]
1097+
)
1098+
)
1099+
10521100
createParallelClusterLambdaAsset = s3_assets.Asset(self, "CreateParallelClusterAsset", path="resources/lambdas/CreateParallelCluster")
10531101
self.create_parallel_cluster_lambda = aws_lambda.Function(
10541102
self, "CreateParallelClusterLambda",
10551103
function_name=f"{self.stack_name}-CreateParallelCluster",
1056-
description="Create ParallelCluster from json string",
1104+
description="Create ParallelCluster",
10571105
memory_size=2048,
10581106
runtime=aws_lambda.Runtime.PYTHON_3_9,
10591107
architecture=aws_lambda.Architecture.X86_64,
@@ -2380,7 +2428,7 @@ def create_parallel_cluster_config(self):
23802428
index = 0
23812429
for extra_mount_sg_name, extra_mount_sg in self.extra_mount_security_groups[fs_type].items():
23822430
template_var = f"ExtraMountSecurityGroupId{index}"
2383-
self.create_parallel_cluster_lambda.add_environment(
2431+
self.create_parallel_cluster_config_lambda.add_environment(
23842432
key = template_var,
23852433
value = extra_mount_sg.security_group_id
23862434
)
@@ -2838,50 +2886,64 @@ def create_parallel_cluster_config(self):
28382886
self.parallel_cluster_config['SharedStorage'].append(parallel_cluster_storage_dict)
28392887

28402888
# Save the config template to s3.
2889+
self.parallel_cluster_config_template_yaml = yaml.dump(self.parallel_cluster_config)
2890+
self.parallel_cluster_config_template_yaml_hash = sha512()
2891+
self.parallel_cluster_config_template_yaml_hash.update(bytes(self.parallel_cluster_config_template_yaml, 'utf-8'))
2892+
self.assets_hash.update(bytes(self.parallel_cluster_config_template_yaml, 'utf-8'))
28412893
self.s3_client.put_object(
28422894
Bucket = self.assets_bucket,
28432895
Key = self.parallel_cluster_config_template_yaml_s3_key,
2844-
Body = yaml.dump(self.parallel_cluster_config)
2896+
Body = self.parallel_cluster_config_template_yaml
28452897
)
28462898

28472899
self.build_config_files = CustomResource(
28482900
self, "BuildConfigFiles",
28492901
service_token = self.create_build_files_lambda.function_arn
28502902
)
28512903

2852-
self.create_parallel_cluster_lambda.add_environment(
2904+
self.create_parallel_cluster_config_lambda.add_environment(
28532905
key = 'ParallelClusterAssetReadPolicyArn',
28542906
value = self.parallel_cluster_asset_read_policy.managed_policy_arn
28552907
)
2856-
self.create_parallel_cluster_lambda.add_environment(
2908+
self.create_parallel_cluster_config_lambda.add_environment(
28572909
key = 'ParallelClusterJwtWritePolicyArn',
28582910
value = self.parallel_cluster_jwt_write_policy.managed_policy_arn
28592911
)
2860-
self.create_parallel_cluster_lambda.add_environment(
2912+
self.create_parallel_cluster_config_lambda.add_environment(
28612913
key = 'ParallelClusterMungeKeyWritePolicyArn',
28622914
value = self.parallel_cluster_munge_key_write_policy.managed_policy_arn
28632915
)
2864-
self.create_parallel_cluster_lambda.add_environment(
2916+
self.create_parallel_cluster_config_lambda.add_environment(
28652917
key = 'ParallelClusterSnsPublishPolicyArn',
28662918
value = self.parallel_cluster_sns_publish_policy.managed_policy_arn
28672919
)
2868-
self.create_parallel_cluster_lambda.add_environment(
2920+
self.create_parallel_cluster_config_lambda.add_environment(
28692921
key = 'SlurmCtlSecurityGroupId',
28702922
value = self.slurmctl_sg.security_group_id
28712923
)
2872-
self.create_parallel_cluster_lambda.add_environment(
2924+
self.create_parallel_cluster_config_lambda.add_environment(
28732925
key = 'SlurmNodeSecurityGroupId',
28742926
value = self.slurmnode_sg.security_group_id
28752927
)
2928+
self.parallel_cluster_config = CustomResource(
2929+
self, "ParallelClusterConfig",
2930+
service_token = self.create_parallel_cluster_config_lambda.function_arn,
2931+
properties = {
2932+
'ParallelClusterConfigTemplateYamlHash': self.parallel_cluster_config_template_yaml_hash.hexdigest()
2933+
}
2934+
)
2935+
self.parallel_cluster_config_template_yaml_s3_url = self.parallel_cluster_config.get_att_string('ConfigTemplateYamlS3Url')
2936+
self.parallel_cluster_config_yaml_s3_url = self.parallel_cluster_config.get_att_string('ConfigYamlS3Url')
2937+
self.parallel_cluster_config_yaml_hash = self.parallel_cluster_config.get_att_string('ConfigYamlHash')
2938+
self.assets_hash.update(bytes(self.parallel_cluster_config_yaml_hash, 'utf-8'))
2939+
28762940
self.parallel_cluster = CustomResource(
28772941
self, "ParallelCluster",
28782942
service_token = self.create_parallel_cluster_lambda.function_arn,
28792943
properties = {
2880-
'ParallelClusterConfigHash': self.assets_hash.hexdigest()
2944+
'ParallelClusterConfigHash': self.parallel_cluster_config_yaml_hash
28812945
}
28822946
)
2883-
self.parallel_cluster_config_template_yaml_s3_url = self.parallel_cluster.get_att_string('ConfigTemplateYamlS3Url')
2884-
self.parallel_cluster_config_yaml_s3_url = self.parallel_cluster.get_att_string('ConfigYamlS3Url')
28852947
# The lambda to create an A record for the head node must be built before the parallel cluster.
28862948
self.parallel_cluster.node.add_dependency(self.create_head_node_a_record_lambda)
28872949
self.parallel_cluster.node.add_dependency(self.update_head_node_lambda)
@@ -2891,6 +2953,7 @@ def create_parallel_cluster_config(self):
28912953
self.parallel_cluster.node.add_dependency(self.configure_res_submitters_lambda)
28922954
# Build config files need to be created before cluster so that they can be downloaded as part of on_head_node_configures
28932955
self.parallel_cluster.node.add_dependency(self.build_config_files)
2956+
self.parallel_cluster.node.add_dependency(self.parallel_cluster_config)
28942957

28952958
self.call_slurm_rest_api_lambda.node.add_dependency(self.parallel_cluster)
28962959

@@ -2899,7 +2962,7 @@ def create_parallel_cluster_config(self):
28992962
self, "UpdateHeadNode",
29002963
service_token = self.update_head_node_lambda.function_arn,
29012964
properties = {
2902-
'ParallelClusterConfigHash': self.assets_hash.hexdigest(),
2965+
'ParallelClusterConfigHash': self.parallel_cluster_config_yaml_hash,
29032966
}
29042967
)
29052968
self.update_head_node.node.add_dependency(self.parallel_cluster)
@@ -2929,6 +2992,9 @@ def create_parallel_cluster_config(self):
29292992
CfnOutput(self, "ParallelClusterConfigYamlS3Url",
29302993
value = self.parallel_cluster_config_yaml_s3_url
29312994
)
2995+
CfnOutput(self, "ParallelClusterConfigHash",
2996+
value = self.parallel_cluster_config_yaml_hash
2997+
)
29322998
CfnOutput(self, "PlaybookS3Url",
29332999
value = self.playbooks_asset.s3_object_url
29343000
)

source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py

Lines changed: 15 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -76,49 +76,16 @@ def lambda_handler(event, context):
7676
else:
7777
raise KeyError(error_message)
7878

79-
s3_resource = boto3.resource('s3')
80-
81-
yaml_template_key = environ['ParallelClusterConfigYamlTemplateS3Key']
82-
yaml_template_s3_url = f"s3://{environ['ParallelClusterConfigS3Bucket']}/{yaml_template_key}"
83-
yaml_template_config_object = s3_resource.Object(
84-
bucket_name = environ['ParallelClusterConfigS3Bucket'],
85-
key = yaml_template_key
86-
)
87-
parallel_cluster_config_yaml_template = Template(yaml_template_config_object.get()['Body'].read().decode('utf-8'))
88-
89-
template_vars = {}
90-
for template_var in environ:
91-
template_vars[template_var] = environ[template_var]
92-
parallel_cluster_config_yaml = parallel_cluster_config_yaml_template.render(**template_vars)
93-
parallel_cluster_config = yaml.load(parallel_cluster_config_yaml, Loader=yaml.FullLoader)
94-
logger.info(f"HeadNode config:\n{json.dumps(parallel_cluster_config['HeadNode'], indent=4)}")
95-
96-
yaml_key = f"{environ['ParallelClusterConfigYamlS3Key']}"
97-
yaml_s3_url = f"s3://{environ['ParallelClusterConfigS3Bucket']}/{yaml_key}"
98-
yaml_config_object = s3_resource.Object(
99-
bucket_name = environ['ParallelClusterConfigS3Bucket'],
100-
key = yaml_key
101-
)
102-
if requestType == 'Delete':
103-
logging.info(f"Deleting Parallel Cluster yaml config in {yaml_s3_url}")
104-
try:
105-
yaml_config_object.delete()
106-
except:
107-
pass
108-
else:
109-
logging.info(f"Saving Parallel Cluster yaml config in {yaml_s3_url}")
110-
yaml_config_object.put(Body=yaml.dump(parallel_cluster_config, sort_keys=False))
111-
11279
cluster_name = environ['ClusterName']
11380
cluster_region = environ['Region']
114-
11581
logger.info(f"{requestType} request for {cluster_name} in {cluster_region}")
11682

11783
cluster_status = get_cluster_status(cluster_name, cluster_region)
11884
if cluster_status:
11985
valid_statuses = ['CREATE_COMPLETE', 'UPDATE_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE']
12086
invalid_statuses = ['CREATE_IN_PROGRESS', 'UPDATE_IN_PROGRESS', 'DELETE_IN_PROGRESS']
12187
if cluster_status in invalid_statuses:
88+
logger.error(f"{cluster_name} has invalid status: {cluster_status}")
12289
cfnresponse.send(event, context, cfnresponse.FAILED, {'error': f"{cluster_name} in {cluster_status} state."}, physicalResourceId=cluster_name)
12390
return
12491
if requestType == 'Create':
@@ -135,6 +102,19 @@ def lambda_handler(event, context):
135102
else:
136103
logger.info(f"{cluster_name} doesn't exist.")
137104

105+
yaml_key = f"{environ['ParallelClusterConfigYamlS3Key']}"
106+
yaml_s3_url = f"s3://{environ['ParallelClusterConfigS3Bucket']}/{yaml_key}"
107+
108+
logger.info(f"Getting Parallel Cluster yaml config from {yaml_s3_url}")
109+
s3_client = boto3.client('s3')
110+
parallel_cluster_config_yaml = s3_client.get_object(
111+
Bucket = environ['ParallelClusterConfigS3Bucket'],
112+
Key = yaml_key
113+
)['Body'].read().decode('utf-8')
114+
115+
parallel_cluster_config = yaml.load(parallel_cluster_config_yaml, Loader=yaml.FullLoader)
116+
logger.info(f"HeadNode config:\n{json.dumps(parallel_cluster_config['HeadNode'], indent=4)}")
117+
138118
if requestType == "Create":
139119
logger.info(f"Creating {cluster_name}")
140120
try:
@@ -277,4 +257,4 @@ def lambda_handler(event, context):
277257
logger.info(f"Published error to {environ['ErrorSnsTopicArn']}")
278258
raise
279259

280-
cfnresponse.send(event, context, cfnresponse.SUCCESS, {'ConfigTemplateYamlS3Url': yaml_template_s3_url, 'ConfigYamlS3Url': yaml_s3_url}, physicalResourceId=cluster_name)
260+
cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name)
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""
2+
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
SPDX-License-Identifier: MIT-0
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
6+
software and associated documentation files (the "Software"), to deal in the Software
7+
without restriction, including without limitation the rights to use, copy, modify,
8+
merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
9+
permit persons to whom the Software is furnished to do so.
10+
11+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
12+
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
13+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
14+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
15+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
16+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17+
"""
18+
19+
'''
20+
Create/update/delete ParallelCluster cluster config file and save to S3 as json and yaml.
21+
'''
22+
import boto3
23+
import cfnresponse
24+
from hashlib import sha512
25+
from jinja2 import Template as Template
26+
import json
27+
import logging
28+
from os import environ as environ
29+
import pcluster.lib as pc
30+
from pcluster.api.errors import BadRequestException, UpdateClusterBadRequestException
31+
from time import sleep
32+
import yaml
33+
34+
logger=logging.getLogger(__file__)
35+
logger_formatter = logging.Formatter('%(levelname)s: %(message)s')
36+
logger_streamHandler = logging.StreamHandler()
37+
logger_streamHandler.setFormatter(logger_formatter)
38+
logger.addHandler(logger_streamHandler)
39+
logger.setLevel(logging.INFO)
40+
logger.propagate = False
41+
42+
def lambda_handler(event, context):
43+
try:
44+
logger.info(f"event:\n{json.dumps(event, indent=4)}")
45+
cluster_name = None
46+
requestType = event['RequestType']
47+
properties = event['ResourceProperties']
48+
required_properties = [
49+
'ParallelClusterConfigTemplateYamlHash'
50+
]
51+
error_message = ""
52+
for property in required_properties:
53+
try:
54+
value = properties[property]
55+
except:
56+
error_message += f"Missing {property} property. "
57+
if error_message:
58+
logger.info(error_message)
59+
if requestType == 'Delete':
60+
cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name)
61+
return
62+
else:
63+
raise KeyError(error_message)
64+
65+
s3_client = boto3.client('s3')
66+
67+
yaml_template_key = environ['ParallelClusterConfigYamlTemplateS3Key']
68+
yaml_template_s3_url = f"s3://{environ['ParallelClusterConfigS3Bucket']}/{yaml_template_key}"
69+
70+
yaml_key = f"{environ['ParallelClusterConfigYamlS3Key']}"
71+
yaml_s3_url = f"s3://{environ['ParallelClusterConfigS3Bucket']}/{yaml_key}"
72+
73+
parallel_cluster_config_hash = sha512()
74+
75+
if requestType == 'Delete':
76+
logger.info(f"Deleting Parallel Cluster yaml config template in {yaml_template_s3_url}")
77+
try:
78+
s3_client.delete_object(
79+
Bucket = environ['ParallelClusterConfigS3Bucket'],
80+
Key = yaml_template_key
81+
)
82+
except:
83+
pass
84+
85+
logger.info(f"Deleting Parallel Cluster yaml config in {yaml_s3_url}")
86+
try:
87+
s3_client.delete_object(
88+
Bucket = environ['ParallelClusterConfigS3Bucket'],
89+
Key = yaml_key
90+
)
91+
except:
92+
pass
93+
else: # Create or Update
94+
parallel_cluster_config_yaml_template = Template(
95+
s3_client.get_object(
96+
Bucket = environ['ParallelClusterConfigS3Bucket'],
97+
Key = yaml_template_key
98+
)['Body'].read().decode('utf-8'))
99+
100+
template_vars = {}
101+
for template_var in environ:
102+
template_vars[template_var] = environ[template_var]
103+
logger.info(f"template_vars:\n{json.dumps(template_vars, indent=4, sort_keys=True)}")
104+
parallel_cluster_config_yaml = parallel_cluster_config_yaml_template.render(**template_vars)
105+
106+
parallel_cluster_config_hash.update(bytes(parallel_cluster_config_yaml, 'utf-8'))
107+
logger.info(f"Config hash: {parallel_cluster_config_hash.hexdigest()}")
108+
109+
parallel_cluster_config = yaml.load(parallel_cluster_config_yaml, Loader=yaml.FullLoader)
110+
logger.info(f"HeadNode config:\n{json.dumps(parallel_cluster_config['HeadNode'], indent=4)}")
111+
112+
logger.info(f"Saving Parallel Cluster yaml config in {yaml_s3_url}")
113+
s3_client.put_object(
114+
Bucket = environ['ParallelClusterConfigS3Bucket'],
115+
Key = yaml_key,
116+
Body = parallel_cluster_config_yaml
117+
)
118+
119+
except Exception as e:
120+
logger.exception(str(e))
121+
cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, physicalResourceId=cluster_name)
122+
sns_client = boto3.client('sns')
123+
sns_client.publish(
124+
TopicArn = environ['ErrorSnsTopicArn'],
125+
Subject = f"{cluster_name} CreateParallelClusterConfig failed",
126+
Message = str(e)
127+
)
128+
logger.info(f"Published error to {environ['ErrorSnsTopicArn']}")
129+
raise
130+
131+
cfnresponse.send(event, context, cfnresponse.SUCCESS, {'ConfigTemplateYamlS3Url': yaml_template_s3_url, 'ConfigYamlS3Url': yaml_s3_url, 'ConfigYamlHash': parallel_cluster_config_hash.hexdigest()}, physicalResourceId=cluster_name)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../cfnresponse.py

0 commit comments

Comments
 (0)