Skip to content

Commit 3717730

Browse files
authored
Compile slurm for the os of the submitter (#182)
So that it can submit jobs to a cluster that uses a different OS. Automate the configuration of the cluster manager and submitters in RES. Fix errors in configuration scripts. Add a parameter to the ParallelCluster custom resource that has a hash of all of the config file contents so that the resource gets updated whenever anything changes. This current issue with this implementation is that the ParallelCluster still doesn't change so even though the custom resource is updated, the cluster doesn't because it's configuration is unchanged. Fix a bug in configuring licenses. After initial configuration the playbook was failing on subsequent updates. Support custom munge key which must be specified in secretsmanager Try to allow RES /home to be mounted, but currently causes a validation error. Update EDA instance types to the latest and so that all memory sizes get built. Restrict lambda function IAM roles and cleanup cfn_nag errors and warnings. Resolves #177 Resolves #178 Resolves #180 Resolves #181
1 parent 82243b3 commit 3717730

File tree

43 files changed

+2447
-331
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+2447
-331
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ _site
44
site/
55
.vscode/
66
source/resources/parallel-cluster/config/build-files/*/*/parallelcluster-*.yml
7+
security_scan/cfn_nag.log

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ local-docs: .mkdocs_venv/bin/activate
1616
github-docs: .mkdocs_venv/bin/activate
1717
source .mkdocs_venv/bin/activate; mkdocs gh-deploy --strict
1818

19+
security_scan:
20+
security_scan/security_scan.sh
21+
1922
test:
2023
pytest -x -v tests
2124

security_scan/security_scan.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
scriptdir=$(dirname $(readlink -f $0))
66

77
cd $scriptdir/..
8-
./install.sh --config-file ~/slurm/slurm_eda_az1.yml --cdk-cmd synth
8+
./install.sh --config-file ~/slurm/res-eda/res-eda-pc-3-7-2-centos7-x86-config.yml --cdk-cmd synth
99

10-
cfn_nag_scan --input-path $scriptdir/../source/cdk.out/slurmedaaz1.template.json --deny-list-path $scriptdir/cfn_nag-deny-list.yml --fail-on-warnings &> $scriptdir/cfn_nag.log
10+
cfn_nag_scan --input-path $scriptdir/../source/cdk.out/res-eda-pc-3-7-2-centos7-x86-config.template.json --deny-list-path $scriptdir/cfn_nag-deny-list.yml --fail-on-warnings &> $scriptdir/cfn_nag.log
1111

1212
cd $scriptdir
1313
if [ ! -e $scriptdir/bandit-env ]; then

source/cdk/cdk_slurm_stack.py

Lines changed: 708 additions & 104 deletions
Large diffs are not rendered by default.

source/cdk/config_schema.py

Lines changed: 113 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,11 @@
3535
logger.setLevel(logging.INFO)
3636

3737
# MIN_PARALLEL_CLUSTER_VERSION
38+
# 3.2.0:
39+
# * Add support for memory-based job scheduling in Slurm
3840
# 3.3.0:
3941
# * Add support for multiple instance types in a compute resource
42+
# * Add new configuration section Scheduling/SlurmSettings/Database to enable accounting functionality in Slurm.
4043
# 3.4.0:
4144
# * Add support for launching nodes across multiple availability zones to increase capacity availability
4245
# * Add support for specifying multiple subnets for each queue to increase capacity availability
@@ -53,6 +56,7 @@
5356
# 3.7.0:
5457
# * Login Nodes
5558
# * Add support for configurable node weights within queue
59+
# * Allow memory-based scheduling when multiple instance types are specified for a Slurm Compute Resource.
5660
# 3.7.1:
5761
# * Fix pmix CVE
5862
# * Use Slurm 23.02.5
@@ -84,11 +88,19 @@
8488
}
8589
PARALLEL_CLUSTER_SLURM_VERSIONS = {
8690
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
87-
'3.6.0': '23-02-2-1', # confirmed
88-
'3.6.1': '23-02-2-1', # confirmed
89-
'3.7.0': '23-02-4-1', # confirmed
90-
'3.7.1': '23-02-5-1', # confirmed
91-
'3.7.2': '23-02-6-1', # confirmed
91+
'3.6.0': '23.02.2', # confirmed
92+
'3.6.1': '23.02.2', # confirmed
93+
'3.7.0': '23.02.4', # confirmed
94+
'3.7.1': '23.02.5', # confirmed
95+
'3.7.2': '23.02.6', # confirmed
96+
}
97+
PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
98+
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
99+
'3.6.0': '23-02-2-1', # confirmed
100+
'3.6.1': '23-02-2-1', # confirmed
101+
'3.7.0': '23-02-4-1', # confirmed
102+
'3.7.1': '23-02-5-1', # confirmed
103+
'3.7.2': '23-02-6-1', # confirmed
92104
}
93105
SLURM_REST_API_VERSIONS = {
94106
'23-02-2-1': '0.0.39',
@@ -110,23 +122,45 @@ def get_parallel_cluster_version(config):
110122

111123
def get_PARALLEL_CLUSTER_MUNGE_VERSION(config):
112124
parallel_cluster_version = get_parallel_cluster_version(config)
113-
munge_version = PARALLEL_CLUSTER_MUNGE_VERSIONS[parallel_cluster_version]
114-
return munge_version
125+
return PARALLEL_CLUSTER_MUNGE_VERSIONS[parallel_cluster_version]
115126

116127
def get_PARALLEL_CLUSTER_PYTHON_VERSION(config):
117128
parallel_cluster_version = get_parallel_cluster_version(config)
118-
python_version = PARALLEL_CLUSTER_PYTHON_VERSIONS[parallel_cluster_version]
119-
return python_version
129+
return PARALLEL_CLUSTER_PYTHON_VERSIONS[parallel_cluster_version]
120130

121131
def get_SLURM_VERSION(config):
122132
parallel_cluster_version = get_parallel_cluster_version(config)
123-
slurm_version = PARALLEL_CLUSTER_SLURM_VERSIONS[parallel_cluster_version]
124-
return slurm_version
133+
return PARALLEL_CLUSTER_SLURM_VERSIONS[parallel_cluster_version]
134+
135+
def get_PC_SLURM_VERSION(config):
136+
parallel_cluster_version = get_parallel_cluster_version(config)
137+
return PARALLEL_CLUSTER_PC_SLURM_VERSIONS[parallel_cluster_version]
125138

126139
def get_slurm_rest_api_version(config):
127-
slurm_version = get_SLURM_VERSION(config)
128-
slurm_rest_api_version = SLURM_REST_API_VERSIONS.get(slurm_version, )
129-
return slurm_rest_api_version
140+
slurm_version = get_PC_SLURM_VERSION(config)
141+
return SLURM_REST_API_VERSIONS.get(slurm_version, )
142+
143+
# Feature support
144+
145+
# Version 3.7.0:
146+
PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES_VERSION = parse_version('3.7.0')
147+
def PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES(parallel_cluster_version):
148+
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES_VERSION
149+
150+
PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE_VERSION = parse_version('3.7.0')
151+
def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE(parallel_cluster_version):
152+
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE_VERSION
153+
154+
PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE_VERSION = parse_version('3.7.0')
155+
def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE(parallel_cluster_version):
156+
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE_VERSION
157+
158+
# Unsupported
159+
def PARALLEL_CLUSTER_SUPPORTS_CUSTOM_MUNGE_KEY(parallel_cluster_version):
160+
return False
161+
162+
def PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT(parallel_cluster_version):
163+
return False
130164

131165
# Determine all AWS regions available on the account.
132166
default_region = environ.get("AWS_DEFAULT_REGION", "us-east-1")
@@ -147,48 +181,88 @@ def get_slurm_rest_api_version(config):
147181
'AFTER_90_DAYS'
148182
]
149183

184+
# By default I've chosen to exclude *7i instance types because they have 50% of the cores as *7z instances with the same memory.
150185
default_eda_instance_families = [
151-
#'c5', # Mixed depending on size
152-
#'c5a', # AMD EPYC 7R32 3.3 GHz
153-
#'c5ad', # AMD EPYC 7R32 3.3 GHz
186+
'c7a', # AMD EPYC 9R14 Processor 3.7 GHz
187+
188+
'c7g', # AWS Graviton3 Processor 2.6 GHz
189+
# 'c7gd', # AWS Graviton3 Processor 2.6 GHz
190+
# 'c7gn', # AWS Graviton3 Processor 2.6 GHz
191+
192+
# 'c7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
193+
194+
#'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
195+
196+
'm5zn', # Intel Xeon Platinum 8252 4.5 GHz
197+
198+
'm7a', # AMD EPYC 9R14 Processor 3.7 GHz
199+
200+
# 'm7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
201+
202+
'm7g', # AWS Graviton3 Processor 2.6 GHz
203+
# 'm7gd', # AWS Graviton3 Processor 2.6 GHz
204+
205+
'r7a', # AMD EPYC 9R14 Processor 3.7 GHz
206+
207+
'r7g', # AWS Graviton3 Processor 2.6 GHz
208+
# 'r7gd', # AWS Graviton3 Processor 2.6 GHz
209+
210+
# 'r7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
211+
212+
'r7iz', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
213+
214+
'x2gd', # AWS Graviton2 Processor 2.5 GHz 1TB
215+
216+
'x2idn', # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB
217+
218+
'x2iedn', # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB
219+
220+
'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB
221+
222+
#'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB
223+
#'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB
224+
#'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB
225+
]
226+
227+
old_eda_instance_families = [
228+
'c5', # Mixed depending on size
229+
'c5a', # AMD EPYC 7R32 3.3 GHz
230+
'c5ad', # AMD EPYC 7R32 3.3 GHz
154231
'c6a',
155232
'c6ad',
156233
'c6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz
157234
'c6id',
158235
'c6g', # AWS Graviton2 Processor 2.5 GHz
159-
#'c6gd', # AWS Graviton2 Processor 2.5 GHz
160-
#'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
161-
#'m5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
162-
#'m5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
163-
#'m5a', # AMD EPYC 7571 2.5 GHz
164-
#'m5ad', # AMD EPYC 7571 2.5 GHz
236+
'c6gd', # AWS Graviton2 Processor 2.5 GHz
237+
'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
238+
'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
239+
'm5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
240+
'm5a', # AMD EPYC 7571 2.5 GHz
241+
'm5ad', # AMD EPYC 7571 2.5 GHz
165242
'm5zn', # Intel Xeon Platinum 8252 4.5 GHz
166243
'm6a', # AMD EPYC 7R13 Processor 3.6 GHz
167244
'm6ad',
168245
'm6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz
169246
'm6id',
170247
'm6g', # AWS Graviton2 Processor 2.5 GHz
171-
#'m6gd', # AWS Graviton2 Processor 2.5 GHz
248+
'm6gd', # AWS Graviton2 Processor 2.5 GHz
172249
'r5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
173250
'r5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
174-
#'r5b', # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz
251+
'r5b', # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz
175252
'r5a', # AMD EPYC 7571 2.5 GHz
176253
'r5ad', # AMD EPYC 7571 2.5 GHz
177254
'r6a',
178255
'r6i', # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB
179256
'r6id',
180257
'r6g', # AWS Graviton2 Processor 2.5 GHz
181-
#'r6gd', # AWS Graviton2 Processor 2.5 GHz
182-
#'x1', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB
183-
#'x1e', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB
258+
'r6gd', # AWS Graviton2 Processor 2.5 GHz
259+
'x1', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB
260+
'x1e', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB
184261
'x2gd', # AWS Graviton2 Processor 2.5 GHz 1TB
185262
'x2idn', # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB
186263
'x2iedn', # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB
187264
'x2iezn', # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB
188265
'z1d', # Intel Xeon Platinum 8151 4.0 GHz
189-
#'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB
190-
#'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB
191-
#'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB
192266
]
193267

194268
default_eda_instance_types = [
@@ -219,7 +293,7 @@ def get_slurm_rest_api_version(config):
219293

220294
default_excluded_instance_types = [
221295
'.+\.(micro|nano)', # Not enough memory
222-
'.*\.metal'
296+
'.*\.metal.*'
223297
]
224298

225299
architectures = [
@@ -259,6 +333,7 @@ def get_config_schema(config):
259333
# Optional, but highly recommended
260334
Optional('ErrorSnsTopicArn'): str,
261335
Optional('TimeZone', default='US/Central'): str,
336+
Optional('RESEnvironmentName'): str,
262337
'slurm': {
263338
Optional('ParallelClusterConfig'): {
264339
Optional('Enable', default=True): And(bool, lambda s: s == True),
@@ -328,11 +403,13 @@ def get_config_schema(config):
328403
# Default to StackName-cl
329404
Optional('ClusterName'): And(str, lambda s: s != config['StackName']),
330405
#
331-
# MungeKeySsmParameter:
332-
# SSM String Parameter with a base64 encoded munge key to use for the cluster.
406+
# MungeKeySecret:
407+
# AWS secret with a base64 encoded munge key to use for the cluster.
408+
# For an existing secret can be the secret name or the ARN.
409+
# If the secret doesn't exist one will be created, but won't be part of the cloudformation stack
410+
# so that it won't be deleted when the stack is deleted.
333411
# Required if your submitters need to use more than 1 cluster.
334-
# Will be created if it doesn't exist to save the value in Parameter Store.
335-
Optional('MungeKeySsmParameter', default='/slurm/munge_key'): str,
412+
Optional('MungeKeySecret'): str,
336413
#
337414
# SlurmCtl:
338415
# Required, but can be an empty dict to accept all of the defaults
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
SPDX-License-Identifier: MIT-0
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
6+
software and associated documentation files (the "Software"), to deal in the Software
7+
without restriction, including without limitation the rights to use, copy, modify,
8+
merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
9+
permit persons to whom the Software is furnished to do so.
10+
11+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
12+
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
13+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
14+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
15+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
16+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17+
"""
18+
19+
'''
20+
Call /opt/slurm/{{ClusterName}}/config/bin/create_users_groups_json_configure.sh using ssm run command.
21+
'''
22+
import boto3
23+
import json
24+
import logging
25+
from os import environ as environ
26+
27+
logger=logging.getLogger(__file__)
28+
logger_formatter = logging.Formatter('%(levelname)s: %(message)s')
29+
logger_streamHandler = logging.StreamHandler()
30+
logger_streamHandler.setFormatter(logger_formatter)
31+
logger.addHandler(logger_streamHandler)
32+
logger.setLevel(logging.INFO)
33+
logger.propagate = False
34+
35+
def lambda_handler(event, context):
36+
try:
37+
logger.info(f"event:\n{json.dumps(event, indent=4)}")
38+
39+
cluster_name = environ['ClusterName']
40+
cluster_region = environ['Region']
41+
environment_name = environ['RESEnvironmentName']
42+
logger.info(f"Update RES cluster={environment_name} manager for {cluster_name} in {cluster_region}")
43+
44+
ec2_client = boto3.client('ec2', region_name=cluster_region)
45+
46+
cluster_manager_info = ec2_client.describe_instances(
47+
Filters = [
48+
{'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]},
49+
{'Name': 'tag:res:ModuleId', 'Values': ['cluster-manager']}
50+
]
51+
)['Reservations'][0]['Instances'][0]
52+
cluster_manager_instance_id = cluster_manager_info['InstanceId']
53+
logger.info(f"cluster manager instance id: {cluster_manager_instance_id}")
54+
55+
ssm_client = boto3.client('ssm', region_name=cluster_region)
56+
commands = f"""
57+
set -ex
58+
59+
if ! [ -e /opt/slurm/{cluster_name} ]; then
60+
sudo mkdir -p /opt/slurm/{cluster_name}
61+
fi
62+
if ! mountpoint /opt/slurm/{cluster_name} ; then
63+
sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true
64+
fi
65+
66+
script="/opt/slurm/{cluster_name}/config/bin/create_users_groups_json_configure.sh"
67+
if ! [ -e $script ]; then
68+
echo "$script doesn't exist"
69+
exit 1
70+
fi
71+
72+
sudo $script
73+
"""
74+
response = ssm_client.send_command(
75+
DocumentName = 'AWS-RunShellScript',
76+
InstanceIds = [cluster_manager_instance_id],
77+
Parameters = {'commands': [commands]},
78+
Comment = f"Configure {environment_name} cluster manager for {cluster_name}"
79+
)
80+
logger.info(f"Sent SSM command {response['Command']['CommandId']}")
81+
82+
except Exception as e:
83+
logger.exception(str(e))
84+
raise

0 commit comments

Comments
 (0)