3535logger .setLevel (logging .INFO )
3636
3737# MIN_PARALLEL_CLUSTER_VERSION
38+ # 3.2.0:
39+ # * Add support for memory-based job scheduling in Slurm
3840# 3.3.0:
3941# * Add support for multiple instance types in a compute resource
42+ # * Add new configuration section Scheduling/SlurmSettings/Database to enable accounting functionality in Slurm.
4043# 3.4.0:
4144# * Add support for launching nodes across multiple availability zones to increase capacity availability
4245# * Add support for specifying multiple subnets for each queue to increase capacity availability
5356# 3.7.0:
5457# * Login Nodes
5558# * Add support for configurable node weights within queue
59+ # * Allow memory-based scheduling when multiple instance types are specified for a Slurm Compute Resource.
5660# 3.7.1:
5761# * Fix pmix CVE
5862# * Use Slurm 23.02.5
8488}
8589PARALLEL_CLUSTER_SLURM_VERSIONS = {
8690 # This can be found on the head node at /etc/chef/local-mode-cache/cache/
87- '3.6.0' : '23-02-2-1' , # confirmed
88- '3.6.1' : '23-02-2-1' , # confirmed
89- '3.7.0' : '23-02-4-1' , # confirmed
90- '3.7.1' : '23-02-5-1' , # confirmed
91- '3.7.2' : '23-02-6-1' , # confirmed
91+ '3.6.0' : '23.02.2' , # confirmed
92+ '3.6.1' : '23.02.2' , # confirmed
93+ '3.7.0' : '23.02.4' , # confirmed
94+ '3.7.1' : '23.02.5' , # confirmed
95+ '3.7.2' : '23.02.6' , # confirmed
96+ }
97+ PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
98+ # This can be found on the head node at /etc/chef/local-mode-cache/cache/
99+ '3.6.0' : '23-02-2-1' , # confirmed
100+ '3.6.1' : '23-02-2-1' , # confirmed
101+ '3.7.0' : '23-02-4-1' , # confirmed
102+ '3.7.1' : '23-02-5-1' , # confirmed
103+ '3.7.2' : '23-02-6-1' , # confirmed
92104}
93105SLURM_REST_API_VERSIONS = {
94106 '23-02-2-1' : '0.0.39' ,
@@ -110,23 +122,45 @@ def get_parallel_cluster_version(config):
110122
111123def get_PARALLEL_CLUSTER_MUNGE_VERSION (config ):
112124 parallel_cluster_version = get_parallel_cluster_version (config )
113- munge_version = PARALLEL_CLUSTER_MUNGE_VERSIONS [parallel_cluster_version ]
114- return munge_version
125+ return PARALLEL_CLUSTER_MUNGE_VERSIONS [parallel_cluster_version ]
115126
116127def get_PARALLEL_CLUSTER_PYTHON_VERSION (config ):
117128 parallel_cluster_version = get_parallel_cluster_version (config )
118- python_version = PARALLEL_CLUSTER_PYTHON_VERSIONS [parallel_cluster_version ]
119- return python_version
129+ return PARALLEL_CLUSTER_PYTHON_VERSIONS [parallel_cluster_version ]
120130
121131def get_SLURM_VERSION (config ):
122132 parallel_cluster_version = get_parallel_cluster_version (config )
123- slurm_version = PARALLEL_CLUSTER_SLURM_VERSIONS [parallel_cluster_version ]
124- return slurm_version
133+ return PARALLEL_CLUSTER_SLURM_VERSIONS [parallel_cluster_version ]
134+
135+ def get_PC_SLURM_VERSION (config ):
136+ parallel_cluster_version = get_parallel_cluster_version (config )
137+ return PARALLEL_CLUSTER_PC_SLURM_VERSIONS [parallel_cluster_version ]
125138
126139def get_slurm_rest_api_version (config ):
127- slurm_version = get_SLURM_VERSION (config )
128- slurm_rest_api_version = SLURM_REST_API_VERSIONS .get (slurm_version , )
129- return slurm_rest_api_version
140+ slurm_version = get_PC_SLURM_VERSION (config )
141+ return SLURM_REST_API_VERSIONS .get (slurm_version , )
142+
143+ # Feature support
144+
145+ # Version 3.7.0:
146+ PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES_VERSION = parse_version ('3.7.0' )
147+ def PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES (parallel_cluster_version ):
148+ return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_LOGIN_NODES_VERSION
149+
150+ PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE_VERSION = parse_version ('3.7.0' )
151+ def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE (parallel_cluster_version ):
152+ return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE_VERSION
153+
154+ PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE_VERSION = parse_version ('3.7.0' )
155+ def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE (parallel_cluster_version ):
156+ return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE_VERSION
157+
158+ # Unsupported
159+ def PARALLEL_CLUSTER_SUPPORTS_CUSTOM_MUNGE_KEY (parallel_cluster_version ):
160+ return False
161+
162+ def PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT (parallel_cluster_version ):
163+ return False
130164
131165# Determine all AWS regions available on the account.
132166default_region = environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )
@@ -147,48 +181,88 @@ def get_slurm_rest_api_version(config):
147181 'AFTER_90_DAYS'
148182 ]
149183
184+ # By default I've chosen to exclude *7i instance types because they have 50% of the cores as *7z instances with the same memory.
150185default_eda_instance_families = [
151- #'c5', # Mixed depending on size
152- #'c5a', # AMD EPYC 7R32 3.3 GHz
153- #'c5ad', # AMD EPYC 7R32 3.3 GHz
186+ 'c7a' , # AMD EPYC 9R14 Processor 3.7 GHz
187+
188+ 'c7g' , # AWS Graviton3 Processor 2.6 GHz
189+ # 'c7gd', # AWS Graviton3 Processor 2.6 GHz
190+ # 'c7gn', # AWS Graviton3 Processor 2.6 GHz
191+
192+ # 'c7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
193+
194+ #'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
195+
196+ 'm5zn' , # Intel Xeon Platinum 8252 4.5 GHz
197+
198+ 'm7a' , # AMD EPYC 9R14 Processor 3.7 GHz
199+
200+ # 'm7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
201+
202+ 'm7g' , # AWS Graviton3 Processor 2.6 GHz
203+ # 'm7gd', # AWS Graviton3 Processor 2.6 GHz
204+
205+ 'r7a' , # AMD EPYC 9R14 Processor 3.7 GHz
206+
207+ 'r7g' , # AWS Graviton3 Processor 2.6 GHz
208+ # 'r7gd', # AWS Graviton3 Processor 2.6 GHz
209+
210+ # 'r7i', # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
211+
212+ 'r7iz' , # Intel Xeon Scalable (Sapphire Rapids) 3.2 GHz
213+
214+ 'x2gd' , # AWS Graviton2 Processor 2.5 GHz 1TB
215+
216+ 'x2idn' , # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB
217+
218+ 'x2iedn' , # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB
219+
220+ 'x2iezn' , # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB
221+
222+ #'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB
223+ #'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB
224+ #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB
225+ ]
226+
227+ old_eda_instance_families = [
228+ 'c5' , # Mixed depending on size
229+ 'c5a' , # AMD EPYC 7R32 3.3 GHz
230+ 'c5ad' , # AMD EPYC 7R32 3.3 GHz
154231 'c6a' ,
155232 'c6ad' ,
156233 'c6i' , # Intel Xeon 8375C (Ice Lake) 3.5 GHz
157234 'c6id' ,
158235 'c6g' , # AWS Graviton2 Processor 2.5 GHz
159- # 'c6gd', # AWS Graviton2 Processor 2.5 GHz
160- # 'f1', # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
161- # 'm5', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
162- # 'm5d', # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
163- # 'm5a', # AMD EPYC 7571 2.5 GHz
164- # 'm5ad', # AMD EPYC 7571 2.5 GHz
236+ 'c6gd' , # AWS Graviton2 Processor 2.5 GHz
237+ 'f1' , # Intel Xeon E5-2686 v4 (Broadwell) 2.3 GHz
238+ 'm5' , # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
239+ 'm5d' , # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
240+ 'm5a' , # AMD EPYC 7571 2.5 GHz
241+ 'm5ad' , # AMD EPYC 7571 2.5 GHz
165242 'm5zn' , # Intel Xeon Platinum 8252 4.5 GHz
166243 'm6a' , # AMD EPYC 7R13 Processor 3.6 GHz
167244 'm6ad' ,
168245 'm6i' , # Intel Xeon 8375C (Ice Lake) 3.5 GHz
169246 'm6id' ,
170247 'm6g' , # AWS Graviton2 Processor 2.5 GHz
171- # 'm6gd', # AWS Graviton2 Processor 2.5 GHz
248+ 'm6gd' , # AWS Graviton2 Processor 2.5 GHz
172249 'r5' , # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
173250 'r5d' , # Intel Xeon Platinum 8175 (Skylake) 3.1 GHz
174- # 'r5b', # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz
251+ 'r5b' , # Intel Xeon Platinum 8259 (Cascade Lake) 3.1 GHz
175252 'r5a' , # AMD EPYC 7571 2.5 GHz
176253 'r5ad' , # AMD EPYC 7571 2.5 GHz
177254 'r6a' ,
178255 'r6i' , # Intel Xeon 8375C (Ice Lake) 3.5 GHz 1TB
179256 'r6id' ,
180257 'r6g' , # AWS Graviton2 Processor 2.5 GHz
181- # 'r6gd', # AWS Graviton2 Processor 2.5 GHz
182- # 'x1', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB
183- # 'x1e', # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB
258+ 'r6gd' , # AWS Graviton2 Processor 2.5 GHz
259+ 'x1' , # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 2TB
260+ 'x1e' , # High Frequency Intel Xeon E7-8880 v3 (Haswell) 2.3 GHz 4TB
184261 'x2gd' , # AWS Graviton2 Processor 2.5 GHz 1TB
185262 'x2idn' , # Intel Xeon Scalable (Icelake) 3.5 GHz 2 TB
186263 'x2iedn' , # Intel Xeon Scalable (Icelake) 3.5 GHz 4 TB
187264 'x2iezn' , # Intel Xeon Platinum 8252 4.5 GHz 1.5 TB
188265 'z1d' , # Intel Xeon Platinum 8151 4.0 GHz
189- #'u-6tb1', # Intel Xeon Scalable (Skylake) 6 TB
190- #'u-9tb1', # Intel Xeon Scalable (Skylake) 9 TB
191- #'u-12tb1', # Intel Xeon Scalable (Skylake) 12 TB
192266]
193267
194268default_eda_instance_types = [
@@ -219,7 +293,7 @@ def get_slurm_rest_api_version(config):
219293
220294default_excluded_instance_types = [
221295 '.+\.(micro|nano)' , # Not enough memory
222- '.*\.metal'
296+ '.*\.metal.* '
223297]
224298
225299architectures = [
@@ -259,6 +333,7 @@ def get_config_schema(config):
259333 # Optional, but highly recommended
260334 Optional ('ErrorSnsTopicArn' ): str ,
261335 Optional ('TimeZone' , default = 'US/Central' ): str ,
336+ Optional ('RESEnvironmentName' ): str ,
262337 'slurm' : {
263338 Optional ('ParallelClusterConfig' ): {
264339 Optional ('Enable' , default = True ): And (bool , lambda s : s == True ),
@@ -328,11 +403,13 @@ def get_config_schema(config):
328403 # Default to StackName-cl
329404 Optional ('ClusterName' ): And (str , lambda s : s != config ['StackName' ]),
330405 #
331- # MungeKeySsmParameter:
332- # SSM String Parameter with a base64 encoded munge key to use for the cluster.
406+ # MungeKeySecret:
407+ # AWS secret with a base64 encoded munge key to use for the cluster.
408+ # For an existing secret can be the secret name or the ARN.
409+ # If the secret doesn't exist one will be created, but won't be part of the cloudformation stack
410+ # so that it won't be deleted when the stack is deleted.
333411 # Required if your submitters need to use more than 1 cluster.
334- # Will be created if it doesn't exist to save the value in Parameter Store.
335- Optional ('MungeKeySsmParameter' , default = '/slurm/munge_key' ): str ,
412+ Optional ('MungeKeySecret' ): str ,
336413 #
337414 # SlurmCtl:
338415 # Required, but can be an empty dict to accept all of the defaults
0 commit comments