Skip to content

Commit 82243b3

Browse files
authored
Change default ParallelCluster version to 3.7.2 (#176)
* Fix bug in create_slurm_accounts.py * Add support for ParallelCluster 3.7.2 Clean up schema. Clean up unused playbook variables. Fix a bug in create_slurm_accounts.py Resolves #172
1 parent 13bc31e commit 82243b3

File tree

5 files changed

+99
-81
lines changed

5 files changed

+99
-81
lines changed

source/cdk/cdk_slurm_stack.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import base64
5252
import boto3
5353
from botocore.exceptions import ClientError
54+
from config_schema import get_PARALLEL_CLUSTER_MUNGE_VERSION, get_PARALLEL_CLUSTER_PYTHON_VERSION, get_SLURM_VERSION
5455
from constructs import Construct
5556
from copy import copy, deepcopy
5657
from jinja2 import Template as Template
@@ -1262,16 +1263,16 @@ def get_instance_template_vars(self, instance_role):
12621263
instance_template_vars['ParallelClusterVersion'] = self.config['slurm']['ParallelClusterConfig']['Version']
12631264
instance_template_vars['SlurmBaseDir'] = '/opt/slurm'
12641265
instance_template_vars['SlurmOSDir'] = '/opt/slurm'
1265-
instance_template_vars['SlurmVersion'] = self.config['slurm']['SlurmVersion']
1266+
instance_template_vars['SlurmVersion'] = get_SLURM_VERSION(self.config)
12661267

12671268
if instance_role == 'ParallelClusterHeadNode':
12681269
if 'Database' in self.config['slurm']['ParallelClusterConfig']:
12691270
instance_template_vars['AccountingStorageHost'] = 'pcvluster-head-node'
12701271
else:
12711272
instance_template_vars['AccountingStorageHost'] = ''
12721273
instance_template_vars['Licenses'] = self.config['Licenses']
1273-
instance_template_vars['ParallelClusterMungeVersion'] = self.config['slurm']['ParallelClusterConfig']['MungeVersion']
1274-
instance_template_vars['ParallelClusterPythonVersion'] = self.config['slurm']['ParallelClusterConfig']['PythonVersion']
1274+
instance_template_vars['ParallelClusterMungeVersion'] = get_PARALLEL_CLUSTER_MUNGE_VERSION(self.config)
1275+
instance_template_vars['ParallelClusterPythonVersion'] = get_PARALLEL_CLUSTER_PYTHON_VERSION(self.config)
12751276
instance_template_vars['PrimaryController'] = True
12761277
instance_template_vars['SlurmctldPort'] = self.slurmctld_port
12771278
instance_template_vars['SlurmctldPortMin'] = self.slurmctld_port_min
@@ -1284,7 +1285,7 @@ def get_instance_template_vars(self, instance_role):
12841285
instance_template_vars['SlurmrestdUid'] = self.config['slurm']['SlurmCtl']['SlurmrestdUid']
12851286
elif instance_role == 'ParallelClusterSubmitter':
12861287
instance_template_vars['FileSystemMountPath'] = f'/opt/slurm/{cluster_name}'
1287-
instance_template_vars['ParallelClusterMungeVersion'] = self.config['slurm']['ParallelClusterConfig']['MungeVersion']
1288+
instance_template_vars['ParallelClusterMungeVersion'] = get_PARALLEL_CLUSTER_MUNGE_VERSION(self.config)
12881289
instance_template_vars['SlurmBaseDir'] = f'/opt/slurm/{cluster_name}'
12891290
instance_template_vars['SlurmOSDir'] = f'/opt/slurm/{cluster_name}'
12901291

source/cdk/config_schema.py

Lines changed: 59 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -57,37 +57,46 @@
5757
# * Fix pmix CVE
5858
# * Use Slurm 23.02.5
5959
MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0')
60-
DEFAULT_PARALLEL_CLUSTER_VERSION = parse_version('3.7.1')
61-
DEFAULT_PARALLEL_CLUSTER_MUNGE_VERSION = '0.5.15'
62-
DEFAULT_PARALLEL_CLUSTER_MUNGE_VERSIONS = {
63-
'3.6.0': '0.5.15',
64-
'3.6.1': '0.5.15',
65-
'3.7.0': '0.5.15',
66-
'3.7.1': '0.5.15',
60+
DEFAULT_PARALLEL_CLUSTER_VERSION = parse_version('3.7.2')
61+
PARALLEL_CLUSTER_VERSIONS = [
62+
'3.6.0',
63+
'3.6.1',
64+
'3.7.0',
65+
'3.7.1',
66+
'3.7.2',
67+
]
68+
PARALLEL_CLUSTER_MUNGE_VERSIONS = {
69+
# This can be found on the head node at /opt/parallelcluster/sources
70+
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
71+
'3.6.0': '0.5.15', # confirmed
72+
'3.6.1': '0.5.15', # confirmed
73+
'3.7.0': '0.5.15', # confirmed
74+
'3.7.1': '0.5.15', # confirmed
75+
'3.7.2': '0.5.15', # confirmed
6776
}
68-
DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION = '3.9.16'
69-
DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSIONS = {
70-
'3.6.0': '3.9.16',
71-
'3.6.1': '3.9.16',
72-
'3.7.0': '3.9.16',
73-
'3.7.1': '3.9.16',
77+
PARALLEL_CLUSTER_PYTHON_VERSIONS = {
78+
# This can be found on the head node at /opt/parallelcluster/pyenv/versions
79+
'3.6.0': '3.9.16', # confirmed
80+
'3.6.1': '3.9.16', # confirmed
81+
'3.7.0': '3.9.16', # confirmed
82+
'3.7.1': '3.9.16', # confirmed
83+
'3.7.2': '3.9.16', # confirmed
7484
}
75-
DEFAULT_PARALLEL_CLUSTER_SLURM_VERSION = '23-02-3-1'
76-
DEFAULT_PARALLEL_CLUSTER_SLURM_VERSIONS = {
77-
'3.7.0': DEFAULT_PARALLEL_CLUSTER_SLURM_VERSION,
78-
'3.7.1': DEFAULT_PARALLEL_CLUSTER_SLURM_VERSION,
85+
PARALLEL_CLUSTER_SLURM_VERSIONS = {
86+
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
87+
'3.6.0': '23-02-2-1', # confirmed
88+
'3.6.1': '23-02-2-1', # confirmed
89+
'3.7.0': '23-02-4-1', # confirmed
90+
'3.7.1': '23-02-5-1', # confirmed
91+
'3.7.2': '23-02-6-1', # confirmed
92+
}
93+
SLURM_REST_API_VERSIONS = {
94+
'23-02-2-1': '0.0.39',
95+
'23-02-3-1': '0.0.39',
96+
'23-02-4-1': '0.0.39',
97+
'23-02-5-1': '0.0.39',
98+
'23-02-6-1': '0.0.39',
7999
}
80-
81-
def get_DEFAULT_PARALLEL_CLUSTER_MUNGE_VERSION(config):
82-
parallel_cluster_version = config.get('slurm', {}).get('ParallelClusterConfig', {}).get('Version', DEFAULT_PARALLEL_CLUSTER_VERSION)
83-
munge_version = DEFAULT_PARALLEL_CLUSTER_MUNGE_VERSIONS.get(parallel_cluster_version, str(DEFAULT_PARALLEL_CLUSTER_MUNGE_VERSION))
84-
return munge_version
85-
86-
def get_DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION(config):
87-
parallel_cluster_version = config.get('slurm', {}).get('ParallelClusterConfig', {}).get('Version', DEFAULT_PARALLEL_CLUSTER_VERSION)
88-
python_version = DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSIONS.get(parallel_cluster_version, str(DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION))
89-
return python_version
90-
91100
PARALLEL_CLUSTER_ALLOWED_OSES = [
92101
'alinux2',
93102
'centos7',
@@ -96,24 +105,28 @@ def get_DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION(config):
96105
'ubuntu2204'
97106
]
98107

99-
DEFAULT_SLURM_VERSION = '23.02.1'
100-
def get_DEFAULT_SLURM_VERSION(config):
101-
if config.get('slurm', {}).get('ParallelClusterConfig', {}).get('Enable', False):
102-
parallel_cluster_version = config.get('slurm', {}).get('ParallelClusterConfig', {}).get('Version', DEFAULT_PARALLEL_CLUSTER_VERSION)
103-
slurm_version = DEFAULT_PARALLEL_CLUSTER_SLURM_VERSIONS.get(parallel_cluster_version, DEFAULT_PARALLEL_CLUSTER_SLURM_VERSION)
104-
else:
105-
slurm_version = DEFAULT_SLURM_VERSION
108+
def get_parallel_cluster_version(config):
109+
return config['slurm']['ParallelClusterConfig']['Version']
110+
111+
def get_PARALLEL_CLUSTER_MUNGE_VERSION(config):
112+
parallel_cluster_version = get_parallel_cluster_version(config)
113+
munge_version = PARALLEL_CLUSTER_MUNGE_VERSIONS[parallel_cluster_version]
114+
return munge_version
115+
116+
def get_PARALLEL_CLUSTER_PYTHON_VERSION(config):
117+
parallel_cluster_version = get_parallel_cluster_version(config)
118+
python_version = PARALLEL_CLUSTER_PYTHON_VERSIONS[parallel_cluster_version]
119+
return python_version
120+
121+
def get_SLURM_VERSION(config):
122+
parallel_cluster_version = get_parallel_cluster_version(config)
123+
slurm_version = PARALLEL_CLUSTER_SLURM_VERSIONS[parallel_cluster_version]
106124
return slurm_version
107125

108-
DEFAULT_SLURM_REST_API_VERSION = '0.0.39'
109-
DEFAULT_SLURM_REST_API_VERSIONs = {
110-
'23.02.1': '0.0.39',
111-
'23-02-3-1': '0.0.39',
112-
}
113-
def get_default_slurm_rest_api_version(config):
114-
slurm_version = config.get('slurm', {}).get('SlurmVersion', get_DEFAULT_SLURM_VERSION(config))
115-
default_slurm_rest_api_version = DEFAULT_SLURM_REST_API_VERSIONs.get(slurm_version, DEFAULT_SLURM_REST_API_VERSION)
116-
return default_slurm_rest_api_version
126+
def get_slurm_rest_api_version(config):
127+
slurm_version = get_SLURM_VERSION(config)
128+
slurm_rest_api_version = SLURM_REST_API_VERSIONS.get(slurm_version, )
129+
return slurm_rest_api_version
117130

118131
# Determine all AWS regions available on the account.
119132
default_region = environ.get("AWS_DEFAULT_REGION", "us-east-1")
@@ -249,9 +262,7 @@ def get_config_schema(config):
249262
'slurm': {
250263
Optional('ParallelClusterConfig'): {
251264
Optional('Enable', default=True): And(bool, lambda s: s == True),
252-
Optional('Version', default=str(DEFAULT_PARALLEL_CLUSTER_VERSION)): And(str, lambda s: parse_version(s) >= MIN_PARALLEL_CLUSTER_VERSION),
253-
Optional('MungeVersion', default=get_DEFAULT_PARALLEL_CLUSTER_MUNGE_VERSION(config)): str,
254-
Optional('PythonVersion', default=get_DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION(config)): str,
265+
Optional('Version', default=str(DEFAULT_PARALLEL_CLUSTER_VERSION)): And(str, lambda version: version in PARALLEL_CLUSTER_VERSIONS, lambda version: parse_version(version) >= MIN_PARALLEL_CLUSTER_VERSION),
255266
Optional('Image', default={'Os': 'centos7'}): {
256267
'Os': And(str, lambda s: s in PARALLEL_CLUSTER_ALLOWED_OSES, ),
257268
Optional('CustomAmi'): And(str, lambda s: s.startswith('ami-')),
@@ -311,10 +322,6 @@ def get_config_schema(config):
311322
]
312323
}
313324
},
314-
# SlurmVersion:
315-
# Latest tested version
316-
# Critical security fix released in 21.08.8. Must be later than that.
317-
Optional('SlurmVersion', default=get_DEFAULT_SLURM_VERSION(config)): str,
318325
#
319326
# ClusterName:
320327
# Name of the ParallelCluster cluster.
@@ -342,7 +349,7 @@ def get_config_schema(config):
342349
# File that will be included at end of slurm.conf to override configuration parameters.
343350
Optional('SlurmConfOverrides'): str,
344351
Optional('SlurmrestdUid', default=901): int,
345-
Optional('SlurmRestApiVersion', default=get_default_slurm_rest_api_version(config)): str,
352+
Optional('SlurmRestApiVersion', default=get_slurm_rest_api_version(config)): str,
346353
},
347354
#
348355
# SubmitterSecurityGroupIds:

source/resources/playbooks/inventories/group_vars/all

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,13 @@ centos7_7_to_9: "{{distribution in ['CentOS', 'RedHat'] and distribution_version
3333

3434
# Create separate build and release dirs because binaries built on AmazonLinux2 don't run on CentOS 7
3535
SlurmBaseDir: "{{FileSystemMountPath}}"
36-
SlurmAnsibleDir: "{{SlurmBaseDir}}/ansible"
37-
SlurmSrcDir: "{{SlurmBaseDir}}/src/{{SlurmVersion}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}"
38-
SlurmVersionDir: "{{SlurmBaseDir}}/slurm-{{SlurmVersion}}"
39-
SlurmOSDir: "{{SlurmVersionDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}"
40-
SlurmSbinDir: "{{SlurmOSDir}}/sbin"
41-
SlurmBinDir: "{{SlurmOSDir}}/bin"
36+
SlurmSbinDir: "{{SlurmBaseDir}}/sbin"
37+
SlurmBinDir: "{{SlurmBaseDir}}/bin"
4238
SlurmScriptsDir: "{{SlurmBaseDir}}/bin"
43-
SlurmRoot: "{{SlurmOSDir}}"
39+
SlurmRoot: "{{SlurmBaseDir}}"
4440

4541
# Cluster specific directories
46-
SlurmAccountingDir: "{{SlurmBaseDir}}/accounting"
4742
SlurmConfigDir: "{{SlurmBaseDir}}/config"
48-
SlurmLocalConfigDir: "/opt/slurm/config"
4943
SlurmEtcDir: "{{SlurmBaseDir}}/etc"
5044
SlurmLogsDir: "{{SlurmBaseDir}}/logs"
5145
SlurmrestdSocketDir: "{{SlurmBaseDir}}/com"

source/resources/playbooks/roles/ParallelClusterHeadNode/files/opt/slurm/config/bin/create_slurm_accounts.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __init__(self, accounts_filename, users_filename, default_account):
6161

6262
self.devnull = open(os.devnull, 'w')
6363

64-
logger.debug(f"Configured accounts:\n{json.dumps(self.accounts, indent=4)}")
64+
logger.debug(f"Configured accounts:\n{json.dumps(self.accounts, indent=4, sort_keys=True)}")
6565

6666
# Get all mapped users
6767
self.users_to_accounts_map = {}
@@ -85,12 +85,12 @@ def __init__(self, accounts_filename, users_filename, default_account):
8585
self.accounts[self.default_account]['users'].append(user)
8686

8787
self.slurm_user_account_dict = self.get_slurm_user_account_dict()
88-
logger.debug(f"Current users and accounts in slurmdb:\n{json.dumps(self.slurm_user_account_dict, indent=4)}")
88+
logger.debug(f"Current users and accounts in slurmdb:\n{json.dumps(self.slurm_user_account_dict, indent=4, sort_keys=True)}")
8989

9090
number_of_changes = self.update_slurm()
9191

9292
self.slurm_user_account_dict = self.get_slurm_user_account_dict()
93-
logger.debug(f"Current users and accounts in slurmdb:\n{json.dumps(self.slurm_user_account_dict, indent=4)}")
93+
logger.debug(f"Current users and accounts in slurmdb:\n{json.dumps(self.slurm_user_account_dict, indent=4, sort_keys=True)}")
9494

9595
number_of_changes = self.update_slurm()
9696

@@ -163,6 +163,10 @@ def update_slurm(self):
163163
logger.info(f"Creating user {user} with account={default_account}")
164164
try:
165165
subprocess.run([self.sacctmgr, '-i', 'add', 'user', user, f'Account={default_account}'], check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='UTF-8') # nosec
166+
self.slurm_user_account_dict['users'][user] = {
167+
'default-account': default_account,
168+
'accounts': {default_account: 1}
169+
}
166170
except subprocess.CalledProcessError as e:
167171
logger.exception(f"Couldn't add user {user}.\ncommand: {e.cmd}\noutput:\n{e.output}")
168172
number_of_errors += 1
@@ -177,10 +181,11 @@ def update_slurm(self):
177181
try:
178182
subprocess.run([self.sacctmgr, '-i', 'add', 'user', user, f'account={account}'], check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='UTF-8') # nosec
179183
except subprocess.CalledProcessError as e:
180-
logger.info(f"Default account of {user} already {account}.")
181-
if 'Nothing new added' not in e.output:
182-
logger.exception(f"Couldn't change default account of {user} to {account}.\ncommand: {e.cmd}\noutput:\n{e.output}")
183-
number_of_errors += 1
184+
if 'Nothing new added' in e.output:
185+
logger.info(f" Default account of {user} already {account}.")
186+
else:
187+
logger.exception(f" Couldn't change default account of {user} to {account}.\ncommand: {e.cmd}\noutput:\n{e.output}")
188+
number_of_errors += 1
184189
number_of_changes += 1
185190

186191
# Make sure default account of users is correct
@@ -229,8 +234,11 @@ def update_slurm(self):
229234
try:
230235
subprocess.run([self.sacctmgr, '-i', 'delete', 'user', user, 'where', f'Account={account}'], check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='UTF-8') #nosec
231236
except subprocess.CalledProcessError as e:
232-
logger.error(f"Couldn't delete user {user} from account {account}.\ncommand: {e.cmd}\noutput:\n{e.output}")
233-
number_of_errors += 1
237+
if 'Nothing deleted' in e.output:
238+
logger.warning(f" Couldn't delete user {user} from account {account}.\ncommand: {e.cmd}\noutput:\n{e.output}")
239+
else:
240+
logger.error(f" Couldn't delete user {user} from account {account}.\ncommand: {e.cmd}\noutput:\n{e.output}")
241+
number_of_errors += 1
234242
number_of_changes += 1
235243

236244
# Delete unused accounts
@@ -243,9 +251,22 @@ def update_slurm(self):
243251
number_of_errors += 1
244252
number_of_changes += 1
245253

246-
logger.debug(f"Delete unconfigured users")
247-
for user in sorted(self.slurm_user_account_dict['users']):
254+
# Update the state of slurmdbd users and accounts
255+
self.slurm_user_account_dict = self.get_slurm_user_account_dict()
248256

257+
logger.info(f"Delete unconfigured users")
258+
for user in sorted(self.slurm_user_account_dict['users']):
259+
if user in self.SYSTEM_USERS:
260+
logger.debug(f" Skipping system user {user}")
261+
continue
262+
if user not in self.users_to_accounts_map:
263+
logger.info(f" Deleting user {user} from slurmdbd")
264+
try:
265+
subprocess.run([self.sacctmgr, '-i', 'delete', 'user', user], check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='UTF-8') #nosec
266+
except subprocess.CalledProcessError as e:
267+
logger.error(f"Couldn't delete user {user} from slurmdbd.\ncommand: {e.cmd}\noutput:\n{e.output}")
268+
number_of_errors += 1
269+
number_of_changes += 1
249270

250271
if number_of_errors:
251272
raise RuntimeError("Some slurm updates failed")

source/resources/playbooks/roles/all/tasks/main.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@
3333
ansible_ssh_user: {{ansible_ssh_user}}
3434
ansible_ssh_common_args: {{ansible_ssh_common_args}}
3535
36-
SlurmVersion: {{SlurmVersion}}
3736
SlurmBaseDir: {{SlurmBaseDir}}
38-
SlurmSrcDir: {{SlurmSrcDir}}
39-
SlurmVersionDir: {{SlurmVersionDir}}
40-
SlurmOSDir: {{SlurmOSDir}}
4137
SlurmSbinDir: {{SlurmSbinDir}}
4238
SlurmBinDir: {{SlurmBinDir}}
4339
SlurmScriptsDir: {{SlurmScriptsDir}}
@@ -46,7 +42,6 @@
4642
SupportedDistributions: {{SupportedDistributions}}
4743
4844
Cluster Specific Vars
49-
SlurmAccountingDir: {{SlurmAccountingDir}}
5045
SlurmConfigDir: {{SlurmConfigDir}}
5146
SlurmEtcDir: {{SlurmEtcDir}}
5247
SlurmLogsDir: {{SlurmLogsDir}}

0 commit comments

Comments
 (0)