From 072dce6f2c2999c6b7c3dcef581f7cba287d0574 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Fri, 8 Sep 2017 17:44:48 -0400 Subject: [PATCH 01/18] Adding a check to the number of requested gpus to give a meaningful message --- scripts/smart-dispatch | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index 86904fa..87377c5 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -155,6 +155,13 @@ def main(): .format(req_cores=args.coresPerCommand, node_cores=queue.nb_cores_per_node)) sys.exit(2) + # Check that requested gpu number does not exceed node total + if args.gpusPerCommand > queue.nb_gpus_per_node: + sys.stderr.write("smart-dispatch: error: gpusPerCommand exceeds nodes total: asked {req_gpus} gpus, nodes have {node_gpus}\n" + .format(req_gpus=args.gpusPerCommand, node_gpus=queue.nb_gpus_per_node)) + sys.exit(2) + + command_params = {'nb_cores_per_command': args.coresPerCommand, 'nb_gpus_per_command': args.gpusPerCommand, 'mem_per_command': None # args.memPerCommand From f6ab791ffd42c237651a40bea029906f7c717fdb Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Mon, 11 Sep 2017 14:46:02 -0400 Subject: [PATCH 02/18] The package is now installed using entry_points. The old scripts used, "script/smart-dispatch" and "script/sd-launch-pbs", have been moved to "smartdispatch/smartdispatch_script.py" and "smartdispatch/sd_launch_pbs_script.py". The last commit of "script/smart-dispatch" was : 072dce6f2c2999c6b7c3dcef581f7cba287d0574 --- scripts/sd-launch-pbs | 32 +--- scripts/smart-dispatch | 237 +------------------------ setup.py | 5 +- smartdispatch/sd_launch_pbs_script.py | 33 ++++ smartdispatch/smartdispatch_script.py | 239 ++++++++++++++++++++++++++ 5 files changed, 282 insertions(+), 264 deletions(-) create mode 100644 smartdispatch/sd_launch_pbs_script.py create mode 100755 smartdispatch/smartdispatch_script.py diff --git a/scripts/sd-launch-pbs b/scripts/sd-launch-pbs index 0b38733..e545e52 100644 --- a/scripts/sd-launch-pbs +++ b/scripts/sd-launch-pbs @@ -1,36 +1,10 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -import argparse import logging -from smartdispatch import launch_jobs -from smartdispatch import utils - -LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" -CLUSTER_NAME = utils.detect_cluster() -LAUNCHER = utils.get_launcher(CLUSTER_NAME) - - -def main(): - # Necessary if we want 'logging.info' to appear in stderr. - logging.root.setLevel(logging.INFO) - - args = parse_arguments() - - launch_jobs(LAUNCHER if args.launcher is None else args.launcher, [args.pbs], CLUSTER_NAME, args.path_job) - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('-L', '--launcher', choices=['qsub', 'msub'], required=False, help='Which launcher to use. Default: qsub') - parser.add_argument('pbs', type=str, help='PBS filename to launch.') - parser.add_argument('path_job', type=str, help='Path to the job folder.') - - args = parser.parse_args() - - return args - +from smartdispatch import sd_launch_pbs_script if __name__ == "__main__": - main() + logging.warn("This file is deprecated. The script 'sd-launch-pbs' should be use instead.") + sd_launch_pbs_script.main() diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index 87377c5..e223ec5 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -1,240 +1,11 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -import os -import sys -import argparse -import time as t -from os.path import join as pjoin -from textwrap import dedent - -from smartdispatch.command_manager import CommandManager - -from smartdispatch.queue import Queue -from smartdispatch.job_generator import job_generator_factory -from smartdispatch import get_available_queues -from smartdispatch import launch_jobs -from smartdispatch import utils - import logging -import smartdispatch - -LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" -CLUSTER_NAME = utils.detect_cluster() -AVAILABLE_QUEUES = get_available_queues(CLUSTER_NAME) -LAUNCHER = utils.get_launcher(CLUSTER_NAME) - -# Autoresume settings. -TIMEOUT_EXIT_CODE = 124 -AUTORESUME_TRIGGER_AFTER = '$(($PBS_WALLTIME - 60))' # By default, 60s before the maximum walltime. -AUTORESUME_WORKER_CALL_PREFIX = 'timeout -s TERM {trigger_after} '.format(trigger_after=AUTORESUME_TRIGGER_AFTER) -AUTORESUME_WORKER_CALL_SUFFIX = ' WORKER_PIDS+=" $!"' -AUTORESUME_PROLOG = 'WORKER_PIDS=""' -AUTORESUME_EPILOG = """\ -NEED_TO_RESUME=false -for WORKER_PID in $WORKER_PIDS; do - wait "$WORKER_PID" - RETURN_CODE=$? - if [ $RETURN_CODE -eq {timeout_exit_code} ]; then - NEED_TO_RESUME=true - fi -done -if [ "$NEED_TO_RESUME" = true ]; then - echo "Autoresuming using: {{launcher}} $PBS_FILENAME" - sd-launch-pbs --launcher {{launcher}} $PBS_FILENAME {{path_job}} -fi -""".format(timeout_exit_code=TIMEOUT_EXIT_CODE) - - -def main(): - # Necessary if we want 'logging.info' to appear in stderr. - logging.root.setLevel(logging.INFO) - - args = parse_arguments() - path_smartdispatch_logs = pjoin(os.getcwd(), LOGS_FOLDERNAME) - - # Check if RESUME or LAUNCH mode - if args.mode == "launch": - if args.commandsFile is not None: - # Commands are listed in a file. - jobname = smartdispatch.generate_logfolder_name(os.path.basename(args.commandsFile.name), max_length=235) - commands = smartdispatch.get_commands_from_file(args.commandsFile) - else: - # Command that needs to be parsed and unfolded. - command = " ".join(args.commandAndOptions) - jobname = smartdispatch.generate_name_from_command(command, max_length=235) - commands = smartdispatch.unfold_command(command) - - commands = smartdispatch.replace_uid_tag(commands) - nb_commands = len(commands) # For print at the end - - if args.batchName: - jobname = smartdispatch.generate_logfolder_name(utils.slugify(args.batchName), max_length=235) - - elif args.mode == "resume": - jobname = args.batch_uid - if os.path.isdir(jobname): - # We assume `jobname` is `path_job` repo, we extract the real `jobname`. - jobname = os.path.basename(os.path.abspath(jobname)) - - if not os.path.isdir(pjoin(path_smartdispatch_logs, jobname)): - raise LookupError("Batch UID ({0}) does not exist! Cannot resume.".format(jobname)) - else: - raise ValueError("Unknown subcommand!") - - job_folders_paths = smartdispatch.get_job_folders(path_smartdispatch_logs, jobname) - path_job, path_job_logs, path_job_commands = job_folders_paths - - # Keep a log of the command line in the job folder. - command_line = " ".join(sys.argv) - smartdispatch.log_command_line(path_job, command_line) - - command_manager = CommandManager(pjoin(path_job_commands, "commands.txt")) - - # If resume mode, reset running jobs - if args.mode == "launch": - command_manager.set_commands_to_run(commands) - elif args.mode == "resume": - # Verifying if there are failed commands - failed_commands = command_manager.get_failed_commands() - if len(failed_commands) > 0: - FAILED_COMMAND_MESSAGE = dedent("""\ - {nb_failed} command(s) are in a failed state. They won't be resumed. - Failed commands: - {failed_commands} - The actual errors can be found in the log folder under: - {failed_commands_err_file}""") - utils.print_boxed(FAILED_COMMAND_MESSAGE.format( - nb_failed=len(failed_commands), - failed_commands=''.join(failed_commands), - failed_commands_err_file='\n'.join([utils.generate_uid_from_string(c[:-1]) + '.err' for c in failed_commands]) - )) - - if not utils.yes_no_prompt("Do you want to continue?", 'n'): - exit() - - if args.expandPool is None: - command_manager.reset_running_commands() - - nb_commands = command_manager.get_nb_commands_to_run() - - if args.expandPool is not None: - args.pool = min(nb_commands, args.expandPool) - - # If no pool size is specified the number of commands is taken - if args.pool is None: - args.pool = command_manager.get_nb_commands_to_run() - - # Generating all the worker commands - worker_script = pjoin(os.path.dirname(smartdispatch.__file__), 'workers', 'base_worker.py') - worker_script_flags = '' - if args.autoresume: - worker_script_flags = '-r' - - worker_call_prefix = '' - worker_call_suffix = '' - if args.autoresume: - worker_call_prefix = AUTORESUME_WORKER_CALL_PREFIX - worker_call_suffix = AUTORESUME_WORKER_CALL_SUFFIX - - COMMAND_STRING = 'cd "{cwd}"; {worker_call_prefix}python2 {worker_script} {worker_script_flags} "{commands_file}" "{log_folder}" '\ - '1>> "{log_folder}/worker/$PBS_JOBID\"\"_worker_{{ID}}.o" '\ - '2>> "{log_folder}/worker/$PBS_JOBID\"\"_worker_{{ID}}.e" &'\ - '{worker_call_suffix}' - COMMAND_STRING = COMMAND_STRING.format(cwd=os.getcwd(), worker_call_prefix=worker_call_prefix, worker_script=worker_script, - worker_script_flags=worker_script_flags, commands_file=command_manager._commands_filename, - log_folder=path_job_logs, worker_call_suffix=worker_call_suffix) - commands = [COMMAND_STRING.format(ID=i) for i in range(args.pool)] - - # TODO: use args.memPerNode instead of args.memPerNode - queue = Queue(args.queueName, CLUSTER_NAME, args.walltime, args.coresPerNode, args.gpusPerNode, float('inf'), args.modules) - - # Check that requested core number does not exceed node total - if args.coresPerCommand > queue.nb_cores_per_node: - sys.stderr.write("smart-dispatch: error: coresPerCommand exceeds nodes total: asked {req_cores} cores, nodes have {node_cores}\n" - .format(req_cores=args.coresPerCommand, node_cores=queue.nb_cores_per_node)) - sys.exit(2) - - # Check that requested gpu number does not exceed node total - if args.gpusPerCommand > queue.nb_gpus_per_node: - sys.stderr.write("smart-dispatch: error: gpusPerCommand exceeds nodes total: asked {req_gpus} gpus, nodes have {node_gpus}\n" - .format(req_gpus=args.gpusPerCommand, node_gpus=queue.nb_gpus_per_node)) - sys.exit(2) - - - command_params = {'nb_cores_per_command': args.coresPerCommand, - 'nb_gpus_per_command': args.gpusPerCommand, - 'mem_per_command': None # args.memPerCommand - } - - prolog = [] - epilog = ['wait'] - if args.autoresume: - prolog = [AUTORESUME_PROLOG] - epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)] - - job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job) - - # generating default names per each jobs in each batch - for pbs_id, pbs in enumerate(job_generator.pbs_list): - proper_size_name = utils.jobname_generator(jobname, pbs_id) - pbs.add_options(N=proper_size_name) - - if args.pbsFlags is not None: - job_generator.add_pbs_flags(args.pbsFlags.split(' ')) - pbs_filenames = job_generator.write_pbs_files(path_job_commands) - - # Launch the jobs - print "## {nb_commands} command(s) will be executed in {nb_jobs} job(s) ##".format(nb_commands=nb_commands, nb_jobs=len(pbs_filenames)) - print "Batch UID:\n{batch_uid}".format(batch_uid=jobname) - if not args.doNotLaunch: - launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job) - print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)') - parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.') - parser.add_argument('-t', '--walltime', required=False, help='Set the estimated running time of your jobs using the DD:HH:MM:SS format. Note that they will be killed when this time limit is reached.') - parser.add_argument('-L', '--launcher', choices=['qsub', 'msub'], required=False, help='Which launcher to use. Default: qsub') - parser.add_argument('-C', '--coresPerNode', type=int, required=False, help='How many cores there are per node.') - parser.add_argument('-G', '--gpusPerNode', type=int, required=False, help='How many gpus there are per node.') - # parser.add_argument('-M', '--memPerNode', type=int, required=False, help='How much memory there are per node (in Gb).') - - parser.add_argument('-c', '--coresPerCommand', type=int, required=False, help='How many cores a command needs.', default=1) - parser.add_argument('-g', '--gpusPerCommand', type=int, required=False, help='How many gpus a command needs.', default=1) - # parser.add_argument('-m', '--memPerCommand', type=float, required=False, help='How much memory a command needs (in Gb).') - parser.add_argument('-f', '--commandsFile', type=file, required=False, help='File containing commands to launch. Each command must be on a seperate line. (Replaces commandAndOptions)') - - parser.add_argument('-l', '--modules', type=str, required=False, help='List of additional modules to load.', nargs='+') - parser.add_argument('-x', '--doNotLaunch', action='store_true', help='Generate all the files without launching the job.') - parser.add_argument('-r', '--autoresume', action='store_true', help='Requeue the job when the running time hits the maximum walltime allowed on the cluster. Assumes that commands are resumable.') - - parser.add_argument('-p', '--pool', type=int, help="Number of workers that will be consuming commands. Default: Nb commands") - parser.add_argument('--pbsFlags', type=str, help='ADVANCED USAGE: Allow to pass a space seperated list of PBS flags. Ex:--pbsFlags="-lfeature=k80 -t0-4"') - subparsers = parser.add_subparsers(dest="mode") - - launch_parser = subparsers.add_parser('launch', help="Launch jobs.") - launch_parser.add_argument("commandAndOptions", help="Options for the commands.", nargs=argparse.REMAINDER) - - resume_parser = subparsers.add_parser('resume', help="Resume jobs from batch UID.") - resume_parser.add_argument('--expandPool', type=int, nargs='?', const=sys.maxsize, help='Add workers to the given batch. Default: # pending jobs.') - resume_parser.add_argument("batch_uid", help="Batch UID of the jobs to resume.") - - args = parser.parse_args() - - # Check for invalid arguments in - if args.mode == "launch": - if args.commandsFile is None and len(args.commandAndOptions) < 1: - parser.error("You need to specify a command to launch.") - if args.queueName not in AVAILABLE_QUEUES and ((args.coresPerNode is None and args.gpusPerNode is None) or args.walltime is None): - parser.error("Unknown queue, --coresPerNode/--gpusPerNode and --walltime must be set.") - if args.coresPerCommand < 1: - parser.error("coresPerNode must be at least 1") - - return args +from smartdispatch import smartdispatch_script if __name__ == "__main__": - main() + + logging.warn("This file is deprecated. The script 'smart-dispatch' should be use instead.") + smartdispatch_script.main() diff --git a/setup.py b/setup.py index 1da3ffe..6e708a6 100644 --- a/setup.py +++ b/setup.py @@ -8,8 +8,9 @@ author_email='smart-udes-dev@googlegroups.com', packages=['smartdispatch', 'smartdispatch/workers'], - scripts=['scripts/smart-dispatch', - 'scripts/sd-launch-pbs'], + entry_points={ + 'console_scripts': ['smart-dispatch = smartdispatch.smartdispatch_script:main', + 'sd-launch-pbs = smartdispatch.sd_launch_pbs_script:main']}, url='https://github.com/SMART-Lab/smartdispatch', license='LICENSE.txt', description='An easy to use job launcher for supercomputers with PBS compatible job manager.', diff --git a/smartdispatch/sd_launch_pbs_script.py b/smartdispatch/sd_launch_pbs_script.py new file mode 100644 index 0000000..9e537b9 --- /dev/null +++ b/smartdispatch/sd_launch_pbs_script.py @@ -0,0 +1,33 @@ +import argparse +import logging + +from smartdispatch import launch_jobs +from smartdispatch import utils + +LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" +CLUSTER_NAME = utils.detect_cluster() +LAUNCHER = utils.get_launcher(CLUSTER_NAME) + + +def main(): + # Necessary if we want 'logging.info' to appear in stderr. + logging.root.setLevel(logging.INFO) + + args = parse_arguments() + + launch_jobs(LAUNCHER if args.launcher is None else args.launcher, [args.pbs], CLUSTER_NAME, args.path_job) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('-L', '--launcher', choices=['qsub', 'msub'], required=False, help='Which launcher to use. Default: qsub') + parser.add_argument('pbs', type=str, help='PBS filename to launch.') + parser.add_argument('path_job', type=str, help='Path to the job folder.') + + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + main() diff --git a/smartdispatch/smartdispatch_script.py b/smartdispatch/smartdispatch_script.py new file mode 100755 index 0000000..9bae0fa --- /dev/null +++ b/smartdispatch/smartdispatch_script.py @@ -0,0 +1,239 @@ +import os +import sys +import argparse +import time as t +from os.path import join as pjoin +from textwrap import dedent + + +import smartdispatch +from command_manager import CommandManager + +from queue import Queue +from job_generator import job_generator_factory +from smartdispatch import get_available_queues +from smartdispatch import launch_jobs +from smartdispatch import utils + +import logging +import smartdispatch + +LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" +CLUSTER_NAME = utils.detect_cluster() +AVAILABLE_QUEUES = get_available_queues(CLUSTER_NAME) +LAUNCHER = utils.get_launcher(CLUSTER_NAME) + +# Autoresume settings. +TIMEOUT_EXIT_CODE = 124 +AUTORESUME_TRIGGER_AFTER = '$(($PBS_WALLTIME - 60))' # By default, 60s before the maximum walltime. +AUTORESUME_WORKER_CALL_PREFIX = 'timeout -s TERM {trigger_after} '.format(trigger_after=AUTORESUME_TRIGGER_AFTER) +AUTORESUME_WORKER_CALL_SUFFIX = ' WORKER_PIDS+=" $!"' +AUTORESUME_PROLOG = 'WORKER_PIDS=""' +AUTORESUME_EPILOG = """\ +NEED_TO_RESUME=false +for WORKER_PID in $WORKER_PIDS; do + wait "$WORKER_PID" + RETURN_CODE=$? + if [ $RETURN_CODE -eq {timeout_exit_code} ]; then + NEED_TO_RESUME=true + fi +done +if [ "$NEED_TO_RESUME" = true ]; then + echo "Autoresuming using: {{launcher}} $PBS_FILENAME" + sd-launch-pbs --launcher {{launcher}} $PBS_FILENAME {{path_job}} +fi +""".format(timeout_exit_code=TIMEOUT_EXIT_CODE) + + +def main(): + # Necessary if we want 'logging.info' to appear in stderr. + logging.root.setLevel(logging.INFO) + + args = parse_arguments() + path_smartdispatch_logs = pjoin(os.getcwd(), LOGS_FOLDERNAME) + + # Check if RESUME or LAUNCH mode + if args.mode == "launch": + if args.commandsFile is not None: + # Commands are listed in a file. + jobname = smartdispatch.generate_logfolder_name(os.path.basename(args.commandsFile.name), max_length=235) + commands = smartdispatch.get_commands_from_file(args.commandsFile) + else: + # Command that needs to be parsed and unfolded. + command = " ".join(args.commandAndOptions) + jobname = smartdispatch.generate_name_from_command(command, max_length=235) + commands = smartdispatch.unfold_command(command) + + commands = smartdispatch.replace_uid_tag(commands) + nb_commands = len(commands) # For print at the end + + if args.batchName: + jobname = smartdispatch.generate_logfolder_name(utils.slugify(args.batchName), max_length=235) + + elif args.mode == "resume": + jobname = args.batch_uid + if os.path.isdir(jobname): + # We assume `jobname` is `path_job` repo, we extract the real `jobname`. + jobname = os.path.basename(os.path.abspath(jobname)) + + if not os.path.isdir(pjoin(path_smartdispatch_logs, jobname)): + raise LookupError("Batch UID ({0}) does not exist! Cannot resume.".format(jobname)) + else: + raise ValueError("Unknown subcommand!") + + job_folders_paths = smartdispatch.get_job_folders(path_smartdispatch_logs, jobname) + path_job, path_job_logs, path_job_commands = job_folders_paths + + # Keep a log of the command line in the job folder. + command_line = " ".join(sys.argv) + smartdispatch.log_command_line(path_job, command_line) + + command_manager = CommandManager(pjoin(path_job_commands, "commands.txt")) + + # If resume mode, reset running jobs + if args.mode == "launch": + command_manager.set_commands_to_run(commands) + elif args.mode == "resume": + # Verifying if there are failed commands + failed_commands = command_manager.get_failed_commands() + if len(failed_commands) > 0: + FAILED_COMMAND_MESSAGE = dedent("""\ + {nb_failed} command(s) are in a failed state. They won't be resumed. + Failed commands: + {failed_commands} + The actual errors can be found in the log folder under: + {failed_commands_err_file}""") + utils.print_boxed(FAILED_COMMAND_MESSAGE.format( + nb_failed=len(failed_commands), + failed_commands=''.join(failed_commands), + failed_commands_err_file='\n'.join([utils.generate_uid_from_string(c[:-1]) + '.err' for c in failed_commands]) + )) + + if not utils.yes_no_prompt("Do you want to continue?", 'n'): + exit() + + if args.expandPool is None: + command_manager.reset_running_commands() + + nb_commands = command_manager.get_nb_commands_to_run() + + if args.expandPool is not None: + args.pool = min(nb_commands, args.expandPool) + + # If no pool size is specified the number of commands is taken + if args.pool is None: + args.pool = command_manager.get_nb_commands_to_run() + + # Generating all the worker commands + worker_script = pjoin(os.path.dirname(smartdispatch.__file__), 'workers', 'base_worker.py') + worker_script_flags = '' + if args.autoresume: + worker_script_flags = '-r' + + worker_call_prefix = '' + worker_call_suffix = '' + if args.autoresume: + worker_call_prefix = AUTORESUME_WORKER_CALL_PREFIX + worker_call_suffix = AUTORESUME_WORKER_CALL_SUFFIX + + COMMAND_STRING = 'cd "{cwd}"; {worker_call_prefix}python2 {worker_script} {worker_script_flags} "{commands_file}" "{log_folder}" '\ + '1>> "{log_folder}/worker/$PBS_JOBID\"\"_worker_{{ID}}.o" '\ + '2>> "{log_folder}/worker/$PBS_JOBID\"\"_worker_{{ID}}.e" &'\ + '{worker_call_suffix}' + COMMAND_STRING = COMMAND_STRING.format(cwd=os.getcwd(), worker_call_prefix=worker_call_prefix, worker_script=worker_script, + worker_script_flags=worker_script_flags, commands_file=command_manager._commands_filename, + log_folder=path_job_logs, worker_call_suffix=worker_call_suffix) + commands = [COMMAND_STRING.format(ID=i) for i in range(args.pool)] + + # TODO: use args.memPerNode instead of args.memPerNode + queue = Queue(args.queueName, CLUSTER_NAME, args.walltime, args.coresPerNode, args.gpusPerNode, float('inf'), args.modules) + + # Check that requested core number does not exceed node total + if args.coresPerCommand > queue.nb_cores_per_node: + sys.stderr.write("smart-dispatch: error: coresPerCommand exceeds nodes total: asked {req_cores} cores, nodes have {node_cores}\n" + .format(req_cores=args.coresPerCommand, node_cores=queue.nb_cores_per_node)) + sys.exit(2) + + # Check that requested gpu number does not exceed node total + if args.gpusPerCommand > queue.nb_gpus_per_node: + sys.stderr.write("smart-dispatch: error: gpusPerCommand exceeds nodes total: asked {req_gpus} gpus, nodes have {node_gpus}\n" + .format(req_gpus=args.gpusPerCommand, node_gpus=queue.nb_gpus_per_node)) + sys.exit(2) + + + command_params = {'nb_cores_per_command': args.coresPerCommand, + 'nb_gpus_per_command': args.gpusPerCommand, + 'mem_per_command': None # args.memPerCommand + } + + prolog = [] + epilog = ['wait'] + if args.autoresume: + prolog = [AUTORESUME_PROLOG] + epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)] + + job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job) + + # generating default names per each jobs in each batch + for pbs_id, pbs in enumerate(job_generator.pbs_list): + proper_size_name = utils.jobname_generator(jobname, pbs_id) + pbs.add_options(N=proper_size_name) + + if args.pbsFlags is not None: + job_generator.add_pbs_flags(args.pbsFlags.split(' ')) + pbs_filenames = job_generator.write_pbs_files(path_job_commands) + + # Launch the jobs + print "## {nb_commands} command(s) will be executed in {nb_jobs} job(s) ##".format(nb_commands=nb_commands, nb_jobs=len(pbs_filenames)) + print "Batch UID:\n{batch_uid}".format(batch_uid=jobname) + if not args.doNotLaunch: + launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job) + print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)') + parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.') + parser.add_argument('-t', '--walltime', required=False, help='Set the estimated running time of your jobs using the DD:HH:MM:SS format. Note that they will be killed when this time limit is reached.') + parser.add_argument('-L', '--launcher', choices=['qsub', 'msub'], required=False, help='Which launcher to use. Default: qsub') + parser.add_argument('-C', '--coresPerNode', type=int, required=False, help='How many cores there are per node.') + parser.add_argument('-G', '--gpusPerNode', type=int, required=False, help='How many gpus there are per node.') + # parser.add_argument('-M', '--memPerNode', type=int, required=False, help='How much memory there are per node (in Gb).') + + parser.add_argument('-c', '--coresPerCommand', type=int, required=False, help='How many cores a command needs.', default=1) + parser.add_argument('-g', '--gpusPerCommand', type=int, required=False, help='How many gpus a command needs.', default=1) + # parser.add_argument('-m', '--memPerCommand', type=float, required=False, help='How much memory a command needs (in Gb).') + parser.add_argument('-f', '--commandsFile', type=file, required=False, help='File containing commands to launch. Each command must be on a seperate line. (Replaces commandAndOptions)') + + parser.add_argument('-l', '--modules', type=str, required=False, help='List of additional modules to load.', nargs='+') + parser.add_argument('-x', '--doNotLaunch', action='store_true', help='Generate all the files without launching the job.') + parser.add_argument('-r', '--autoresume', action='store_true', help='Requeue the job when the running time hits the maximum walltime allowed on the cluster. Assumes that commands are resumable.') + + parser.add_argument('-p', '--pool', type=int, help="Number of workers that will be consuming commands. Default: Nb commands") + parser.add_argument('--pbsFlags', type=str, help='ADVANCED USAGE: Allow to pass a space seperated list of PBS flags. Ex:--pbsFlags="-lfeature=k80 -t0-4"') + subparsers = parser.add_subparsers(dest="mode") + + launch_parser = subparsers.add_parser('launch', help="Launch jobs.") + launch_parser.add_argument("commandAndOptions", help="Options for the commands.", nargs=argparse.REMAINDER) + + resume_parser = subparsers.add_parser('resume', help="Resume jobs from batch UID.") + resume_parser.add_argument('--expandPool', type=int, nargs='?', const=sys.maxsize, help='Add workers to the given batch. Default: # pending jobs.') + resume_parser.add_argument("batch_uid", help="Batch UID of the jobs to resume.") + + args = parser.parse_args() + + # Check for invalid arguments in + if args.mode == "launch": + if args.commandsFile is None and len(args.commandAndOptions) < 1: + parser.error("You need to specify a command to launch.") + if args.queueName not in AVAILABLE_QUEUES and ((args.coresPerNode is None and args.gpusPerNode is None) or args.walltime is None): + parser.error("Unknown queue, --coresPerNode/--gpusPerNode and --walltime must be set.") + if args.coresPerCommand < 1: + parser.error("coresPerNode must be at least 1") + + return args + + +if __name__ == "__main__": + main() From d9c4e9cec4bb57bcf025b4b5597cc5d39ea1e8d6 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 12 Sep 2017 18:42:39 -0400 Subject: [PATCH 03/18] changed the call to check_output to be able to mock it. --- smartdispatch/smartdispatch.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/smartdispatch/smartdispatch.py b/smartdispatch/smartdispatch.py index b38e8cf..9b058aa 100644 --- a/smartdispatch/smartdispatch.py +++ b/smartdispatch/smartdispatch.py @@ -5,8 +5,7 @@ import itertools import time as t from os.path import join as pjoin -from subprocess import check_output - +import subprocess import smartdispatch from smartdispatch import utils from smartdispatch.filelock import open_with_lock @@ -183,13 +182,13 @@ def launch_jobs(launcher, pbs_filenames, cluster_name, path_job): # pragma: no ''' jobs_id = [] for pbs_filename in pbs_filenames: - launcher_output = check_output('PBS_FILENAME={pbs_filename} {launcher} {pbs_filename}'.format( + launcher_output = subprocess.check_output('PBS_FILENAME={pbs_filename} {launcher} {pbs_filename}'.format( launcher=launcher, pbs_filename=pbs_filename), shell=True) jobs_id += [launcher_output.strip()] # On some clusters, SRMJID and PBS_JOBID don't match if cluster_name in ['helios']: - launcher_output = check_output(['qstat', '-f']).split('Job Id: ') + launcher_output = subprocess.check_output(['qstat', '-f']).split('Job Id: ') for job in launcher_output: if re.search(r"SRMJID:{job_id}".format(job_id=jobs_id[-1]), job): pbs_job_id = re.match(r"[0-9a-zA-Z.-]*", job).group() From 4a0d63d3551f82e827bd06a2722e07bff3ab0a71 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 12 Sep 2017 18:43:20 -0400 Subject: [PATCH 04/18] Adding a check for the launcher error. --- smartdispatch/smartdispatch_script.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/smartdispatch/smartdispatch_script.py b/smartdispatch/smartdispatch_script.py index 9bae0fa..9e4b9c8 100755 --- a/smartdispatch/smartdispatch_script.py +++ b/smartdispatch/smartdispatch_script.py @@ -8,7 +8,7 @@ import smartdispatch from command_manager import CommandManager - +import subprocess from queue import Queue from job_generator import job_generator_factory from smartdispatch import get_available_queues @@ -45,11 +45,11 @@ """.format(timeout_exit_code=TIMEOUT_EXIT_CODE) -def main(): +def main(argv=None): # Necessary if we want 'logging.info' to appear in stderr. logging.root.setLevel(logging.INFO) - args = parse_arguments() + args = parse_arguments(argv) path_smartdispatch_logs = pjoin(os.getcwd(), LOGS_FOLDERNAME) # Check if RESUME or LAUNCH mode @@ -187,11 +187,21 @@ def main(): print "## {nb_commands} command(s) will be executed in {nb_jobs} job(s) ##".format(nb_commands=nb_commands, nb_jobs=len(pbs_filenames)) print "Batch UID:\n{batch_uid}".format(batch_uid=jobname) if not args.doNotLaunch: - launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job) + + try: + launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job) + except subprocess.CalledProcessError as e: + sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. Maybe the pbs file(s) generated were invalid: \n{}".format(e.output)) + sys.exit(2) + print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) -def parse_arguments(): +def parse_arguments(argv=None): + + if argv is None: + argv = sys.argv[1:] + parser = argparse.ArgumentParser() parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)') parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.') @@ -221,7 +231,7 @@ def parse_arguments(): resume_parser.add_argument('--expandPool', type=int, nargs='?', const=sys.maxsize, help='Add workers to the given batch. Default: # pending jobs.') resume_parser.add_argument("batch_uid", help="Batch UID of the jobs to resume.") - args = parser.parse_args() + args = parser.parse_args(argv) # Check for invalid arguments in if args.mode == "launch": From 9f637717bf6d479b4f49b149ae1e1f604949121c Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 12 Sep 2017 19:02:17 -0400 Subject: [PATCH 05/18] Adding test to test the well behaviour of the script --- .../tests/test_smartdispatch_script.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 smartdispatch/tests/test_smartdispatch_script.py diff --git a/smartdispatch/tests/test_smartdispatch_script.py b/smartdispatch/tests/test_smartdispatch_script.py new file mode 100644 index 0000000..6f70297 --- /dev/null +++ b/smartdispatch/tests/test_smartdispatch_script.py @@ -0,0 +1,49 @@ +import unittest +from smartdispatch import smartdispatch_script +import subprocess +from mock import patch +import tempfile as tmp +import shutil + + +class TestSmartScript(unittest.TestCase): + + def setUp(self): + self._base_dir = tmp.mkdtemp() + smartdispatch_script.LOGS_FOLDERNAME = self._base_dir + + def tearDown(self): + shutil.rmtree(self._base_dir) + + def test_gpu_check(self): + + argv = ['-x', '-g', '2', '-G', '1', '-q', 'gpu_1', 'launch', 'echo', 'testing123'] + + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + def test_cpu_check(self): + + argv = ['-x', '-c', '2', '-C', '1', '-q', 'gpu_1', 'launch', 'echo', 'testing123'] + + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + @patch('subprocess.check_output') + def test_launch_job_check(self, mock_check_output): + + mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") + argv = ['-q', 'gpu_1', 'launch', 'echo', 'testing123'] + + try: + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + except subprocess.CalledProcessError: + self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly!") From cc07bc65974578184a07ca31e7996665f2b70ae2 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Wed, 13 Sep 2017 10:20:54 -0400 Subject: [PATCH 06/18] Using an unexisting queue. --- smartdispatch/tests/test_smartdispatch_script.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smartdispatch/tests/test_smartdispatch_script.py b/smartdispatch/tests/test_smartdispatch_script.py index 6f70297..b0cd29f 100644 --- a/smartdispatch/tests/test_smartdispatch_script.py +++ b/smartdispatch/tests/test_smartdispatch_script.py @@ -17,7 +17,7 @@ def tearDown(self): def test_gpu_check(self): - argv = ['-x', '-g', '2', '-G', '1', '-q', 'gpu_1', 'launch', 'echo', 'testing123'] + argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '10' ,'launch', 'echo', 'testing123'] with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) @@ -26,7 +26,7 @@ def test_gpu_check(self): def test_cpu_check(self): - argv = ['-x', '-c', '2', '-C', '1', '-q', 'gpu_1', 'launch', 'echo', 'testing123'] + argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '10', '-q', 'random', 'launch', 'echo', 'testing123'] with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) @@ -37,7 +37,7 @@ def test_cpu_check(self): def test_launch_job_check(self, mock_check_output): mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") - argv = ['-q', 'gpu_1', 'launch', 'echo', 'testing123'] + argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] try: with self.assertRaises(SystemExit) as context: From 4d76f5ac7fbc7bc6924cfce8c8fca1349b3ee6e5 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Wed, 13 Sep 2017 11:05:42 -0400 Subject: [PATCH 07/18] more precise error message. --- smartdispatch/smartdispatch_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartdispatch/smartdispatch_script.py b/smartdispatch/smartdispatch_script.py index 9e4b9c8..a4f78be 100755 --- a/smartdispatch/smartdispatch_script.py +++ b/smartdispatch/smartdispatch_script.py @@ -191,7 +191,7 @@ def main(argv=None): try: launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job) except subprocess.CalledProcessError as e: - sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. Maybe the pbs file(s) generated were invalid: \n{}".format(e.output)) + sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. Maybe the pbs file(s) generated were invalid. The following error message was returned: \n{}".format(e.output)) sys.exit(2) print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) From 1b7ffbb6987a84372713e784f8baf91212df539e Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Wed, 13 Sep 2017 12:16:10 -0400 Subject: [PATCH 08/18] Puttig all the tests on the script in the same file. --- tests/test_smart_dispatch.py | 43 +++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index 9e514c1..f64c9dd 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -3,11 +3,11 @@ import tempfile import shutil from os.path import join as pjoin, abspath - +from mock import patch from subprocess import call - +import subprocess from nose.tools import assert_true, assert_equal - +from smartdispatch import smartdispatch_script class TestSmartdispatcher(unittest.TestCase): @@ -23,17 +23,23 @@ def setUp(self): self.nb_commands = len(self.commands) scripts_path = abspath(pjoin(os.path.dirname(__file__), os.pardir, "scripts")) - self.smart_dispatch_command = '{} -C 1 -q test -t 5:00 -x'.format(pjoin(scripts_path, 'smart-dispatch')) + self.smart_dispatch_command = '{} -C 1 -G 1 -q test -t 5:00 -x'.format(pjoin(scripts_path, 'smart-dispatch')) self.launch_command = "{0} launch {1}".format(self.smart_dispatch_command, self.folded_commands) self.resume_command = "{0} resume {{0}}".format(self.smart_dispatch_command) - smart_dispatch_command_with_pool = '{} --pool 10 -C 1 -q test -t 5:00 -x {{0}}'.format(pjoin(scripts_path, 'smart-dispatch')) + self.smart_dispatch_launcher_command = '{} -C 1 -G 1 -q test -t 5:00'.format(pjoin(scripts_path, 'smart-dispatch')) + self.launcher_command = "{0} launch {1}".format(self.smart_dispatch_launcher_command, self.folded_commands) + + smart_dispatch_command_with_pool = '{} --pool 10 -C 1 -G 1 -q test -t 5:00 -x {{0}}'.format(pjoin(scripts_path, 'smart-dispatch')) self.launch_command_with_pool = smart_dispatch_command_with_pool.format('launch ' + self.folded_commands) self.nb_workers = 10 - smart_dispatch_command_with_cores = '{} -C 1 -c {{cores}} -q test -t 5:00 -x {{0}}'.format(pjoin(scripts_path, 'smart-dispatch')) + smart_dispatch_command_with_cores = '{} -C 1 -G 1 -c {{cores}} -q test -t 5:00 -x {{0}}'.format(pjoin(scripts_path, 'smart-dispatch')) self.launch_command_with_cores = smart_dispatch_command_with_cores.format('launch ' + self.folded_commands, cores='{cores}') + smart_dispatch_command_with_gpus = '{} -C 1 -G 1 -g {{gpus}} -q test -t 5:00 -x {{0}}'.format(pjoin(scripts_path, 'smart-dispatch')) + self.launch_command_with_gpus = smart_dispatch_command_with_gpus.format('launch ' + self.folded_commands, gpus='{gpus}') + self._cwd = os.getcwd() os.chdir(self.testing_dir) @@ -95,6 +101,31 @@ def test_main_launch_with_cores_command(self): assert_equal(exit_status_100, 2) assert_true(os.path.isdir(self.logs_dir)) + def test_main_launch_with_gpus_command(self): + # Actual test + exit_status_100 = call(self.launch_command_with_gpus.format(gpus=100), shell=True) + + # Test validation + assert_equal(exit_status_100, 2) + assert_true(os.path.isdir(self.logs_dir)) + + @patch('subprocess.check_output') + def test_launch_job_check(self, mock_check_output): + + #For this test, we won't call the script directly, since we want to mock subprocess.check_output + mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") + argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] + + try: + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + except subprocess.CalledProcessError: + self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly!") + + def test_main_resume(self): # Setup call(self.launch_command, shell=True) From 90c4f4d451e8e128ab4090df4af73ca00313781b Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Thu, 21 Sep 2017 12:37:51 -0400 Subject: [PATCH 09/18] The tests cover more cases and are more explicit. --- .../tests/test_smartdispatch_script.py | 41 ++++++++++++++++--- tests/test_smart_dispatch.py | 16 +++++++- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/smartdispatch/tests/test_smartdispatch_script.py b/smartdispatch/tests/test_smartdispatch_script.py index b0cd29f..278f3b4 100644 --- a/smartdispatch/tests/test_smartdispatch_script.py +++ b/smartdispatch/tests/test_smartdispatch_script.py @@ -4,7 +4,7 @@ from mock import patch import tempfile as tmp import shutil - +import traceback class TestSmartScript(unittest.TestCase): @@ -17,33 +17,64 @@ def tearDown(self): def test_gpu_check(self): - argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '10' ,'launch', 'echo', 'testing123'] + argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] + # Test if the check fail with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) self.assertTrue(context.exception.code, 2) + # Test if the test pass + argv[2] = '0' + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The command failed the check, but it was supposed to pass.") + + def test_cpu_check(self): - argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '10', '-q', 'random', 'launch', 'echo', 'testing123'] + argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '00:00:10', '-q', 'random', 'launch', 'echo', 'testing123'] + # Test if the check fail with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) self.assertTrue(context.exception.code, 2) + # Test if the test pass + argv[2] = '1' + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The command failed the check, but it was supposed to pass.") + + + @patch('subprocess.check_output') def test_launch_job_check(self, mock_check_output): mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] + # Test if the test fail. try: with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) self.assertTrue(context.exception.code, 2) - + except subprocess.CalledProcessError: - self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly!") + self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly:\n {}".format(traceback.format_exc())) + + # Test if the test pass (i.e the script run normaly) + mock_check_output.side_effect = None + mock_check_output.return_value = "" + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The launcher had no problem, but the script failed nonetheless.") diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index f64c9dd..f32f650 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -8,6 +8,7 @@ import subprocess from nose.tools import assert_true, assert_equal from smartdispatch import smartdispatch_script +import traceback class TestSmartdispatcher(unittest.TestCase): @@ -103,9 +104,11 @@ def test_main_launch_with_cores_command(self): def test_main_launch_with_gpus_command(self): # Actual test + exit_status_0 = call(self.launch_command_with_gpus.format(gpus=0), shell=True) exit_status_100 = call(self.launch_command_with_gpus.format(gpus=100), shell=True) # Test validation + assert_equal(exit_status_0, 0) assert_equal(exit_status_100, 2) assert_true(os.path.isdir(self.logs_dir)) @@ -116,6 +119,8 @@ def test_launch_job_check(self, mock_check_output): mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] + + #Test if the test fail. try: with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) @@ -123,7 +128,16 @@ def test_launch_job_check(self, mock_check_output): self.assertTrue(context.exception.code, 2) except subprocess.CalledProcessError: - self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly!") + self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly:\n {}".format(traceback.format_exc())) + + # Test if the test pass (i.e the script run normaly) + mock_check_output.side_effect = None + mock_check_output.return_value = "" + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The launcher had no problem, but the script failed nonetheless.") def test_main_resume(self): From fba5bf33fcd7ec1770dffdc7716b61276deefd60 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Thu, 21 Sep 2017 12:38:26 -0400 Subject: [PATCH 10/18] removing useless lines. --- smartdispatch/smartdispatch_script.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/smartdispatch/smartdispatch_script.py b/smartdispatch/smartdispatch_script.py index a4f78be..c44e7b3 100755 --- a/smartdispatch/smartdispatch_script.py +++ b/smartdispatch/smartdispatch_script.py @@ -199,9 +199,6 @@ def main(argv=None): def parse_arguments(argv=None): - if argv is None: - argv = sys.argv[1:] - parser = argparse.ArgumentParser() parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)') parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.') From 5d99e79fc83fedda3463c8a61de2b59571dd4ac5 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Thu, 21 Sep 2017 15:25:39 -0400 Subject: [PATCH 11/18] Raising a new Exception instead with the stack instead of falling the test. --- .../tests/test_smartdispatch_script.py | 33 +++++++++++-------- tests/test_smart_dispatch.py | 31 +++++++++-------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/smartdispatch/tests/test_smartdispatch_script.py b/smartdispatch/tests/test_smartdispatch_script.py index 278f3b4..493df6a 100644 --- a/smartdispatch/tests/test_smartdispatch_script.py +++ b/smartdispatch/tests/test_smartdispatch_script.py @@ -4,7 +4,8 @@ from mock import patch import tempfile as tmp import shutil -import traceback +import six +import sys class TestSmartScript(unittest.TestCase): @@ -57,19 +58,8 @@ def test_cpu_check(self): @patch('subprocess.check_output') def test_launch_job_check(self, mock_check_output): - mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] - # Test if the test fail. - try: - with self.assertRaises(SystemExit) as context: - smartdispatch_script.main(argv=argv) - - self.assertTrue(context.exception.code, 2) - - except subprocess.CalledProcessError: - self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly:\n {}".format(traceback.format_exc())) - # Test if the test pass (i.e the script run normaly) mock_check_output.side_effect = None mock_check_output.return_value = "" @@ -77,4 +67,21 @@ def test_launch_job_check(self, mock_check_output): try: smartdispatch_script.main(argv=argv) except SystemExit as e: - self.fail("The launcher had no problem, but the script failed nonetheless.") + self.fail("The launcher had no problem, but the script failed nonetheless.") + + mock_check_output.side_effect = subprocess.CalledProcessError(1, "echo blabla", "A wild error appeared!") + + # Test if the test fail. + try: + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + except subprocess.CalledProcessError as e: + # Rerasing the exception + orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() + + new_exc = Exception("smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") + new_exc.reraised = True + six.reraise(type(new_exc), new_exc, orig_exc_traceback) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index f32f650..be53d54 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -8,7 +8,8 @@ import subprocess from nose.tools import assert_true, assert_equal from smartdispatch import smartdispatch_script -import traceback +import six +import sys class TestSmartdispatcher(unittest.TestCase): @@ -116,20 +117,8 @@ def test_main_launch_with_gpus_command(self): def test_launch_job_check(self, mock_check_output): #For this test, we won't call the script directly, since we want to mock subprocess.check_output - mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] - - #Test if the test fail. - try: - with self.assertRaises(SystemExit) as context: - smartdispatch_script.main(argv=argv) - - self.assertTrue(context.exception.code, 2) - - except subprocess.CalledProcessError: - self.fail("smartdispatch_script.main() raised CalledProcessError unexpectedly:\n {}".format(traceback.format_exc())) - # Test if the test pass (i.e the script run normaly) mock_check_output.side_effect = None mock_check_output.return_value = "" @@ -139,6 +128,22 @@ def test_launch_job_check(self, mock_check_output): except SystemExit as e: self.fail("The launcher had no problem, but the script failed nonetheless.") + # Test if the check fail + mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") + + try: + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + except subprocess.CalledProcessError: + # Rerasing the exception + orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() + + new_exc = Exception("smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") + new_exc.reraised = True + six.reraise(type(new_exc), new_exc, orig_exc_traceback) def test_main_resume(self): # Setup From 7e357d84070fc77a0ab869124366136931d30b09 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 26 Sep 2017 11:06:11 -0400 Subject: [PATCH 12/18] Removing duplicate test. Putting all the tests that have to do with the script in the same file. --- tests/test_smart_dispatch.py | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index be53d54..6d85e84 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -113,6 +113,45 @@ def test_main_launch_with_gpus_command(self): assert_equal(exit_status_100, 2) assert_true(os.path.isdir(self.logs_dir)) + + def test_gpu_check(self): + + argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] + + # Test if the check fail + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + # Test if the test pass + argv[2] = '0' + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The command failed the check, but it was supposed to pass.") + + + def test_cpu_check(self): + + argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '00:00:10', '-q', 'random', 'launch', 'echo', 'testing123'] + + # Test if the check fail + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + # Test if the test pass + argv[2] = '1' + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The command failed the check, but it was supposed to pass.") + + @patch('subprocess.check_output') def test_launch_job_check(self, mock_check_output): From c33292b9a12bb0cf2b2eac1eb8985967254a49f4 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 26 Sep 2017 11:06:11 -0400 Subject: [PATCH 13/18] Removing duplicate test. Putting all the tests that have to do with the script in the same file. --- .../tests/test_smartdispatch_script.py | 87 ------------------- tests/test_smart_dispatch.py | 39 +++++++++ 2 files changed, 39 insertions(+), 87 deletions(-) delete mode 100644 smartdispatch/tests/test_smartdispatch_script.py diff --git a/smartdispatch/tests/test_smartdispatch_script.py b/smartdispatch/tests/test_smartdispatch_script.py deleted file mode 100644 index 493df6a..0000000 --- a/smartdispatch/tests/test_smartdispatch_script.py +++ /dev/null @@ -1,87 +0,0 @@ -import unittest -from smartdispatch import smartdispatch_script -import subprocess -from mock import patch -import tempfile as tmp -import shutil -import six -import sys - -class TestSmartScript(unittest.TestCase): - - def setUp(self): - self._base_dir = tmp.mkdtemp() - smartdispatch_script.LOGS_FOLDERNAME = self._base_dir - - def tearDown(self): - shutil.rmtree(self._base_dir) - - def test_gpu_check(self): - - argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] - - # Test if the check fail - with self.assertRaises(SystemExit) as context: - smartdispatch_script.main(argv=argv) - - self.assertTrue(context.exception.code, 2) - - # Test if the test pass - argv[2] = '0' - - try: - smartdispatch_script.main(argv=argv) - except SystemExit as e: - self.fail("The command failed the check, but it was supposed to pass.") - - - def test_cpu_check(self): - - argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '00:00:10', '-q', 'random', 'launch', 'echo', 'testing123'] - - # Test if the check fail - with self.assertRaises(SystemExit) as context: - smartdispatch_script.main(argv=argv) - - self.assertTrue(context.exception.code, 2) - - # Test if the test pass - argv[2] = '1' - - try: - smartdispatch_script.main(argv=argv) - except SystemExit as e: - self.fail("The command failed the check, but it was supposed to pass.") - - - - @patch('subprocess.check_output') - def test_launch_job_check(self, mock_check_output): - - argv = ['-t', '0:0:1', '-G', '1', '-C', '1', '-q', 'random', 'launch', 'echo', 'testing123'] - - # Test if the test pass (i.e the script run normaly) - mock_check_output.side_effect = None - mock_check_output.return_value = "" - - try: - smartdispatch_script.main(argv=argv) - except SystemExit as e: - self.fail("The launcher had no problem, but the script failed nonetheless.") - - mock_check_output.side_effect = subprocess.CalledProcessError(1, "echo blabla", "A wild error appeared!") - - # Test if the test fail. - try: - with self.assertRaises(SystemExit) as context: - smartdispatch_script.main(argv=argv) - - self.assertTrue(context.exception.code, 2) - - except subprocess.CalledProcessError as e: - # Rerasing the exception - orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() - - new_exc = Exception("smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") - new_exc.reraised = True - six.reraise(type(new_exc), new_exc, orig_exc_traceback) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index be53d54..6d85e84 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -113,6 +113,45 @@ def test_main_launch_with_gpus_command(self): assert_equal(exit_status_100, 2) assert_true(os.path.isdir(self.logs_dir)) + + def test_gpu_check(self): + + argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] + + # Test if the check fail + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + # Test if the test pass + argv[2] = '0' + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The command failed the check, but it was supposed to pass.") + + + def test_cpu_check(self): + + argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '00:00:10', '-q', 'random', 'launch', 'echo', 'testing123'] + + # Test if the check fail + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + + self.assertTrue(context.exception.code, 2) + + # Test if the test pass + argv[2] = '1' + + try: + smartdispatch_script.main(argv=argv) + except SystemExit as e: + self.fail("The command failed the check, but it was supposed to pass.") + + @patch('subprocess.check_output') def test_launch_job_check(self, mock_check_output): From 94f58d0b441b7b800f75a40c07343b80abaa4b1e Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Mon, 9 Oct 2017 14:04:56 -0400 Subject: [PATCH 14/18] refactoring the test to catch and reraise the exceptions. --- tests/test_smart_dispatch.py | 58 +++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index 6d85e84..ddcbe9c 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -10,6 +10,28 @@ from smartdispatch import smartdispatch_script import six import sys +import traceback + +def rethrow_exception(exception, new_message): + + def func_wraper(func): + + def test_func(*args, **kwargs): + try: + return func(*args, **kwargs) + except exception as e: + + orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() + new_exc = Exception(new_message) + new_exc.reraised = True + new_exc.__cause__ = orig_exc_value + + new_traceback = orig_exc_traceback + six.reraise(type(new_exc), new_exc, new_traceback) + + + return test_func + return func_wraper class TestSmartdispatcher(unittest.TestCase): @@ -113,7 +135,7 @@ def test_main_launch_with_gpus_command(self): assert_equal(exit_status_100, 2) assert_true(os.path.isdir(self.logs_dir)) - + @rethrow_exception(SystemExit, "smartdispatch_script.main() raised SystemExit unexpectedly.") def test_gpu_check(self): argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] @@ -125,14 +147,10 @@ def test_gpu_check(self): self.assertTrue(context.exception.code, 2) # Test if the test pass - argv[2] = '0' - - try: - smartdispatch_script.main(argv=argv) - except SystemExit as e: - self.fail("The command failed the check, but it was supposed to pass.") - + argv[2] = '1' + smartdispatch_script.main(argv=argv) + @rethrow_exception(SystemExit, "smartdispatch_script.main() raised SystemExit unexpectedly.") def test_cpu_check(self): argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '00:00:10', '-q', 'random', 'launch', 'echo', 'testing123'] @@ -145,13 +163,9 @@ def test_cpu_check(self): # Test if the test pass argv[2] = '1' + smartdispatch_script.main(argv=argv) - try: - smartdispatch_script.main(argv=argv) - except SystemExit as e: - self.fail("The command failed the check, but it was supposed to pass.") - - + @rethrow_exception(subprocess.CalledProcessError, "smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") @patch('subprocess.check_output') def test_launch_job_check(self, mock_check_output): @@ -170,19 +184,9 @@ def test_launch_job_check(self, mock_check_output): # Test if the check fail mock_check_output.side_effect = subprocess.CalledProcessError(1, 1, "A wild error appeared!") - try: - with self.assertRaises(SystemExit) as context: - smartdispatch_script.main(argv=argv) - - self.assertTrue(context.exception.code, 2) - - except subprocess.CalledProcessError: - # Rerasing the exception - orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() - - new_exc = Exception("smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") - new_exc.reraised = True - six.reraise(type(new_exc), new_exc, orig_exc_traceback) + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + self.assertTrue(context.exception.code, 2) def test_main_resume(self): # Setup From 7eb4b84ef5dc7ce462a4e6d52ae2d851d53b04be Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Mon, 9 Oct 2017 14:05:36 -0400 Subject: [PATCH 15/18] Adding specific advices depending on the cluster we are currently on. --- smartdispatch/smartdispatch_script.py | 5 ++++- smartdispatch/utils.py | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/smartdispatch/smartdispatch_script.py b/smartdispatch/smartdispatch_script.py index c44e7b3..99fb586 100755 --- a/smartdispatch/smartdispatch_script.py +++ b/smartdispatch/smartdispatch_script.py @@ -191,7 +191,10 @@ def main(argv=None): try: launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job) except subprocess.CalledProcessError as e: - sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. Maybe the pbs file(s) generated were invalid. The following error message was returned: \n{}".format(e.output)) + + cluster_advice = utils.get_advice(CLUSTER_NAME) + + sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. The following error message was returned: \n\n{}\n\nMaybe the pbs file(s) generated were invalid. {}\n\n".format(e.output, cluster_advice)) sys.exit(2) print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) diff --git a/smartdispatch/utils.py b/smartdispatch/utils.py index 9135780..c8e89e6 100644 --- a/smartdispatch/utils.py +++ b/smartdispatch/utils.py @@ -136,3 +136,26 @@ def get_launcher(cluster_name): return "msub" else: return "qsub" + +def get_advice(cluster_name): + + helios_advice = """On Helios, don't forget that the queue gpu_1, gpu_2, gpu_4 and gpu_8 give access to a specific amount of gpus. +For more advices, please refer to the official documentation: 'https://wiki.calculquebec.ca/w/Helios/en'""" + mammouth_advice = "On Mammouth, please refer to the official documentation for more information: 'https://wiki.ccs.usherbrooke.ca/Accueil/en'" + hades_advice = """On Hades, don't forget that the queue name '@hades' needs to be use. +For more advices, please refer to the official documentation: 'https://wiki.calculquebec.ca/w/Ex%C3%A9cuter_une_t%C3%A2che/en#tab=tab5'""" + guillimin_advice = """On Guillimin, please refer to the official documentation for more information: 'http://www.hpc.mcgill.ca/index.php/starthere'""" + + if cluster_name == "helios": + return helios_advice + elif cluster_name == 'mammouth': + return mammouth_advice + elif cluster_name == 'hades': + return hades_advice + elif cluster_name == "guillimin": + return guillimin_advice + + return '' + + + From cec39e66cf7cde2fac24c515ac2c16050785be34 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 10 Oct 2017 16:46:43 -0400 Subject: [PATCH 16/18] refactoring the test utils, and testing le script when no gpus are available on the queue. pep8fy some long strings. --- smartdispatch/smartdispatch_script.py | 27 +++++++++-- smartdispatch/utils.py | 66 ++++++++++++++++++++------- tests/test_smart_dispatch.py | 45 ++++++++---------- 3 files changed, 91 insertions(+), 47 deletions(-) diff --git a/smartdispatch/smartdispatch_script.py b/smartdispatch/smartdispatch_script.py index 99fb586..def6a35 100755 --- a/smartdispatch/smartdispatch_script.py +++ b/smartdispatch/smartdispatch_script.py @@ -148,6 +148,13 @@ def main(argv=None): # TODO: use args.memPerNode instead of args.memPerNode queue = Queue(args.queueName, CLUSTER_NAME, args.walltime, args.coresPerNode, args.gpusPerNode, float('inf'), args.modules) + # Change the default value of the gpusPerCommand depending on the value of + if args.gpusPerCommand is None: + if queue.nb_gpus_per_node == 0: + args.gpusPerCommand = 0 + else: + args.gpusPerCommand = 1 + # Check that requested core number does not exceed node total if args.coresPerCommand > queue.nb_cores_per_node: sys.stderr.write("smart-dispatch: error: coresPerCommand exceeds nodes total: asked {req_cores} cores, nodes have {node_cores}\n" @@ -156,8 +163,11 @@ def main(argv=None): # Check that requested gpu number does not exceed node total if args.gpusPerCommand > queue.nb_gpus_per_node: - sys.stderr.write("smart-dispatch: error: gpusPerCommand exceeds nodes total: asked {req_gpus} gpus, nodes have {node_gpus}\n" - .format(req_gpus=args.gpusPerCommand, node_gpus=queue.nb_gpus_per_node)) + + error_message = ("smart-dispatch: error: gpusPerCommand exceeds nodes total:" + "asked {req_gpus} gpus, nodes have {node_gpus}. Make sure you have specified the correct queue.\n") + + sys.stderr.write(error_message.format(req_gpus=args.gpusPerCommand, node_gpus=queue.nb_gpus_per_node)) sys.exit(2) @@ -194,7 +204,14 @@ def main(argv=None): cluster_advice = utils.get_advice(CLUSTER_NAME) - sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. The following error message was returned: \n\n{}\n\nMaybe the pbs file(s) generated were invalid. {}\n\n".format(e.output, cluster_advice)) + error_message = ("smart-dispatch: error: The launcher wasn't" + " able the launch the job(s) properly. The" + " following error message was returned: \n\n{}" + "\n\nMaybe the pbs file(s) generated were" + " invalid. {}\n\n") + + + sys.stderr.write(error_message.format(e.output, cluster_advice)) sys.exit(2) print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) @@ -212,7 +229,7 @@ def parse_arguments(argv=None): # parser.add_argument('-M', '--memPerNode', type=int, required=False, help='How much memory there are per node (in Gb).') parser.add_argument('-c', '--coresPerCommand', type=int, required=False, help='How many cores a command needs.', default=1) - parser.add_argument('-g', '--gpusPerCommand', type=int, required=False, help='How many gpus a command needs.', default=1) + parser.add_argument('-g', '--gpusPerCommand', type=int, required=False, help='How many gpus a command needs. The value is 1 by default if GPUs are available on the specified queue, 0 otherwise.') # parser.add_argument('-m', '--memPerCommand', type=float, required=False, help='How much memory a command needs (in Gb).') parser.add_argument('-f', '--commandsFile', type=file, required=False, help='File containing commands to launch. Each command must be on a seperate line. (Replaces commandAndOptions)') @@ -241,7 +258,7 @@ def parse_arguments(argv=None): parser.error("Unknown queue, --coresPerNode/--gpusPerNode and --walltime must be set.") if args.coresPerCommand < 1: parser.error("coresPerNode must be at least 1") - + return args diff --git a/smartdispatch/utils.py b/smartdispatch/utils.py index c8e89e6..205a39b 100644 --- a/smartdispatch/utils.py +++ b/smartdispatch/utils.py @@ -2,10 +2,45 @@ import hashlib import unicodedata import json +import sys +import six from distutils.util import strtobool from subprocess import Popen, PIPE +HELIOS_ADVICE = ("On Helios, don't forget that the queue gpu_1, gpu_2, gpu_4" + " and gpu_8 give access to a specific amount of gpus." + "\nFor more advices, please refer to the official" + " documentation 'https://wiki.calculquebec.ca/w/Helios/en'") + +MAMMOUTH_ADVICE = ("On Mammouth, please refer to the official documentation" + " for more information:" + " 'https://wiki.ccs.usherbrooke.ca/Accueil/en'") + +HADES_ADVICE = ("On Hades, don't forget that the queue name '@hades' needs" + " to be used.\nFor more advices, please refer to the" + " official documentation: 'https://wiki.calculquebec.ca/w" + "/Ex%C3%A9cuter_une_t%C3%A2che/en#tab=tab5'") + +GUILLIMIN_ADVICE = ("On Guillimin, please refer to the official documentation" + " for more information: 'http://www.hpc.mcgill.ca/" + "index.php/starthere'") + + +def get_advice(cluster_name): + + if cluster_name == "helios": + return HELIOS_ADVICE + elif cluster_name == 'mammouth': + return MAMMOUTH_ADVICE + elif cluster_name == 'hades': + return HADES_ADVICE + elif cluster_name == "guillimin": + return GUILLIMIN_ADVICE + + return '' + + def jobname_generator(jobname, job_id): '''Crop the jobname to a maximum of 64 characters. Parameters @@ -137,25 +172,24 @@ def get_launcher(cluster_name): else: return "qsub" -def get_advice(cluster_name): - helios_advice = """On Helios, don't forget that the queue gpu_1, gpu_2, gpu_4 and gpu_8 give access to a specific amount of gpus. -For more advices, please refer to the official documentation: 'https://wiki.calculquebec.ca/w/Helios/en'""" - mammouth_advice = "On Mammouth, please refer to the official documentation for more information: 'https://wiki.ccs.usherbrooke.ca/Accueil/en'" - hades_advice = """On Hades, don't forget that the queue name '@hades' needs to be use. -For more advices, please refer to the official documentation: 'https://wiki.calculquebec.ca/w/Ex%C3%A9cuter_une_t%C3%A2che/en#tab=tab5'""" - guillimin_advice = """On Guillimin, please refer to the official documentation for more information: 'http://www.hpc.mcgill.ca/index.php/starthere'""" +def rethrow_exception(exception, new_message): - if cluster_name == "helios": - return helios_advice - elif cluster_name == 'mammouth': - return mammouth_advice - elif cluster_name == 'hades': - return hades_advice - elif cluster_name == "guillimin": - return guillimin_advice + def func_wraper(func): - return '' + def test_func(*args, **kwargs): + try: + return func(*args, **kwargs) + except exception as e: + + orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() + new_exc = Exception(new_message) + new_exc.reraised = True + new_exc.__cause__ = orig_exc_value + new_traceback = orig_exc_traceback + six.reraise(type(new_exc), new_exc, new_traceback) + return test_func + return func_wraper diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index ddcbe9c..6318629 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -9,29 +9,7 @@ from nose.tools import assert_true, assert_equal from smartdispatch import smartdispatch_script import six -import sys -import traceback - -def rethrow_exception(exception, new_message): - - def func_wraper(func): - - def test_func(*args, **kwargs): - try: - return func(*args, **kwargs) - except exception as e: - - orig_exc_type, orig_exc_value, orig_exc_traceback = sys.exc_info() - new_exc = Exception(new_message) - new_exc.reraised = True - new_exc.__cause__ = orig_exc_value - - new_traceback = orig_exc_traceback - six.reraise(type(new_exc), new_exc, new_traceback) - - - return test_func - return func_wraper +from smartdispatch import utils class TestSmartdispatcher(unittest.TestCase): @@ -135,7 +113,7 @@ def test_main_launch_with_gpus_command(self): assert_equal(exit_status_100, 2) assert_true(os.path.isdir(self.logs_dir)) - @rethrow_exception(SystemExit, "smartdispatch_script.main() raised SystemExit unexpectedly.") + @utils.rethrow_exception(SystemExit, "smartdispatch_script.main() raised SystemExit unexpectedly.") def test_gpu_check(self): argv = ['-x', '-g', '2', '-G', '1', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] @@ -150,7 +128,22 @@ def test_gpu_check(self): argv[2] = '1' smartdispatch_script.main(argv=argv) - @rethrow_exception(SystemExit, "smartdispatch_script.main() raised SystemExit unexpectedly.") + # Test if we don't have gpus. (and spicified in script). + argv[2] = '0' + argv[4] = '0' + smartdispatch_script.main(argv=argv) + + # Don't have gpus, but the user specofy 1 anyway. + argv[2] = '1' + with self.assertRaises(SystemExit) as context: + smartdispatch_script.main(argv=argv) + self.assertTrue(context.exception.code, 2) + + # Test if the user didn't specified anything. + argv = ['-x', '-C', '1', '-q', 'random', '-t', '00:00:10' ,'launch', 'echo', 'testing123'] + smartdispatch_script.main(argv=argv) + + @utils.rethrow_exception(SystemExit, "smartdispatch_script.main() raised SystemExit unexpectedly.") def test_cpu_check(self): argv = ['-x', '-c', '2', '-C', '1', '-G', '1', '-t', '00:00:10', '-q', 'random', 'launch', 'echo', 'testing123'] @@ -165,7 +158,7 @@ def test_cpu_check(self): argv[2] = '1' smartdispatch_script.main(argv=argv) - @rethrow_exception(subprocess.CalledProcessError, "smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") + @utils.rethrow_exception(subprocess.CalledProcessError, "smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly") @patch('subprocess.check_output') def test_launch_job_check(self, mock_check_output): From 3b27597bce33f08180caae6c49cad91ff37524e7 Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 17 Oct 2017 12:02:08 -0400 Subject: [PATCH 17/18] Correcting some typo and use better decorator helper functions. --- smartdispatch/utils.py | 2 ++ tests/test_smart_dispatch.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/smartdispatch/utils.py b/smartdispatch/utils.py index 205a39b..7096224 100644 --- a/smartdispatch/utils.py +++ b/smartdispatch/utils.py @@ -4,6 +4,7 @@ import json import sys import six +import functools from distutils.util import strtobool from subprocess import Popen, PIPE @@ -177,6 +178,7 @@ def rethrow_exception(exception, new_message): def func_wraper(func): + @functools.wraps(func) def test_func(*args, **kwargs): try: return func(*args, **kwargs) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index 6318629..60c4725 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -128,12 +128,12 @@ def test_gpu_check(self): argv[2] = '1' smartdispatch_script.main(argv=argv) - # Test if we don't have gpus. (and spicified in script). + # Test if we don't have gpus. (and specified in script). argv[2] = '0' argv[4] = '0' smartdispatch_script.main(argv=argv) - # Don't have gpus, but the user specofy 1 anyway. + # Don't have gpus, but the user specify 1 anyway. argv[2] = '1' with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) From 71af7a33e884c9bbf2b22f3d268732d5fe179c6c Mon Sep 17 00:00:00 2001 From: Francis Dutil Date: Tue, 17 Oct 2017 13:54:28 -0400 Subject: [PATCH 18/18] Adding some comments to make the tests easier to understand. --- tests/test_smart_dispatch.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_smart_dispatch.py b/tests/test_smart_dispatch.py index 60c4725..b46727e 100644 --- a/tests/test_smart_dispatch.py +++ b/tests/test_smart_dispatch.py @@ -125,16 +125,16 @@ def test_gpu_check(self): self.assertTrue(context.exception.code, 2) # Test if the test pass - argv[2] = '1' + argv[2] = '1' # -g 1 smartdispatch_script.main(argv=argv) # Test if we don't have gpus. (and specified in script). - argv[2] = '0' - argv[4] = '0' + argv[2] = '0' # -g 0 + argv[4] = '0' # -G 0 smartdispatch_script.main(argv=argv) - # Don't have gpus, but the user specify 1 anyway. - argv[2] = '1' + # Don't have gpus, but the user specified 1 anyway. + argv[2] = '1' # -g 1 with self.assertRaises(SystemExit) as context: smartdispatch_script.main(argv=argv) self.assertTrue(context.exception.code, 2) @@ -155,7 +155,7 @@ def test_cpu_check(self): self.assertTrue(context.exception.code, 2) # Test if the test pass - argv[2] = '1' + argv[2] = '1'# -c 1 smartdispatch_script.main(argv=argv) @utils.rethrow_exception(subprocess.CalledProcessError, "smartdispatch_script.main() raised subprocess.CalledProcessError unexpectedly")