Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,12 @@ submit_command = /usr/bin/sbatch

`submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`).

```ini
cancel_command = /usr/bin/scancel
```

`cancel_command` is the full path to the Slurm command used for cancelling batch jobs. You may want to verify if `scancel` is provided at that path or determine its actual location (using `which scancel`).

```ini
build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...]
```
Expand All @@ -560,11 +566,11 @@ name on GitHub. Thus, one could not - by accident - give build permissions to an
unknown account.

```ini
no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds.
no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs.
```

`no_build_permission_comment` defines a comment (template) that is used when
the account trying to trigger build jobs has no permission to do so.
the account trying to trigger or cancel build jobs has no permission to do so.

```ini
allow_update_submit_opts = false
Expand Down
5 changes: 4 additions & 1 deletion app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,16 @@ slurm_params = --hold
# full path to the job submission command
submit_command = /usr/bin/sbatch

# full path to the job cancellation command
cancel_command = /usr/bin/scancel

# defines which GitHub accounts have the permission to trigger
# build jobs, i.e., for which accounts the bot acts on `bot: build ...`
# commands. If the value is left empty, everyone can trigger build jobs.
build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME-

# template for comment when user who set a label has no permission to trigger build jobs
no_build_permission_comment = Label `bot:build` has been set by user `{build_labeler}`, but this person does not have permission to trigger builds
no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs.

# whether or not to allow updating the submit options via custom module det_submit_opts
# Should only be enabled (true) with care because this will result in code from the target
Expand Down
63 changes: 60 additions & 3 deletions eessi_bot_event_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

# Local application imports (anything from EESSI/eessi-bot-software-layer)
from connections import github
from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \
submit_build_jobs
from tasks.build import cancel_jobs, check_build_permission, get_job_ids, get_node_types, \
get_work_dirs, request_bot_build_issue_comments, submit_build_jobs
from tasks.deploy import deploy_built_artefacts, determine_job_dirs
from tasks.clean_up import move_to_trash_bin
from tools import config
Expand All @@ -53,6 +53,7 @@
config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required
config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended
config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended
config.BUILDENV_SETTING_CANCEL_COMMAND, # required
config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended
# config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional
# config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional
Expand Down Expand Up @@ -102,6 +103,7 @@
# the poll interval setting is required for the alternative job handover
# protocol (delayed_begin)
config.SECTION_JOB_MANAGER: [
config.JOB_MANAGER_SETTING_POLL_COMMAND, # required
config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required
config.SECTION_REPO_TARGETS: [
config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required
Expand Down Expand Up @@ -507,7 +509,7 @@ def handle_bot_command_help(self, event_info, bot_command):
help_msg += "\n - Commands must be sent with a **new** comment (edits of existing comments are ignored)."
help_msg += "\n - A comment may contain multiple commands, one per line."
help_msg += "\n - Every command begins at the start of a line and has the syntax `bot: COMMAND [ARGUMENTS]*`"
help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`"
help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`, `cancel`"
Comment thread
trz42 marked this conversation as resolved.
help_msg += "\n"
help_msg += "\n For more information, see https://www.eessi.io/docs/bot"
return help_msg
Expand Down Expand Up @@ -679,6 +681,61 @@ def handle_bot_command_status(self, event_info, bot_command):
else:
return "\n - failed to create status comment"

def handle_bot_command_cancel(self, event_info, bot_command):
"""
Handles bot command 'cancel' by parsing 'jobid:' arguments and
cancelling the jobs.

Args:
event_info (dict): event received by event_handler
bot_command (EESSIBotCommand): command to be handled

Returns:
comment (string): list of cancelled jobs if any, error message if not
"""
self.log("processing bot command 'cancel'")

request_body = event_info["raw_request_body"]
repo_name = request_body["repository"]["full_name"]
pr_number = request_body["issue"]["number"]
user = request_body["comment"]["user"]["login"]

gh = github.get_instance()
pr = gh.get_repo(repo_name).get_pull(pr_number)

# Jobs can only be cancelled by the user who submitted the job
# -> No need to proceed if user cannot submit jobs
if not check_build_permission(pr, event_info):
self.log(f"User '{user}' does not have build permission - skipping cancellation.")
return f"\n - User `{user}` cannot submit or cancel build jobs."

# Get valid 'jobid:' arguments
job_ids = get_job_ids(bot_command.action_filters)
if len(job_ids) == 0:
self.log("Got no valid job IDs")
return "\n - No valid job IDs were given."

# Get working directories of jobs
work_dirs = get_work_dirs(job_ids, self.cfg)
if len(work_dirs) == 0:
self.log("None of the given jobs are cancellable")
return "\n - No cancellable jobs were given."

# Log skipped jobs
for job_id in job_ids:
if job_id not in work_dirs.keys():
log(f"Skipping job {job_id} - not found")
Comment thread
sondrebr marked this conversation as resolved.

# Cancel jobs
cancelled_jobs = cancel_jobs(work_dirs, user, pr, self.cfg)
if len(cancelled_jobs) == 0:
return "\n - No jobs were cancelled."
else:
comment = ""
for job_id in cancelled_jobs:
comment += f"\n - cancelled job `{job_id}`"
return comment

def start(self, app, port=3000):
"""
Logs startup information to shell and log file and starts the app using
Expand Down
134 changes: 131 additions & 3 deletions tasks/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# Local application imports (anything from EESSI/eessi-bot-software-layer)
from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd
import tools.filter as tools_filter
from tools.pr_comments import ChatLevels, create_comment
from tools.pr_comments import ChatLevels, create_comment, update_comment
from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL

# defaults (used if not specified via, eg, 'app.cfg')
Expand All @@ -51,7 +51,9 @@
# other constants
EXPORT_VARS_FILE = 'export_vars.sh'

Job = namedtuple('Job', ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator'))

Job = namedtuple('Job',
('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator', 'owner'))

# global repo_cfg
repo_cfg = {}
Expand Down Expand Up @@ -108,6 +110,10 @@ def get_build_env_cfg(cfg):
log(f"{fn}(): submit_command '{submit_command}'")
config_data[config.BUILDENV_SETTING_SUBMIT_COMMAND] = submit_command

cancel_command = buildenv.get(config.BUILDENV_SETTING_CANCEL_COMMAND)
log(f"{fn}(): cancel_command '{cancel_command}'")
config_data[config.BUILDENV_SETTING_CANCEL_COMMAND] = cancel_command

job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL)
slurm_params = buildenv.get(config.BUILDENV_SETTING_SLURM_PARAMS)
if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE:
Expand Down Expand Up @@ -582,6 +588,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params):
base_branch_name = pr.base.ref
log(f"{fn}(): pr.base.repo.ref '{base_branch_name}'")

job_owner = event_info['raw_request_body']['sender']['login']

# create run dir (base directory for potentially several jobs)
# TODO may still be too early (before we get to any actual job being
# prepared below when calling 'download_pr')
Expand Down Expand Up @@ -689,7 +697,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params):

# enlist jobs to proceed
job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month,
pr_id, accelerator)
pr_id, accelerator, job_owner)
jobs.append(job)

log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list")
Expand Down Expand Up @@ -1358,3 +1366,123 @@ def request_bot_build_issue_comments(repo_name, pr_number):
if len(comments) != 100:
break
return status_table


def get_job_ids(action_filter):
"""
Gets and validates 'jobid:' arguments.

Args:
action_filter (EESSIBotActionFilter): Instance containing 'jobid:' arguments

Returns:
job_ids (list): valid 'jobid:' arguments
"""
fn = sys._getframe().f_code.co_name

# Get 'jobid:' arguments
job_filter = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_JOBID)
if not job_filter:
log(f"{fn}(): 'bot: cancel' command needs at least one 'jobid:' argument.")
return []

# Validate job IDs
job_ids = []
for job_id in job_filter:
try:
if int(job_id) > 0:
job_ids.append(job_id)
else:
log(f"{fn}(): Invalid job ID: '{job_id}'")
except Exception as err:
log(f"{fn}(): Invalid job ID: {err}")

return job_ids


def get_work_dirs(job_ids, cfg):
"""
Gets working directories of build jobs.

Args:
job_ids (list): list of job_ids to check.
cfg (ConfigParser): Instance containing full configuration from app.cfg

Returns:
work_dirs (dict): dict mapping each job_id to its work_dir
"""
poll_command = cfg[config.SECTION_JOB_MANAGER][config.JOB_MANAGER_SETTING_POLL_COMMAND]

# squeue only the given job IDs
cs_jobs = ",".join(job_ids)
command_line = f"{poll_command} --noheader --Format=JobId:0@,WorkDir:0 --job={cs_jobs}"
out, err, exit_code = run_cmd(command_line, "Get WorkDirs of jobs")

# All output lines are formatted as '{job_id}@{work_dir}'
work_dirs = {}
for line in out.split("\n"):
job = [field.strip() for field in line.split("@")]
if len(job) != 2:
continue
work_dirs[job[0]] = job[1]

return work_dirs


def cancel_jobs(jobs, user, pr, cfg):
"""
Cancels the given build jobs.

Args:
jobs (dict): dictionary mapping each job_id to cancel to its work_dir
user (str): The user who sent the 'bot: cancel' command
pr (github.PullRequest.PullRequest): instance representing the pull request
cfg (ConfigParser): Instance containing full configuration from app.cfg

Returns:
cancelled_jobs (list): job_ids of successfully cancelled jobs
"""
fn = sys._getframe().f_code.co_name

buildenv = get_build_env_cfg(cfg)
cancel_command = buildenv[config.BUILDENV_SETTING_CANCEL_COMMAND]

cancelled_jobs = []
for job_id, work_dir in jobs.items():
# Get job owner and PR comment ID from metadata
metadata_path = os.path.join(work_dir, f"_bot_job{job_id}.metadata")
metadata = job_metadata.get_section_from_file(
filepath=metadata_path,
section=job_metadata.JOB_PR_SECTION,
)
job_owner = metadata.get(job_metadata.JOB_PR_JOB_OWNER)
pr_comment_id = metadata.get(job_metadata.JOB_PR_PR_COMMENT_ID)

# Only the job owner should be able to cancel a job
if job_owner != user:
log(f"{fn}(): User {user} did not start job {job_id} - skipping cancellation")
continue
log(f"{fn}(): Job {job_id} was started by user {user} - cancelling job")

# Cancel job
command_line = f"{cancel_command} --verbose {job_id}"
out, err, exit_code = run_cmd(command_line, f"cancel job {job_id}", raise_on_error=False)

# Check if command was successful
if exit_code != 0:
log(f"{fn}(): scancel resulted in a non-zero exit code for job {job_id}.")
continue
if any([line.startswith("scancel: error: ") for line in err.split("\n")]):
log(f"{fn}(): Unable to cancel job {job_id}.")
continue

log(f"{fn}(): Cancelled job {job_id}")

# Update job status table
dt = datetime.now(timezone.utc)
update = f"\n|{dt.strftime('%b %d %X %Z %Y')}|finished|job id `{job_id}` was cancelled|"
update_comment(int(pr_comment_id), pr, update)

cancelled_jobs.append(job_id)

return cancelled_jobs
1 change: 1 addition & 0 deletions tests/test_bot_job123.metadata
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
repo = test_repo
pr_number = 999
pr_comment_id = 77
job_owner = user01

Loading