These commands result in remote procedure calls (RPC) on the primary Slurm controller. This controller is not setup like a web-server would be and can easily become overwhelmed with too many RPCs. When this happens, Slurm will begin killing jobs. Do not use these commands to figure out real-time state of the cluster, i.e. something as seemingly innocent watch squeue
(or similar with any of these commands) can cause jobs to be killed!
Some useful slurm/squeue tools to add to your .bashrc
# List unique `who` output
unique_users() {
who | sort -u | cut -f 1 -d " " | uniq;
}
# Get account group GPU limits
account_caps(){
sacctmgr -nop show assoc format=account,grptres | grep gres | awk -F"[|=]" '{print $1" "$3}'
}
# Compute total number of GPUs
total_gpus(){
scontrol show nodes -o | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
}
#Compute maximum number of GPUs available through slurm
avail_gpus(){
scontrol show nodes -o | grep -v DOWN | grep -v DRAIN | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
}
# Compute number of GPUs in use using squeue
srun_gpus() {
if [ $# -eq 0 ]; then
echo "IN-USE / AVAILABLE / TOTAL"
AVAIL_GPUS=`avail_gpus`
TOTAL_GPUS=`total_gpus`
squeue -o '%b %t' | grep gpu | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {printf "%d",i}'; echo " / $AVAIL_GPUS / $TOTAL_GPUS"
elif [ $# -eq 1 ]; then
squeue -o '%b %t %u' | grep gpu | grep $1 | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {print i}'
else
echo "srun_gpus does not accept more than 1 argument"
fi
}
# Compute number of GPUs which are in line to be scheduled.
scheduled_gpus() {
if [ $# -eq 0 ]; then
squeue -o '%b %t' | grep gpu | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
elif [ $# -eq 1 ]; then
squeue -o '%b %t %u' | grep gpu | grep $1 | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
else
echo "srun_gpus does not accept more than 1 argument"
fi
}
alias gpus_scheduled=scheduled_gpus
# GPU Usage by user
usage_by_user() {
squeue -o "%u %t %b %D" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+= $(NF) * $(NF - 1);}else{ print name" "count; name=$1; count=$(NF) * $(NF - 1);}} END{print name" "count;}'
}
# GPU Usage by lab
usage_by_lab() {
squeue -o "%a %t %b %D" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+= $(NF) * $(NF - 1);}else{ print name" "count; name=$1; count=$(NF) * $(NF - 1);}} END{print name" "count;}'
}
#GPUs scheduled by lab
scheduled_gpus_by_lab() {
squeue --format="%a %b %t" | grep gpu | grep "R\|PD" | sed 's/gpu[:A-Za-z0-9_]*://g' |sort | awk -F'[ :]' '{if(name==$1){ count+=$2;}else{ print name" "count; name=$1; count=$2;}} END{print name" "count;}' | tail -n +2
}
# Automatically generate the string for slurm's --exclude argument by filtering out the nodes with CPU load average greater than a fixed value (= 28 for now, since there are 28 cores for each machine)
filter_cpu_avail_nodes() {
sinfo --format '%10n %8O' | awk 'NR>1{if ($2 > 28.0) print $1}' | paste -s -d, -
}
# An example of blacklisted nodes (should be changed according to your preference)
export BLACK_LIST_NODES="hal,calculon"
exclude_list() {
exclude_str="$BLACK_LIST_NODES"
filt_nodes=`filter_cpu_avail_nodes`
if [ -z "$filt_nodes" ]; then
true
else
if [ -z "$exclude_str" ]; then
exclude_str="$filt_nodes"
else
exclude_str="$exclude_str,$filt_nodes"
fi
fi
if [ -z "$exclude_str" ]; then
printf $exclude_str
else
printf -- '--exclude '
printf $exclude_str | sed 's/,/\n/g' | sort | uniq | paste -s -d, -
fi
}
alias gpus_running=srun_gpus
alias gpus_users=usage_by_user
alias gpus_labs=usage_by_lab
alias sload="sinfo --format '%10n %8O %e'"
# Compute total number of free GPUs.
free_gpus(){
AVAIL_GPUS=`avail_gpus`
RUN_GPUS=`gpus_scheduled`
echo $((AVAIL_GPUS - RUN_GPUS))
}
# Get per lab usage and their associated caps
lab_use(){
join <(usage_by_lab) <(scheduled_gpus_by_lab) | join - <(account_caps) | awk 'BEGIN {print "Account \t Used / Sched / Cap"} {print $1" \t "$2" / " $3 " / " $4}'
}
Same set of commands, for Fish Shell. Add them to .config/fish/conf.d/{init.fish|omf.fish}
:
# List unique `who` output
function unique_users
who | sort -u | cut -f 1 -d " " | uniq;
end
# Get account group GPU limits
function account_caps
sacctmgr -nop show assoc format=account,grptres | grep gres | awk -F"[|=]" '{print $1" "$3}'
end
# Compute total number of GPUs
function total_gpus
scontrol show nodes -o | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
end
#Compute maximum number of GPUs available through slurm
function avail_gpus
scontrol show nodes -o | grep -v DOWN | grep -v DRAIN | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
end
# Compute number of GPUs in use using squeue
function gpus_running
if test (count $argv) -eq 0
echo "IN-USE / AVAILABLE / TOTAL"
AVAIL_GPUS=`avail_gpus`
TOTAL_GPUS=`total_gpus`
squeue -o '%b %t' | grep gpu | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {printf "%d",i}'; echo " / $AVAIL_GPUS / $TOTAL_GPUS"
else if test (count $argv) -eq 1
squeue -o '%b %t %u' | grep gpu | grep $1 | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {print i}'
else
echo "srun_gpus does not accept more than 1 argument"
end
end
# Compute number of GPUs which are in line to be scheduled.
function scheduled_gpus
if test (count $argv) -eq 0
squeue -o '%b %t' | grep gpu | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
else if test (count $argv) -eq 1
squeue -o '%b %t %u' | grep gpu | grep $1 | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
else
echo "srun_gpus does not accept more than 1 argument"
end
end
# GPU Usage by user
function usage_by_user
squeue -o "%u %t %b" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+=$NF;}else{ print name" "count; name=$1; count=$NF;}} END{print name" "count;}'
end
# GPU Usage by lab
function usage_by_lab
squeue -o "%a %t %b" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+=$NF;}else{ print name" "count; name=$1; count=$NF;}} END{print name" "count;}'
end
# GPUs scheduled by lab
function scheduled_gpus_by_lab
squeue --format="%a %b %t" | grep gpu | grep "R\|PD" | sed 's/gpu://g' |sort | awk -F'[ :]' '{if(name==$1){ count+=$2;}else{ print name" "count; name=$1; count=$2;}} END{print name" "count;}' | tail -n +2
end
# Automatically generate the string for slurm's --exclude argument by filtering out the nodes with CPU load average greater than a fixed value (= 28 for now, since there are 28 cores for each machine)
function high_cpuload
sinfo --format '%10n %8O' | awk 'NR>1{if ($2 > 28.0) print $1}' | paste -s -d, -
end
alias gpus_users=usage_by_user
alias gpus_labs=usage_by_lab
alias sload="sinfo --format '%10n %8O %e'"
# Compute total number of free GPUs.
function free_gpus
AVAIL_GPUS=`avail_gpus`
RUN_GPUS=`gpus_scheduled`
echo (math $AVAIL_GPUS - $RUN_GPUS)
end
# Get per lab usage and their associated caps
function lab_use
join (usage_by_lab | psub) (scheduled_gpus_by_lab | psub) | join - (account_caps | psub) | awk 'BEGIN {print "Account \t Used / Sched / Cap"} {print $1" \t "$2" / " $3 " / " $4}'
end
gpus_users
command by Nirbhay.gpus_users
lists GPU usage on SLURM by running/pending/terminated jobs for each user and individually for each lab. Also shows total GPUS for each lab.gpus_users -q
lists only running jobs in three categories: normal, overcap and total (normal+overcap). Overcap jobs are executed using the --account=overcap
flag.gpus_users -v
a verbose version of the first command and second command combined, showing in detail all gpus usage in all labs by category (overcap, normal, total) and state (running/pending/terminated).Add the following lines to your .bashrc
usage_by_lab2() {
{ sacctmgr -nop show assoc format=account,user,grptres | grep -v 'root' | grep -v 'test-lab'; squeue -o "G> %u %t %q %b" -h | grep gpu | sort; } | awk -f $1 -
}
gpus_users() {
if [ $# -eq 1 ]; then
if [[ $1 == "-q" ]]; then
usage_by_lab2 ~/rcfiles/shell/lab_usage_qos.awk
elif [[ $1 == "-v" ]]; then
usage_by_lab2 ~/rcfiles/shell/lab_usage_verbose.awk
fi
else
usage_by_lab2 ~/rcfiles/shell/lab_usage.awk
fi
}
These shell commands point to the following three awk files, which should be added to either the default location ~/rcfiles/shell/*.awk
or anywhere else by changing the path in the above bash code snippet.
lab_usage.awk
#!/bin/awk -f
BEGIN {
FS="[ :|=,]";
printf("| %14s |","Username");
printf(" %3s | %3s | %3s |", "R", "PD", "CG");
printf("\n");
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
}
{
if ($1 == "G>") {
counts[$2][$3]+=$NF;
counts[$2][$3,$4]+=$NF;
counts[$2]["R"]+=0;
counts[$2]["PD"]+=0;
counts[$2]["CG"]+=0;
labs_to_gpus_used[user_to_lab[$2]][$3]+=$NF;
} else {
if ($5 == "gres/gpu") {
labs_to_gpus[$1] += $6;
labs_to_cpus[$1] += $4;
} else {
if ($1 != "overcap") {
user_to_lab[$2]=$1;
}
}
}
}
END {
for (lab in labs_to_gpus) {
if (lab == "guest-lab") {
continue;
}
print_str = sprintf("[ %d/%d/%d ]", labs_to_gpus_used[lab]["R"], labs_to_gpus_used[lab]["PD"], labs_to_gpus[lab])
printf("| %14s | %-15s |\n", lab, print_str);
for (name in counts){
if (user_to_lab[name] == lab) {
printf("| %14s |",name);
printf(" %3d |",counts[name]["R"]);
printf(" %3d |",counts[name]["PD"]);
printf(" %3d |",counts[name]["CG"]);
printf("\n");
}
}
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
}
}
lab_usage_qos.awk
#!/bin/awk -f
BEGIN {
FS="[ :|=,]";
printf("| %14s |","Username");
printf(" %3s | %3s | %3s |", "N-R", "O-R", "R");
printf("\n");
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
}
{
if ($1 == "G>") {
counts[$2][$3]+=$NF;
counts[$2][$3,$4]+=$NF;
counts[$2]["R"]+=0;
counts[$2]["R","normal"]+=0;
counts[$2]["R","overcap"]+=0;
labs_to_gpus_used[user_to_lab[$2]][$3]+=$NF;
lab_totals[user_to_lab[$2]][$3]+=$NF;
lab_totals[user_to_lab[$2]][$3,$4]+=$NF;
} else {
if ($5 == "gres/gpu") {
labs_to_gpus[$1] += $6;
labs_to_cpus[$1] += $4;
} else {
if ($1 != "overcap") {
user_to_lab[$2]=$1;
}
}
}
}
END {
for (lab in labs_to_gpus) {
if (lab == "guest-lab") {
continue;
}
print_str = sprintf("[ %d/%d/%d ]", labs_to_gpus_used[lab]["R"], labs_to_gpus_used[lab]["PD"], labs_to_gpus[lab])
printf("| %14s | %-15s |\n", lab, print_str);
for (name in counts){
if (user_to_lab[name] == lab) {
printf("| %14s |",name);
printf(" %3d |",counts[name]["R","normal"]);
printf(" %3d |",counts[name]["R","overcap"]);
printf(" %3d |",counts[name]["R"]);
printf("\n");
}
}
printf("| %14s |","");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
printf("| %14s :","totals");
printf(" %3d |",lab_totals[lab]["R","normal"]);
printf(" %3d |",lab_totals[lab]["R","overcap"]);
printf(" %3d |",lab_totals[lab]["R"]);
printf("\n");
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
}
}
lab_usage_verbose.awk
#!/bin/awk -f
BEGIN {
FS="[ :|=,]";
printf("| %14s |","Username");
printf(" %3s | %3s| %3s|", "N-R", "N-PD", "N-CG");
printf(" %3s | %3s| %3s|", "O-R", "O-PD", "O-CG");
printf(" %3s | %3s | %3s |", "R", "PD", "CG");
printf("\n");
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
printf("| %-68s |\n"," 'N': normal, 'O': overcap, none: total");
# printf("| %-68s |\n"," 'R': running, 'PD': pending, 'CG': interrupted");
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
}
{
if ($1 == "G>") {
counts[$2][$3]+=$NF;
counts[$2][$3,$4]+=$NF;
counts[$2]["R"]+=0;
counts[$2]["PD"]+=0;
counts[$2]["CG"]+=0;
counts[$2]["R","normal"]+=0;
counts[$2]["PD","normal"]+=0;
counts[$2]["CG","normal"]+=0;
counts[$2]["R","overcap"]+=0;
counts[$2]["PD","overcap"]+=0;
counts[$2]["CG","overcap"]+=0;
labs_to_gpus_used[user_to_lab[$2]][$3]+=$NF;
lab_totals[user_to_lab[$2]][$3]+=$NF;
lab_totals[user_to_lab[$2]][$3,$4]+=$NF;
} else {
if ($5 == "gres/gpu") {
labs_to_gpus[$1] += $6;
labs_to_cpus[$1] += $4;
} else {
if ($1 != "overcap") {
user_to_lab[$2]=$1;
}
}
}
}
END {
for (lab in labs_to_gpus) {
if (lab == "guest-lab") {
continue;
}
print_str = sprintf("[ %d / %d / %d (Run/Sched/Total) GPUS]", labs_to_gpus_used[lab]["R"], labs_to_gpus_used[lab]["PD"], labs_to_gpus[lab])
printf("| %14s = %-51s |\n", lab, print_str);
for (name in counts){
if (user_to_lab[name] == lab) {
printf("| %14s |",name);
printf(" %3d |",counts[name]["R","normal"]);
printf(" %3d |",counts[name]["PD","normal"]);
printf(" %3d |",counts[name]["CG","normal"]);
printf(" %3d |",counts[name]["R","overcap"]);
printf(" %3d |",counts[name]["PD","overcap"]);
printf(" %3d |",counts[name]["CG","overcap"]);
printf(" %3d |",counts[name]["R"]);
printf(" %3d |",counts[name]["PD"]);
printf(" %3d |",counts[name]["CG"]);
printf("\n");
}
}
printf("| %14s |","");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
printf("| %14s :","totals");
printf(" %3d |",lab_totals[lab]["R","normal"]);
printf(" %3d |",lab_totals[lab]["PD","normal"]);
printf(" %3d |",lab_totals[lab]["CG","normal"]);
printf(" %3d |",lab_totals[lab]["R","overcap"]);
printf(" %3d |",lab_totals[lab]["PD","overcap"]);
printf(" %3d |",lab_totals[lab]["CG","overcap"]);
printf(" %3d |",lab_totals[lab]["R"]);
printf(" %3d |",lab_totals[lab]["PD"]);
printf(" %3d |",lab_totals[lab]["CG"]);
printf("\n");
printf("| %14s |","--------------");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf(" %3s + %3s + %3s |", "---", "---", "---");
printf("\n");
}
}