Warning

These commands result in remote procedure calls (RPC) on the primary Slurm controller. This controller is not setup like a web-server would be and can easily become overwhelmed with too many RPCs. When this happens, Slurm will begin killing jobs. Do not use these commands to figure out real-time state of the cluster, i.e. something as seemingly innocent watch squeue (or similar with any of these commands) can cause jobs to be killed!

.bashrc

Some useful slurm/squeue tools to add to your .bashrc

# List unique `who` output
unique_users() {
	who | sort -u | cut -f 1 -d " " | uniq;
}

# Get account group GPU limits
account_caps(){
	sacctmgr -nop show assoc format=account,grptres | grep gres | awk -F"[|=]" '{print $1" "$3}'
}

# Compute total number of GPUs
total_gpus(){
  scontrol show nodes -o | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
}

#Compute maximum number of GPUs available through slurm
avail_gpus(){
	scontrol show nodes -o | grep -v DOWN | grep -v DRAIN | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
}

# Compute number of GPUs in use using squeue
srun_gpus() {
	if [ $# -eq 0 ]; then
                echo "IN-USE / AVAILABLE / TOTAL"
		AVAIL_GPUS=`avail_gpus`
                TOTAL_GPUS=`total_gpus`
		squeue -o '%b %t' | grep gpu | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {printf "%d",i}'; echo " / $AVAIL_GPUS / $TOTAL_GPUS"
	elif [ $# -eq 1 ]; then
		squeue -o '%b %t %u' | grep gpu | grep $1 | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {print i}'
	else
		echo "srun_gpus does not accept more than 1 argument"
	fi
}


# Compute number of GPUs which are in line to be scheduled.
scheduled_gpus() {
  if [ $# -eq 0 ]; then
    squeue -o '%b %t' | grep gpu | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
  elif [ $# -eq 1 ]; then
      squeue -o '%b %t %u' | grep gpu | grep $1 | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
  else
    echo "srun_gpus does not accept more than 1 argument"
  fi
}
alias gpus_scheduled=scheduled_gpus

# GPU Usage by user
usage_by_user() {
        squeue -o "%u %t %b %D" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+= $(NF) * $(NF - 1);}else{ print name" "count; name=$1; count=$(NF) * $(NF - 1);}} END{print name" "count;}'	
}

# GPU Usage by lab
usage_by_lab() {
        squeue -o "%a %t %b %D" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+= $(NF) * $(NF - 1);}else{ print name" "count; name=$1; count=$(NF) * $(NF - 1);}} END{print name" "count;}'	
}

#GPUs scheduled by lab
scheduled_gpus_by_lab() {
	squeue --format="%a %b %t" | grep gpu | grep "R\|PD" | sed 's/gpu[:A-Za-z0-9_]*://g' |sort | awk -F'[ :]' '{if(name==$1){ count+=$2;}else{ print name" "count; name=$1; count=$2;}} END{print name" "count;}' | tail -n +2
}

# Automatically generate the string for slurm's --exclude argument by filtering out the nodes with CPU load average greater than a fixed value (= 28 for now, since there are 28 cores for each machine)
filter_cpu_avail_nodes() {
	sinfo --format '%10n %8O' | awk 'NR>1{if ($2 > 28.0) print $1}' | paste -s -d, -	
}
# An example of blacklisted nodes (should be changed according to your preference)
export BLACK_LIST_NODES="hal,calculon"
exclude_list() {
	exclude_str="$BLACK_LIST_NODES"
	filt_nodes=`filter_cpu_avail_nodes`
	if [ -z "$filt_nodes" ]; then
		true
	else
		if [ -z "$exclude_str" ]; then
			exclude_str="$filt_nodes"
		else
			exclude_str="$exclude_str,$filt_nodes"
		fi
	fi
	if [ -z "$exclude_str" ]; then
		printf $exclude_str
	else
		printf -- '--exclude '
		printf $exclude_str | sed 's/,/\n/g' | sort | uniq | paste -s -d, -
	fi
}

alias gpus_running=srun_gpus
alias gpus_users=usage_by_user
alias gpus_labs=usage_by_lab
alias sload="sinfo --format '%10n %8O %e'"

# Compute total number of free GPUs.
free_gpus(){
  AVAIL_GPUS=`avail_gpus`
  RUN_GPUS=`gpus_scheduled`
  echo $((AVAIL_GPUS - RUN_GPUS))
}

# Get per lab usage and their associated caps
lab_use(){
	join <(usage_by_lab) <(scheduled_gpus_by_lab) | join - <(account_caps) | awk 'BEGIN {print "Account \t Used / Sched / Cap"} {print $1" \t "$2" / " $3 " / " $4}'
}

Fish shell

Same set of commands, for Fish Shell. Add them to .config/fish/conf.d/{init.fish|omf.fish}:

# List unique `who` output
function unique_users
  who | sort -u | cut -f 1 -d " " | uniq;
end

# Get account group GPU limits
function account_caps
  sacctmgr -nop show assoc format=account,grptres | grep gres | awk -F"[|=]" '{print $1" "$3}'
end

# Compute total number of GPUs
function total_gpus
  scontrol show nodes -o | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
end

#Compute maximum number of GPUs available through slurm
function avail_gpus
  scontrol show nodes -o | grep -v DOWN | grep -v DRAIN | awk '{print $9}' | grep gpu | awk -F: '{i += $2} END {print i}'
end

# Compute number of GPUs in use using squeue
function gpus_running
  if test (count $argv) -eq 0
    echo "IN-USE / AVAILABLE / TOTAL"
    AVAIL_GPUS=`avail_gpus`
    TOTAL_GPUS=`total_gpus`
    squeue -o '%b %t' | grep gpu | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {printf "%d",i}'; echo " / $AVAIL_GPUS / $TOTAL_GPUS"
  else if test (count $argv) -eq 1
    squeue -o '%b %t %u' | grep gpu | grep $1 | grep R | sed 's/gpu:[a-zA-Z_:$]*//g' | awk '{i += $1;} END {print i}'
  else
    echo "srun_gpus does not accept more than 1 argument"
  end
end

# Compute number of GPUs which are in line to be scheduled.
function scheduled_gpus
  if test (count $argv) -eq 0
    squeue -o '%b %t' | grep gpu | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
  else if test (count $argv) -eq 1
      squeue -o '%b %t %u' | grep gpu | grep $1 | grep "R\|PD" | sed 's/gpu://g' | awk '{i += $1;} END {print i}'
  else
    echo "srun_gpus does not accept more than 1 argument"
  end
end

# GPU Usage by user
function usage_by_user
  squeue -o "%u %t %b" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+=$NF;}else{ print name" "count; name=$1; count=$NF;}} END{print name" "count;}'	
end

# GPU Usage by lab
function usage_by_lab
  squeue -o "%a %t %b" -h | grep gpu | grep R | sort | awk -F'[ :]' '{if(name==$1){ count+=$NF;}else{ print name" "count; name=$1; count=$NF;}} END{print name" "count;}'	
end

# GPUs scheduled by lab
function scheduled_gpus_by_lab
  squeue --format="%a %b %t" | grep gpu | grep "R\|PD" | sed 's/gpu://g' |sort | awk -F'[ :]' '{if(name==$1){ count+=$2;}else{ print name" "count; name=$1; count=$2;}} END{print name" "count;}' | tail -n +2
end

# Automatically generate the string for slurm's --exclude argument by filtering out the nodes with CPU load average greater than a fixed value (= 28 for now, since there are 28 cores for each machine)
function high_cpuload
  sinfo --format '%10n %8O' | awk 'NR>1{if ($2 > 28.0) print $1}' | paste -s -d, -	
end

alias gpus_users=usage_by_user
alias gpus_labs=usage_by_lab
alias sload="sinfo --format '%10n %8O %e'"

# Compute total number of free GPUs.
function free_gpus
  AVAIL_GPUS=`avail_gpus`
  RUN_GPUS=`gpus_scheduled`
  echo (math $AVAIL_GPUS - $RUN_GPUS)
end

# Get per lab usage and their associated caps
function lab_use
  join (usage_by_lab | psub) (scheduled_gpus_by_lab | psub) | join - (account_caps | psub) | awk 'BEGIN {print "Account \t Used / Sched / Cap"} {print $1" \t "$2" / " $3 " / " $4}'
end

New gpus_users command by Nirbhay.

IMPORTANT: Code for this is now managed at this Git Repo as of Oct 17th, 2020. No further updates will be posted here.

Last Updated: Sept 16th, 2020.

Features:

  • gpus_users lists GPU usage on SLURM by running/pending/terminated jobs for each user and individually for each lab. Also shows total GPUS for each lab.
  • gpus_users -q lists only running jobs in three categories: normal, overcap and total (normal+overcap). Overcap jobs are executed using the --account=overcap flag.
  • gpus_users -v a verbose version of the first command and second command combined, showing in detail all gpus usage in all labs by category (overcap, normal, total) and state (running/pending/terminated).

Add the following lines to your .bashrc

usage_by_lab2() {
    { sacctmgr -nop show assoc format=account,user,grptres | grep -v 'root' | grep -v 'test-lab'; squeue -o "G> %u %t %q %b" -h | grep gpu | sort; } | awk -f $1 -
}

gpus_users() {
    if [ $# -eq 1 ]; then
    if [[ $1 == "-q" ]]; then
        usage_by_lab2 ~/rcfiles/shell/lab_usage_qos.awk
    elif [[ $1 == "-v" ]]; then
        usage_by_lab2 ~/rcfiles/shell/lab_usage_verbose.awk
    fi
    else
    usage_by_lab2 ~/rcfiles/shell/lab_usage.awk
    fi
}
These shell commands point to the following three awk files, which should be added to either the default location ~/rcfiles/shell/*.awk or anywhere else by changing the path in the above bash code snippet.
  1. lab_usage.awk

    #!/bin/awk -f
    
    BEGIN {
        FS="[ :|=,]";
        printf("| %14s |","Username");
        printf(" %3s | %3s | %3s |", "R", "PD", "CG");
        printf("\n");
    
        printf("| %14s |","--------------");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf("\n");
    }
    {
        if ($1 == "G>") {
            counts[$2][$3]+=$NF;
            counts[$2][$3,$4]+=$NF;
    
            counts[$2]["R"]+=0;
            counts[$2]["PD"]+=0;
            counts[$2]["CG"]+=0;
    
            labs_to_gpus_used[user_to_lab[$2]][$3]+=$NF;
    
        } else {
            if ($5 == "gres/gpu") {
                labs_to_gpus[$1] += $6;
                labs_to_cpus[$1] += $4;
            } else {
                if ($1 != "overcap") {
                    user_to_lab[$2]=$1;
                }
            }
        }
    }
    END {
        for (lab in labs_to_gpus) {
            if (lab == "guest-lab") {
                continue;
            }
            print_str = sprintf("[ %d/%d/%d ]", labs_to_gpus_used[lab]["R"], labs_to_gpus_used[lab]["PD"], labs_to_gpus[lab])
            printf("| %14s | %-15s |\n", lab, print_str);
            for (name in counts){
                if (user_to_lab[name] == lab) {
                    printf("| %14s |",name);
                    printf(" %3d |",counts[name]["R"]);
                    printf(" %3d |",counts[name]["PD"]);
                    printf(" %3d |",counts[name]["CG"]);
                    printf("\n");
                }
            }
            printf("| %14s |","--------------");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf("\n");
        }
    }
  2. lab_usage_qos.awk

    #!/bin/awk -f
    
    BEGIN {
        FS="[ :|=,]";
        printf("| %14s |","Username");
        printf(" %3s | %3s | %3s |", "N-R", "O-R", "R");
        printf("\n");
    
        printf("| %14s |","--------------");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf("\n");
    }
    {
        if ($1 == "G>") {
            counts[$2][$3]+=$NF;
            counts[$2][$3,$4]+=$NF;
    
            counts[$2]["R"]+=0;
            counts[$2]["R","normal"]+=0;
            counts[$2]["R","overcap"]+=0;
    
            labs_to_gpus_used[user_to_lab[$2]][$3]+=$NF;
            lab_totals[user_to_lab[$2]][$3]+=$NF;
            lab_totals[user_to_lab[$2]][$3,$4]+=$NF;
    
        } else {
            if ($5 == "gres/gpu") {
                labs_to_gpus[$1] += $6;
                labs_to_cpus[$1] += $4;
            } else {
                if ($1 != "overcap") {
                    user_to_lab[$2]=$1;
                }
            }
        }
    }
    END {
        for (lab in labs_to_gpus) {
            if (lab == "guest-lab") {
                continue;
            }
            print_str = sprintf("[ %d/%d/%d ]", labs_to_gpus_used[lab]["R"], labs_to_gpus_used[lab]["PD"], labs_to_gpus[lab])
            printf("| %14s | %-15s |\n", lab, print_str);
            for (name in counts){
                if (user_to_lab[name] == lab) {
                    printf("| %14s |",name);
                    printf(" %3d |",counts[name]["R","normal"]);
                    printf(" %3d |",counts[name]["R","overcap"]);
                    printf(" %3d |",counts[name]["R"]);
                    printf("\n");
                }
            }
            printf("| %14s |","");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf("\n");
    
            printf("| %14s :","totals");
    
            printf(" %3d |",lab_totals[lab]["R","normal"]);
            printf(" %3d |",lab_totals[lab]["R","overcap"]);
            printf(" %3d |",lab_totals[lab]["R"]);
    
            printf("\n");
    
            printf("| %14s |","--------------");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf("\n");
        }
    }
  3. lab_usage_verbose.awk

    #!/bin/awk -f
    
    BEGIN {
        FS="[ :|=,]";
        printf("| %14s |","Username");
        printf(" %3s | %3s| %3s|", "N-R", "N-PD", "N-CG");
        printf(" %3s | %3s| %3s|", "O-R", "O-PD", "O-CG");
        printf(" %3s | %3s | %3s |", "R", "PD", "CG");
        printf("\n");
    
        printf("| %14s |","--------------");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf("\n");
    
        printf("| %-68s |\n","  'N': normal, 'O': overcap, none: total");
        # printf("| %-68s |\n","  'R': running, 'PD': pending, 'CG': interrupted");
    
        printf("| %14s |","--------------");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf(" %3s + %3s + %3s |", "---", "---", "---");
        printf("\n");
    }
    {
        if ($1 == "G>") {
            counts[$2][$3]+=$NF;
            counts[$2][$3,$4]+=$NF;
    
            counts[$2]["R"]+=0;
            counts[$2]["PD"]+=0;
            counts[$2]["CG"]+=0;
    
            counts[$2]["R","normal"]+=0;
            counts[$2]["PD","normal"]+=0;
            counts[$2]["CG","normal"]+=0;
            counts[$2]["R","overcap"]+=0;
            counts[$2]["PD","overcap"]+=0;
            counts[$2]["CG","overcap"]+=0;
    
            labs_to_gpus_used[user_to_lab[$2]][$3]+=$NF;
            lab_totals[user_to_lab[$2]][$3]+=$NF;
            lab_totals[user_to_lab[$2]][$3,$4]+=$NF;
    
        } else {
            if ($5 == "gres/gpu") {
                labs_to_gpus[$1] += $6;
                labs_to_cpus[$1] += $4;
            } else {
                if ($1 != "overcap") {
                    user_to_lab[$2]=$1;
                }
            }
        }
    }
    END {
        for (lab in labs_to_gpus) {
            if (lab == "guest-lab") {
                continue;
            }
            print_str = sprintf("[ %d / %d / %d (Run/Sched/Total) GPUS]", labs_to_gpus_used[lab]["R"], labs_to_gpus_used[lab]["PD"], labs_to_gpus[lab])
            printf("| %14s = %-51s |\n", lab, print_str);
            for (name in counts){
                if (user_to_lab[name] == lab) {
                    printf("| %14s |",name);
                    printf(" %3d |",counts[name]["R","normal"]);
                    printf(" %3d |",counts[name]["PD","normal"]);
                    printf(" %3d |",counts[name]["CG","normal"]);
    
                    printf(" %3d |",counts[name]["R","overcap"]);
                    printf(" %3d |",counts[name]["PD","overcap"]);
                    printf(" %3d |",counts[name]["CG","overcap"]);
    
                    printf(" %3d |",counts[name]["R"]);
                    printf(" %3d |",counts[name]["PD"]);
                    printf(" %3d |",counts[name]["CG"]);
                    printf("\n");
                }
            }
            printf("| %14s |","");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf("\n");
    
            printf("| %14s :","totals");
            printf(" %3d |",lab_totals[lab]["R","normal"]);
            printf(" %3d |",lab_totals[lab]["PD","normal"]);
            printf(" %3d |",lab_totals[lab]["CG","normal"]);
    
            printf(" %3d |",lab_totals[lab]["R","overcap"]);
            printf(" %3d |",lab_totals[lab]["PD","overcap"]);
            printf(" %3d |",lab_totals[lab]["CG","overcap"]);
    
            printf(" %3d |",lab_totals[lab]["R"]);
            printf(" %3d |",lab_totals[lab]["PD"]);
            printf(" %3d |",lab_totals[lab]["CG"]);
            printf("\n");
    
            printf("| %14s |","--------------");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf(" %3s + %3s + %3s |", "---", "---", "---");
            printf("\n");
        }
    }