Commit 30ec36c3 authored by Kurt Lust's avatar Kurt Lust
Browse files

Replaced old job scripts with new ones that include verification of the results.

parent 4a2eb4aa
#!/bin/bash
# An affinity wrapper for KNCs.
# As arguments, it expects first the number of processes per node
# followed by the command to run (and any possible arguments to it).
# get some information about the job
ppn=$1
shift
rank=$PMI_RANK
nmpi=$PMI_SIZE
# echo "RANK", $PMI_RANK
# number of devices in the system
ndev=2
# number of cores per device
nphcores=61
nphcores=$((nphcores - 1))
# number of threads per physical core
tpc=4
# ranks per device
rpd=$((ppn / ndev))
if [ "$rpd" == "0" ]; then
rpd=1
fi
# physical cores per device
ncores=$((nphcores / rpd))
# partition number of the current rank on its device
partition=$((rank % rpd))
# offset for the current rank
offset=$((ncores * partition))
# build core selection string
select="${ncores}c,${tpc}t,${offset}o"
# fire up the actual run
log="affinity-`printf %03d $rank`.log"
rm -f $log
echo "host `hostname` rank `printf %03d $rank` - $select " |& tee -a $log
env | grep PYMIC |& tee -a $log
PYMIC_KMP_AFFINITY=compact,verbose PYMIC_KMP_PLACE_THREADS=$select $@ |& tee -a $log
#! /bin/bash -l
#SBARCH -N 21
#SBATCH -n 1000 -c 1
#SBATCH --time 2:00:00
#SBATCH -J GPAWbench
#SBATCH --qos prace
#
inputfile=../input.py
benchmark_size='large'
csv_summary="BSC_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}.csv"
module_20_1=GPAW-UEABS/20.1.0-Python38-FFTW-icc
module_20_10=GPAW-UEABS/20.10.0-Python39-FFTW-icc
# The next one is not a system module but one that was used to do the settings
# for the project.
module load UEABS/2.2
srun_options=''
echo -e "\nWorking in: $(pwd)\n"
echo -e "Modules loaded:\n"
module list
echo -e "Slurm environment:\n$(env | grep SLURM_)\n"
echo -e "\nJob script:\n"
cat $0
echo -e "\n\n"
#
# Check the results
#
function print_header {
echo '"Module", "python/gpaw/ase/numpy/scipy", "tasks", "time", "iterations", "dipole", "fermi", "energy", "check", "Job ID"' >$1
}
function print_results {
output=$1
summary=$2
module=$3
. ../bounds.sh
python_version=$(python -V | awk '{print $2}')
gpaw_version=$(srun -n 1 -c 1 python -c "import gpaw ; print( gpaw.__version__ )")
ase_version=$(python -c "import ase ; print( ase.__version__ )")
numpy_version=$(python -c "import numpy ; print( numpy.__version__ )")
scipy_version=$(python -c "import scipy ; print( scipy.__version__ )")
# Extract some data to report form the output file.
bmtime=$(grep "Total:" $output | sed -e 's/Total: *//' | cut -d " " -f 1)
iterations=$(grep "Converged after" $output | cut -d " " -f 3)
dipole=$(grep "Dipole" $output | cut -d " " -f 5 | sed -e 's/)//')
fermi=$(grep "Fermi level:" $output | cut -d ":" -f 2 | sed -e 's/ //g')
energy=$(grep "Extrapolated: " $output | cut -d ":" -f 2 | sed -e 's/ //g')
# Check the bounds
if (( $(bc -l <<< "(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations)") == 1 )); then iterations_ok="OK"; else iterations_ok="not OK"; fi
if (( $(bc -l <<< "(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole)") == 1 )); then dipole_ok="OK"; else dipole_ok="not OK"; fi
if (( $(bc -l <<< "(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi)") == 1 )); then fermi_ok="OK"; else fermi_ok="not OK"; fi
if (( $(bc -l <<< "(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)") == 1 )); then energy_ok="OK"; else energy_ok="not OK"; fi
compare=""
compare+="(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations) && "
compare+="(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole) && "
compare+="(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi) && "
compare+="(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)"
if (( $(bc -l <<< "$compare") ));
then
bounds_check="OK";
else
bounds_check="not OK"
fi
# Output to the slurm.out file
echo -e "\nResult information:\n" \
" * Time: $bmtime s\n" \
" * Number of iterations: $iterations (lower: $lower_iterations, upper: $upper_iterations, $iterations_ok)\n" \
" * Dipole (3rd component): $dipole (lower: $lower_dipole, upper: $upper_dipole, $dipole_ok)\n" \
" * Fermi level: $fermi (lower: $lower_fermi, upper: $upper_fermi, $fermi_ok)\n" \
" * Extrapolated energy: $energy (lower: $lower_energy, upper: $upper_energy, $energy_ok)\n" \
" * Boundary check: $bounds_check"
# Output to the summary spreadsheet
echo "\"$module\", \"$python_version/$gpaw_version/$ase_version/$numpy_version/$scipy_version\"," \
"\"$SLURM_NTASKS\", \"$bmtime\", \"$iterations\", \"$dipole\", \"$fermi\", \"$energy\", \"$bounds_check\", \"$SLURM_JOB_ID\"" >> $summary
}
#
# Running with GPAW 20.1.0
#
print_header $csv_summary
module load $module_20_1
echo -e "\n\nStarting GPAW for $module_20_1\n"
srun $srun_options gpaw python $inputfile
echo -e "\n\nGPAW terminated\n"
print_results output.txt $csv_summary $module_20_1
mv output.txt BSC_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}_20.1.0.txt
#
# Running with GPAW 20.10.0
#
module load $module_20_10
echo -e "\n\n"
echo -e "\n\nStarting GPAW for $module_20_10\n"
srun $srun_options gpaw python $inputfile
echo -e "\n\nGPAW terminated\n"
print_results output.txt $csv_summary $module_20_10
mv output.txt BSC_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}_20.10.0.txt
#! /bin/bash -l
#PBS -l select=8:node_type=rome:mpiprocs=128
#PBS -l walltime=1:00:00
#PBS -N GPPAWbench
numranks=1000
inputfile=../../input.py
benchmark_size='large'
csv_summary="HLRS_${benchmark_size}_${PBS_JOBID}_${numranks}.csv"
module_20_1=GPAW-UEABS/20.1.0-Python38-FFTW-icc
module_20_10=GPAW-UEABS/20.10.0-Python39-FFTW-icc
cd $PBS_O_WORKDIR
compiler_module='intel/19.1.3'
mpi_module='mpt/2.23'
math_module='mkl/19.1.0'
bounds="$(dirname ${inputfile})/bounds.sh"
# The first module is not a system module but one that was used to do the settings
# for the project.
module load UEABS
module load $compiler_module
module load $mpi_module
module load $math_module
export MKL_DEBUG_CPU_TYPE=5
export OMP_NUM_THREADS=1
echo -e "\nWorking in: $(pwd)\n"
echo -e "Modules loaded:\n"
module list
echo -e "PBS environment:\n$(env | grep PBS_)\n"
echo -e "\nJob script:\n"
cat $0
echo -e "\n\n"
#
# Check the results
#
function print_header {
echo '"Module", "python/gpaw/ase/numpy/scipy", "tasks", "time", "iterations", "dipole", "fermi", "energy", "check", "Job ID"' >$1
}
function print_results {
output=$1
summary=$2
module=$3
bounds=$4
#source ${bounds}
source ../../bounds.sh
python_version=$(python -V | awk '{print $2}')
gpaw_version=$(python -c "import gpaw ; print( gpaw.__version__ )")
ase_version=$(python -c "import ase ; print( ase.__version__ )")
numpy_version=$(python -c "import numpy ; print( numpy.__version__ )")
scipy_version=$(python -c "import scipy ; print( scipy.__version__ )")
# Extract some data to report form the output file.
bmtime=$(grep "Total:" $output | sed -e 's/Total: *//' | cut -d " " -f 1)
iterations=$(grep "Converged after" $output | cut -d " " -f 3)
dipole=$(grep "Dipole" $output | cut -d " " -f 5 | sed -e 's/)//')
fermi=$(grep "Fermi level:" $output | cut -d ":" -f 2 | sed -e 's/ //g')
energy=$(grep "Extrapolated: " $output | cut -d ":" -f 2 | sed -e 's/ //g')
# Check the bounds
if (( $(bc -l <<< "(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations)") == 1 )); then iterations_ok="OK"; else iterations_ok="not OK"; fi
if (( $(bc -l <<< "(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole)") == 1 )); then dipole_ok="OK"; else dipole_ok="not OK"; fi
if (( $(bc -l <<< "(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi)") == 1 )); then fermi_ok="OK"; else fermi_ok="not OK"; fi
if (( $(bc -l <<< "(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)") == 1 )); then energy_ok="OK"; else energy_ok="not OK"; fi
compare=""
compare+="(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations) && "
compare+="(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole) && "
compare+="(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi) && "
compare+="(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)"
if (( $(bc -l <<< "$compare") ));
then
bounds_check="OK";
else
bounds_check="not OK"
fi
# Output to the PBS output file
echo -e "\nResult information:\n" \
" * Time: $bmtime s\n" \
" * Number of iterations: $iterations (lower: $lower_iterations, upper: $upper_iterations, $iterations_ok)\n" \
" * Dipole (3rd component): $dipole (lower: $lower_dipole, upper: $upper_dipole, $dipole_ok)\n" \
" * Fermi level: $fermi (lower: $lower_fermi, upper: $upper_fermi, $fermi_ok)\n" \
" * Extrapolated energy: $energy (lower: $lower_energy, upper: $upper_energy, $energy_ok)\n" \
" * Boundary check: $bounds_check"
# Output to the summary spreadsheet
echo "\"$module\", \"$python_version/$gpaw_version/$ase_version/$numpy_version/$scipy_version\"," \
"\"$numranks\", \"$bmtime\", \"$iterations\", \"$dipole\", \"$fermi\", \"$energy\", \"$bounds_check\", \"$PBS_JOBID\"" >> $summary
}
#
# Running with GPAW 20.1.0
#
print_header $csv_summary
module purge
module load UEABS/2.2
module load $module_20_1
echo -e "\n\nGPAW run with $module_20_1\nModules loaded:\n"
module list 2>&1
echo -e "\n\nStarting GPAW...\n"
mpirun -n $numranks gpaw python $inputfile
echo -e "\nGPAW ended, checking results...\n"
print_results output.txt $csv_summary $module_20_1
mv output.txt HLRS_${benchmark_size}_${PBS_JOBID}_${numranks}_20.1.0.txt
#
# Running with GPAW 20.10.0
#
module purge
module load UEABS/2.2
module load $module_20_10
echo -e "\n\nGPAW run with $module_20_10\nModules loaded:\n"
module list 2>&1
echo -e "\n\nStarting GPAW...\n"
echo -e "\n\n"
mpirun -n $numranks gpaw python $inputfile $bounds
echo -e "\nGPAW ended, checking results...\n"
print_results output.txt $csv_summary $module_20_10 $bounds
mv output.txt HLRS_${benchmark_size}_${PBS_JOBID}_${numranks}_20.10.0.txt
#! /bin/bash -l
#SBATCH -A prpb101
#SBARCH -N 21
#SBATCH -n 1000 -c 1
#SBATCH --time 1:00:00
#SBATCH -J GPAWbench_large
#SBATCH -p batch
#SBATCH --hint=nomultithread
#SBATCH -o %x-%j.out
#
inputfile=../input.py
benchmark_size='large'
csv_summary="JSC_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}.csv"
module_20_1=GPAW-UEABS/20.1.0-Python38-FFTW-icc
module_20_10=GPAW-UEABS/20.10.0-Python39-FFTW-icc
module load Intel/2021.2.0-GCC-10.3.0
module load IntelMPI/2021.2.0
echo -e "\nWorking in: $(pwd)\n"
echo -e "Modules loaded:\n"
module list
echo -e "Slurm environment:\n$(env | grep SLURM_)\n"
echo -e "\nJob script:\n"
cat $0
echo -e "\n\n"
#
# Check the results
#
function print_header {
echo '"Module", "python/gpaw/ase/numpy/scipy", "tasks", "time", "iterations", "dipole", "fermi", "energy", "check", "Job ID"' >$1
}
function print_results {
output=$1
summary=$2
module=$3
. ../bounds.sh
python_version=$(python -V | awk '{print $2}')
gpaw_version=$(srun -n 1 -c 1 python -c "import gpaw ; print( gpaw.__version__ )")
ase_version=$(python -c "import ase ; print( ase.__version__ )")
numpy_version=$(python -c "import numpy ; print( numpy.__version__ )")
scipy_version=$(python -c "import scipy ; print( scipy.__version__ )")
# Extract some data to report form the output file.
bmtime=$(grep "Total:" $output | sed -e 's/Total: *//' | cut -d " " -f 1)
iterations=$(grep "Converged after" $output | cut -d " " -f 3)
dipole=$(grep "Dipole" $output | cut -d " " -f 5 | sed -e 's/)//')
fermi=$(grep "Fermi level:" $output | cut -d ":" -f 2 | sed -e 's/ //g')
energy=$(grep "Extrapolated: " $output | cut -d ":" -f 2 | sed -e 's/ //g')
# Check the bounds
if (( $(bc -l <<< "(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations)") == 1 )); then iterations_ok="OK"; else iterations_ok="not OK"; fi
if (( $(bc -l <<< "(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole)") == 1 )); then dipole_ok="OK"; else dipole_ok="not OK"; fi
if (( $(bc -l <<< "(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi)") == 1 )); then fermi_ok="OK"; else fermi_ok="not OK"; fi
if (( $(bc -l <<< "(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)") == 1 )); then energy_ok="OK"; else energy_ok="not OK"; fi
compare=""
compare+="(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations) && "
compare+="(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole) && "
compare+="(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi) && "
compare+="(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)"
if (( $(bc -l <<< "$compare") ));
then
bounds_check="OK";
else
bounds_check="not OK"
fi
# Output to the slurm.out file
echo -e "\nResult information:\n" \
" * Time: $bmtime s\n" \
" * Number of iterations: $iterations (lower: $lower_iterations, upper: $upper_iterations, $iterations_ok)\n" \
" * Dipole (3rd component): $dipole (lower: $lower_dipole, upper: $upper_dipole, $dipole_ok)\n" \
" * Fermi level: $fermi (lower: $lower_fermi, upper: $upper_fermi, $fermi_ok)\n" \
" * Extrapolated energy: $energy (lower: $lower_energy, upper: $upper_energy, $energy_ok)\n" \
" * Boundary check: $bounds_check"
# Output to the summary spreadsheet
echo "\"$module\", \"$python_version/$gpaw_version/$ase_version/$numpy_version/$scipy_version\"," \
"\"$SLURM_NTASKS\", \"$bmtime\", \"$iterations\", \"$dipole\", \"$fermi\", \"$energy\", \"$bounds_check\", \"$SLURM_JOB_ID\"" >> $summary
}
#
# Running with GPAW 20.1.0
#
print_header $csv_summary
module purge
# UEABS/2.2 is not a system module but one of our own to basically do some settings
# needed for the project (and to adjust the MODULEPATH). It is not needed if
# $module_20_1 (generated by the build scripts) is in the MODULEPATH
module load UEABS/2.2
module load $module_20_1
echo -e "\n\n"
srun gpaw python $inputfile
print_results output.txt $csv_summary $module_20_1
mv output.txt JSC_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}_20.1.0.txt
#
# Running with GPAW 20.10.0
#
module purge
module load UEABS/2.2
module load $module_20_10
echo -e "\n\n"
srun gpaw python $inputfile
print_results output.txt $csv_summary $module_20_10
mv output.txt JSC_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}_20.10.0.txt
#! /bin/bash -l
#SBATCH -A pn73ye
#SBARCH -N 21
#SBATCH -n 1000 -c 1
#SBATCH --time 40:00
#SBATCH -J GPAWbench
#SBATCH -p general
#SBATCH --hint=nomultithread
#SBATCH -o %x-%j.out
#
inputfile=../input.py
benchmark_size='large'
csv_summary="LRZ_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}.csv"
module_20_1=GPAW-UEABS/20.1.0-Python38-FFTW-icc
module_20_10=GPAW-UEABS/20.10.0-Python39-FFTW-icc
module unload intel-mkl
module unload intel-mpi
module unload intel
# UEABS/2.2 is not a system module but one of our own to basically do some settings
# needed for the project (and to adjust the MODULEPATH). It is not needed if
# $module_20_1 (generated by the build scripts) is in the MODULEPATH
module load UEABS/2.2
echo -e "\nWorking in: $(pwd)\n"
echo -e "Modules loaded:\n"
module list
echo -e "Slurm environment:\n$(env | grep SLURM_)\n"
echo -e "\nJob script:\n"
cat $0
echo -e "\n\n"
echo -e "\nCalled script:\n"
cat ../../LRZ_run_GPAW.slurm
echo -e "\n\n"
if (( $SLURM_NTASKS <= 512 ))
then
#srun --distribution=block:block /hppfs/work/pn73ye/di46ras/UEABS/Run/TEST-JOB/mpi_hello.exe
srun /hppfs/work/pn73ye/di46ras/UEABS/Run/TEST-JOB/mpi_hello.exe
fi
#
# Check the results
#
function print_header {
echo '"Module", "python/gpaw/ase/numpy/scipy", "tasks", "time", "iterations", "dipole", "fermi", "energy", "check", "Job ID"' >$1
}
function print_results {
output=$1
summary=$2
module=$3
. ../bounds.sh
python_version=$(python -V | awk '{print $2}')
gpaw_version=$(srun -n 1 -c 1 python -c "import gpaw ; print( gpaw.__version__ )")
ase_version=$(python -c "import ase ; print( ase.__version__ )")
numpy_version=$(python -c "import numpy ; print( numpy.__version__ )")
scipy_version=$(python -c "import scipy ; print( scipy.__version__ )")
# Extract some data to report form the output file.
bmtime=$(grep "Total:" $output | sed -e 's/Total: *//' | cut -d " " -f 1)
iterations=$(grep "Converged after" $output | cut -d " " -f 3)
dipole=$(grep "Dipole" $output | cut -d " " -f 5 | sed -e 's/)//')
fermi=$(grep "Fermi level:" $output | cut -d ":" -f 2 | sed -e 's/ //g')
energy=$(grep "Extrapolated: " $output | cut -d ":" -f 2 | sed -e 's/ //g')
# Check the bounds
if (( $(bc -l <<< "(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations)") == 1 )); then iterations_ok="OK"; else iterations_ok="not OK"; fi
if (( $(bc -l <<< "(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole)") == 1 )); then dipole_ok="OK"; else dipole_ok="not OK"; fi
if (( $(bc -l <<< "(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi)") == 1 )); then fermi_ok="OK"; else fermi_ok="not OK"; fi
if (( $(bc -l <<< "(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)") == 1 )); then energy_ok="OK"; else energy_ok="not OK"; fi
compare=""
compare+="(($iterations-0) >= $lower_iterations) && (($iterations-0) <= $upper_iterations) && "
compare+="(($dipole-0) >= $lower_dipole) && (($dipole-0) <= $upper_dipole) && "
compare+="(($fermi-0) >= $lower_fermi) && (($fermi-0) <= $upper_fermi) && "
compare+="(($energy-0) >= $lower_energy) && (($energy-0) <= $upper_energy)"
if (( $(bc -l <<< "$compare") ));
then
bounds_check="OK";
else
bounds_check="not OK"
fi
# Output to the slurm.out file
echo -e "\nResult information:\n" \
" * Time: $bmtime s\n" \
" * Number of iterations: $iterations (lower: $lower_iterations, upper: $upper_iterations, $iterations_ok)\n" \
" * Dipole (3rd component): $dipole (lower: $lower_dipole, upper: $upper_dipole, $dipole_ok)\n" \
" * Fermi level: $fermi (lower: $lower_fermi, upper: $upper_fermi, $fermi_ok)\n" \
" * Extrapolated energy: $energy (lower: $lower_energy, upper: $upper_energy, $energy_ok)\n" \
" * Boundary check: $bounds_check"
# Output to the summary spreadsheet
echo "\"$module\", \"$python_version/$gpaw_version/$ase_version/$numpy_version/$scipy_version\"," \
"\"$SLURM_NTASKS\", \"$bmtime\", \"$iterations\", \"$dipole\", \"$fermi\", \"$energy\", \"$bounds_check\", \"$SLURM_JOB_ID\"" >> $summary
}
#
# Running with GPAW 20.1.0
#
print_header $csv_summary
module load $module_20_1
echo -e "\n\nStarting GPAW for $module_20_1\n"
#srun --distribution=block:block gpaw python $inputfile
srun gpaw python $inputfile
echo -e "\n\nGPAW terminated\n"
print_results output.txt $csv_summary $module_20_1
mv output.txt LRZ_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}_20.1.0.txt
#
# Running with GPAW 20.10.0
#
module load $module_20_10
echo -e "\n\n"
echo -e "\n\nStarting GPAW for $module_20_10\n"
#srun --distribution=block:block gpaw python $inputfile
srun gpaw python $inputfile
echo -e "\n\nGPAW terminated\n"
print_results output.txt $csv_summary $module_20_10
mv output.txt LRZ_${benchmark_size}_${SLURM_JOB_ID}_${SLURM_NTASKS}_20.10.0.txt
#! /bin/bash -l
#MSUB -A pa5772
#MSUB -q rome
#MSUB -Q normal
#MSUB -n 1024
#MSUB -c 1
#MSUB -T 5000
#MSUB -r GPAWbench
#
inputfile=../input.py
boundsfile=../bounds.sh
benchmark_size='large'