Skip to content
###
### README - QCD Accelerator Benchmarksuite Part 2
### README - QCD Accelerator Benchmarksuite Part 2
###
### 2017 - Jacob Finkenrath - CaSToRC - The Cyprus Institute (j.finkenrath@cyi.ac.cy)
###
......@@ -237,3 +237,42 @@ KNCs GFLOPS
16 368.196
32 605.882
64 847.566
#########################################################
Results from PRACE 5IP (see White paper for more details)
Results in GFLOP/s for V=96x32x32x32
Nodes Irene SKL Juwels Marconi-KNL MareNostrum PizDaint Davide Frioul Deep Mont-Blanc 3
1 134,382 132,26 101,815 142,336 387,659 392,763 184,729 41,7832 99,6378
2 240,853 245,599 145,608 263,355 755,308 773,901 269,705 40,7721 214,549
4 460,044 456,228 202,135 480,516 1400,06 1509,46 441,534 59,6317 410,902
8 754,657 864,959 223,082 895,277 1654,21 2902,83 614,466 67,3355 715,699
16 1366,21 1700,95 214,705 1632,87 2145,69 5394,16 644,303 91,5139 1,17E+03
32 2603,9 3199,98 183,327 2923,7 2923,98 9650,91 937,755
64 4122,76 5167,48 232,788 4118,7 2332,71 800,514
128 4703,46 7973,9 37,8003 4050,41
256 -- 3130,42
512 -- 3421,25
Qphix Qphix Qphix Qphix QUDA QUDA Qphix Qphix Grid
Skylake Skylake KNL Skylake P100 P100 KNL Xeons ARM
Results in GFLOP/s for V=128x64x64x64
Node Irene SKL Juwels Marconi-KNL MareNostrum PizDaint
1 141,306 134,972 64,2657 144,32
2 267,278 263,636 153,008 280,68
4 503,041 496,465 420,936 514,956
8 922,187 954,659 783,39 930,95 2694
16 1607,92 1787,43 1109,95 1778,23 5731,56
32 3088,02 3289,02 1486,79 2635,74 7779,29
64 4787,89 5952,8 1087,01 5264,16 10607,2
128 5750,35 10315,3 601,615 7998,56 13560,5
256 15370,9 18177,2
512 26972,6
Qphix Qphix Qphix QPhix QUDA
Skylake Skylake KNL Skylake P100
##
##
##
##
##
time=$1
Node=$2
n=$3
g=$4
openmp=$5
cpuptask=$6
perm=$7
scr=$8
pt=${9}
pz=${10}
py=${11}
px=${12}
exe=${13}
name=${14}
lx=${15}
lz=${16}
ly=${17}
lt=${18}
prec=${19}
sed 's/#NODES#/'${Node}'/g' submit_job.sh.template > test
mv test submit_job.temp
sed 's/#NTASK#/'${n}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#NTASKPERNODE#/'${g}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#OPENMP#/'${openmp}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#CPUSPERTASK#/'${cpuptask}'/g' submit_job.temp > test
mv test submit_job.temp
wrc=$(pwd)
echo $wrc
sed 's #WRC# '${wrc}' g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PT#/'${pt}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PZ#/'${pz}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PY#/'${py}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PX#/'${px}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's #EXE# '${exe}' g' submit_job.temp > test
mv test submit_job.temp
sed 's/#NAME#/'${name}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LT#/'${lt}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LZ#/'${lz}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LY#/'${ly}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LX#/'${lx}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PREC#/'${prec}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#TIME#/'${time}'/g' submit_job.temp > test
mv test $scr
if [ $perm -eq 1 ];then
chmod +x $scr
fi
rm submit_job.temp
##
## RUN - Strong -scaling
##
## Before starting this job-script replace "SUBMIT" with the submition-command of the local queing system.
## Additional in the script submit_job the execution command has to be adjusted to the local machine.
##
##
## Script for a parallelization of 2 4 8 16 32 64 KNLs
##
#!/bin/bash
EXE=/ccc/cont005/home/unicy/finkenrj/run/qphix/time_clov_noqdp
## Set scaling-mode: Strong or Weak
sca_mode="Strong"
#sca_mode="OneNode"
#sca_mode="Weak"
mode="Analysis"
##mode="Run"
## sbatch_on=1
exe_perm=1 ## use chmod to allow execution of submit_job_Nx_Gx.sh
g=8 ##MPItaskperNODE
openmp=6 ##OMP
cpuptask=6 ## Total number of CPUS / MpitaskperNODE (= openmp if Hyperthreaded Cores are used, = 2* openmp if Hyperthreading is enabled but not used )
## lattice size (size strong 1)
gx=32
gy=32
gz=32
gt=96
## lattice size (size strong 2)
#gx=8
#gy=4
#gz=4
#gt=8
## lattice size (size weak 1)
#gx=48
#gt=24
## use smaller lattice size of weak scaling mode: like gx=24 gt=24
##
#gy=$gx
#gz=$gx
lt=$gt
lx=$gx
ly=$gy
lz=$gz
# for gpus_per_node in 1 2; do
cards_per_node=1
#n=1
# for n in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576; do
for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8; do
for p in "s" "d" ; do
# p="d"
case $p in
"s" )
prec="f"
;;
"d" )
prec="d"
;;
"h" )
prec="h"
;;
esac
px=1
py=1
pz=1
pt=$n
if [ $n -eq 16 ];then
pz=2
pt=8
fi
if [ $n -eq 32 ];then
pz=4
pt=8
fi
if [ $n -eq 64 ];then
py=2
pz=4
pt=8
fi
if [ $n -eq 128 ];then
py=2
pz=8
pt=8
fi
if [ $n -eq 256 ];then
py=4
pz=8
pt=8
fi
if [ $n -eq 512 ];then
px=2
py=4
pz=8
pt=8
fi
if [ $n -eq 1024 ];then
px=4
py=4
pz=8
pt=8
fi
if [ $n -eq 2048 ];then
px=8
py=4
pz=8
pt=8
fi
if [ $n -eq 4096 ];then
px=8
py=8
pz=8
pt=8
fi
if [ $n -eq 8192 ];then
px=8
py=8
pz=8
pt=16
fi
if [ $n -eq 16384 ];then
px=8
py=8
pz=16
pt=16
fi
if [ $n -eq 32768 ];then
px=8
py=16
pz=16
pt=16
fi
if [ $n -eq 65536 ];then
px=16
py=16
pz=16
pt=16
fi
if [ $n -eq 131072 ];then
px=16
py=16
pz=16
pt=32
fi
if [ $n -eq 262144 ];then
px=16
py=16
pz=32
pt=32
fi
if [ $n -eq 524288 ];then
px=16
py=32
pz=32
pt=32
fi
if [ $n -eq 1048576 ];then
px=32
py=32
pz=32
pt=32
fi
nt=$n
if [ $sca_mode = "Strong" ];then
lt1=$((gt/pt))
lx1=$((gx/px))
ly1=$((gy/py))
lz1=$((gz/pz))
elif [ $sca_mode = "OneNode" ]; then
lx1=$((gx*px))
ly1=$((gy*py))
lz1=$((gz*pz))
lt1=$((gt*pt/g))
nt=$g
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
lt=$((gt*pt))
px=1
py=1
pz=1
pt=$g
else
lt1=$lt
lx1=$lx
ly1=$ly
lz1=$lz
lt=$((gt*pt))
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
fi
Node=$((nt/g))
name=${sca_mode}_qphix_${pt}x${pz}x${py}x${px}_${lt}x${lz}x${ly}x${lx}_${p}
if [ $mode != "Analysis" ];then
echo $name
submitscript=submit_job_N${nt}_${p}.sh
./prepare_submit_job.sh '00:10:00' ${Node} ${nt} ${g} ${openmp} ${cpuptask} ${exe_perm} ${submitscript} ${pt} ${pz} ${py} $px $EXE $name $lx $lz $ly $lt $prec
ccc_msub ./$submitscript
sleep 1
## Scaning the output and save the data in dat_nameif
else
case $p in
"s" )
echo $name >> Sca_s.log
less $name | grep "Time" -A 1 >> Sca_s.log
;;
"d" )
echo $name >> Sca_d.log
less $name | grep "Time" -A 1 >> Sca_d.log
;;
"h" )
echo $name >> Sca_h.log
less $name | grep "Time" -A 1 >> Sca_h.log
;;
esac
fi
done
done
##
## RUN - Strong -scaling
##
## Before starting this job-script replace "SUBMIT" with the submition-command of the local queing system.
## Additional in the script submit_job the execution command has to be adjusted to the local machine.
##
##
## Script for a parallelization of 2 4 8 16 32 64 KNLs
##
#!/bin/bash
EXE=/ccc/cont005/home/unicy/finkenrj/run/qphix/time_clov_noqdp
## Set scaling-mode: Strong or Weak
sca_mode="Strong"
#sca_mode="OneNode"
#sca_mode="Weak"
## mode="Analysis"
mode="Run"
## sbatch_on=1
exe_perm=1 ## use chmod to allow execution of submit_job_Nx_Gx.sh
g=8 ##MPItaskperNODE
openmp=6 ##OMP
cpuptask=6 ## Total number of CPUS / MpitaskperNODE (= openmp if Hyperthreaded Cores are used, = 2* openmp if Hyperthreading is enabled but not used )
## lattice size (size strong 1)
gx=96
gy=96
gz=96
gt=192
## lattice size (size strong 2)
#gx=8
#gy=4
#gz=4
#gt=8
## lattice size (size weak 1)
#gx=48
#gt=24
## use smaller lattice size of weak scaling mode: like gx=24 gt=24
##
#gy=$gx
#gz=$gx
lt=$gt
lx=$gx
ly=$gy
lz=$gz
# for gpus_per_node in 1 2; do
cards_per_node=1
#n=1
# for n in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576; do
for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8; do
for p in "s" "d" ; do
# p="d"
case $p in
"s" )
prec="f"
;;
"d" )
prec="d"
;;
"h" )
prec="h"
;;
esac
px=1
py=1
pz=1
pt=$n
if [ $n -eq 16 ];then
pz=2
pt=8
fi
if [ $n -eq 32 ];then
pz=4
pt=8
fi
if [ $n -eq 64 ];then
py=2
pz=4
pt=8
fi
if [ $n -eq 128 ];then
py=2
pz=8
pt=8
fi
if [ $n -eq 256 ];then
py=4
pz=8
pt=8
fi
if [ $n -eq 512 ];then
px=2
py=4
pz=8
pt=8
fi
if [ $n -eq 1024 ];then
px=4
py=4
pz=8
pt=8
fi
if [ $n -eq 2048 ];then
px=8
py=4
pz=8
pt=8
fi
if [ $n -eq 4096 ];then
px=8
py=8
pz=8
pt=8
fi
if [ $n -eq 8192 ];then
px=8
py=8
pz=8
pt=16
fi
if [ $n -eq 16384 ];then
px=8
py=8
pz=16
pt=16
fi
if [ $n -eq 32768 ];then
px=8
py=16
pz=16
pt=16
fi
if [ $n -eq 65536 ];then
px=16
py=16
pz=16
pt=16
fi
if [ $n -eq 131072 ];then
px=16
py=16
pz=16
pt=32
fi
if [ $n -eq 262144 ];then
px=16
py=16
pz=32
pt=32
fi
if [ $n -eq 524288 ];then
px=16
py=32
pz=32
pt=32
fi
if [ $n -eq 1048576 ];then
px=32
py=32
pz=32
pt=32
fi
nt=$n
if [ $sca_mode = "Strong" ];then
lt1=$((gt/pt))
lx1=$((gx/px))
ly1=$((gy/py))
lz1=$((gz/pz))
elif [ $sca_mode = "OneNode" ]; then
lx1=$((gx*px))
ly1=$((gy*py))
lz1=$((gz*pz))
lt1=$((gt*pt/g))
nt=$g
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
lt=$((gt*pt))
px=1
py=1
pz=1
pt=$g
else
lt1=$lt
lx1=$lx
ly1=$ly
lz1=$lz
lt=$((gt*pt))
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
fi
Node=$((nt/g))
name=${sca_mode}_qphix_${pt}x${pz}x${py}x${px}_${lt}x${lz}x${ly}x${lx}_${p}
if [ $mode != "Analysis" ];then
echo $name
submitscript=submit_job_N${nt}_${p}.sh
./prepare_submit_job.sh '00:10:00' ${Node} ${nt} ${g} ${openmp} ${cpuptask} ${exe_perm} ${submitscript} ${pt} ${pz} ${py} $px $EXE $name $lx $lz $ly $lt $prec
ccc_msub ./$submitscript
sleep 1
## Scaning the output and save the data in dat_nameif
else
case $p in
"s" )
echo $name >> Sca_s.log
less $name | grep "Time" -A 1 >> Sca_s.log
;;
"d" )
echo $name >> Sca_d.log
less $name | grep "Time" -A 1 >> Sca_d.log
;;
"h" )
echo $name >> Sca_h.log
less $name | grep "Time" -A 1 >> Sca_h.log
;;
esac
fi
done
done
#! /bin/bash
#MSUB -r Test1 # Request name
#MSUB -n #NTASK# # Number of Task
#MSUB -c #OPENMP# # Number of Threads per Task
#MSUB -N #NODES# # Number of Nodes
#MSUB -T 1800 # Elapsed time limit in seconds
#MSUB -o bench_out_%I.o # Standart output %I is the job ID
#MSUB -o bench_out_%I.e # Error output %I is the job ID
#MSUB -q skylake # see ccc_mpinfo: skylake or
#MSUB -A pa4564 # Project ID
set -x
cd ${BRIDGE_MSUB_PWD}
export OMP_NUM_THEADS=#OPENMP#
export BRIDGE_MSUB_NCORE=#CPUPERTASK# # number of requested cores per process
export I_MPI_PIN=1
export I_MPI_PIN_DOMAIN=#CPUSPERTASK#
#set -e
#export OMP_NUM_THREADS=#OPENMP#
#export KMP_AFFINITY=compact,1,0,granularity=fine,verbose
#export KMP_HW_SUBSET=1T
module unload feature/openmpi/net/auto feature/openmpi/mpi_compiler/intel mpi/openmpi/2.0.4
module unload mpi/openmpi/2.0.4
module unload .tuning/openmpi/2.0.4
module unload feature/openmpi/net/auto feature/openmpi/mpi_compiler/intel mpi/openmpi/2.0.4
module load mpi/intelmpi/2018.0.3.222
module load python3
echo "ccc_mrun -E '--exclusive' -n #NTASK# #EXE# -x #LX# -y #LY# -z #LZ# -t #LT# -by 4 -bz 4 -pxy 1 -pxyz 0 -c #OPENMP# -sy 1 -sz 1 -minct 1 -compress12 -geom #PX# #PY# #PZ# #PT# -prec #PREC# > #NAME#"
ccc_mprun -E '--exclusive' -n #NTASK# #EXE# -x #LX# -y #LY# -z #LZ# -t #LT# -by 4 -bz 4 -pxy 1 -pxyz 0 -c #OPENMP# -sy 1 -sz 1 -minct 1 -compress12 -geom #PX# #PY# #PZ# #PT# -prec #PREC# -cg > #NAME#
##
##
##
##
##
time=$1
Node=$2
n=$3
g=$4
openmp=$5
cpuptask=$6
perm=$7
scr=$8
sed 's/#NODES#/'${Node}'/g' submit_job.sh.template > test
mv test submit_job.temp
sed 's/#NTASK#/'${n}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#NTASKPERNODE#/'${g}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#OPENMP#/'${openmp}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#CPUSPERTASK#/'${cpuptask}'/g' submit_job.temp > test
mv test submit_job.temp
wrc=$(pwd)
echo $wrc
sed 's #WRC# '${wrc}' g' submit_job.temp > test
mv test submit_job.temp
sed 's/#TIME#/'${time}'/g' submit_job.temp > test
mv test $scr
if [ $perm -eq 1 ];then
chmod +x $scr
fi
rm submit_job.temp
##
## RUN - Strong -scaling
##
## Before starting this job-script replace "SUBMIT" with the submition-command of the local queing system.
## Additional in the script submit_job the execution command has to be adjusted to the local machine.
##
##
## Script for a parallelization of 2 4 8 16 32 64 KNLs
##
#!/bin/bash
EXE=/gpfs/projects/pr1ehq00/bench/qphix/time_clov_noqdp
## Set scaling-mode: Strong or Weak
sca_mode="Strong"
#sca_mode="Weak"
mode="Analysis"
#mode="Run"
## sbatch_on=1
exe_perm=1 ## use chmod to allow execution of submit_job_Nx_Gx.sh
g=8 ##MPItaskperNODE
openmp=6 ##OMP
cpuptask=6 ## Total number of CPUS / MpitaskperNODE (= openmp if Hyperthreaded Cores are used, = 2* openmp if Hyperthreading is enabled but not used )
## lattice size (size strong 1)
#gx=32
#gt=96
## lattice size (size strong 2)
gx=64
gt=128
## lattice size (size weak 1)
#gx=48
#gt=24
## use smaller lattice size of weak scaling mode: like gx=24 gt=24
##
lt=$gt
lx=$gx
ly=$gx
lz=$gx
# for gpus_per_node in 1 2; do
cards_per_node=1
# n=1
# for n in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8; do
for p in "s" "d" ; do
# p="d"
case $p in
"s" )
prec="f"
;;
"d" )
prec="d"
;;
"h" )
prec="h"
;;
esac
px=1
py=1
pz=1
pt=$n
if [ $n -eq 16 ];then
pz=2
pt=8
fi
if [ $n -eq 32 ];then
pz=4
pt=8
fi
if [ $n -eq 64 ];then
py=2
pz=4
pt=8
fi
if [ $n -eq 128 ];then
py=2
pz=8
pt=8
fi
if [ $n -eq 256 ];then
py=4
pz=8
pt=8
fi
if [ $n -eq 512 ];then
px=2
py=4
pz=8
pt=8
fi
if [ $n -eq 1024 ];then
px=4
py=4
pz=8
pt=8
fi
if [ $n -eq 2048 ];then
px=8
py=4
pz=8
pt=8
fi
if [ $n -eq 4096 ];then
px=8
py=8
pz=8
pt=8
fi
if [ $n -eq 8192 ];then
px=8
py=8
pz=8
pt=16
fi
if [ $n -eq 16384 ];then
px=8
py=8
pz=16
pt=16
fi
if [ $sca_mode = "Strong" ];then
lt1=$((gt/pt))
lx1=$((gx/px))
ly1=$((gx/py))
lz1=$((gx/pz))
else
lt1=$lt
lx1=$lx
ly1=$ly
lz1=$lz
lt=$((gt*pt))
lx=$((gx*px))
ly=$((gx*py))
lz=$((gx*pz))
fi
Node=$((n/g))
name=${sca_mode}_qphix_${pt}x${pz}x${py}x${px}_${lt}x${lz}x${ly}x${lx}_${p}
if [ $mode != "Analysis" ];then
echo $name
submitscript=submit_job_N${n}_${p}.sh
./prepare_submit_job.sh '00:10:00' ${Node} ${n} ${g} ${openmp} ${cpuptask} ${exe_perm} ${submitscript}
sbatch ./$submitscript ${pt} ${pz} ${py} $px $EXE $name $lx $lz $ly $lt $prec
sleep 1
## Scaning the output and save the data in dat_nameif
else
case $p in
"s" )
echo $name >> Sca_s.log
less $name | grep "Time" -A 1 >> Sca_s.log
;;
"d" )
echo $name >> Sca_d.log
less $name | grep "Time" -A 1 >> Sca_d.log
;;
"h" )
echo $name >> Sca_h.log
less $name | grep "Time" -A 1 >> Sca_h.log
;;
esac
fi
done
done
##
## RUN - Strong -scaling
##
## Before starting this job-script replace "SUBMIT" with the submition-command of the local queing system.
## Additional in the script submit_job the execution command has to be adjusted to the local machine.
##
##
## Script for a parallelization of 2 4 8 16 32 64 KNLs
##
#!/bin/bash
EXE=/gpfs/projects/pr1ehq00/bench/qphix/time_clov_noqdp
## Set scaling-mode: Strong or Weak
sca_mode="Strong"
#sca_mode="Weak"
## mode="Analysis"
mode="Run"
## sbatch_on=1
exe_perm=1 ## use chmod to allow execution of submit_job_Nx_Gx.sh
g=8 ##MPItaskperNODE
openmp=6 ##OMP
cpuptask=6 ## Total number of CPUS / MpitaskperNODE (= openmp if Hyperthreaded Cores are used, = 2* openmp if Hyperthreading is enabled but not used )
## lattice size (size strong 1)
#gx=32
#gt=96
## lattice size (size strong 2)
gx=64
gt=128
## lattice size (size weak 1)
#gx=48
#gt=24
## use smaller lattice size of weak scaling mode: like gx=24 gt=24
##
lt=$gt
lx=$gx
ly=$gx
lz=$gx
# for gpus_per_node in 1 2; do
cards_per_node=1
# n=1
# for n in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8; do
for p in "s" "d" ; do
# p="d"
case $p in
"s" )
prec="f"
;;
"d" )
prec="d"
;;
"h" )
prec="h"
;;
esac
px=1
py=1
pz=1
pt=$n
if [ $n -eq 16 ];then
pz=2
pt=8
fi
if [ $n -eq 32 ];then
pz=4
pt=8
fi
if [ $n -eq 64 ];then
py=2
pz=4
pt=8
fi
if [ $n -eq 128 ];then
py=2
pz=8
pt=8
fi
if [ $n -eq 256 ];then
py=4
pz=8
pt=8
fi
if [ $n -eq 512 ];then
px=2
py=4
pz=8
pt=8
fi
if [ $n -eq 1024 ];then
px=4
py=4
pz=8
pt=8
fi
if [ $n -eq 2048 ];then
px=8
py=4
pz=8
pt=8
fi
if [ $n -eq 4096 ];then
px=8
py=8
pz=8
pt=8
fi
if [ $n -eq 8192 ];then
px=8
py=8
pz=8
pt=16
fi
if [ $n -eq 16384 ];then
px=8
py=8
pz=16
pt=16
fi
if [ $sca_mode = "Strong" ];then
lt1=$((gt/pt))
lx1=$((gx/px))
ly1=$((gx/py))
lz1=$((gx/pz))
else
lt1=$lt
lx1=$lx
ly1=$ly
lz1=$lz
lt=$((gt*pt))
lx=$((gx*px))
ly=$((gx*py))
lz=$((gx*pz))
fi
Node=$((n/g))
name=${sca_mode}_qphix_${pt}x${pz}x${py}x${px}_${lt}x${lz}x${ly}x${lx}_${p}
if [ $mode != "Analysis" ];then
echo $name
submitscript=submit_job_N${n}_${p}.sh
./prepare_submit_job.sh '00:10:00' ${Node} ${n} ${g} ${openmp} ${cpuptask} ${exe_perm} ${submitscript}
sbatch ./$submitscript ${pt} ${pz} ${py} $px $EXE $name $lx $lz $ly $lt $prec
sleep 1
## Scaning the output and save the data in dat_nameif
else
case $p in
"s" )
echo $name >> Sca_s.log
less $name | grep "Time" -A 1 >> Sca_s.log
;;
"d" )
echo $name >> Sca_d.log
less $name | grep "Time" -A 1 >> Sca_d.log
;;
"h" )
echo $name >> Sca_h.log
less $name | grep "Time" -A 1 >> Sca_h.log
;;
esac
fi
done
done
#!/bin/bash
#SBATCH --job-name=QCDBENCHMARK
#SBATCH --workdir=#WRC#
#SBATCH --output=mpi_%j_#NODES#.out
#SBATCH --error=mpi_%j_#NODES#.err
#SBATCH --time=00:10:00
#SBATCH --constraint=highmem
#SBATCH --nodes=#NODES#
#SBATCH --ntasks-per-node=#NTASKPERNODE#
#SBATCH --cpus-per-task=#CPUSPERTASK#
#SBATCH --ntasks=#NTASK#
#SBATCH --exclusive
#set -e
export OMP_NUM_THREADS=#OPENMP#
export KMP_AFFINITY=compact,1,0,granularity=fine,verbose
export KMP_HW_SUBSET=1T
#export KMP_AFFINITY=balanced,granularity=fine
export I_MPI_PIN=1
export I_MPI_PIN_DOMAIN=#CPUSPERTASK#
module load intel/2018.4 impi/2018.4
module load hdf5
n=$1
m=$2
g=$3
v=$4
EXE=$5
name=$6
lx=$7
ly=$8
lz=$9
lt=${10}
prec=${11}
echo "mpirun -n #NTASK# $EXE -x $lx -y $ly -z $lz -t $lt -by 4 -bz 4 -pxy 1 -pxyz 0 -c #OPENMP# -sy 1 -sz 1 -minct 1 -compress12 -geom $v $g $m $n -prec $prec -cg"
mpirun -n #NTASK# $EXE -x $lx -y $ly -z $lz -t $lt -by 4 -bz 4 -pxy 1 -pxyz 0 -c #OPENMP# -sy 1 -sz 1 -minct 1 -compress12 -geom $v $g $m $n -prec $prec -cg > $name
Copyright and Disclaimer
Copyright (C) 2008, Forschungszentrum Juelich GmbH, Federal Republic of Germany. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Any publications that result from the use of this software shall
reasonably refer to the Research Centre's development.
* All advertising materials mentioning features or use of this
software must display the following acknowledgement:
This product includes software developed by Forschungszentrum
Juelich GmbH, Federal Republic of Germany.
* Forschungszentrum Juelich GmbH is not obligated to provide the
user with any support, consulting, training or assistance of any
kind with regard to the use, operation and performance of this
software or to provide the user with any updates, revisions or
new versions.
THIS SOFTWARE IS PROVIDED BY FORSCHUNGSZENTRUM JUELICH GMBH "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL FORSCHUNGSZENTRUM JUELICH
GMBH BE LIABLE FOR ANY SPECIAL, DIRECT OR CONSEQUENTIAL DAMAGES OR ANY
DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE ACCESS, USE OR PERFORMANCE OF
THIS SOFTWARE.
#################
################# UEABS - QCD - BENCHMARKSUITE -- QUICK-USERGUIDE
#################
This is a very short summary of the general step, which has
to be performed, to run the UEABS QCD Benchmarksuite on a new
machine. More information can be found in the documentation of
the UEABS-QCD BENCHMARKSUITE which is located in in the folder
./PABS/doc/*
or under the web-link
http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt
http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt
The suite works with Jube, which will handle the compilation,
the submission and the analysis of the Benchmarksuite. On a new
machine several xml-files has to be added or created.
This guide will give a short and very quick overview about
the different steps.
The FIRST STEP on a new machine is to add information about the
system to the platform-folder located in:
./PABS/platform
Here, the new platform has to be added to the xml-file "platform.xml"
similar to the already xml-templates:
..
<platform name="NEW-PLATFORM">
<params
make = "make"
rm = "rm -f"
ar = "ar"
..
module_cmd = "module load"
/>
</platform>
The SECOND STEP is to provide a dummy-submit script which has to
added to a new subdirectory given by:
./PABS/platform/"NEW-PLATFORM"
In the THIRD STEP: Go to the home-directory of the UEABS-QCD-Benchmarksuite
located in:
./PABS/applications/QCD/
Note that the source-files of the kernels are located in "./PABS/applications/QCD/src".
Here, similar to STEP ONE the xml-files:
compile.xml, execute.xml and analyse.xml
has to be edit, i.e. new xml-templates with the new platform-information
has to be added.
In the FOURTH STEP the runs will be setup by creating runs-scripts similar to
"prace-functional-NEW-PLATORM.xml" for a functional test
and
"prace-scaling-NEW-PLATORM.xml" for a scaling run.
Here, several limits of the different codes has to be taken into account, see for
this the section "Limitation" at the end of this quick-userguide.
In the FIFTH STEP the benchmark can be compiled and ran by using the command:
perl ../../bench/jube prace-functional-"NEW-PLATFORM".xml
in the directory:
"./PABS/applications/QCD/".
This will generate a folder "tmp" with subfolder in "./PABS/applications/QCD/"
where the source-file will be compiled and executed. If the compilation or the submission
fails, more information can be found in the subdirectories of "tmp". In any cases
after the generation of the folder "tmp", compilation and submition can be done,
in principle, without Jube.
In the LAST STEP, the scaling results can be analyzed, by using
perl ../../bench/jube analyse.xml
LIMITATION:
The different kernels consists of lattice QCD production codes and have several limitations
in parallelization and lattice volume. Kernel A,C,D and E using a four dimensional
lattice while in case of kernel B a three dimensional lattice is used. All kernels
can be parallelized in all direction. The different lattice sizes and parallelization
has to be declared in the scripts: 'prace-functional-"NEW-PLATFORM".xml' or
'prace-scaling-NEW-PLATORM.xml'. The limitation for the different kernel are given by:
"pt * px * py * pz = task"
and additional for the Kernel A, D and E
" nt / pt modulo 2 = 0 " and " nt => 4 "
and the same condition for the other pairs
"{nx,px}, {ny,py}, {nz,pz}". Moreover
the lattice extends nt, nx, ny and nx has to be even and larger
than 4.
#######
####### Please see for further information the Readme-files
####### which are provided under
#######
####### http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt
####### http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt
####### or in
####### ./PABS/doc/*
#######
####### Jacob Finkenrath, 2017
#######
\ No newline at end of file
This diff is collapsed.
<analyser>
<analyse cname="Cray-XT4-Louhi">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
<analyse cname="Intel-Nehalem-JUROPA">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
</analyse>
<analyse cname="Cray-XE6-HERMIT">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
</analyse>
<analyse cname="Cray-XE6-HECToR">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
</analyse>
<analyse cname="IBM-SP6-Jump">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
<analyse cname="IBM-BGP-Jugene">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
<analyse cname="IBM-BGQ-Juqueen">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
<analyse cname="IBM-SP6-Huygens">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
<analyse cname=" Intel-Haswell-Cartesius">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
<analyse cname="Intel-Broadwell-Marconi">
<precommand>(cd $outdir; bash collectData.sh)</precommand>
<input addfiles="$subdir/IHPCT.log $subdir/GPROF.log" />
<includepattern file="patterns-jube-qcd.xml" />
<includepattern file="patterns-ihpct-qcd.xml" />
<includepattern file="patterns-gprof-qcd.xml" />
</analyse>
</analyser>
This diff is collapsed.
<execution>
<!-- ******************************************************************************** -->
<execute cname="Intel-SNB-supermuc">
<input files="$pdir/Intel-SNB-supermuc/ibm_llsubmit.job.in" />
<substitute infile="ibm_llsubmit.job.in" outfile="ibm_llsubmit.job">
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#JOB_CLASS#" to="$job_class" />
<sub from="#DATA_LIMIT#" to="0.75GB" />
<sub from="#STACK_LIMIT#" to="0.25GB" />
<sub from="#MEMORYPERTASK#" to="1.0GB" />
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#NODEUSAGE#" to="shared" />
<sub from="#TIME_LIMIT#" to="00:15:00" />
<sub from="#NODES#" to="$nodes" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TAFFINITY#" to="$taffinity" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#NOTIFICATION#" to="never" />
<sub from="#NOTIFY_EMAIL#" to="XXX@sd.ds" />
<sub from="#MY_ENERGYTAG#" to="$my_energytag" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="$executable" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="" />
<sub from="#POSTPROCESS#" to="echo 'JuBE: $COMMENT'" />
<sub from="#STARTER#" to="mpiexec -n $tasks" />
<sub from="#ARGS_STARTER#" to="" />
<sub from="#MEASUREMENT#" to="" />
<sub from="#ARGS_EXECUTABLE#" to="-nodes ${nodes}" />
</substitute>
<environment>
<env var = "MP_LABELIO" value="yes" />
<env var = "MP_INFOLEVEL" value="2" />
<env var = "MP_SHARED_MEMORY" value="yes" />
<env var = "MP_TASK_AFFINITY" value="MCM" />
<env var = "MEMORY_AFFINITY" value="MCM" />
<env var = "HPM_STDOUT" value="0" />
<env var = "HPM_UNIQUE_FILE_NAME" value="1" />
<env var = "HPM_EVENT_SET" value="$IHPCT_HWC_GRP" />
</environment>
<command>llsubmit ibm_llsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="Cray-XT4-Louhi">
<input files="$pdir/Cray-XT4-Louhi/cray_qsub.job.in" />
<substitute infile="cray_qsub.job.in" outfile="cray_qsub.job">
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#TIME_LIMIT#" to="00:10:00" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#NOTIFICATION#" to="n" />
<sub from="#NOTIFY_EMAIL#" to="l.arnold@fz-juelich.de" />
<sub from="#MEMORYPERTASK#" to="`($datalimit+$stacklimit)*$threadspertask`M" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="module load xt-craypat; cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="aprun -n $tasks -m ${datalimit}M -N ${taskspernode}" />
<sub from="#ARGS_STARTER#" to="" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
<env var="PAT_RT_HWPC" value="$CRAYPAT_HWC" />
</environment>
<command>qsub -q prace cray_qsub.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="Cray-XE6-HECToR">
<input files="$pdir/Cray-XE6-HECToR/cray_PBSsubmit.job.in" />
<substitute infile="cray_PBSsubmit.job.in" outfile="cray_PBSsubmit.job">
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#ACCOUNTING#" to="pr1uqqcd" />
<sub from="#TIME_LIMIT#" to="00:10:00" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="aprun" />
<sub from="#ARGS_STARTER#" to="-n $tasks -N ${taskspernode}" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
</environment>
<command>qsub cray_PBSsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="Cray-XE6-HERMIT">
<input files="$pdir/Cray-XE6-HERMIT/cray_PBSsubmit.job.in" />
<substitute infile="cray_PBSsubmit.job.in" outfile="cray_PBSsubmit.job">
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#TIME_LIMIT#" to="00:10:00" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="aprun" />
<sub from="#ARGS_STARTER#" to="-n $tasks -N ${taskspernode}" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
</environment>
<command>qsub intel_PBSsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="Intel-Nehalem-JUROPA">
<input files="$pdir/Intel-Nehalem-JUROPA/intel_PBSsubmit.job.in" />
<substitute infile="intel_PBSsubmit.job.in" outfile="intel_PBSsubmit.job">
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#TIME_LIMIT#" to="00:10:00" />
<sub from="#PPN" to="8" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#NODES#" to="$nodes" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#NOTIFICATION#" to="n" />
<sub from="#NOTIFY_EMAIL#" to="l.arnold@fz-juelich.de" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="mpiexec" />
<sub from="#ARGS_STARTER#" to="-np $ncpus" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
</environment>
<command>msub intel_PBSsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="IBM-SP6-Huygens">
<input files="$pdir/IBM-SP6-Huygens/ibm_llsubmit.job.in" />
<substitute infile="ibm_llsubmit.job.in" outfile="ibm_llsubmit.job">
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#NODEUSAGE#" to="not_shared" />
<sub from="#TIME_LIMIT#" to="00:30:00" />
<sub from="#DATA_LIMIT#" to="0.75GB" />
<sub from="#STACK_LIMIT#" to="0.25GB" />
<sub from="#NODES#" to="$nodes" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#NOTIFICATION#" to="never" />
<sub from="#NOTIFY_EMAIL#" to="l.arnold@fz-juelich.de" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="$executable" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="`index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load hpct;' : ' '`" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="poe" />
<sub from="#ARGS_STARTER#" to="" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
<env var = "MP_LABELIO" value="yes" />
<env var = "MP_INFOLEVEL" value="2" />
<env var = "MP_SHARED_MEMORY" value="yes" />
<env var = "MP_TASK_AFFINITY" value="MCM" />
<env var = "MEMORY_AFFINITY" value="MCM" />
<env var = "HPM_STDOUT" value="0" />
<env var = "HPM_UNIQUE_FILE_NAME" value="1" />
<env var = "HPM_EVENT_SET" value="$IHPCT_HWC_GRP" />
</environment>
<command>llsubmit ibm_llsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="IBM-BGP-Jugene">
<input files="$pdir/IBM-BGP-Jugene/ibm_llsubmit.job.in" />
<substitute infile="ibm_llsubmit.job.in" outfile="ibm_llsubmit.job">
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#TIME_LIMIT#" to="00:30:00" />
<sub from="#BGSIZE#" to="$nodes" />
<sub from="#BGCONNECTION#" to="`int('$nodes')>256 ? 'TORUS' : 'MESH'`" />
<sub from="#NOTIFICATION#" to="never" />
<sub from="#NOTIFY_EMAIL#" to="l.arnold@fz-juelich.de" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="mpirun -env 'BG_MAPPING=TXYZ'" />
<sub from="#ARGS_STARTER#" to="`$taskspernode == 1 ? '-mode SMP' : ($taskspernode == 2 ? '-mode DUAL' : '-mode VN')`
-np $ncpus -verbose 1" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
<env var = "MP_LABELIO" value="yes" />
<env var = "MP_INFOLEVEL" value="2" />
<env var = "MP_SHARED_MEMORY" value="yes" />
<env var = "MP_TASK_AFFINITY" value="MCM" />
<env var = "MEMORY_AFFINITY" value="MCM" />
<env var = "HPM_STDOUT" value="0" />
<env var = "HPM_UNIQUE_FILE_NAME" value="1" />
<env var = "HPM_EVENT_SET" value="$IHPCT_HWC_GRP" />
</environment>
<command>llsubmit ibm_llsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="IBM-BGQ-Juqueen">
<input files="$pdir/IBM-BGQ-Juqueen/ibm_llsubmit.job.in" />
<substitute infile="ibm_llsubmit.job.in" outfile="ibm_llsubmit.job">
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#TIME_LIMIT#" to="00:30:00" />
<sub from="#BGSIZE#" to="$nodes" />
<sub from="#BGCONNECTIVITY#" to="`int('$nodes')>256 ? 'TORUS' : 'MESH'`" />
<sub from="#NOTIFICATION#" to="never" />
<sub from="#NOTIFY_EMAIL#" to="st.janetzko@fz-juelich.de" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="runjob" />
<sub from="#ARGS_STARTER#" to="--ranks-per-node $taskspernode --np $ncpus" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
<env var = "MP_LABELIO" value="yes" />
<env var = "MP_INFOLEVEL" value="2" />
<env var = "MP_SHARED_MEMORY" value="yes" />
<env var = "MP_TASK_AFFINITY" value="MCM" />
<env var = "MEMORY_AFFINITY" value="MCM" />
<env var = "HPM_STDOUT" value="0" />
<env var = "HPM_UNIQUE_FILE_NAME" value="1" />
<env var = "HPM_EVENT_SET" value="$IHPCT_HWC_GRP" />
</environment>
<command>llsubmit ibm_llsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="IBM-SP6-Jump">
<input files="$pdir/IBM-SP6-Jump/ibm_llsubmit.job.in" />
<substitute infile="ibm_llsubmit.job.in" outfile="ibm_llsubmit.job">
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#CLASS#" to="$class" />
<sub from="#DATA_LIMIT#" to="0.75GB" />
<sub from="#STACK_LIMIT#" to="0.25GB" />
<sub from="#MEMORYPERTASK#" to="1.0GB" />
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#NODEUSAGE#" to="shared" />
<sub from="#TIME_LIMIT#" to="00:05:00" />
<sub from="#NODES#" to="$nodes" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TAFFINITY#" to="$taffinity" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#NOTIFICATION#" to="never" />
<sub from="#NOTIFY_EMAIL#" to="l.arnold@fz-juelich.de" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="$executable" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="" />
<sub from="#POSTPROCESS#" to="echo 'JuBE: $COMMENT'" />
<sub from="#STARTER#" to="poe" />
<sub from="#ARGS_STARTER#" to="" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
<env var = "MP_LABELIO" value="yes" />
<env var = "MP_INFOLEVEL" value="2" />
<env var = "MP_SHARED_MEMORY" value="yes" />
<env var = "MP_TASK_AFFINITY" value="MCM" />
<env var = "MEMORY_AFFINITY" value="MCM" />
<env var = "HPM_STDOUT" value="0" />
<env var = "HPM_UNIQUE_FILE_NAME" value="1" />
<env var = "HPM_EVENT_SET" value="$IHPCT_HWC_GRP" />
</environment>
<command>llsubmit ibm_llsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="Intel-Haswell-Cartesius">
<input files="$pdir/Intel-Haswell-Cartesius/intel_SLURMsubmit.job.in" />
<substitute infile="intel_SLURMsubmit.job.in" outfile="intel_SLURMsubmit.job">
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#TIME_LIMIT#" to="00:10:00" />
<sub from="#PARTITION#" to="short" />
<sub from="#CONSTRAINT#" to="haswell" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#NODES#" to="$nodes" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="srun" />
<sub from="#ARGS_STARTER#" to="" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
</environment>
<command>sbatch intel_SLURMsubmit.job</command>
</execute>
<!-- ******************************************************************************** -->
<execute cname="Intel-Broadwell-Marconi">
<input files="$pdir/Intel-Broadwell-Marconi/intel_PBSsubmit.job.in" />
<substitute infile="intel_PBSsubmit.job.in" outfile="intel_PBSsubmit.job">
<sub from="#BENCHNAME#" to="$benchname" />
<sub from="#OUTDIR#" to="$outdir" />
<sub from="#STDOUTLOGFILE#" to="$stdoutlogfile" />
<sub from="#STDERRLOGFILE#" to="$stderrlogfile" />
<sub from="#TIME_LIMIT#" to="00:10:00" />
<sub from="#PARTITION#" to="short" />
<sub from="#CONSTRAINT#" to="broadwell" />
<sub from="#TASKS#" to="$tasks" />
<sub from="#TASKSPERNODE#" to="$taskspernode" />
<sub from="#NODES#" to="$nodes" />
<sub from="#THREADSPERTASK#" to="$threadspertask" />
<sub from="#EXECUTABLE#" to="${outdir}/qcd-submit.exe.tmp" />
<sub from="#ENV#" to="$env" />
<sub from="#PREPROCESS#" to="cp $executable ${outdir}/qcd-submit.exe.tmp" />
<sub from="#POSTPROCESS#" to="" />
<sub from="#STARTER#" to="mpirun" />
<sub from="#ARGS_STARTER#" to="" />
<sub from="#MEASUREMENT#" to="time" />
<sub from="#ARGS_EXECUTABLE#" to="" />
</substitute>
<environment>
</environment>
<command>qsub intel_PBSsubmit.job</command>
</execute>
</execution>
run 0
lattice 2 2 2 2
processes 1 1 1 1
boundary_conditions_fermions 1 1 1 -1
beta 5
kappa 0.13
csw 2.3327
h 0
hmc_test 0
hmc_model C
hmc_rho 0.1
hmc_trajectory_length 0.2
hmc_steps 10
hmc_accept_first 1
hmc_m_scale 3
start_configuration cold
start_random default
mc_steps 1
mc_total_steps 100
solver_rest 1e-99
solver_maxiter 50
solver_ignore_no_convergence 2
solver_mre_vectors 7
run 0
lattice #KA_LATTICE#
processes #KA_PROCESSES#
boundary_conditions_fermions 1 1 1 -1
beta 5
kappa 0.13
csw 2.3327
h 0
hmc_test 0
hmc_model C
hmc_rho 0.1
hmc_trajectory_length 0.2
hmc_steps 10
hmc_accept_first 1
hmc_m_scale 3
start_configuration cold
start_random default
mc_steps 1
mc_total_steps 100
solver_rest 1e-99
solver_maxiter #KA_MAXITER#
solver_ignore_no_convergence 2
solver_mre_vectors 7
nx #KB_NX#
ny #KB_NY#
nz #KB_NZ#
micro steps 4
n_measurement 1
n_correlation 10000
w_correlation 100000
n_save -1000
blocking levels 1
level 0 1
level 1 1
restart 0
n_iteration #KB_MAXITER#
n_thermal 0
seed 989357013
run status
iteration
time: gauge
time: higgs
time: rest