Commit b3d9d655 authored by Jacob Finkenrath's avatar Jacob Finkenrath
Browse files

Update Readme and scripts of Part2

parent 6371d555
......@@ -192,16 +192,20 @@ NVIDIA P100 GPU 5.60E+01
Prace 5IP - Results (see White Paper for more):
Irene KNL Irene SKL Juwels Marconi-KNL MareNostrum PizDaint Davide Frioul Deep Mont-Blanc 3
1 148,68 219,68 182,49 133,38 186,40 53,73 53.4c 151 656,41 206,17
2 79,35 114,22 91,83 186,14 94,63 32,38 113 86.9 432,93 93,48
4 48,07 58,11 46,58 287,17 47,22 19,13 21.4 52.7 277,67 49,95
8 28,42 32,09 25,37 533,49 25,86 12,78 14.8 36.5 189,83 25,19
16 17,08 14,35 11,77 1365,72 11,64 9,20 10.1 27.8 119,14 12,55
32 10,56 7,28 5,43 2441,29 5,59 6,35 6.94 15.6
64 9,01 4,18 2,65 -- 2,65 6,41 -- 11.7
128 5,08 -- 1,39 -- 2,48 5,95
256 -- -- 1,38 -- -- 5,84
512 -- -- 0,89 --
Irene KNL Irene SKL Juwels Marconi-KNL MareNostrum PizDaint Davide Frioul Deep Mont-Blanc 3
1 148,68 219,6 182,49 133,38 186,40 53,73 53.4 151 656,41 206,17
2 79,35 114,22 91,83 186,14 94,63 32,38 113 86.9 432,93 93,48
4 48,07 58,11 46,58 287,17 47,22 19,13 21.4 52.7 277,67 49,95
8 28,42 32,09 25,37 533,49 25,86 12,78 14.8 36.5 189,83 25,19
16 17,08 14,35 11,77 1365,72 11,64 9,20 10.1 17.8 119,14 12,55
32 10,56 7,28 5,43 2441,29 5,59 6,35 6.94 15.6
64 9,01 4,18 2,65 2,65 6,41 11.7
128 5,08 1,39 2,48 5,95
256 1,38 5,84
512 0,89
Results in [sec]
for V=8x64x64x64
......@@ -272,11 +272,9 @@ and for QPhix
git clone https://github.com/JeffersonLab/qphix
```
Note that the AVX512 instructions, which are needed for an optimal run on
KNLs, are not yet part of the main branch. The AVX512 instructions are available
in the avx512-branch ("git checkout avx512). The provided
source file is using the avx512-branch (Status 01/2017).
Note that for running on Skylake chips it is recommended to utilize
the branch develop of QPhix which needs additional packages
like qdp++ (Status 04/2019).
#### 2.1 Compile
......@@ -313,8 +311,23 @@ or for KNL's
by using the previous variable `QMP_INSTALL_DIR` which links to the install-folder
of QMP. The executable `time_clov_noqdp` can be found now in the subfolder `./qphix/test`.
Note that the avx512-branch will compile additional executable which has dependencies
on the package QDP (which will generate an error at the end of the compilation process).
Note for the develop branch the package qdp++ has to be compiled.
QDP++ can be configure using (here for skylake chip)
``` shell
./configure --with-qmp=$QMP_INSTALL_DIR --enable-parallel-arch=parscalar CC=mpiicc CFLAGS="-xCORE-AVX512 -mtune=skylake-avx512 -std=c99" CXX=mpiicpc CXXFLAGS="-axCORE-AVX512 -mtune=skylake-avx512 -std=c++14 -qopenmp" --enable-openmp --host=x86_64-linux-gnu --build=none-none-none --prefix=$QDPXX_INSTALL_DIR
```
Now QPhix executable can be compiled by using:
``` shell
cmake -DQDPXX_DIR=$QDP_INSTALL_DIR -DQMP_DIR=$QMP_INSTALL_DIR -Disa=avx512 -Dparallel_arch=parscalar -Dhost_cxx=mpiicpc -Dhost_cxxflags="-std=c++17 -O3 -axCORE-AVX512 -mtune=skylake-avx512" -Dtm_clover=ON -Dtwisted_mass=ON -Dtesting=ON -DCMAKE_CXX_COMPILER=mpiicpc -DCMAKE_CXX_FLAGS="-std=c++17 -O3 -axCORE-AVX512 -mtune=skylake-avx512" -DCMAKE_C_COMPILER=mpiicc -DCMAKE_C_FLAGS="-std=c99 -O3 -axCORE-AVX512 -mtune=skylake-avx512" ..
```
The executable `time_clov_noqdp` can be found now in the subfolder `./qphix/test`.
##### 2.1.1 Example compilation on PRACE machines
......
......@@ -243,7 +243,7 @@ KNCs GFLOPS
Results from PRACE 5IP (see White paper for more details)
Results in GFLOP/s for V=96x32x32x32
Nodes Irene SKL Juwels Marconi-KNL MareNostrum PizDaint Davide Frioul Deep Mont-Blanc 3
1 134,382 132,26 101,815 142,336 387,659 392,763 184,729 41,7832 99,6378
2 240,853 245,599 145,608 263,355 755,308 773,901 269,705 40,7721 214,549
......@@ -256,6 +256,23 @@ Nodes Irene SKL Juwels Marconi-KNL MareNostrum PizDaint Davide
256 -- 3130,42
512 -- 3421,25
Qphix Qphix Qphix Qphix QUDA QUDA Qphix Qphix Grid
Skylake Skylake KNL Skylake P100 P100 KNL Xeons ARM
Skylake Skylake KNL Skylake P100 P100 KNL Xeons ARM
Results in GFLOP/s
\ No newline at end of file
Results in GFLOP/s for V=128x64x64x64
Node Irene SKL Juwels Marconi-KNL MareNostrum PizDaint
1 141,306 134,972 64,2657 144,32
2 267,278 263,636 153,008 280,68
4 503,041 496,465 420,936 514,956
8 922,187 954,659 783,39 930,95 2694
16 1607,92 1787,43 1109,95 1778,23 5731,56
32 3088,02 3289,02 1486,79 2635,74 7779,29
64 4787,89 5952,8 1087,01 5264,16 10607,2
128 5750,35 10315,3 601,615 7998,56 13560,5
256 15370,9 18177,2
512 26972,6
Qphix Qphix Qphix QPhix QUDA
Skylake Skylake KNL Skylake P100
##
##
##
##
##
time=$1
Node=$2
n=$3
g=$4
openmp=$5
cpuptask=$6
perm=$7
scr=$8
pt=${9}
pz=${10}
py=${11}
px=${12}
exe=${13}
name=${14}
lx=${15}
lz=${16}
ly=${17}
lt=${18}
prec=${19}
sed 's/#NODES#/'${Node}'/g' submit_job.sh.template > test
mv test submit_job.temp
sed 's/#NTASK#/'${n}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#NTASKPERNODE#/'${g}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#OPENMP#/'${openmp}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#CPUSPERTASK#/'${cpuptask}'/g' submit_job.temp > test
mv test submit_job.temp
wrc=$(pwd)
echo $wrc
sed 's #WRC# '${wrc}' g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PT#/'${pt}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PZ#/'${pz}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PY#/'${py}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PX#/'${px}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's #EXE# '${exe}' g' submit_job.temp > test
mv test submit_job.temp
sed 's/#NAME#/'${name}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LT#/'${lt}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LZ#/'${lz}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LY#/'${ly}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#LX#/'${lx}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#PREC#/'${prec}'/g' submit_job.temp > test
mv test submit_job.temp
sed 's/#TIME#/'${time}'/g' submit_job.temp > test
mv test $scr
if [ $perm -eq 1 ];then
chmod +x $scr
fi
rm submit_job.temp
##
## RUN - Strong -scaling
##
## Before starting this job-script replace "SUBMIT" with the submition-command of the local queing system.
## Additional in the script submit_job the execution command has to be adjusted to the local machine.
##
##
## Script for a parallelization of 2 4 8 16 32 64 KNLs
##
#!/bin/bash
EXE=/ccc/cont005/home/unicy/finkenrj/run/qphix/time_clov_noqdp
## Set scaling-mode: Strong or Weak
sca_mode="Strong"
#sca_mode="OneNode"
#sca_mode="Weak"
mode="Analysis"
##mode="Run"
## sbatch_on=1
exe_perm=1 ## use chmod to allow execution of submit_job_Nx_Gx.sh
g=8 ##MPItaskperNODE
openmp=6 ##OMP
cpuptask=6 ## Total number of CPUS / MpitaskperNODE (= openmp if Hyperthreaded Cores are used, = 2* openmp if Hyperthreading is enabled but not used )
## lattice size (size strong 1)
gx=32
gy=32
gz=32
gt=96
## lattice size (size strong 2)
#gx=8
#gy=4
#gz=4
#gt=8
## lattice size (size weak 1)
#gx=48
#gt=24
## use smaller lattice size of weak scaling mode: like gx=24 gt=24
##
#gy=$gx
#gz=$gx
lt=$gt
lx=$gx
ly=$gy
lz=$gz
# for gpus_per_node in 1 2; do
cards_per_node=1
#n=1
# for n in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576; do
for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8; do
for p in "s" "d" ; do
# p="d"
case $p in
"s" )
prec="f"
;;
"d" )
prec="d"
;;
"h" )
prec="h"
;;
esac
px=1
py=1
pz=1
pt=$n
if [ $n -eq 16 ];then
pz=2
pt=8
fi
if [ $n -eq 32 ];then
pz=4
pt=8
fi
if [ $n -eq 64 ];then
py=2
pz=4
pt=8
fi
if [ $n -eq 128 ];then
py=2
pz=8
pt=8
fi
if [ $n -eq 256 ];then
py=4
pz=8
pt=8
fi
if [ $n -eq 512 ];then
px=2
py=4
pz=8
pt=8
fi
if [ $n -eq 1024 ];then
px=4
py=4
pz=8
pt=8
fi
if [ $n -eq 2048 ];then
px=8
py=4
pz=8
pt=8
fi
if [ $n -eq 4096 ];then
px=8
py=8
pz=8
pt=8
fi
if [ $n -eq 8192 ];then
px=8
py=8
pz=8
pt=16
fi
if [ $n -eq 16384 ];then
px=8
py=8
pz=16
pt=16
fi
if [ $n -eq 32768 ];then
px=8
py=16
pz=16
pt=16
fi
if [ $n -eq 65536 ];then
px=16
py=16
pz=16
pt=16
fi
if [ $n -eq 131072 ];then
px=16
py=16
pz=16
pt=32
fi
if [ $n -eq 262144 ];then
px=16
py=16
pz=32
pt=32
fi
if [ $n -eq 524288 ];then
px=16
py=32
pz=32
pt=32
fi
if [ $n -eq 1048576 ];then
px=32
py=32
pz=32
pt=32
fi
nt=$n
if [ $sca_mode = "Strong" ];then
lt1=$((gt/pt))
lx1=$((gx/px))
ly1=$((gy/py))
lz1=$((gz/pz))
elif [ $sca_mode = "OneNode" ]; then
lx1=$((gx*px))
ly1=$((gy*py))
lz1=$((gz*pz))
lt1=$((gt*pt/g))
nt=$g
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
lt=$((gt*pt))
px=1
py=1
pz=1
pt=$g
else
lt1=$lt
lx1=$lx
ly1=$ly
lz1=$lz
lt=$((gt*pt))
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
fi
Node=$((nt/g))
name=${sca_mode}_qphix_${pt}x${pz}x${py}x${px}_${lt}x${lz}x${ly}x${lx}_${p}
if [ $mode != "Analysis" ];then
echo $name
submitscript=submit_job_N${nt}_${p}.sh
./prepare_submit_job.sh '00:10:00' ${Node} ${nt} ${g} ${openmp} ${cpuptask} ${exe_perm} ${submitscript} ${pt} ${pz} ${py} $px $EXE $name $lx $lz $ly $lt $prec
ccc_msub ./$submitscript
sleep 1
## Scaning the output and save the data in dat_nameif
else
case $p in
"s" )
echo $name >> Sca_s.log
less $name | grep "Time" -A 1 >> Sca_s.log
;;
"d" )
echo $name >> Sca_d.log
less $name | grep "Time" -A 1 >> Sca_d.log
;;
"h" )
echo $name >> Sca_h.log
less $name | grep "Time" -A 1 >> Sca_h.log
;;
esac
fi
done
done
##
## RUN - Strong -scaling
##
## Before starting this job-script replace "SUBMIT" with the submition-command of the local queing system.
## Additional in the script submit_job the execution command has to be adjusted to the local machine.
##
##
## Script for a parallelization of 2 4 8 16 32 64 KNLs
##
#!/bin/bash
EXE=/ccc/cont005/home/unicy/finkenrj/run/qphix/time_clov_noqdp
## Set scaling-mode: Strong or Weak
sca_mode="Strong"
#sca_mode="OneNode"
#sca_mode="Weak"
## mode="Analysis"
mode="Run"
## sbatch_on=1
exe_perm=1 ## use chmod to allow execution of submit_job_Nx_Gx.sh
g=8 ##MPItaskperNODE
openmp=6 ##OMP
cpuptask=6 ## Total number of CPUS / MpitaskperNODE (= openmp if Hyperthreaded Cores are used, = 2* openmp if Hyperthreading is enabled but not used )
## lattice size (size strong 1)
gx=96
gy=96
gz=96
gt=192
## lattice size (size strong 2)
#gx=8
#gy=4
#gz=4
#gt=8
## lattice size (size weak 1)
#gx=48
#gt=24
## use smaller lattice size of weak scaling mode: like gx=24 gt=24
##
#gy=$gx
#gz=$gx
lt=$gt
lx=$gx
ly=$gy
lz=$gz
# for gpus_per_node in 1 2; do
cards_per_node=1
#n=1
# for n in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576; do
for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384; do
# for n in 8; do
for p in "s" "d" ; do
# p="d"
case $p in
"s" )
prec="f"
;;
"d" )
prec="d"
;;
"h" )
prec="h"
;;
esac
px=1
py=1
pz=1
pt=$n
if [ $n -eq 16 ];then
pz=2
pt=8
fi
if [ $n -eq 32 ];then
pz=4
pt=8
fi
if [ $n -eq 64 ];then
py=2
pz=4
pt=8
fi
if [ $n -eq 128 ];then
py=2
pz=8
pt=8
fi
if [ $n -eq 256 ];then
py=4
pz=8
pt=8
fi
if [ $n -eq 512 ];then
px=2
py=4
pz=8
pt=8
fi
if [ $n -eq 1024 ];then
px=4
py=4
pz=8
pt=8
fi
if [ $n -eq 2048 ];then
px=8
py=4
pz=8
pt=8
fi
if [ $n -eq 4096 ];then
px=8
py=8
pz=8
pt=8
fi
if [ $n -eq 8192 ];then
px=8
py=8
pz=8
pt=16
fi
if [ $n -eq 16384 ];then
px=8
py=8
pz=16
pt=16
fi
if [ $n -eq 32768 ];then
px=8
py=16
pz=16
pt=16
fi
if [ $n -eq 65536 ];then
px=16
py=16
pz=16
pt=16
fi
if [ $n -eq 131072 ];then
px=16
py=16
pz=16
pt=32
fi
if [ $n -eq 262144 ];then
px=16
py=16
pz=32
pt=32
fi
if [ $n -eq 524288 ];then
px=16
py=32
pz=32
pt=32
fi
if [ $n -eq 1048576 ];then
px=32
py=32
pz=32
pt=32
fi
nt=$n
if [ $sca_mode = "Strong" ];then
lt1=$((gt/pt))
lx1=$((gx/px))
ly1=$((gy/py))
lz1=$((gz/pz))
elif [ $sca_mode = "OneNode" ]; then
lx1=$((gx*px))
ly1=$((gy*py))
lz1=$((gz*pz))
lt1=$((gt*pt/g))
nt=$g
lx=$((gx*px))
ly=$((gy*py))
lz=$((gz*pz))
lt=$((gt*pt))
px=1
py=1
pz=1
pt=$g
else
lt1=$lt
lx1=$lx
ly1=$ly
lz1=$lz