Commit 98f03664 authored by Cedric Jourdain's avatar Cedric Jourdain
Browse files

Merge branch 'r2.1-dev' of https://repository.prace-ri.eu/git/UEABS/ueabs into r2.1-dev

parents 9ff2bbcb 42482007
# Quantum Espresso v6.3 on ARM v8
## Benchmark system
CARMEN (CINECA) 8-node ARM v8 cluster, 2x32 cores + 256G RAM/node.
For more details:
[Cineca documentation](https://wiki.u-gov.it/confluence/pages/viewpage.action?spaceKey=SCAIIN&title=ARM+Demo+@+CINECA)
## Installation
Installed with ARM v19 compilers (flang and clang) and the ARM performance
library. This provides threaded BLAS, LAPACK and FFTW libraries. You will need to modify the make.inc file to make use of these.
Remember also to include the following flags:
```bash
-mcpu=native -armpl
```
Currently the OpenMP version (with ```-fopenmp```) flag does not compile. (Interanal compiler error).
##Execution
- Because of fairly long execution times and limited number of nodes only AUSURF has been tested.
- Run time error causes QE to crash just after final iteration. Probably due to FoX XMl library (an issue has been raised). Walltimes therefore estimated from time of final iteration.
See job files for example execution.
## Profiling
Use the ARM MAP profiler. See example job scripts.
#
# system supplied modules
module load autoload armcompiler/19.0--binary
# the following is the ARM performance library including BLAS, LAPACK + FFTW
module load armpl-for-armcompiler/19.0.0--binary
# own-compiled OpenMPI 4.0.0 (for ARM compilers)
module use $HOME/modules
module load openmpi/openmpi-4.0.0_Arm
# QE configure
CC=mpicc FC=mpifort ./configure
# you need also to modify the make.inc since configure does not set it correctly
#!/bin/bash
#SBATCH --tasks-per-node=64
#SBATCH -N 1
#SBATCH -A cin_staff
#SBATCH -t 3:00:00
module load autoload armpl-for-armcompiler/19.0.0--binary
module load armcompiler/19.0--binary
module load arm-forge
module use $HOME/modules
module load openmpi/openmpi-4.0.0_Arm
start_time=$(date +"%s")
mpirun -np 64 $HOME/qe-6.3-fft/bin/pw.x -npool 2 -ndiag 16 -input ausurf.in
end_time=$(date +"%s")
walltime=$(($end_time-$start_time))
echo "walltime $walltime"
#!/bin/bash
#SBATCH --tasks-per-node=64
#SBATCH -N 1
#SBATCH -A cin_staff
#SBATCH -t 3:00:00
module load autoload armpl-for-armcompiler/19.0.0--binary
module load armcompiler/19.0--binary
module load arm-forge
module use $HOME/modules
module load openmpi/openmpi-4.0.0_Arm
start_time=$(date +"%s")
#map --verbose --start-after 60 --stop-after 600 --profile mpirun -np 64 $HOME/qe-6.3-fft/bin/pw.x -npool 2 -input ausurf.in
map --verbose --profile mpirun -np 64 $HOME/qe-6.3-fft/bin/pw.x -npool 2 -ndiag 16 -input ausurf.in
end_time=$(date +"%s")
walltime=$(($end_time-$start_time))
echo "walltime $walltime"
# make.inc. Generated from make.inc.in by configure.
# compilation rules
.SUFFIXES :
.SUFFIXES : .o .c .f .f90
# most fortran compilers can directly preprocess c-like directives: use
# $(MPIF90) $(F90FLAGS) -c $<
# if explicit preprocessing by the C preprocessor is needed, use:
# $(CPP) $(CPPFLAGS) $< -o $*.F90
# $(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o
# remember the tabulator in the first column !!!
.f90.o:
$(CPP) $(CPPFLAGS) $< -o $(*)_tmp.f90 ; \
$(MPIF90) $(F90FLAGS) -c $(*)_tmp.f90 -o $(*).o
# .f.o and .c.o: do not modify
.f.o:
$(F77) $(FFLAGS) -c $<
.c.o:
$(CC) $(CFLAGS) -c $<
# Top QE directory, useful for locating libraries, linking QE with plugins
# The following syntax should always point to TOPDIR:
TOPDIR = $(dir $(abspath $(filter %make.inc,$(MAKEFILE_LIST))))
# if it doesn't work, uncomment the following line (edit if needed):
# TOPDIR = /home/userinternal/aemerson/qe-6.3
# DFLAGS = precompilation options (possible arguments to -D and -U)
# used by the C compiler and preprocessor
# To use libxc (v>=3.0.1), add -D__LIBXC to DFLAGS
# See include/defs.h.README for a list of options and their meaning
# With the exception of IBM xlf, FDFLAGS = $(DFLAGS)
# For IBM xlf, FDFLAGS is the same as DFLAGS with separating commas
# MANUAL_DFLAGS = additional precompilation option(s), if desired
# BEWARE: it does not work for IBM xlf! Manually edit FDFLAGS
MANUAL_DFLAGS =
DFLAGS = -D__MPI -D__ARM_LIB
FDFLAGS = $(DFLAGS) $(MANUAL_DFLAGS)
# IFLAGS = how to locate directories with *.h or *.f90 file to be included
# typically -I$(TOPDIR)/include -I/some/other/directory/
# the latter contains .e.g. files needed by FFT libraries
# for libxc add -I/path/to/libxc/include/
IFLAGS = -I$(TOPDIR)/include -I$(TOPDIR)/FoX/finclude -I$(TOPDIR)/S3DE/iotk/include/
# MOD_FLAG = flag used by f90 compiler to locate modules
MOD_FLAG = -I
# BASEMOD_FLAGS points to directories containing basic modules,
# while BASEMODS points to the corresponding module libraries
# Each Makefile can add directories to MODFLAGS and libraries to QEMODS
BASEMOD_FLAGS= $(MOD_FLAG)$(TOPDIR)/iotk/src \
$(MOD_FLAG)$(TOPDIR)/Modules \
$(MOD_FLAG)$(TOPDIR)/FFTXlib \
$(MOD_FLAG)$(TOPDIR)/LAXlib \
$(MOD_FLAG)$(TOPDIR)/UtilXlib \
$(MOD_FLAG)$(TOPDIR)/FoX/finclude
# Compilers: fortran-90, fortran-77, C
# If a parallel compilation is desired, MPIF90 should be a fortran-90
# compiler that produces executables for parallel execution using MPI
# (such as for instance mpif90, mpf90, mpxlf90,...);
# otherwise, an ordinary fortran-90 compiler (f90, g95, xlf90, ifort,...)
# If you have a parallel machine but no suitable candidate for MPIF90,
# try to specify the directory containing "mpif.h" in IFLAGS
# and to specify the location of MPI libraries in MPI_LIBS
MPIF90 = mpif90
F90 = mpifort
CC = mpicc
F77 = mpifort
# GPU architecture (Kepler: 35, Pascal: 60, Volta: 70 )
GPU_ARCH=
# CUDA runtime (Pascal: 8.0, Volta: 9.0)
CUDA_RUNTIME=
# CUDA F90 Flags
CUDA_F90FLAGS=
# C preprocessor and preprocessing flags - for explicit preprocessing,
# if needed (see the compilation rules above)
# preprocessing flags must include DFLAGS and IFLAGS
CPP = cpp
CPPFLAGS = -P -traditional $(DFLAGS) $(IFLAGS)
# compiler flags: C, F90, F77
# C flags must include DFLAGS and IFLAGS
# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax
CFLAGS = -O3 -mcpu=native -armpl -fopenmp $(DFLAGS) $(IFLAGS)
F90FLAGS = $(FFLAGS) $(FDFLAGS) $(CUDA_F90FLAGS) $(IFLAGS) $(MODFLAGS)
FFLAGS = -O2 -g -mcpu=native -armpl -fopenmp
# compiler flags without optimization for fortran-77
# the latter is NEEDED to properly compile dlamch.f, used by lapack
FFLAGS_NOOPT = -O0
# compiler flag needed by some compilers when the main program is not fortran
# Currently used for Yambo
FFLAGS_NOMAIN =
# Linker, linker-specific flags (if any)
# Typically LD coincides with F90 or MPIF90, LD_LIBS is empty
# for libxc, set LD_LIBS=-L/path/to/libxc/lib/ -lxcf90 -lxc
LD = mpif90
LDFLAGS = -mcpu=native -armpl -fopenmp
LD_LIBS =
# External Libraries (if any) : blas, lapack, fft, MPI
# If you have nothing better, use the local copy via "--with-netlib" :
# BLAS_LIBS = /your/path/to/espresso/LAPACK/blas.a
# BLAS_LIBS_SWITCH = internal
BLAS_LIBS = /cineca/prod/opt/tools/arm-compiler-for-hpc/19.0/none/opt/arm/armpl-19.0.0_ThunderX2CN99_RHEL-7_arm-hpc-compiler_19.0_aarch64-linux/lib/libarmpl_lp64.a
BLAS_LIBS_SWITCH = external
# If you have nothing better, use the local copy via "--with-netlib" :
# LAPACK_LIBS = /your/path/to/espresso/LAPACK/lapack.a
# LAPACK_LIBS_SWITCH = internal
# For IBM machines with essl (-D__ESSL): load essl BEFORE lapack !
# remember that LAPACK_LIBS precedes BLAS_LIBS in loading order
LAPACK_LIBS = /cineca/prod/opt/tools/arm-compiler-for-hpc/19.0/none/opt/arm/armpl-19.0.0_ThunderX2CN99_RHEL-7_arm-hpc-compiler_19.0_aarch64-linux/lib/libarmpl_lp64.a
LAPACK_LIBS_SWITCH = external
SCALAPACK_LIBS =
# nothing needed here if the the internal copy of FFTW is compiled
# (needs -D__FFTW in DFLAGS)
FFT_LIBS = /cineca/prod/opt/tools/arm-compiler-for-hpc/19.0/none/opt/arm/armpl-19.0.0_ThunderX2CN99_RHEL-7_arm-hpc-compiler_19.0_aarch64-linux/lib/libarmpl_lp64.a
# HDF5
HDF5_LIB =
FOX_LIB = -L$(TOPDIR)/FoX/lib -lFoX_dom -lFoX_sax -lFoX_wxml -lFoX_common\
-lFoX_utils -lFoX_fsys
FOX_FLAGS =
# For parallel execution, the correct path to MPI libraries must
# be specified in MPI_LIBS (except for IBM if you use mpxlf)
MPI_LIBS =
# IBM-specific: MASS libraries, if available and if -D__MASS is defined in FDFLAGS
MASS_LIBS =
# CUDA libraries
CUDA_LIBS=
CUDA_EXTLIBS =
# ar command and flags - for most architectures: AR = ar, ARFLAGS = ruv
AR = ar
ARFLAGS = ruv
# ranlib command. If ranlib is not needed (it isn't in most cases) use
# RANLIB = echo
RANLIB = ranlib
# all internal and external libraries - do not modify
FLIB_TARGETS = all
LIBOBJS = $(TOPDIR)/clib/clib.a $(TOPDIR)/iotk/src/libiotk.a
LIBS = $(CUDA_LIBS) $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(FOX_LIB) $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(MASS_LIBS) $(HDF5_LIB) $(LD_LIBS)
# wget or curl - useful to download from network
WGET = wget -O
# Install directory - not currently used
PREFIX = /usr/local
#!/bin/bash
# script for DAVIDE 16 cores/node
# - 13 nodes, 4 tasks/node, 4 OMP threads/task
# Below <account> represents the budget
#SBATCH -N13
#SBATCH --gres=gpu:4
#SBATCH -A <account>
#SBATCH --tasks-per-node=4
#SBATCH -p dvd_usr_prod
#SBATCH -t 1:00:00
export OMP_NUM_THREADS=4
srun -v -np 52 ./pw.x -input ./Ta2O5.in -npool 26
# Quantum Espresso onthe Juwels system (JSC)
## Notes
- before using ```configure``` perform the following command
```bash
unset ARCH
```
otherwise configure will not detect correctly taht Juwels is a Linux system
- On Juwels both Intel and Parastation MPI libraries are available. Parastation may be
more stable
#!/bin/bash
#SBATCH --nodes=50
#SBATCH --ntasks=650
#SBATCH --ntasks-per-node=13
#SBATCH --cpus-per-task=4
#SBATCH --output=ta205-out.%j
#SBATCH --error=ta205-err.%j
#SBATCH --mem=90GB
#SBATCH --time=01:30:00
#SBATCH --partition=batch
#SBATCH --account=prpb74
module load Intel IntelMPI imkl
#module load intel-para/2018b-mt
QE_HOME=$PROJECT_cprpb74/prpb7400/q-e-qe-6.3
export OMP_NUM_THREADS=4
srun -n 650 $QE_HOME/bin/pw.x -npool 26 -input Ta2O5-2x2xz-552.in
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=16
#SBATCH --output=ausurf-out.%j
#SBATCH --error=ausurf-err.%j
#SBATCH --time=00:30:00
#SBATCH --partition=batch
#SBATCH --account=prpb74
module load Intel IntelMPI imkl
QE_HOME=$PROJECT_cprpb74/prpb7400/q-e-qe-6.3
srun -n 16 $QE_HOME/bin/pw.x -npool 2 -ndiag 4 -input ausurf.in
#!/bin/bash
#module load Intel IntelMPI imkl
# the parastation MPI seems to work better than Intel MPI
module load intel-para/2018b-mt
# the next line is important because otherwise the QE configure does not
# recognise JUWELS as a Linux system
unset ARCH
CC=icc FC=ifort MPIF90=mpiifort ./configure --enable-openmp --with-scalapack=intel
#!/bin/bash
# Batch file for using the Intel APS trace facility
# on Marconi KNL
#
#SBATCH -N 4
#SBATCH --ntasks-per-node=64
#SBATCH --error=ta205-err.%j
#SBATCH --mem=80GB
#SBATCH --time=00:30:00
#SBATCH -A cin_staff
#SBATCH -p knl_usr_prod
start_time=$(date +"%s")
module purge
module load profile/knl
module load autoload qe/6.3_knl
export OMP_NUM_THREADS=1
source $INTEL_HOME/performance_snapshots/apsvars.sh
srun --cpu-bind=cores aps pw.x -npool 2 -ndiag 32 -input pw.in
end_time=$(date +"%s")
walltime=$(($end_time-$start_time))
echo "walltime $walltime"
# PizDaint (CSCS, Switzerland)
#!/bin/bash
#
# QuantumESPRESSO on Piz Daint: 8 nodes, 12 MPI tasks per node,
# 2 OpenMP threads per task using hyperthreading (--ntasks-per-core=2)
#
#SBATCH --job-name=espresso
#SBATCH --time=01:00:00
#SBATCH --nodes=4
#SBATCH --ntasks-per-core=2
#SBATCH --ntasks-per-node=12
#SBATCH --cpus-per-task=2
#SBATCH --constraint=gpu
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
ulimit -s unlimited
srun pw.x -npool 2 -in input.in
#!/bin/bash
# Load the following modules
#module swap PrgEnv-cray PrgEnv-pgi
# module load intel cudatoolkit
# and then install with the make.inc in the QE-GPU distribution
# Quantum Espresso in the United European Applications Benchmark Suite (UEABS)
## Document Author: A. Emerson (a.emerson@cineca.it) , Cineca.
## Introduction
Quantum Espresso is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudopotentials.
Full documentation is available from the project website [QuantumEspresso](https://www.quantum-espresso.org/).
In this README we give information relevant for its use in the UEABS.
### Standard CPU version
For the UEABS activity we have used mainly version v6.0 but later versions are now available.
### GPU version
The GPU port of Quantum Espresso is a version of the program which has been
completely re-written in CUDA FORTRAN by Filippo Spiga. The version program used in these
experiments is v6.0, even though further versions becamse available later during the
activity.
## Installation and requirements
### Standard
The Quantum Espresso source can be downloaded from the projects GitHub repository,[QE](https://github.com/QEF/q-e/tags). Requirements can be found from the website but you will need a good FORTRAN and C compiler with an MPI library and optionally (but highly recommended) an optimised linear algebra library.
### GPU version
For complete build requirements and information see the following GitHub site:
[QE-GPU](https://github.com/fspiga/qe-gpu)
A short summary is given below:
Essential
* The PGI compiler version 17.4 or above.
* You need NVIDIA TESLA GPUS such as Kepler (K20, K40, K80) or Pascal (P100) or Volta (V100).
No other cards are supported. NVIDIA TESLA P100 and V100 are strongly recommend
for their on-board memory capacity and double precision performance.
Optional
* A parallel linear algebra library such as Scalapack, Intel MKL or IBM ESSL. If
none is available on your system then the installation can use a version supplied
with the distribution.
## Downloading the software
### Standard
From the website, for example:
```bash
wget https://github.com/QEF/q-e/releases/download/qe-6.3/qe-6.3.tar.gz
```
### GPU
Available from the web site given above. You can use, for example, ```git clone```
to download the software:
```bash
git clone https://github.com/fspiga/qe-gpu.git
```
## Compiling and installing the application
### Standard installation
Installation is achieved by the usual ```configure, make, make install ``` procedure.
However, it is recommended that the user checks the __make.inc__ file created by this procedure before performing the make.
For example, using the Intel compilers,
```bash
module load intel intelmpi
CC=icc FC=ifort MPIF90=mpiifort ./configure --enable-openmp --with-scalapack=intel
```
Assuming the __make.inc__ file is acceptable, the user can then do:
```bash
make; make install
```
### GPU
Check the __README.md__ file in the downloaded files since the
procedure varies from distribution to distribution.
Most distributions do not have a ```configure``` command. Instead you copy a __make.inc__
file from the __install__ directory, and modify that directly before running make.
A number of templates are available in the distribution:
- make.inc_x86-64
- make.inc_CRAY_PizDaint
- make.inc_POWER_DAVIDE
- make.inc_POWER_SUMMITDEV
The second and third are particularly relevant in the PRACE infrastructure (ie. for CSCS
PizDaint and CINECA DAVIDE).
Run __make__ to see the options available. For the UEABS you should select the
pw program (the only module currently available)
```
make pw
```
The QE-GPU executable will appear in the directory `GPU/PW` and is called `pw-gpu.x`.
## Running the program - general procedure
Of course you need some input before you can run calculations. The
input files are of two types:
1. A control file usually called `pw.in`
2. One or more pseudopotential files with extension `.UPF`
The pseudopotential files are placed in a directory specified in the
control file with the tag pseudo\_dir. Thus if we have
```shell
pseudo_dir=./
```
then QE-GPU will look for the pseudopotential
files in the current directory.
If using the PRACE benchmark suite the data files can be
downloaded from the QE website or the PRACE respository. For example,
```shell
wget http://www.prace-ri.eu/UEABS/Quantum\_Espresso/QuantumEspresso_TestCaseA.tar.gz
```
Once uncompressed you can then run the program like this (e.g. using
MPI over 16 cores):
```shell
mpirun -n 16 pw-gpu.x -input pw.in
```
but check your system documentation since mpirun may be replaced by
`mpiexec, runjob, aprun, srun,` etc. Note also that normally you are not
allowed to run MPI programs interactively without using the
batch system.
### Parallelisation options
Quantum Espresso uses various levels of parallelisation, the most important being MPI parallelisation
over the *k points* available in the input system. This is achieved with the ```-npool``` program option.
Thus for the AUSURF input which has 2 k points we can run:
```bash
srun -n 64 pw.x -npool 2 -input pw.in
```
which would allocate 32 MPI tasks per k-point.
The number of MPI tasks must be a multiple of the number of k-points. For the TA2O5 input, which has 26 k-points, we could try:
```bash
srun -n 52 pw.x -npool 26 -input pw.in
```
but we may wish to use fewer pools but with more tasks per pool:
```bash
srun -n 52 pw.x -npool 13 -input pw.in
```
It is also possible to control the number of MPI tasks used in the diagonalization of the
subspace Hamiltonian. This is possible with the ```-ndiag``` parameter which must be a square number.
For example with the AUSURF input with k-points we can assign 4 processes for the Hamiltonian diagonisation:
```bash
srun -n 64 pw.x -npool 2 -ndiag 4 -input pw.in
```
### Hints for running the GPU version
#### Memory limitations
The GPU port of Quantum Espresso runs almost entirely in the GPU memory. This means that jobs are restricted
by the memory of the GPU device, normally 16-32 GB, regardless of the main node memory. Thus, unless many nodes are used the user is likely to see job failures due to lack of memory, even for small datasets.
For example, on the CSCS Piz Daint supercomputer each node has only 1 NVIDIA Tesla P100 (16GB) which means that you will need at least 4 nodes to run even the smallest dataset (AUSURF in the UEABS).
## Execution
In the UEABS repository you will find a directory for each computer system tested, together with installation
instructions and job scripts.
In the following we describe in detail the execution procedure for the Marconi computer system.
### Execution on the Cineca Marconi KNL system
Quantum Espresso has already been installed for the KNL nodes of
Marconi and can be accessed via a specific module:
``` shell
module load profile/knl
module load autoload qe/6.0_knl
```
On Marconi the default is to use the MCDRAM as cache, and have the
cache mode set as quadrant. Other settings for the KNLs on Marconi
haven't been substantailly tested for Quantum Espresso (e.g. flat
mode) but significant differences in performance for most inputs are
not expected.
An example SLURM batch script for the A2 partition is given below:
``` shell
#!/bin/bash
#SBATCH -N2
#SBATCH --tasks-per-node=64
#SBATCH -A <accountno>
#SBATCH -t 1:00:00
module purge
module load profile/knl
module load autoload qe/6.0_knl
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=${OMP_NUM_THREADS}
srun pw.x -npool 2 -ndiag 16 -input file.in > file.out
```
In the above with the SLURM directives we have asked for 2 KNL nodes (each with 68 cores) in
cache/quadrant mode and 93 Gb main memory each. We are running QE in MPI-only
mode using 64 MPI processes/node with the k-points in 2 pools; the diagonalisation of the Hamiltonian
will be done by 16 (4x4) tasks.
Note that this script needs to be submitted using the KNL scheduler as follows:
``` shell
module load env-knl
sbatch myjob