Commit 42482007 authored by Andrew Emerson's avatar Andrew Emerson
Browse files

JUWELS and ARM files added

parent 6abed49d
# Quantum Espresso v6.3 on ARM v8
## Benchmark system
CARMEN (CINECA) 8-node ARM v8 cluster, 2x32 cores + 256G RAM/node.
For more details:
[Cineca documentation](https://wiki.u-gov.it/confluence/pages/viewpage.action?spaceKey=SCAIIN&title=ARM+Demo+@+CINECA)
## Installation
Installed with ARM v19 compilers (flang and clang) and the ARM performance
library. This provides threaded BLAS, LAPACK and FFTW libraries. You will need to modify the make.inc file to make use of these.
Remember also to include the following flags:
```bash
-mcpu=native -armpl
```
Currently the OpenMP version (with ```-fopenmp```) flag does not compile. (Interanal compiler error).
##Execution
- Because of fairly long execution times and limited number of nodes only AUSURF has been tested.
- Run time error causes QE to crash just after final iteration. Probably due to FoX XMl library (an issue has been raised). Walltimes therefore estimated from time of final iteration.
See job files for example execution.
## Profiling
Use the ARM MAP profiler. See example job scripts.
#
# system supplied modules
module load autoload armcompiler/19.0--binary
# the following is the ARM performance library including BLAS, LAPACK + FFTW
module load armpl-for-armcompiler/19.0.0--binary
# own-compiled OpenMPI 4.0.0 (for ARM compilers)
module use $HOME/modules
module load openmpi/openmpi-4.0.0_Arm
# QE configure
CC=mpicc FC=mpifort ./configure
# you need also to modify the make.inc since configure does not set it correctly
#!/bin/bash
#SBATCH --tasks-per-node=64
#SBATCH -N 1
#SBATCH -A cin_staff
#SBATCH -t 3:00:00
module load autoload armpl-for-armcompiler/19.0.0--binary
module load armcompiler/19.0--binary
module load arm-forge
module use $HOME/modules
module load openmpi/openmpi-4.0.0_Arm
start_time=$(date +"%s")
mpirun -np 64 $HOME/qe-6.3-fft/bin/pw.x -npool 2 -ndiag 16 -input ausurf.in
end_time=$(date +"%s")
walltime=$(($end_time-$start_time))
echo "walltime $walltime"
#!/bin/bash
#SBATCH --tasks-per-node=64
#SBATCH -N 1
#SBATCH -A cin_staff
#SBATCH -t 3:00:00
module load autoload armpl-for-armcompiler/19.0.0--binary
module load armcompiler/19.0--binary
module load arm-forge
module use $HOME/modules
module load openmpi/openmpi-4.0.0_Arm
start_time=$(date +"%s")
#map --verbose --start-after 60 --stop-after 600 --profile mpirun -np 64 $HOME/qe-6.3-fft/bin/pw.x -npool 2 -input ausurf.in
map --verbose --profile mpirun -np 64 $HOME/qe-6.3-fft/bin/pw.x -npool 2 -ndiag 16 -input ausurf.in
end_time=$(date +"%s")
walltime=$(($end_time-$start_time))
echo "walltime $walltime"
# make.inc. Generated from make.inc.in by configure.
# compilation rules
.SUFFIXES :
.SUFFIXES : .o .c .f .f90
# most fortran compilers can directly preprocess c-like directives: use
# $(MPIF90) $(F90FLAGS) -c $<
# if explicit preprocessing by the C preprocessor is needed, use:
# $(CPP) $(CPPFLAGS) $< -o $*.F90
# $(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o
# remember the tabulator in the first column !!!
.f90.o:
$(CPP) $(CPPFLAGS) $< -o $(*)_tmp.f90 ; \
$(MPIF90) $(F90FLAGS) -c $(*)_tmp.f90 -o $(*).o
# .f.o and .c.o: do not modify
.f.o:
$(F77) $(FFLAGS) -c $<
.c.o:
$(CC) $(CFLAGS) -c $<
# Top QE directory, useful for locating libraries, linking QE with plugins
# The following syntax should always point to TOPDIR:
TOPDIR = $(dir $(abspath $(filter %make.inc,$(MAKEFILE_LIST))))
# if it doesn't work, uncomment the following line (edit if needed):
# TOPDIR = /home/userinternal/aemerson/qe-6.3
# DFLAGS = precompilation options (possible arguments to -D and -U)
# used by the C compiler and preprocessor
# To use libxc (v>=3.0.1), add -D__LIBXC to DFLAGS
# See include/defs.h.README for a list of options and their meaning
# With the exception of IBM xlf, FDFLAGS = $(DFLAGS)
# For IBM xlf, FDFLAGS is the same as DFLAGS with separating commas
# MANUAL_DFLAGS = additional precompilation option(s), if desired
# BEWARE: it does not work for IBM xlf! Manually edit FDFLAGS
MANUAL_DFLAGS =
DFLAGS = -D__MPI -D__ARM_LIB
FDFLAGS = $(DFLAGS) $(MANUAL_DFLAGS)
# IFLAGS = how to locate directories with *.h or *.f90 file to be included
# typically -I$(TOPDIR)/include -I/some/other/directory/
# the latter contains .e.g. files needed by FFT libraries
# for libxc add -I/path/to/libxc/include/
IFLAGS = -I$(TOPDIR)/include -I$(TOPDIR)/FoX/finclude -I$(TOPDIR)/S3DE/iotk/include/
# MOD_FLAG = flag used by f90 compiler to locate modules
MOD_FLAG = -I
# BASEMOD_FLAGS points to directories containing basic modules,
# while BASEMODS points to the corresponding module libraries
# Each Makefile can add directories to MODFLAGS and libraries to QEMODS
BASEMOD_FLAGS= $(MOD_FLAG)$(TOPDIR)/iotk/src \
$(MOD_FLAG)$(TOPDIR)/Modules \
$(MOD_FLAG)$(TOPDIR)/FFTXlib \
$(MOD_FLAG)$(TOPDIR)/LAXlib \
$(MOD_FLAG)$(TOPDIR)/UtilXlib \
$(MOD_FLAG)$(TOPDIR)/FoX/finclude
# Compilers: fortran-90, fortran-77, C
# If a parallel compilation is desired, MPIF90 should be a fortran-90
# compiler that produces executables for parallel execution using MPI
# (such as for instance mpif90, mpf90, mpxlf90,...);
# otherwise, an ordinary fortran-90 compiler (f90, g95, xlf90, ifort,...)
# If you have a parallel machine but no suitable candidate for MPIF90,
# try to specify the directory containing "mpif.h" in IFLAGS
# and to specify the location of MPI libraries in MPI_LIBS
MPIF90 = mpif90
F90 = mpifort
CC = mpicc
F77 = mpifort
# GPU architecture (Kepler: 35, Pascal: 60, Volta: 70 )
GPU_ARCH=
# CUDA runtime (Pascal: 8.0, Volta: 9.0)
CUDA_RUNTIME=
# CUDA F90 Flags
CUDA_F90FLAGS=
# C preprocessor and preprocessing flags - for explicit preprocessing,
# if needed (see the compilation rules above)
# preprocessing flags must include DFLAGS and IFLAGS
CPP = cpp
CPPFLAGS = -P -traditional $(DFLAGS) $(IFLAGS)
# compiler flags: C, F90, F77
# C flags must include DFLAGS and IFLAGS
# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax
CFLAGS = -O3 -mcpu=native -armpl -fopenmp $(DFLAGS) $(IFLAGS)
F90FLAGS = $(FFLAGS) $(FDFLAGS) $(CUDA_F90FLAGS) $(IFLAGS) $(MODFLAGS)
FFLAGS = -O2 -g -mcpu=native -armpl -fopenmp
# compiler flags without optimization for fortran-77
# the latter is NEEDED to properly compile dlamch.f, used by lapack
FFLAGS_NOOPT = -O0
# compiler flag needed by some compilers when the main program is not fortran
# Currently used for Yambo
FFLAGS_NOMAIN =
# Linker, linker-specific flags (if any)
# Typically LD coincides with F90 or MPIF90, LD_LIBS is empty
# for libxc, set LD_LIBS=-L/path/to/libxc/lib/ -lxcf90 -lxc
LD = mpif90
LDFLAGS = -mcpu=native -armpl -fopenmp
LD_LIBS =
# External Libraries (if any) : blas, lapack, fft, MPI
# If you have nothing better, use the local copy via "--with-netlib" :
# BLAS_LIBS = /your/path/to/espresso/LAPACK/blas.a
# BLAS_LIBS_SWITCH = internal
BLAS_LIBS = /cineca/prod/opt/tools/arm-compiler-for-hpc/19.0/none/opt/arm/armpl-19.0.0_ThunderX2CN99_RHEL-7_arm-hpc-compiler_19.0_aarch64-linux/lib/libarmpl_lp64.a
BLAS_LIBS_SWITCH = external
# If you have nothing better, use the local copy via "--with-netlib" :
# LAPACK_LIBS = /your/path/to/espresso/LAPACK/lapack.a
# LAPACK_LIBS_SWITCH = internal
# For IBM machines with essl (-D__ESSL): load essl BEFORE lapack !
# remember that LAPACK_LIBS precedes BLAS_LIBS in loading order
LAPACK_LIBS = /cineca/prod/opt/tools/arm-compiler-for-hpc/19.0/none/opt/arm/armpl-19.0.0_ThunderX2CN99_RHEL-7_arm-hpc-compiler_19.0_aarch64-linux/lib/libarmpl_lp64.a
LAPACK_LIBS_SWITCH = external
SCALAPACK_LIBS =
# nothing needed here if the the internal copy of FFTW is compiled
# (needs -D__FFTW in DFLAGS)
FFT_LIBS = /cineca/prod/opt/tools/arm-compiler-for-hpc/19.0/none/opt/arm/armpl-19.0.0_ThunderX2CN99_RHEL-7_arm-hpc-compiler_19.0_aarch64-linux/lib/libarmpl_lp64.a
# HDF5
HDF5_LIB =
FOX_LIB = -L$(TOPDIR)/FoX/lib -lFoX_dom -lFoX_sax -lFoX_wxml -lFoX_common\
-lFoX_utils -lFoX_fsys
FOX_FLAGS =
# For parallel execution, the correct path to MPI libraries must
# be specified in MPI_LIBS (except for IBM if you use mpxlf)
MPI_LIBS =
# IBM-specific: MASS libraries, if available and if -D__MASS is defined in FDFLAGS
MASS_LIBS =
# CUDA libraries
CUDA_LIBS=
CUDA_EXTLIBS =
# ar command and flags - for most architectures: AR = ar, ARFLAGS = ruv
AR = ar
ARFLAGS = ruv
# ranlib command. If ranlib is not needed (it isn't in most cases) use
# RANLIB = echo
RANLIB = ranlib
# all internal and external libraries - do not modify
FLIB_TARGETS = all
LIBOBJS = $(TOPDIR)/clib/clib.a $(TOPDIR)/iotk/src/libiotk.a
LIBS = $(CUDA_LIBS) $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(FOX_LIB) $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(MASS_LIBS) $(HDF5_LIB) $(LD_LIBS)
# wget or curl - useful to download from network
WGET = wget -O
# Install directory - not currently used
PREFIX = /usr/local
......@@ -7,10 +7,12 @@
#SBATCH --error=ta205-err.%j
#SBATCH --mem=90GB
#SBATCH --time=01:30:00
#SBATCH --partition=batch
#SBATCH --account=prpb74
#module load Intel IntelMPI imkl
module load intel-para/2018b-mt
QE_HOME=$HOME/q-e-qe-6.3
module load Intel IntelMPI imkl
#module load intel-para/2018b-mt
QE_HOME=$PROJECT_cprpb74/prpb7400/q-e-qe-6.3
export OMP_NUM_THREADS=4
srun -n 650 $QE_HOME/bin/pw.x -npool 26 -input Ta2O5-2x2xz-552.in
#!/bin/bash
#
# Example job script for AUSURF (2 k points)
#SBATCH --nodes=1
#SBATCH --ntasks=48
#SBATCH --ntasks-per-node=48
#SBATCH --output=mpi-out.%j
#SBATCH --error=mpi-err.%j
#SBATCH --time=01:00:00
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=16
#SBATCH --output=ausurf-out.%j
#SBATCH --error=ausurf-err.%j
#SBATCH --time=00:30:00
#SBATCH --partition=batch
#SBATCH --account=prpb74
module load Intel IntelMPI imkl
QE_HOME=$HOME/q-e-qe-6.3
srun $QE_HOME/bin/pw.x -npool 2 -ndiag 16 -input ausurf.in
QE_HOME=$PROJECT_cprpb74/prpb7400/q-e-qe-6.3
srun -n 16 $QE_HOME/bin/pw.x -npool 2 -ndiag 4 -input ausurf.in
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment