Commit 3d7ee9b3 authored by Alan Gray's avatar Alan Gray
Browse files

added code for accelerator benchmark 1

parent d0960e4d
##############################################################################
#
# Makefile.mk
#
# Please copy one configuration file from the ./config
# directory to this directory (top level Ludwig directory)
# and make any appropriate changes for your platform.
#
# No changes should be required in this file itself.
#
#
# Edinburgh Soft Matter and Statistical Physics Group and
# Edinburgh Parallel Computing Centre
#
# (c) 2015 The University of Edinburgh
# Contributing authors:
# Kevin Stratford (kevin@epcc.ed.ac.uk)
#
##############################################################################
ROOT_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
ifneq ("","$(wildcard $(ROOT_DIR)/config.mk)") #then file exists
include $(ROOT_DIR)/config.mk
else
$(error $(ROOT_DIR)config.mk is missing: please copy one of the configuration files in the config directory to $(ROOT_DIR)config.mk)
endif
PRACE QCD Accelerator Benchmark 1
=================================
This benchmark is part of the QCD section of the Accelerator
Benchmarks Suite developed as part of a PRACE EU funded project
(http://www.prace-ri.eu).
The suite is derived from the Unified European Applications
Benchmark Suite (UEABS) http://www.prace-ri.eu/ueabs/
This specific component is a direct port of "QCD kernel E" from the
UEABS, which is based on the MILC code suite
(http://www.physics.utah.edu/~detar/milc/). The performance-portable
targetDP model has been used to allow the benchmark to utilise NVIDIA
GPUs, Intel Xeon Phi manycore CPUs and traditional multi-core
CPUs. The use of MPI (in conjunction with targetDP) allows multiple
nodes to be used in parallel.
For full details of this benchmark, and for results on NVIDIA GPU and
Intel Knights Corner Xeon Phi architectures (in addition to regular
CPUs), please see:
**********************************************************************
Gray, Alan, and Kevin Stratford. "A lightweight approach to
performance portability with targetDP." The International Journal of
High Performance Computing Applications (2016): 1094342016682071, Also
available at https://arxiv.org/abs/1609.01479
**********************************************************************
To Build
--------
Choose a configuration file from the "config" directory that best
matches your platform, and copy to "config.mk" in this (the
top-level) directory. Then edit this file, if necessary, to properly
set the compilers and paths on your system.
Note that if you are building for a GPU system, and the TARGETCC
variable in the configuration file is set to the NVIDIA compiler nvcc,
then the build process will automatically build the GPU
version. Otherwise, the threaded CPU version will be built which can
run on Xeon Phi manycore CPUs or regular multi-core CPUs.
Then, build the targetDP performance-portable library:
cd targetDP
make clean
make
cd ..
And finally build the benchmark code
cd src
make clean
make
cd ..
To Validate
-----------
After building, an executable "bench" will exist in the src directory.
To run the default validation (64x64x64x8, 1 iteration) case:
cd src
./bench
The code will automatically self-validate by comparing with the
appropriate output reference file for this case which exists in
output_ref, and will print to stdout, e.g.
Validating against output_ref/kernel_E.output.nx64ny64nz64nt8.i1.t1:
VALIDATION PASSED
The benchmark time is also printed to stdout, e.g.
******BENCHMARK TIME 1.6767786769196391e-01 seconds******
(Where this time is as reported on an NVIDIA K40 GPU).
To Run Different Cases
---------------------
You can edit the input file
src/kernel_E.input
if you want to deviate from the default system size, number of
iterations and/or run using more than 1 MPI task. E.g. replacing
totnodes 1 1 1 1
with
totnodes 2 1 1 1
will run with 2 MPI tasks rather than 1, where the domain is decomposed in
the "X" direction.
To Run using a Script
---------------------
The "run" directory contains an example script which
- sets up a temporary scratch directory
- copies in the input file, plus also some reference output files
- sets the number of OpenMP threads (for a multi/many core CPU run)
- runs the code (which will automatically validate if an
appropriate output reference file exists)
So, in the run directory, you should copy "run_example.sh" to
run.sh, which you can customise for your system.
Known Issues
------------
The quantity used for validation (see congrad.C) becomes very small
after a few iterations. Therefore, only a small number of iterations
should be used for validation. This is not an issue specific to this
port of the benchmark, but is also true of the original version (see
above), with which this version is designed to be consistent.
Performance Results for Reference
--------------------------------
Here are some performance timings obtained using this benchmark.
From the paper cited above:
64x64x64x32x8, 1000 iterations, single chip
Chip Time (s)
Intel Ivy-Bridge 12-core CPU 361.55
Intel Haswell 8-core CPU 376.08
AMD Opteron 16-core CPU 618.19
Intel KNC Xeon Phi 139.94
NVIDIA K20X GPU 96.84
NVIDIA K40 GPU 90.90
Multi-node scaling:
Titan GPU (one K20X per node)
Titan CPU (one 16-core Interlagos per node)
ARCHER CPU (two 12-core Ivy-bridge per node)
All times in seconds.
Small Case: 64x64x32x8, 1000 iterations
Nodes Titan GPU Titan CPU ARCHER CPU
1 9.64E+01 6.01E+02 1.86E+02
2 5.53E+01 3.14E+02 9.57E+01
4 3.30E+01 1.65E+02 5.22E+01
8 2.18E+01 8.33E+01 2.60E+01
16 1.35E+01 4.02E+01 1.27E+01
32 8.80E+00 2.06E+01 6.49E+00
64 6.54E+00 9.90E+00 2.36E+00
128 5.13E+00 4.31E+00 1.86E+00
256 4.25E+00 2.95E+00 1.96E+00
Large Case: 64x64x64x192, 1000 iterations
Nodes Titan GPU Titan CPU ARCHER CPU
64 1.36E+02 5.19E+02 1.61E+02
128 8.23E+01 2.75E+02 8.51E+01
256 6.70E+01 1.61E+02 4.38E+01
512 3.79E+01 8.80E+01 2.18E+01
1024 2.41E+01 5.72E+01 1.46E+01
2048 1.81E+01 3.88E+01 7.35E+00
4096 1.56E+01 2.28E+01 6.53E+00
Preliminary results on new Pascal GPU and Intel KNL architectures:
Single chip, 64x64x64x8, 1000 iterations
Chip Time (s)
12-core Intel Ivy-Bridge 7.24E+02
Intel KNL Xeon Phi 9.72E+01
NVIDIA P100 GPU 5.60E+01
#CPU (CRAY XC30) configuration file
CFLAGS = $(DEFINES) -O2 -DARCH=0
LDFLAGS = -lm -openmp
CC=cc
TARGETCC=cc
TARGETCFLAGS=-x c -openmp $(CFLAGS) -DVVL=4 -DAoSoA
#NVIDIA GPU configuration file
MPIDIR=/opt/ibmhpc/pecurrent/mpich/gnu/
GPUS_PER_NODE=4
NVARCH=sm_60
CFLAGS = $(DEFINES) -O2 -DARCH=0 -I $(MPIDIR)/include64
LDFLAGS = -lm -arch=$(NVARCH) -L./targetDP -ltarget -L$(MPIDIR)/lib64 -lmpi -lmpl -lm -lgomp
CC=mpcc -compiler gnu
TARGETCC=nvcc
TARGETCFLAGS=-x cu -arch=$(NVARCH) -I. -DCUDA -DVVL=1 -DSoA -DGPUSPN=$(GPUS_PER_NODE) -dc -c $(CFLAGS)
#NVIDIA GPU configuration file
MPIDIR=/usr/local/packages/mpich2-1.5
OMPDIR=/usr/lib/gcc/x86_64-redhat-linux/4.4.4 #location of libgomp.a
GPUS_PER_NODE=1
NVARCH=sm_35
CFLAGS = $(DEFINES) -O2 -DARCH=0 -I $(MPIDIR)/include
LDFLAGS = -lm -arch=$(NVARCH) -L./targetDP -ltarget -L$(MPIDIR)/lib -lmpich -lmpl -lm -L$(OMPDIR) -lgomp
CC=mpicc
TARGETCC=nvcc
TARGETCFLAGS=-x cu -arch=$(NVARCH) -I. -DCUDA -DVVL=1 -DSoA -DGPUSPN=$(GPUS_PER_NODE) -dc -c $(CFLAGS)
#Intel Xeon Phi configuration file
CFLAGS = $(DEFINES) -O2 -DARCH=0 -mmic
LDFLAGS = -lm -mmic -openmp
CC=mpiicc
TARGETCC=mpiicc
TARGETCFLAGS=-x c -openmp $(CFLAGS) -DVVL=8 -DAoSoA
CFLAGS = $(DEFINES) -O2 -DARCH=0 -DVVL=8 -DAoS
LDFLAGS = -lm -openmp
CC=cc
TARGETCC=cc
TARGETCFLAGS=-x c -openmp $(CFLAGS)
#!/bin/bash
# if you are running on a system with queing facilities,
# add appropriate headers here
#select location of benchmark source
SRCDIR=$PWD/../src
#select a temporary scratch directory.
SCRATCHDIR=./scratch
#set up the scratch dir
rm -rf $SCRATCHDIR
mkdir -p $SCRATCHDIR
cd $SCRATCHDIR
#copy input file
cp $SRCDIR/kernel_E.input .
#copy output reference files
cp -r $SRCDIR/output_ref .
#set the number of OpenMP threads (if using threaded version)
export OMP_NUM_THREADS=4
#run the code
$SRCDIR/bench
SHELL = bash
ifeq ($(MYHOSTNAME),gpulab2)
CC = mpicc
endif
ifeq ($(MYHOSTNAME),archer)
CC = cc
endif
CFLAGS = -I/Users/alang/epcc/prace_accel/qcd/my_version/kernel_E/include
AR = ar -cru
ARFLAGS =
LD = mpicc
#LD = cc
LDFLAGS =
RM = rm
SHELL = #SHELL#
CC = #MPI_CC#
CFLAGS = #CFLAGS#
AR = #AR#
ARFLAGS = #ARFLAGS#
LDFLAGS = #LDFLAGS#
RM = #RM#
#include "includes.h"
#include <mpi.h>
//#define COM_BIT 0x40000000 //now defined in macros.h
int global_com_bit=COM_BIT;
/* Global variables */
int totnodes[4]; /* number of nodes in machine directions */
int Mynode[4], node_parity;
int **neighbor;
int offnode_even[8]; /* # of even sites that have off-node neighbors in a dir */
int offnode_odd[8]; /* # of odd sites that have off-node neighbors in a dir */
MPI_Comm comm_grid, comm_subgrid[4]; /* grid communicators */
/* print on 0 node only */
void node0_printf( const char *fmt, ... )
{
va_list argp;
if( this_node == 0 )
{
va_start( argp, fmt );
vprintf( fmt, argp );
va_end( argp );
}
fflush( 0 );
MPI_Barrier( MPI_COMM_WORLD );
}
void node0_fprintf( FILE * file, const char *fmt, ... )
{
va_list argp;
if( this_node == 0 )
{
va_start( argp, fmt );
vfprintf( file, fmt, argp );
va_end( argp );
}
fflush( 0 );
MPI_Barrier( MPI_COMM_WORLD );
}
void verbose_fprintf( FILE * file, const char *fmt, ... )
{
va_list argp;
if( verbose && this_node == 0 )
{
va_start( argp, fmt );
vfprintf( file, fmt, argp );
va_end( argp );
}
fflush( 0 );
MPI_Barrier( MPI_COMM_WORLD );
}
/* JuBE no args needed */
void initialize_machine_KE()
{
int free_coords[4], wrap_around[4];
MPI_Comm_size(MPI_COMM_WORLD, &number_of_nodes);
MPI_Comm_rank(MPI_COMM_WORLD, &this_node);
/* get the totnodes[dir] from parameters file */
FILE *fpar;
/* JuBE set para file to kernel_E.input*/
if( ( fpar = fopen( "kernel_E.input", "r" ) ) == 0 && this_node == 0 )
{
printf( "ERROR initialize_machine: missing parameter file\n" );
fflush(0);
exit( 1 );
}
if (get_totnodes( fpar, "totnodes" ) && this_node==0)
{
printf( "ERROR initialize_machine: missing totnodes\n" );
fflush(0);
exit(1);
}
if (totnodes[XUP]*totnodes[YUP]*totnodes[ZUP]*totnodes[TUP]!=number_of_nodes &&
this_node==0)
{
printf( "ERROR initialize_machine: bad total number of nodes\n" );
fflush(0);
exit(1);
}
fclose(fpar);
/* Cartesian grid */
wrap_around[0] = 1;
wrap_around[1] = 1;
wrap_around[2] = 1;
wrap_around[3] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 4, totnodes, wrap_around, 1, &comm_grid);
/* new coordinates */
MPI_Comm_rank(comm_grid, &this_node);
MPI_Cart_coords(comm_grid, this_node, 4,Mynode);
node_parity = ( Mynode[XUP] + Mynode[YUP] + Mynode[ZUP] + Mynode[TUP] ) % 2;
node0_printf( "initialize_machine: topology: %dx%dx%dx%d, mynode: %d,%d,%d,%d, numnodes: %d\n",
totnodes[XUP], totnodes[YUP], totnodes[ZUP], totnodes[TUP],
Mynode[XUP], Mynode[YUP], Mynode[ZUP], Mynode[TUP], numnodes_KE( ) );
/* set up communicators */
free_coords[XUP] = 1;
free_coords[YUP] = 0;
free_coords[ZUP] = 0;
free_coords[TUP] = 0;
MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[XUP]);
free_coords[XUP] = 0;
free_coords[YUP] = 1;
free_coords[ZUP] = 0;
free_coords[TUP] = 0;
MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[YUP]);
free_coords[XUP] = 0;
free_coords[YUP] = 0;
free_coords[ZUP] = 1;
free_coords[TUP] = 0;
MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[ZUP]);
free_coords[XUP] = 0;
free_coords[YUP] = 0;
free_coords[ZUP] = 0;
free_coords[TUP] = 1;
MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[TUP]);
}
static char name[] = "Generic communication";
char *machine_type_KE( )
{
return ( name );
}
int mynode_KE( )
{
return Mynode[TUP]+totnodes[TUP]*Mynode[ZUP]+
totnodes[TUP]*totnodes[ZUP]*Mynode[YUP]+
totnodes[TUP]*totnodes[ZUP]*totnodes[YUP]*Mynode[XUP];
}
void mynode4( int *n_x, int *n_y, int *n_z, int *n_t )
{
*n_x = Mynode[XUP];
*n_y = Mynode[YUP];
*n_z = Mynode[ZUP];
*n_t = Mynode[TUP];
}
int numnodes_KE( )
{
return ( totnodes[XUP] * totnodes[YUP] * totnodes[ZUP] * totnodes[TUP] );
}
void numnodes4( int *n_x, int *n_y, int *n_z, int *n_t )
{
*n_x = totnodes[XUP];
*n_y = totnodes[YUP];
*n_z = totnodes[ZUP];
*n_t = totnodes[TUP];
}
void gen_send_recv( int dir, char *sbuf, char *rbuf, int size )
{
MPI_Status status;
int source;
int dest;
if (dir<0 || dir>7)
{
node0_fprintf(file_o1, "ERROR gen_send_recv: Bad direction %d\n",dir);
exit(1);
}
if (dir<4)
{
/* positive direction */
source=(Mynode[dir]+1)%totnodes[dir];
dest=(Mynode[dir]-1+totnodes[dir])%totnodes[dir];
}
else
{
/* negative direction */
dir=OPP_DIR(dir);
dest=(Mynode[dir]+1)%totnodes[dir];
source=(Mynode[dir]-1+totnodes[dir])%totnodes[dir];
}
MPI_Sendrecv(sbuf,size,MPI_BYTE,dest,0,
rbuf,size,MPI_BYTE,source,0,
comm_subgrid[dir],&status);
}
void make_nn_gathers( )
{
int x, y, z, t, xp, yp, zp, tp, xm, ym, zm, tm;
int i, ixp, ixm, iyp, iym, izp, izm, itp, itm, p;
MEMALIGN(neighbor, int *, 8 );
for ( i = 0; i < 8; i++ )
MEMALIGN(neighbor[i],int, sites_on_node);
/* neighbor = malloc( 8 * sizeof( int * ) ); */
/* for ( i = 0; i < 8; i++ ) */
/* { */
/* neighbor[i] = malloc( sites_on_node * sizeof( int ) ); */
/* } */
for ( i = 0; i < 8; i++ )
{
offnode_even[i] = offnode_odd[i] = 0;
}
for ( x = 0; x < nx; x++ )
for ( y = 0; y < ny; y++ )
for ( z = 0; z < nz; z++ )
for ( t = 0; t < nt; t++ )
if( node_number_KE( x, y, z, t ) == mynode_KE( ) )
{
i = node_index_KE( x, y, z, t );
p = lattice[i].parity;
xp = ( x + 1 ) % nx;
xm = ( x - 1 + nx ) % nx;
yp = ( y + 1 ) % ny;
ym = ( y - 1 + ny ) % ny;
zp = ( z + 1 ) % nz;
zm = ( z - 1 + nz ) % nz;
tp = ( t + 1 ) % nt;
tm = ( t - 1 + nt ) % nt;
ixp = node_index_KE( xp, y, z, t );
ixm = node_index_KE( xm, y, z, t );
iyp = node_index_KE( x, yp, z, t );
iym = node_index_KE( x, ym, z, t );
izp = node_index_KE( x, y, zp, t );
izm = node_index_KE( x, y, zm, t );
itp = node_index_KE( x, y, z, tp );
itm = node_index_KE( x, y, z, tm );
if( node_number_KE( xp, y, z, t ) == mynode_KE( ) )
{
neighbor[XUP][i] = ixp;
}
else
{
neighbor[XUP][i] = ixp + COM_BIT;
if( p == EVEN )
{
offnode_even[XUP]++;
}
else
offnode_odd[XUP]++;
}
if( node_number_KE( xm, y, z, t ) == mynode_KE( ) )
{
neighbor[XDOWN][i] = ixm;
}
else
{