Commit a63e1065 authored by Jussi Enkovaara's avatar Jussi Enkovaara

Merge branch 'master' into 'master'

Chapel training material

See merge request !1
parents 55ade84f b633f9b2
# Makefile that builds each src/*.chpl file into a binary in bin/*
CC=chpl
CFLAGS=-g
LDFLAGS=
SRC=$(wildcard src/*.chpl)
PROGRAM=$(addprefix bin/, $(subst .chpl,, $(subst src/,,$(SRC))))
all: mkdir $(PROGRAM)
bin/% : src/%.chpl
$(CC) $(CFLAGS) -o $@ $<
.PHONY: clean mkdir
mkdir:
mkdir -p bin
clean:
rm -R bin
Heat Equation
=============
In this example, we solve the heat equation. The idea is to apply a 5-point stencil on a domain iteratively until equilibrium.
Sequential
----------
[sequential.chpl](src/sequential.chpl) is a sequential implementation of the heat equation written in Chapel. The stencil computation is the most time consuming part of the code and look like:
```
for (i,j) in Interior do//Iterate over all non-border cells
{
//Assign each cell in 'T' the mean of its neighboring cells in 'A'
T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
}
```
Basically, each *interior* element in `T` gets the mean of the corresponding element in `A` as well as the neighboring elements. Since `for` is a sequential language construct in Chapel, a single CPU-core will execute this code.
Now, let's run it:
```
./bin/heat_equation -nl 1 --size=5000*10
Heat Equation (sequential) - n: 5000, iterations: 10, elapsed-time: 381.5 seconds
```
Multi-core
----------
In order to improve the performance, we can tell Chapel to use threads to execute the stencil operations in parallel ([single_machine.chpl](src/single_machine.chpl)). We do that by replacing `for` with `forall`, which tells Chapel to execute each iteration in `Interior` parallel.
It is our responsibility to make sure that each iteration in the `forall` loop is independent in order not to introduce race conditions.
Clearly in this case iteration is clearly independent since we do not read `T`:
```
forall (i,j) in Interior do//Iterate over all non-border cells
{
//Assign each cell in 'T' the mean of its neighboring cells in 'A'
T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
}
```
Now, let's run it (note that `CHPL_RT_NUM_THREADS_PER_LOCALE` tells Chapel the number of threads to use)::
```
export CHPL_RT_NUM_THREADS_PER_LOCALE=16
./bin/heat_equation -nl 1 --size=5000*10
Heat Equation (single machine) - n: 5000, iterations: 10, elapsed-time: 25.7052 seconds
```
Multiple Machines
-----------------
In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines ([multiple_machines.chpl](src/multiple_machines.chpl)).
We still use the `forall` loop construct, be we have to tell Chapel how to distributes `A` and `T` between the multiple machines. For that, we use the `dmapped` language construct when defining the `Grid` and `Interior` domain:
```
//A n+2 by n+2 domain.
const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
//A n by n domain that represents the interior of 'Grid'
const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
var A, T : [Grid] real;//Zero initialized as default
```
We tell Chapel to use the same *block* distribution of the `Grid` and `Interior` domain such that each index in `Grid` has the same location as the corresponding index in `Interior`. Because they use the same distribution, no communication is needed when accessing the same index. For example, the operations `A[2,4] + T[2,4]` can be done locally on the machine that *owns* index `[2,4]`. However, it also means that a operations such as `A[2,4] + T[3,4]` will generally require communication.
Now, let's run it (note that `-nl 8` tells Chapel to use eight locations):
```
export CHPL_RT_NUM_THREADS_PER_LOCALE=16
./bin/heat_equation -nl 8 --size=5000*10
Heat Equation (multiple machines) - n: 5000, iterations: 10, elapsed-time: 5.13 seconds
```
It is very importation that all arrays in the calculation uses similar `dmapped` layouts. For example, if we do not use `dmapped` when defines `Interior` we get horrible performance:
```
export CHPL_RT_NUM_THREADS_PER_LOCALE=16
./bin/heat_equation -nl 8 --size=5000*10
Heat Equation (multiple machines) - n: 5000, iterations: 10, elapsed-time: 1823.23 seconds
```
//The format of 'size' is two integers separated with a '*'.
//The first integer is the domain size squired and the second integer is
//the number of iterations.
config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
//Stop condition in amount of change (ignored when 'iterations' are non-zero).
config const epsilon = 1.0e-10;
//Parse the --size argument into 'n' and 'iterations'
use Regexp;
const arg = size.matches(compile("(\\d+)*(\\d+)"));
const arg_n = arg[1][1];
const arg_i = arg[2][1];
const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
//Initiate a Timer object
use Time;
var timer : Timer;
//Now, let's implement the heat equation!
//We will use the Block distribution
use BlockDist;
//A n+2 by n+2 domain.
const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
//A n by n domain that represents the interior of 'Grid'
const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
var A, T : [Grid] real;//Zero initialized as default
A[..,0] = -273.15; //Left column
A[..,n+1] = -273.15; //Right column
A[n+1,..] = -273.15; //Bottom row
A[0,..] = 40.0; //Top row
timer.start();
var iter_count = 0;
do{
//Since all iterations are independent, we can use 'forall', which allows
//the Chapel runtime system to calculate the iterations in parallel
forall (i,j) in Interior do//Iterate over all non-border cells
{
//Assign each cell in 'T' the mean of its neighboring cells in 'A'
T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
}
//Delta is the total amount of change done in this iteration
const delta = + reduce abs(A[Interior] - T[Interior]);
//Copy back the non-border cells
A[Interior] = T[Interior];
//if 'iterations' is non-zero we stop after a fixed number of iterations
//otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
var stop = false;
if(iterations > 0)
{
if iter_count >= iterations then
stop = true;
}
else
{
if delta < epsilon then
stop = true;
}
} while (!stop);
timer.stop();
writeln("Heat Equation (multiple machines) - n: ",n,
", iterations: ", iterations,
", elapsed: ", timer.elapsed(), " seconds");
//The format of 'size' is two integers separated with a '*'.
//The first integer is the domain size squired and the second integer is
//the number of iterations.
config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
//Stop condition in amount of change (ignored when 'iterations' are non-zero).
config const epsilon = 1.0e-10;
//Parse the --size argument into 'n' and 'iterations'
use Regexp;
const arg = size.matches(compile("(\\d+)*(\\d+)"));
const arg_n = arg[1][1];
const arg_i = arg[2][1];
const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
//Initiate a Timer object
use Time;
var timer : Timer;
//Now, let's implement the heat equation!
//A n+2 by n+2 domain.
const Grid = {0..n+1, 0..n+1};
//A n by n domain that represents the interior of 'Grid'
const Interior = {1..n, 1..n};
var A, T : [Grid] real;//Zero initialized as default
A[..,0] = -273.15; //Left column
A[..,n+1] = -273.15; //Right column
A[n+1,..] = -273.15; //Bottom row
A[0,..] = 40.0; //Top row
timer.start();
var iter_count = 0;
do{
//Since all iterations are independent, we can use 'forall', which allows
//the Chapel runtime system to calculate the iterations in parallel
for (i,j) in Interior do//Iterate over all non-border cells
{
//Assign each cell in 'T' the mean of its neighboring cells in 'A'
T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
}
//Delta is the total amount of change done in this iteration
const delta = + reduce abs(A[Interior] - T[Interior]);
//Copy back the non-border cells
A[Interior] = T[Interior];
//if 'iterations' is non-zero we stop after a fixed number of iterations
//otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
var stop = false;
if(iterations > 0)
{
if iter_count >= iterations then
stop = true;
}
else
{
if delta < epsilon then
stop = true;
}
} while (!stop);
timer.stop();
writeln("Heat Equation (sequential) - n: ",n,
", iterations: ", iterations,
", elapsed: ", timer.elapsed(), " seconds");
//The format of 'size' is two integers separated with a '*'.
//The first integer is the domain size squired and the second integer is
//the number of iterations.
config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
//Stop condition in amount of change (ignored when 'iterations' are non-zero).
config const epsilon = 1.0e-10;
//Parse the --size argument into 'n' and 'iterations'
use Regexp;
const arg = size.matches(compile("(\\d+)*(\\d+)"));
const arg_n = arg[1][1];
const arg_i = arg[2][1];
const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
//Initiate a Timer object
use Time;
var timer : Timer;
//Now, let's implement the heat equation!
//A n+2 by n+2 domain.
const Grid = {0..n+1, 0..n+1};
//A n by n domain that represents the interior of 'Grid'
const Interior = {1..n, 1..n};
var A, T : [Grid] real;//Zero initialized as default
A[..,0] = -273.15; //Left column
A[..,n+1] = -273.15; //Right column
A[n+1,..] = -273.15; //Bottom row
A[0,..] = 40.0; //Top row
timer.start();
var iter_count = 0;
do{
//Since all iterations are independent, we can use 'forall', which allows
//the Chapel runtime system to calculate the iterations in parallel
forall (i,j) in Interior do//Iterate over all non-border cells
{
//Assign each cell in 'T' the mean of its neighboring cells in 'A'
T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
}
//Delta is the total amount of change done in this iteration
const delta = + reduce abs(A[Interior] - T[Interior]);
//Copy back the non-border cells
A[Interior] = T[Interior];
//if 'iterations' is non-zero we stop after a fixed number of iterations
//otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
var stop = false;
if(iterations > 0)
{
if iter_count >= iterations then
stop = true;
}
else
{
if delta < epsilon then
stop = true;
}
} while (!stop);
timer.stop();
writeln("Heat Equation (single machine) - n: ",n,
", iterations: ", iterations,
", elapsed: ", timer.elapsed(), " seconds");
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment