From d6c2a7c3306c00f19ddcb0c43a3078e51a30bf48 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen"
Date: Fri, 15 Apr 2016 14:25:10 +0200
Subject: [PATCH] Updated the chapel/heat_equation

heat_equation/README.md  85 ++++++++++++++++++++++++
heat_equation/README.rst  51 
heat_equation/src/multiple_machines.chpl  28 ++++++
heat_equation/src/sequential.chpl  28 ++++++
heat_equation/src/single_machine.chpl  28 ++++++
5 files changed, 151 insertions(+), 69 deletions()
create mode 100644 heat_equation/README.md
delete mode 100644 heat_equation/README.rst
diff git a/heat_equation/README.md b/heat_equation/README.md
new file mode 100644
index 0000000..62898aa
 /dev/null
+++ b/heat_equation/README.md
@@ 0,0 +1,85 @@
+Heat Equation
+=============
+
+In this example, we solve the heat equation. The idea is to apply a 5point stencil on a domain iteratively until equilibrium.
+
+Sequential
+
+
+[sequential.chpl](src/sequential.chpl) is a sequential implementation of the heat equation written in Chapel. The stencil computation is the most time consuming part of the code and look like:
+
+```
+ for (i,j) in Interior do//Iterate over all nonborder cells
+ {
+ //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+ T[i,j] = (A[i,j] + A[i1,j] + A[i+1,j] + A[i,j1] + A[i,j+1]) / 5;
+ }
+```
+
+Basically, each *interior* element in `T` gets the mean of the corresponding element in `A` as well as the neighboring elements. Since `for` is a sequential language construct in Chapel, a single CPUcore will execute this code.
+
+Now, let's run it:
+
+```
+ ./bin/heat_equation nl 1 size=5000*10
+ Heat Equation (sequential)  n: 5000, iterations: 10, elapsedtime: 381.5 seconds
+```
+
+Multicore
+
+
+In order to improve the performance, we can tell Chapel to use threads to execute the stencil operations in parallel ([single_machine.chpl](src/single_machine.chpl)). We do that by replacing `for` with `forall`, which tells Chapel to execute each iteration in `Interior` parallel.
+It is our responsibility to make sure that each iteration in the `forall` loop is independent in order not to introduce race conditions.
+
+Clearly in this case iteration is clearly independent since we do not read `T`:
+
+```
+ forall (i,j) in Interior do//Iterate over all nonborder cells
+ {
+ //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+ T[i,j] = (A[i,j] + A[i1,j] + A[i+1,j] + A[i,j1] + A[i,j+1]) / 5;
+ }
+```
+
+Now, let's run it (note that `CHPL_RT_NUM_THREADS_PER_LOCALE` tells Chapel the number of threads to use)::
+
+```
+ export CHPL_RT_NUM_THREADS_PER_LOCALE=16
+ ./bin/heat_equation nl 1 size=5000*10
+ Heat Equation (single machine)  n: 5000, iterations: 10, elapsedtime: 25.7052 seconds
+```
+
+Multiple Machines
+
+
+In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines (`multiple_machines.chpl `).
+We still use the `forall` loop construct, be we have to tell Chapel how to distributes `A` and `T` between the multiple machines. For that, we use the `dmapped` language construct when defining the `Grid` and `Interior` domain:
+
+```
+ //A n+2 by n+2 domain.
+ const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
+
+ //A n by n domain that represents the interior of 'Grid'
+ const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
+
+ var A, T : [Grid] real;//Zero initialized as default
+```
+
+We tell Chapel to use the same *block* distribution of the `Grid` and `Interior` domain such that each index in `Grid` has the same location as the corresponding index in `Interior`. Because they use the same distribution, no communication is needed when accessing the same index. For example, the operations `A[2,4] + T[2,4]` can be done locally on the machine that *owns* index `[2,4]`. However, it also means that a operations such as `A[2,4] + T[3,4]` will generally require communication.
+
+Now, let's run it (note that `nl 8` tells Chapel to use eight locations):
+
+```
+ export CHPL_RT_NUM_THREADS_PER_LOCALE=16
+ ./bin/heat_equation nl 8 size=5000*10
+ Heat Equation (multiple machines)  n: 5000, iterations: 10, elapsedtime: 5.13 seconds
+```
+
+It is very importation that all arrays in the calculation uses similar `dmapped` layouts. For example, if we do not use `dmapped` when defines `Interior` we get horrible performance:
+
+```
+ export CHPL_RT_NUM_THREADS_PER_LOCALE=16
+ ./bin/heat_equation nl 8 size=5000*10
+ Heat Equation (multiple machines)  n: 5000, iterations: 10, elapsedtime: 1823.23 seconds
+```
+
diff git a/heat_equation/README.rst b/heat_equation/README.rst
deleted file mode 100644
index 440a205..0000000
 a/heat_equation/README.rst
+++ /dev/null
@@ 1,51 +0,0 @@
Heat Equation
=============

In this example, we solve the heat equation. The idea is to apply a 5point stencil on a domain iteratively until equilibrium.

Sequential


`sequential.chpl ` is a sequential implementation of the heat equation written in Chapel. The stencil computation is the most time consuming part of the code and look like::

 for (i,j) in Interior do//Iterate over all nonborder cells
 {
 //Assign each cell in 'T' the mean of its neighboring cells in 'A'
 T[i,j] = (A[i,j] + A[i1,j] + A[i+1,j] + A[i,j1] + A[i,j+1]) / 5;
 }

Basically, each *interior* element in ``T`` gets the mean of the corresponding element in ``A`` as well as the neighboring elements. Since ``for`` is a sequential language construct in Chapel, a single CPUcore will execute this code.


Multicore


In order to improve the performance, we can tell Chapel to use threads to execute the stencil operations in parallel (`single_machine.chpl `). We do that by replacing ``for`` with ``forall``, which tells Chapel to execute each iteration in ``Interior`` parallel.
It is our responsibility to make sure that each iteration in the ``forall`` loop is independent in order not to introduce race conditions.

Clearly in this case iteration is clearly independent since we do not read ``T``::

 forall (i,j) in Interior do//Iterate over all nonborder cells
 {
 //Assign each cell in 'T' the mean of its neighboring cells in 'A'
 T[i,j] = (A[i,j] + A[i1,j] + A[i+1,j] + A[i,j1] + A[i,j+1]) / 5;
 }


Multiple Machines


In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines (`multiple_machines.chpl `).
We still use the ``forall`` loop construct, be we have to tell Chapel how to distributes ``A`` and ``T`` between the multiple machines. For that, we use the ``dmapped`` language construct when defining the ``Grid`` and ``Interior`` domain::

 //A n+2 by n+2 domain.
 const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});

 //A n by n domain that represents the interior of 'Grid'
 const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});

 var A, T : [Grid] real;//Zero initialized as default

We tell Chapel to use the same *block* distribution of the ``Grid`` and ``Interior`` domain such that each index in ``Grid`` has the same location as the corresponding index in ``Interior``. Because they use the same distribution, no communication is needed when accessing the same index. For example, the operations ``A[2,4] + T[2,4]`` can be done locally on the machine that *owns* index ``[2,4]``. However, it also means that a operations such as ``A[2,4] + T[3,4]`` will generally require communication.

In relation to HPC, it is very importation use ``dmapped`` such that you minimize the communication requirements of your application.
diff git a/heat_equation/src/multiple_machines.chpl b/heat_equation/src/multiple_machines.chpl
index a414ca9..0aa625f 100644
 a/heat_equation/src/multiple_machines.chpl
+++ b/heat_equation/src/multiple_machines.chpl
@@ 2,13 +2,17 @@
//The first integer is the domain size squired and the second integer is
//the number of iterations.
config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
config const epsilon = 1.0e10;//Stop condition in amount of change
+
+//Stop condition in amount of change (ignored when 'iterations' are nonzero).
+config const epsilon = 1.0e10;
//Parse the size argument into 'n' and 'iterations'
use Regexp;
const arg = size.matches(compile("(\\d+)*(\\d+)"));
const n = size.substring(arg[1][1]) : int;
const iterations = size.substring(arg[2][1]) : int;
+const arg_n = arg[1][1];
+const arg_i = arg[2][1];
+const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
+const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
//Initiate a Timer object
use Time;
@@ 49,9 +53,21 @@ do{
//Copy back the nonborder cells
A[Interior] = T[Interior];
 //When 'delta' is smaller than 'epsilon' the calculation has converged
 iter_count += 1;
} while (delta > epsilon && iter_count >= iterations);
+ //if 'iterations' is nonzero we stop after a fixed number of iterations
+ //otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
+ var stop = false;
+ if(iterations > 0)
+ {
+ if iter_count >= iterations then
+ stop = true;
+ }
+ else
+ {
+ if delta < epsilon then
+ stop = true;
+ }
+
+} while (!stop);
timer.stop();
writeln("Heat Equation (multiple machines)  n: ",n,
diff git a/heat_equation/src/sequential.chpl b/heat_equation/src/sequential.chpl
index 776e86e..5d110b6 100644
 a/heat_equation/src/sequential.chpl
+++ b/heat_equation/src/sequential.chpl
@@ 2,13 +2,17 @@
//The first integer is the domain size squired and the second integer is
//the number of iterations.
config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
config const epsilon = 1.0e10;//Stop condition in amount of change
+
+//Stop condition in amount of change (ignored when 'iterations' are nonzero).
+config const epsilon = 1.0e10;
//Parse the size argument into 'n' and 'iterations'
use Regexp;
const arg = size.matches(compile("(\\d+)*(\\d+)"));
const n = size.substring(arg[1][1]) : int;
const iterations = size.substring(arg[2][1]) : int;
+const arg_n = arg[1][1];
+const arg_i = arg[2][1];
+const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
+const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
//Initiate a Timer object
use Time;
@@ 46,9 +50,21 @@ do{
//Copy back the nonborder cells
A[Interior] = T[Interior];
 //When 'delta' is smaller than 'epsilon' the calculation has converged
 iter_count += 1;
} while (delta > epsilon && iter_count >= iterations);
+ //if 'iterations' is nonzero we stop after a fixed number of iterations
+ //otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
+ var stop = false;
+ if(iterations > 0)
+ {
+ if iter_count >= iterations then
+ stop = true;
+ }
+ else
+ {
+ if delta < epsilon then
+ stop = true;
+ }
+
+} while (!stop);
timer.stop();
writeln("Heat Equation (sequential)  n: ",n,
diff git a/heat_equation/src/single_machine.chpl b/heat_equation/src/single_machine.chpl
index e3f147c..54af20b 100644
 a/heat_equation/src/single_machine.chpl
+++ b/heat_equation/src/single_machine.chpl
@@ 2,13 +2,17 @@
//The first integer is the domain size squired and the second integer is
//the number of iterations.
config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
config const epsilon = 1.0e10;//Stop condition in amount of change
+
+//Stop condition in amount of change (ignored when 'iterations' are nonzero).
+config const epsilon = 1.0e10;
//Parse the size argument into 'n' and 'iterations'
use Regexp;
const arg = size.matches(compile("(\\d+)*(\\d+)"));
const n = size.substring(arg[1][1]) : int;
const iterations = size.substring(arg[2][1]) : int;
+const arg_n = arg[1][1];
+const arg_i = arg[2][1];
+const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
+const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
//Initiate a Timer object
use Time;
@@ 46,9 +50,21 @@ do{
//Copy back the nonborder cells
A[Interior] = T[Interior];
 //When 'delta' is smaller than 'epsilon' the calculation has converged
 iter_count += 1;
} while (delta > epsilon && iter_count >= iterations);
+ //if 'iterations' is nonzero we stop after a fixed number of iterations
+ //otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
+ var stop = false;
+ if(iterations > 0)
+ {
+ if iter_count >= iterations then
+ stop = true;
+ }
+ else
+ {
+ if delta < epsilon then
+ stop = true;
+ }
+
+} while (!stop);
timer.stop();
writeln("Heat Equation (single machine)  n: ",n,

GitLab