From 4076693e8668d74af8685e1561e1c942f1f1e7bd Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 8 Apr 2016 13:06:02 +0200
Subject: [PATCH 1/6] Added training materials for the Chapel Programming
 Language

---
 README.md                                | 14 +++++++++
 heat_equation/Makefile                   | 20 ++++++++++++
 heat_equation/src/multiple_machines.chpl | 40 ++++++++++++++++++++++++
 heat_equation/src/single_machine.chpl    | 38 ++++++++++++++++++++++
 4 files changed, 112 insertions(+)
 create mode 100644 README.md
 create mode 100644 heat_equation/Makefile
 create mode 100644 heat_equation/src/multiple_machines.chpl
 create mode 100644 heat_equation/src/single_machine.chpl

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..084c70b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+Chapel
+======
+
+Compilation instructions
+------------------------
+There are no specific requirements for building examples,
+just standard make, working MPI environment (for MPI examples) and
+OpenMP enabled C or Fortran compiler (for OpenMP examples).
+
+Move to proper subfolder (C or Fortran) and modify the top of the **Makefile**
+according to your environment (proper compiler commands and compiler flags).
+
+All examples can be built with simple **make**, **make mpi** builds the MPI
+examples and **make omp** OpenMP examples.
diff --git a/heat_equation/Makefile b/heat_equation/Makefile
new file mode 100644
index 0000000..0768425
--- /dev/null
+++ b/heat_equation/Makefile
@@ -0,0 +1,20 @@
+# Makefile that builds each src/*.chpl file into a binary in bin/*
+
+CC=chpl
+CFLAGS=-g
+LDFLAGS=
+
+SRC=$(wildcard src/*.chpl)
+PROGRAM=$(addprefix bin/, $(subst .chpl,, $(subst src/,,$(SRC))))
+
+all: mkdir $(PROGRAM)
+
+bin/% : src/%.chpl
+	$(CC) $(CFLAGS) -o $@ $<
+
+.PHONY: clean mkdir
+
+mkdir:
+	mkdir -p bin
+clean:
+	rm -R bin
diff --git a/heat_equation/src/multiple_machines.chpl b/heat_equation/src/multiple_machines.chpl
new file mode 100644
index 0000000..9418082
--- /dev/null
+++ b/heat_equation/src/multiple_machines.chpl
@@ -0,0 +1,40 @@
+use BlockDist;
+
+config const n = 8;//Size of the domain squired
+config const epsilon = 1.0e-10;//Stop condition in amount of change
+config var iterations = 1000;//Stop condition in number of iterations
+
+//A n+2 by n+2 domain.
+const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
+
+//A n by n domain that represents the interior of 'Grid'
+const Interior = {1..n, 1..n};
+
+var A, T : [Grid] real;//Zero initialized as default
+
+A[..,0] = -273.15;   //Left column
+A[..,n+1] = -273.15; //Right column
+A[n+1,..] = -273.15; //Bottom row
+A[0,..] = 40.0;      //Top row
+
+do{
+
+  //Since all iterations are independent, we can use 'forall', which allows
+  //the Chapel runtime system to calculate the iterations in parallel
+  forall (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+
+  //Delta is the total amount of change done in this iteration
+  const delta = + reduce abs(A[Interior] - T[Interior]);
+
+  //Copy back the non-border cells
+  A[Interior] = T[Interior];
+
+  //When 'delta' is smaller than 'epsilon' the calculation has converged
+  iterations -= 1;
+} while (delta > epsilon && iterations > 0);
+
+
diff --git a/heat_equation/src/single_machine.chpl b/heat_equation/src/single_machine.chpl
new file mode 100644
index 0000000..9453d75
--- /dev/null
+++ b/heat_equation/src/single_machine.chpl
@@ -0,0 +1,38 @@
+config const n = 8;//Size of the domain squired
+config const epsilon = 1.0e-10;//Stop condition in amount of change
+config var iterations = 1000;//Stop condition in number of iterations
+
+//A n+2 by n+2 domain.
+const Grid = {0..n+1, 0..n+1};
+
+//A n by n domain that represents the interior of 'Grid'
+const Interior = {1..n, 1..n};
+
+var A, T : [Grid] real;//Zero initialized as default
+
+A[..,0] = -273.15;   //Left column
+A[..,n+1] = -273.15; //Right column
+A[n+1,..] = -273.15; //Bottom row
+A[0,..] = 40.0;      //Top row
+
+do{
+
+  //Since all iterations are independent, we can use 'forall', which allows
+  //the Chapel runtime system to calculate the iterations in parallel
+  forall (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+
+  //Delta is the total amount of change done in this iteration
+  const delta = + reduce abs(A[Interior] - T[Interior]);
+
+  //Copy back the non-border cells
+  A[Interior] = T[Interior];
+
+  //When 'delta' is smaller than 'epsilon' the calculation has converged
+  iterations -= 1;
+} while (delta > epsilon && iterations > 0);
+
+
-- 
GitLab


From d87e74ee4f5ebd817e7d6b5c2a9ec6cf3d908390 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 8 Apr 2016 16:22:36 +0200
Subject: [PATCH 2/6] added a sequential implementation of the heat equation

---
 README.md                                | 14 -------
 README.rst                               |  9 +++++
 heat_equation/README.rst                 | 51 ++++++++++++++++++++++++
 heat_equation/src/multiple_machines.chpl |  2 +-
 heat_equation/src/sequential.chpl        | 38 ++++++++++++++++++
 5 files changed, 99 insertions(+), 15 deletions(-)
 delete mode 100644 README.md
 create mode 100644 README.rst
 create mode 100644 heat_equation/README.rst
 create mode 100644 heat_equation/src/sequential.chpl

diff --git a/README.md b/README.md
deleted file mode 100644
index 084c70b..0000000
--- a/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Chapel
-======
-
-Compilation instructions
-------------------------
-There are no specific requirements for building examples,
-just standard make, working MPI environment (for MPI examples) and
-OpenMP enabled C or Fortran compiler (for OpenMP examples).
-
-Move to proper subfolder (C or Fortran) and modify the top of the **Makefile**
-according to your environment (proper compiler commands and compiler flags).
-
-All examples can be built with simple **make**, **make mpi** builds the MPI
-examples and **make omp** OpenMP examples.
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..b1f7fe1
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,9 @@
+Chapel
+======
+
+Compilation instructions
+------------------------
+In order to compile and run these examples, the only requirement is a working Chapel compiler and make. You can download Chapel at http://chapel.cray.com/.
+
+All examples can be built with simple **make**.
+
diff --git a/heat_equation/README.rst b/heat_equation/README.rst
new file mode 100644
index 0000000..440a205
--- /dev/null
+++ b/heat_equation/README.rst
@@ -0,0 +1,51 @@
+Heat Equation
+=============
+
+In this example, we solve the heat equation. The idea is to apply a 5-point stencil on a domain iteratively until equilibrium.
+
+Sequential
+----------
+
+`sequential.chpl <src/sequential.chpl>`  is a sequential implementation of the heat equation written in Chapel. The stencil computation is the most time consuming part of the code and look like::
+
+  for (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+
+Basically, each *interior* element in ``T`` gets the mean of the corresponding element in ``A`` as well as the neighboring elements. Since ``for`` is a sequential language construct in Chapel, a single CPU-core will execute this code.
+
+
+Multi-core
+----------
+
+In order to improve the performance, we can tell Chapel to use threads to execute the stencil operations in parallel (`single_machine.chpl <src/single_machine.chpl>`). We do that by replacing ``for`` with ``forall``, which tells Chapel to execute each iteration in ``Interior`` parallel.
+It is our responsibility to make sure that each iteration in the ``forall`` loop is independent in order not to introduce race conditions.
+
+Clearly in this case iteration is clearly independent since we do not read ``T``::
+
+  forall (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+
+
+Multiple Machines
+-----------------
+
+In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines (`multiple_machines.chpl <src/multiple_machines.chpl>`).
+We still use the ``forall`` loop construct, be we have to tell Chapel how to distributes ``A`` and ``T`` between the multiple machines. For that, we use the ``dmapped`` language construct when defining the ``Grid`` and ``Interior`` domain::
+
+    //A n+2 by n+2 domain.
+    const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
+
+    //A n by n domain that represents the interior of 'Grid'
+    const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
+
+    var A, T : [Grid] real;//Zero initialized as default
+
+We tell Chapel to use the same *block* distribution of the ``Grid`` and ``Interior`` domain such that each index in ``Grid`` has the same location as the corresponding index in ``Interior``. Because they use the same distribution, no communication is needed when accessing the same index. For example, the operations ``A[2,4] + T[2,4]`` can be done locally on the machine that *owns* index ``[2,4]``. However, it also means that a operations such as ``A[2,4] + T[3,4]`` will generally require communication.
+
+In relation to HPC, it is very importation use ``dmapped`` such that you minimize the communication requirements of your application.
diff --git a/heat_equation/src/multiple_machines.chpl b/heat_equation/src/multiple_machines.chpl
index 9418082..aaccf39 100644
--- a/heat_equation/src/multiple_machines.chpl
+++ b/heat_equation/src/multiple_machines.chpl
@@ -8,7 +8,7 @@ config var iterations = 1000;//Stop condition in number of iterations
 const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
 
 //A n by n domain that represents the interior of 'Grid'
-const Interior = {1..n, 1..n};
+const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
 
 var A, T : [Grid] real;//Zero initialized as default
 
diff --git a/heat_equation/src/sequential.chpl b/heat_equation/src/sequential.chpl
new file mode 100644
index 0000000..968d491
--- /dev/null
+++ b/heat_equation/src/sequential.chpl
@@ -0,0 +1,38 @@
+config const n = 8;//Size of the domain squired
+config const epsilon = 1.0e-10;//Stop condition in amount of change
+config var iterations = 1000;//Stop condition in number of iterations
+
+//A n+2 by n+2 domain.
+const Grid = {0..n+1, 0..n+1};
+
+//A n by n domain that represents the interior of 'Grid'
+const Interior = {1..n, 1..n};
+
+var A, T : [Grid] real;//Zero initialized as default
+
+A[..,0] = -273.15;   //Left column
+A[..,n+1] = -273.15; //Right column
+A[n+1,..] = -273.15; //Bottom row
+A[0,..] = 40.0;      //Top row
+
+do{
+
+  //Since all iterations are independent, we can use 'forall', which allows
+  //the Chapel runtime system to calculate the iterations in parallel
+  for (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+
+  //Delta is the total amount of change done in this iteration
+  const delta = + reduce abs(A[Interior] - T[Interior]);
+
+  //Copy back the non-border cells
+  A[Interior] = T[Interior];
+
+  //When 'delta' is smaller than 'epsilon' the calculation has converged
+  iterations -= 1;
+} while (delta > epsilon && iterations > 0);
+
+
-- 
GitLab


From 1d8e39635510e8b843cfdba5d0795433eade6c67 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 12 Apr 2016 14:11:44 +0200
Subject: [PATCH 3/6] Now using the --size argument in the Heat Equation
 example

---
 heat_equation/src/multiple_machines.chpl | 35 +++++++++++++++++++-----
 heat_equation/src/sequential.chpl        | 30 ++++++++++++++++----
 heat_equation/src/single_machine.chpl    | 30 ++++++++++++++++----
 3 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/heat_equation/src/multiple_machines.chpl b/heat_equation/src/multiple_machines.chpl
index aaccf39..a414ca9 100644
--- a/heat_equation/src/multiple_machines.chpl
+++ b/heat_equation/src/multiple_machines.chpl
@@ -1,8 +1,23 @@
-use BlockDist;
-
-config const n = 8;//Size of the domain squired
+//The format of 'size' is two integers separated with a '*'.
+//The first integer is the domain size squired and the second integer is
+//the number of iterations.
+config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
 config const epsilon = 1.0e-10;//Stop condition in amount of change
-config var iterations = 1000;//Stop condition in number of iterations
+
+//Parse the --size argument into 'n' and 'iterations'
+use Regexp;
+const arg = size.matches(compile("(\\d+)*(\\d+)"));
+const n = size.substring(arg[1][1]) : int;
+const iterations = size.substring(arg[2][1]) : int;
+
+//Initiate a Timer object
+use Time;
+var timer : Timer;
+
+//Now, let's implement the heat equation!
+
+//We will use the Block distribution
+use BlockDist;
 
 //A n+2 by n+2 domain.
 const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
@@ -17,8 +32,9 @@ A[..,n+1] = -273.15; //Right column
 A[n+1,..] = -273.15; //Bottom row
 A[0,..] = 40.0;      //Top row
 
+timer.start();
+var iter_count = 0;
 do{
-
   //Since all iterations are independent, we can use 'forall', which allows
   //the Chapel runtime system to calculate the iterations in parallel
   forall (i,j) in Interior do//Iterate over all non-border cells
@@ -34,7 +50,12 @@ do{
   A[Interior] = T[Interior];
 
   //When 'delta' is smaller than 'epsilon' the calculation has converged
-  iterations -= 1;
-} while (delta > epsilon && iterations > 0);
+  iter_count += 1;
+} while (delta > epsilon && iter_count >= iterations);
+
+timer.stop();
+writeln("Heat Equation (multiple machines) - n: ",n,
+        ", iterations: ", iterations,
+        ", elapsed: ", timer.elapsed(), " seconds");
 
 
diff --git a/heat_equation/src/sequential.chpl b/heat_equation/src/sequential.chpl
index 968d491..776e86e 100644
--- a/heat_equation/src/sequential.chpl
+++ b/heat_equation/src/sequential.chpl
@@ -1,6 +1,20 @@
-config const n = 8;//Size of the domain squired
+//The format of 'size' is two integers separated with a '*'.
+//The first integer is the domain size squired and the second integer is
+//the number of iterations.
+config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
 config const epsilon = 1.0e-10;//Stop condition in amount of change
-config var iterations = 1000;//Stop condition in number of iterations
+
+//Parse the --size argument into 'n' and 'iterations'
+use Regexp;
+const arg = size.matches(compile("(\\d+)*(\\d+)"));
+const n = size.substring(arg[1][1]) : int;
+const iterations = size.substring(arg[2][1]) : int;
+
+//Initiate a Timer object
+use Time;
+var timer : Timer;
+
+//Now, let's implement the heat equation!
 
 //A n+2 by n+2 domain.
 const Grid = {0..n+1, 0..n+1};
@@ -15,8 +29,9 @@ A[..,n+1] = -273.15; //Right column
 A[n+1,..] = -273.15; //Bottom row
 A[0,..] = 40.0;      //Top row
 
+timer.start();
+var iter_count = 0;
 do{
-
   //Since all iterations are independent, we can use 'forall', which allows
   //the Chapel runtime system to calculate the iterations in parallel
   for (i,j) in Interior do//Iterate over all non-border cells
@@ -32,7 +47,12 @@ do{
   A[Interior] = T[Interior];
 
   //When 'delta' is smaller than 'epsilon' the calculation has converged
-  iterations -= 1;
-} while (delta > epsilon && iterations > 0);
+  iter_count += 1;
+} while (delta > epsilon && iter_count >= iterations);
+
+timer.stop();
+writeln("Heat Equation (sequential) - n: ",n,
+        ", iterations: ", iterations,
+        ", elapsed: ", timer.elapsed(), " seconds");
 
 
diff --git a/heat_equation/src/single_machine.chpl b/heat_equation/src/single_machine.chpl
index 9453d75..e3f147c 100644
--- a/heat_equation/src/single_machine.chpl
+++ b/heat_equation/src/single_machine.chpl
@@ -1,6 +1,20 @@
-config const n = 8;//Size of the domain squired
+//The format of 'size' is two integers separated with a '*'.
+//The first integer is the domain size squired and the second integer is
+//the number of iterations.
+config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
 config const epsilon = 1.0e-10;//Stop condition in amount of change
-config var iterations = 1000;//Stop condition in number of iterations
+
+//Parse the --size argument into 'n' and 'iterations'
+use Regexp;
+const arg = size.matches(compile("(\\d+)*(\\d+)"));
+const n = size.substring(arg[1][1]) : int;
+const iterations = size.substring(arg[2][1]) : int;
+
+//Initiate a Timer object
+use Time;
+var timer : Timer;
+
+//Now, let's implement the heat equation!
 
 //A n+2 by n+2 domain.
 const Grid = {0..n+1, 0..n+1};
@@ -15,8 +29,9 @@ A[..,n+1] = -273.15; //Right column
 A[n+1,..] = -273.15; //Bottom row
 A[0,..] = 40.0;      //Top row
 
+timer.start();
+var iter_count = 0;
 do{
-
   //Since all iterations are independent, we can use 'forall', which allows
   //the Chapel runtime system to calculate the iterations in parallel
   forall (i,j) in Interior do//Iterate over all non-border cells
@@ -32,7 +47,12 @@ do{
   A[Interior] = T[Interior];
 
   //When 'delta' is smaller than 'epsilon' the calculation has converged
-  iterations -= 1;
-} while (delta > epsilon && iterations > 0);
+  iter_count += 1;
+} while (delta > epsilon && iter_count >= iterations);
+
+timer.stop();
+writeln("Heat Equation (single machine) - n: ",n,
+        ", iterations: ", iterations,
+        ", elapsed: ", timer.elapsed(), " seconds");
 
 
-- 
GitLab


From d6c2a7c3306c00f19ddcb0c43a3078e51a30bf48 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 15 Apr 2016 14:25:10 +0200
Subject: [PATCH 4/6] Updated the chapel/heat_equation

---
 heat_equation/README.md                  | 85 ++++++++++++++++++++++++
 heat_equation/README.rst                 | 51 --------------
 heat_equation/src/multiple_machines.chpl | 28 ++++++--
 heat_equation/src/sequential.chpl        | 28 ++++++--
 heat_equation/src/single_machine.chpl    | 28 ++++++--
 5 files changed, 151 insertions(+), 69 deletions(-)
 create mode 100644 heat_equation/README.md
 delete mode 100644 heat_equation/README.rst

diff --git a/heat_equation/README.md b/heat_equation/README.md
new file mode 100644
index 0000000..62898aa
--- /dev/null
+++ b/heat_equation/README.md
@@ -0,0 +1,85 @@
+Heat Equation
+=============
+
+In this example, we solve the heat equation. The idea is to apply a 5-point stencil on a domain iteratively until equilibrium.
+
+Sequential
+----------
+
+[sequential.chpl](src/sequential.chpl)  is a sequential implementation of the heat equation written in Chapel. The stencil computation is the most time consuming part of the code and look like:
+
+```
+  for (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+```
+
+Basically, each *interior* element in `T` gets the mean of the corresponding element in `A` as well as the neighboring elements. Since `for` is a sequential language construct in Chapel, a single CPU-core will execute this code.
+
+Now, let's run it:
+
+```
+  ./bin/heat_equation -nl 1 --size=5000*10
+  Heat Equation (sequential) - n: 5000, iterations: 10, elapsed-time: 381.5 seconds
+```
+
+Multi-core
+----------
+
+In order to improve the performance, we can tell Chapel to use threads to execute the stencil operations in parallel ([single_machine.chpl](src/single_machine.chpl)). We do that by replacing `for` with `forall`, which tells Chapel to execute each iteration in `Interior` parallel.
+It is our responsibility to make sure that each iteration in the `forall` loop is independent in order not to introduce race conditions.
+
+Clearly in this case iteration is clearly independent since we do not read `T`:
+
+```
+  forall (i,j) in Interior do//Iterate over all non-border cells
+  {
+    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
+    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
+  }
+```
+
+Now, let's run it (note that `CHPL_RT_NUM_THREADS_PER_LOCALE` tells Chapel the number of threads to use)::
+
+```
+  export CHPL_RT_NUM_THREADS_PER_LOCALE=16
+  ./bin/heat_equation -nl 1 --size=5000*10
+  Heat Equation (single machine) - n: 5000, iterations: 10, elapsed-time: 25.7052 seconds
+```
+
+Multiple Machines
+-----------------
+
+In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines (`multiple_machines.chpl <src/multiple_machines.chpl>`).
+We still use the `forall` loop construct, be we have to tell Chapel how to distributes `A` and `T` between the multiple machines. For that, we use the `dmapped` language construct when defining the `Grid` and `Interior` domain:
+
+```
+    //A n+2 by n+2 domain.
+    const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
+
+    //A n by n domain that represents the interior of 'Grid'
+    const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
+
+    var A, T : [Grid] real;//Zero initialized as default
+```
+
+We tell Chapel to use the same *block* distribution of the `Grid` and `Interior` domain such that each index in `Grid` has the same location as the corresponding index in `Interior`. Because they use the same distribution, no communication is needed when accessing the same index. For example, the operations `A[2,4] + T[2,4]` can be done locally on the machine that *owns* index `[2,4]`. However, it also means that a operations such as `A[2,4] + T[3,4]` will generally require communication.
+
+Now, let's run it (note that `-nl 8` tells Chapel to use eight locations):
+
+```
+  export CHPL_RT_NUM_THREADS_PER_LOCALE=16
+  ./bin/heat_equation -nl 8 --size=5000*10
+  Heat Equation (multiple machines) - n: 5000, iterations: 10, elapsed-time: 5.13 seconds
+```
+
+It is very importation that all arrays in the calculation uses similar `dmapped` layouts. For example, if we do not use `dmapped` when defines `Interior` we get horrible performance:
+
+```
+  export CHPL_RT_NUM_THREADS_PER_LOCALE=16
+  ./bin/heat_equation -nl 8 --size=5000*10
+  Heat Equation (multiple machines) - n: 5000, iterations: 10, elapsed-time: 1823.23 seconds
+```
+
diff --git a/heat_equation/README.rst b/heat_equation/README.rst
deleted file mode 100644
index 440a205..0000000
--- a/heat_equation/README.rst
+++ /dev/null
@@ -1,51 +0,0 @@
-Heat Equation
-=============
-
-In this example, we solve the heat equation. The idea is to apply a 5-point stencil on a domain iteratively until equilibrium.
-
-Sequential
-----------
-
-`sequential.chpl <src/sequential.chpl>`  is a sequential implementation of the heat equation written in Chapel. The stencil computation is the most time consuming part of the code and look like::
-
-  for (i,j) in Interior do//Iterate over all non-border cells
-  {
-    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
-    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
-  }
-
-Basically, each *interior* element in ``T`` gets the mean of the corresponding element in ``A`` as well as the neighboring elements. Since ``for`` is a sequential language construct in Chapel, a single CPU-core will execute this code.
-
-
-Multi-core
-----------
-
-In order to improve the performance, we can tell Chapel to use threads to execute the stencil operations in parallel (`single_machine.chpl <src/single_machine.chpl>`). We do that by replacing ``for`` with ``forall``, which tells Chapel to execute each iteration in ``Interior`` parallel.
-It is our responsibility to make sure that each iteration in the ``forall`` loop is independent in order not to introduce race conditions.
-
-Clearly in this case iteration is clearly independent since we do not read ``T``::
-
-  forall (i,j) in Interior do//Iterate over all non-border cells
-  {
-    //Assign each cell in 'T' the mean of its neighboring cells in 'A'
-    T[i,j] = (A[i,j] + A[i-1,j] + A[i+1,j] + A[i,j-1] + A[i,j+1]) / 5;
-  }
-
-
-Multiple Machines
------------------
-
-In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines (`multiple_machines.chpl <src/multiple_machines.chpl>`).
-We still use the ``forall`` loop construct, be we have to tell Chapel how to distributes ``A`` and ``T`` between the multiple machines. For that, we use the ``dmapped`` language construct when defining the ``Grid`` and ``Interior`` domain::
-
-    //A n+2 by n+2 domain.
-    const Grid = {0..n+1, 0..n+1} dmapped Block({1..n, 1..n});
-
-    //A n by n domain that represents the interior of 'Grid'
-    const Interior = {1..n, 1..n} dmapped Block({1..n, 1..n});
-
-    var A, T : [Grid] real;//Zero initialized as default
-
-We tell Chapel to use the same *block* distribution of the ``Grid`` and ``Interior`` domain such that each index in ``Grid`` has the same location as the corresponding index in ``Interior``. Because they use the same distribution, no communication is needed when accessing the same index. For example, the operations ``A[2,4] + T[2,4]`` can be done locally on the machine that *owns* index ``[2,4]``. However, it also means that a operations such as ``A[2,4] + T[3,4]`` will generally require communication.
-
-In relation to HPC, it is very importation use ``dmapped`` such that you minimize the communication requirements of your application.
diff --git a/heat_equation/src/multiple_machines.chpl b/heat_equation/src/multiple_machines.chpl
index a414ca9..0aa625f 100644
--- a/heat_equation/src/multiple_machines.chpl
+++ b/heat_equation/src/multiple_machines.chpl
@@ -2,13 +2,17 @@
 //The first integer is the domain size squired and the second integer is
 //the number of iterations.
 config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
-config const epsilon = 1.0e-10;//Stop condition in amount of change
+
+//Stop condition in amount of change (ignored when 'iterations' are non-zero).
+config const epsilon = 1.0e-10;
 
 //Parse the --size argument into 'n' and 'iterations'
 use Regexp;
 const arg = size.matches(compile("(\\d+)*(\\d+)"));
-const n = size.substring(arg[1][1]) : int;
-const iterations = size.substring(arg[2][1]) : int;
+const arg_n = arg[1][1];
+const arg_i = arg[2][1];
+const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
+const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
 
 //Initiate a Timer object
 use Time;
@@ -49,9 +53,21 @@ do{
   //Copy back the non-border cells
   A[Interior] = T[Interior];
 
-  //When 'delta' is smaller than 'epsilon' the calculation has converged
-  iter_count += 1;
-} while (delta > epsilon && iter_count >= iterations);
+  //if 'iterations' is non-zero we stop after a fixed number of iterations
+  //otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
+  var stop = false;
+  if(iterations > 0)
+  {
+    if iter_count >= iterations then
+      stop = true;
+  }
+  else
+  {
+    if delta < epsilon then
+      stop = true;
+  }
+
+} while (!stop);
 
 timer.stop();
 writeln("Heat Equation (multiple machines) - n: ",n,
diff --git a/heat_equation/src/sequential.chpl b/heat_equation/src/sequential.chpl
index 776e86e..5d110b6 100644
--- a/heat_equation/src/sequential.chpl
+++ b/heat_equation/src/sequential.chpl
@@ -2,13 +2,17 @@
 //The first integer is the domain size squired and the second integer is
 //the number of iterations.
 config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
-config const epsilon = 1.0e-10;//Stop condition in amount of change
+
+//Stop condition in amount of change (ignored when 'iterations' are non-zero).
+config const epsilon = 1.0e-10;
 
 //Parse the --size argument into 'n' and 'iterations'
 use Regexp;
 const arg = size.matches(compile("(\\d+)*(\\d+)"));
-const n = size.substring(arg[1][1]) : int;
-const iterations = size.substring(arg[2][1]) : int;
+const arg_n = arg[1][1];
+const arg_i = arg[2][1];
+const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
+const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
 
 //Initiate a Timer object
 use Time;
@@ -46,9 +50,21 @@ do{
   //Copy back the non-border cells
   A[Interior] = T[Interior];
 
-  //When 'delta' is smaller than 'epsilon' the calculation has converged
-  iter_count += 1;
-} while (delta > epsilon && iter_count >= iterations);
+  //if 'iterations' is non-zero we stop after a fixed number of iterations
+  //otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
+  var stop = false;
+  if(iterations > 0)
+  {
+    if iter_count >= iterations then
+      stop = true;
+  }
+  else
+  {
+    if delta < epsilon then
+      stop = true;
+  }
+
+} while (!stop);
 
 timer.stop();
 writeln("Heat Equation (sequential) - n: ",n,
diff --git a/heat_equation/src/single_machine.chpl b/heat_equation/src/single_machine.chpl
index e3f147c..54af20b 100644
--- a/heat_equation/src/single_machine.chpl
+++ b/heat_equation/src/single_machine.chpl
@@ -2,13 +2,17 @@
 //The first integer is the domain size squired and the second integer is
 //the number of iterations.
 config const size = "100*10";//Default, 100 by 100 domain and 10 iterations
-config const epsilon = 1.0e-10;//Stop condition in amount of change
+
+//Stop condition in amount of change (ignored when 'iterations' are non-zero).
+config const epsilon = 1.0e-10;
 
 //Parse the --size argument into 'n' and 'iterations'
 use Regexp;
 const arg = size.matches(compile("(\\d+)*(\\d+)"));
-const n = size.substring(arg[1][1]) : int;
-const iterations = size.substring(arg[2][1]) : int;
+const arg_n = arg[1][1];
+const arg_i = arg[2][1];
+const n = size[arg_n.offset+1..arg_n.offset+arg_n.length] : int;
+const iterations = size[arg_i.offset+1..arg_i.offset+arg_i.length]: int;
 
 //Initiate a Timer object
 use Time;
@@ -46,9 +50,21 @@ do{
   //Copy back the non-border cells
   A[Interior] = T[Interior];
 
-  //When 'delta' is smaller than 'epsilon' the calculation has converged
-  iter_count += 1;
-} while (delta > epsilon && iter_count >= iterations);
+  //if 'iterations' is non-zero we stop after a fixed number of iterations
+  //otherwise we stop when the calculation has converged, i.e. 'delta' is smaller than 'epsilon'.
+  var stop = false;
+  if(iterations > 0)
+  {
+    if iter_count >= iterations then
+      stop = true;
+  }
+  else
+  {
+    if delta < epsilon then
+      stop = true;
+  }
+
+} while (!stop);
 
 timer.stop();
 writeln("Heat Equation (single machine) - n: ",n,
-- 
GitLab


From 3d22aaeee799c8429b3b674a22a2069154bd3b8d Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 15 Apr 2016 14:27:15 +0200
Subject: [PATCH 5/6] fixed link typo

---
 heat_equation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat_equation/README.md b/heat_equation/README.md
index 62898aa..980bc3d 100644
--- a/heat_equation/README.md
+++ b/heat_equation/README.md
@@ -52,7 +52,7 @@ Now, let's run it (note that `CHPL_RT_NUM_THREADS_PER_LOCALE` tells Chapel the n
 Multiple Machines
 -----------------
 
-In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines (`multiple_machines.chpl <src/multiple_machines.chpl>`).
+In order to improve the performance even further, we can tell Chapel to execute the stencil operation in parallel on multiple machines ([multiple_machines.chpl](src/multiple_machines.chpl)).
 We still use the `forall` loop construct, be we have to tell Chapel how to distributes `A` and `T` between the multiple machines. For that, we use the `dmapped` language construct when defining the `Grid` and `Interior` domain:
 
 ```
-- 
GitLab


From b633f9b274a79daefcebf0f22d5dc6e3a9f7409a Mon Sep 17 00:00:00 2001
From: Jussi Enkovaara <jussi.enkovaara@csc.fi>
Date: Thu, 21 Jun 2018 10:54:48 +0300
Subject: [PATCH 6/6] Removed the duplicate README

---
 README.rst | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 README.rst

diff --git a/README.rst b/README.rst
deleted file mode 100644
index b1f7fe1..0000000
--- a/README.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Chapel
-======
-
-Compilation instructions
-------------------------
-In order to compile and run these examples, the only requirement is a working Chapel compiler and make. You can download Chapel at http://chapel.cray.com/.
-
-All examples can be built with simple **make**.
-
-- 
GitLab