adb51bfa · adb51bfa · adb51bfa · adb51bfa · adb51bfa · adb51bfa
--- a/heat-equation/fortran/solution/main.F90
+++ b/heat-equation/fortran/solution/main.F90
+! Heat equation solver in 2D.
+
+program heat_solve
+  use heat
+  use core
+  use io
+  use setup
+  use utilities
+
+  implicit none
+
+  real(dp), parameter :: a = 0.5 ! Diffusion constant
+  type(field) :: current, previous    ! Current and previus temperature fields
+
+  real(dp) :: dt     ! Time step
+  integer :: nsteps       ! Number of time steps
+  integer, parameter :: image_interval = 10 ! Image output interval
+
+  integer :: iter
+
+  real :: start, stop ! Timers
+
+  !$OMP PARALLEL PRIVATE(iter)
+  call initialize(current, previous, nsteps)
+
+  !$OMP SINGLE
+  ! Draw the picture of the initial state
+  call write_field(current, 0)
+
+  ! Largest stable time step
+  dt = current%dx**2 * current%dy**2 / &
+       & (2.0 * a * (current%dx**2 + current%dy**2))
+
+  ! Main iteration loop, save a picture every
+  ! image_interval steps
+
+  call cpu_time(start)
+  !$OMP END SINGLE
+  do iter = 1, nsteps
+     call evolve(current, previous, a, dt)
+     !$OMP SINGLE
+     if (mod(iter, image_interval) == 0) then
+        call write_field(current, iter)
+     end if
+     call swap_fields(current, previous)
+     !$OMP END SINGLE
+  end do
+  !$OMP SINGLE
+  call cpu_time(stop)
+  !$OMP END SINGLE
+  !$OMP END PARALLEL
+
+  write(*,'(A,F7.3,A)') 'Iteration took ', stop - start, ' seconds.'
+  write(*,'(A,G0)') 'Reference value at 5,5: ', previous % data(5,5)
+
+  call finalize(current, previous)
+
+end program heat_solve
--- a/heat-equation/fortran/solution/pngwriter_mod.F90
+++ b/heat-equation/fortran/solution/pngwriter_mod.F90
+! PNG writer for heat equation solver
+module pngwriter
+  use heat
+
+contains
+
+  function save_png(data, nx, ny, fname) result(stat)
+
+    use, intrinsic :: ISO_C_BINDING
+    implicit none
+
+    real(dp), dimension(:,:), intent(in) :: data
+    integer, intent(in) :: nx, ny
+    character(len=*), intent(in) :: fname
+    integer :: stat
+
+    ! Interface for save_png C-function
+    interface
+       ! The C-function definition is
+       !   int save_png(double *data, const int nx, const int ny,
+       !                const char *fname)
+       function save_png_c(data, nx, ny, fname, order) &
+            & bind(C,name="save_png") result(stat)
+         use, intrinsic :: ISO_C_BINDING
+         implicit none
+         real(kind=C_DOUBLE) :: data(*)
+         integer(kind=C_INT), value, intent(IN) :: nx, ny
+         character(kind=C_CHAR), intent(IN) :: fname(*)
+         character(kind=C_CHAR), value, intent(IN) :: order
+         integer(kind=C_INT) :: stat
+       end function save_png_c
+    end interface
+
+    stat = save_png_c(data, nx, ny, trim(fname) // C_NULL_CHAR, 'f')
+    if (stat /= 0) then
+       write(*,*) 'save_png returned error!'
+    end if
+
+  end function save_png
+
+end module pngwriter
--- a/heat-equation/fortran/solution/setup.F90
+++ b/heat-equation/fortran/solution/setup.F90
+! Setup routines for heat equation solver
+module setup
+
+contains
+
+  subroutine initialize(previous, current, nsteps)
+
+    use heat
+    use utilities
+    use io
+
+    implicit none
+
+    type(field), intent(out) :: previous, current
+    integer, intent(out) :: nsteps
+
+    integer :: rows, cols
+    logical :: using_input_file
+    character(len=85) :: input_file, arg  ! Input file name and command line arguments
+
+
+    ! Default values for grid size and time steps
+    rows = 200
+    cols = 200
+    nsteps = 500
+    using_input_file = .false.
+
+    ! Read in the command line arguments and
+    ! set up the needed variables
+    select case(command_argument_count())
+    case(0) ! No arguments -> default values
+    case(1) ! One argument -> input file name
+       using_input_file = .true.
+       call get_command_argument(1, input_file)
+    case(2) ! Two arguments -> input file name and number of steps
+       using_input_file = .true.
+       call get_command_argument(1, input_file)
+       call get_command_argument(2, arg)
+       read(arg, *) nsteps
+    case(3) ! Three arguments -> rows, cols and nsteps
+       call get_command_argument(1, arg)
+       read(arg, *) rows
+       call get_command_argument(2, arg)
+       read(arg, *) cols
+       call get_command_argument(3, arg)
+       read(arg, *) nsteps
+    case default
+       call usage()
+       stop
+    end select
+
+    ! Initialize the fields according the command line arguments
+    if (using_input_file) then
+       !$OMP SINGLE
+       call read_field(previous, input_file)
+       call copy_fields(previous, current)
+       !$OMP END SINGLE
+    else
+       !$OMP SINGLE
+       call set_field_dimensions(previous, rows, cols)
+       call set_field_dimensions(current, rows, cols)
+       !$OMP END SINGLE
+       call generate_field(previous)
+       call generate_field(current)
+    end if
+
+  end subroutine initialize
+
+  ! Generate initial the temperature field.  Pattern is disc with a radius
+  ! of nx_full / 6 in the center of the grid.
+  ! Boundary conditions are (different) constant temperatures outside the grid
+  subroutine generate_field(field0)
+    use heat
+
+    implicit none
+
+    type(field), intent(inout) :: field0
+
+    real(dp) :: radius2
+    integer :: i, j, ds2
+
+    ! The arrays for field contain also a halo region
+    !$OMP SINGLE
+    allocate(field0%data(0:field0%nx+1, 0:field0%ny+1))
+    !$OMP END SINGLE
+
+    ! Square of the disk radius
+    radius2 = (field0%nx / 6.0_dp)**2
+
+    !$OMP DO
+    do j = 0, field0%ny + 1
+       do i = 0, field0%nx + 1
+          ds2 = int((i - field0%nx / 2.0_dp + 1)**2 + &
+               & (j - field0%ny / 2.0_dp + 1)**2)
+          if (ds2 < radius2) then
+             field0%data(i,j) = 5.0_dp
+          else
+             field0%data(i,j) = 65.0_dp
+          end if
+       end do
+    end do
+    !$OMP END DO
+
+    ! Boundary conditions
+    !$OMP DO
+    do j = 0, field0%nx + 1
+       field0%data(j, 0) = 20.0_dp
+    end do
+    !$OMP END DO NOWAIT
+    !$OMP DO
+    do j = 0, field0%nx + 1
+       field0%data(j, field0%ny + 1) = 70.0_dp
+    end do
+    !$OMP END DO
+    !$OMP DO
+    do j = 0, field0%ny + 1
+       field0%data(0, j) = 85.0_dp
+    end do
+    !$OMP END DO NOWAIT
+    !$OMP DO
+    do j = 0, field0%ny+1
+       field0%data(field0%nx + 1, j) = 5.0_dp
+    end do
+    !$OMP END DO
+
+  end subroutine generate_field
+
+
+  ! Clean up routine for field type
+  ! Arguments:
+  !   field0 (type(field)): field variable to be cleared
+  subroutine finalize(field0, field1)
+    use heat
+
+    implicit none
+
+    type(field), intent(inout) :: field0, field1
+
+    deallocate(field0%data)
+    deallocate(field1%data)
+
+  end subroutine finalize
+
+  ! Helper routine that prints out a simple usage if
+  ! user gives more than three arguments
+  subroutine usage()
+    implicit none
+    character(len=256) :: buf
+
+    call get_command_argument(0, buf)
+    write (*,'(A)') 'Usage:'
+    write (*,'(A, " (default values will be used)")') trim(buf)
+    write (*,'(A, " <filename>")') trim(buf)
+    write (*,'(A, " <filename> <nsteps>")') trim(buf)
+    write (*,'(A, " <rows> <cols> <nsteps>")') trim(buf)
+  end subroutine usage
+
+end module setup
--- a/heat-equation/fortran/solution/utilities.F90
+++ b/heat-equation/fortran/solution/utilities.F90
+! Utility routines for heat equation solver
+!   NOTE: This file does not need to be edited!
+module utilities
+
+contains
+
+  ! Swap the data fields of two variables of type field
+  ! Arguments:
+  !   curr, prev (type(field)): the two variables that are swapped
+  subroutine swap_fields(curr, prev)
+    use heat
+
+    implicit none
+
+    type(field), intent(inout) :: curr, prev
+    real(dp), allocatable, dimension(:,:) :: tmp
+
+    call move_alloc(curr%data, tmp)
+    call move_alloc(prev%data, curr%data)
+    call move_alloc(tmp, prev%data)
+  end subroutine swap_fields
+
+  ! Copy the data from one field to another
+  ! Arguments:
+  !   from_field (type(field)): variable to copy from
+  !   to_field (type(field)): variable to copy to
+  subroutine copy_fields(from_field, to_field)
+
+    use heat
+    implicit none
+
+    type(field), intent(in) :: from_field
+    type(field), intent(out) :: to_field
+
+    ! Consistency checks
+    if (.not.allocated(from_field%data)) then
+       write (*,*) "Can not copy from a field without allocated data"
+       stop
+    end if
+    if (.not.allocated(to_field%data)) then
+       ! Target is not initialize, allocate memory
+       allocate(to_field%data(lbound(from_field%data, 1):ubound(from_field%data, 1), &
+            & lbound(from_field%data, 2):ubound(from_field%data, 2)))
+    else if (any(shape(from_field%data) /= shape(to_field%data))) then
+       write (*,*) "Wrong field data sizes in copy routine"
+       print *, shape(from_field%data), shape(to_field%data)
+       stop
+    end if
+
+    to_field%data = from_field%data
+
+    to_field%nx = from_field%nx
+    to_field%ny = from_field%ny
+    to_field%dx = from_field%dx
+    to_field%dy = from_field%dy
+  end subroutine copy_fields
+
+end module utilities
--- a/hello-world/LICENSE.txt
+++ b/hello-world/LICENSE.txt
+
+Copyright (C) 2018  CSC - IT Center for Science Ltd.
+
+Licensed under the terms of the GNU General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+Code is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Copy of the GNU General Public License can be obtained from
+<http://www.gnu.org/licenses/>.
+
--- a/hello-world/README.md
+++ b/hello-world/README.md
+## Hello World ##
+
+In this exercise you will test that you are able to compile and run
+OpenMP application. Take a look at a simple example program in
+[hello.c](hello.c) ([hello.F90](hello.F90) for
+Fortran) that has been parallelized using OpenMP. It will first print
+out a hello message (in serial) after which each thread will print out
+an "X" character (in parallel).
+
+Compile this program so that you enable OpenMP, and run it with one,
+two and four threads. Do you get the expected amount of "X"s?
--- a/hello-world/hello.F90
+++ b/hello-world/hello.F90
+program hello
+  implicit none
+
+  print *, 'Hello world!'
+  !$omp parallel
+  print *, 'X'
+  !$omp end parallel
+
+end program hello
--- a/hello-world/hello.c
+++ b/hello-world/hello.c
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+    printf("Hello world!\n");
+    #pragma omp parallel
+    {
+        printf("X\n");
+    }
+
+    return 0;
+}
--- a/hello-world/solution/hello.F90
+++ b/hello-world/solution/hello.F90
+program hello
+  implicit none
+
+  print *, 'Hello world!'
+  !$omp parallel
+  print *, 'X'
+  !$omp end parallel
+
+end program hello
--- a/hello-world/solution/hello.c
+++ b/hello-world/solution/hello.c
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+    printf("Hello world!\n");
+    #pragma omp parallel
+    {
+        printf("X\n");
+    }
+
+    return 0;
+}
--- a/race-condition/LICENSE.txt
+++ b/race-condition/LICENSE.txt
+
+Copyright (C) 2018  CSC - IT Center for Science Ltd.
+
+Licensed under the terms of the GNU General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+Code is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Copy of the GNU General Public License can be obtained from
+<http://www.gnu.org/licenses/>.
+
--- a/race-condition/README.md
+++ b/race-condition/README.md
+## Vector sum and race condition ##
+
+In [vectorsum.c](vectorsum.c) (or [vectorsum.F90](vectorsum.F90) for
+Fortran) you will find a serial code that sums up all the elements of
+a vector A, initialized as A=1,2,...,N. For reference, from the
+arithmetic sum formula we know that the result should be S=N(N+1)/2.
+
+Try to parallelize the code by using only `omp parallel` or `omp for`
+pragmas. Are you able to get same results with different number of
+threads and in different runs? Explain why the program does not work
+correctly in parallel. 
+
+Next, try to use `reduction` clause to compute the sum correctly.
+
+Implement finally an alternative version where each thread computes its
+own part to a private variable and the use a `critical` section after
+the loop to compute the global sum. Which version (`reduction` or `critical`)
+gives better performance?
+
+
--- a/race-condition/solution/vectorsum.F90
+++ b/race-condition/solution/vectorsum.F90
+program vectorsum
+  use iso_fortran_env, only: int64
+  implicit none
+  integer, parameter :: ik = int64
+  integer(kind=ik), parameter :: nx = 102400_ik
+
+  integer(kind=ik), dimension(nx) :: vecA
+  integer(kind=ik) :: sum, psum, sumex
+  integer(kind=ik) :: i
+
+  ! Initialization of vector
+  do i = 1, nx
+     vecA(i) = i
+  end do
+
+  sumex = nx*(nx+1_ik)/2_ik
+  write(*,*) 'Arithmetic sum formula (exact):                  ', sumex
+
+  sum = 0
+  ! Sum with data race
+  !$omp parallel do default(shared) private(i)
+  do i = 1, nx
+     sum = sum + vecA(i)
+  end do
+  !$omp end parallel do
+  write(*,*) 'Sum with data race:                              ', sum
+
+  sum = 0
+  ! Dot product using critical section = SERIAL CODE
+  !$omp parallel do default(shared) private(i)
+  do i = 1, nx
+     !$omp critical
+     sum = sum + vecA(i)
+     !$omp end critical
+  end do
+  !$omp end parallel do
+  write(*,*) 'Sum using critical section:                      ', sum
+
+  sum = 0
+  ! Dot product using reduction
+  !$omp parallel do default(shared) private(i) reduction(+:sum)
+  do i = 1, nx
+     sum = sum + vecA(i)
+  end do
+  !$omp end parallel do
+  write(*,*) 'Sum using reduction:                             ', sum
+
+  sum = 0
+  ! Dot product using private variable and critical secion
+  !$omp parallel default(shared) private(i, psum)
+  psum = 0
+  !$omp do
+  do i = 1, nx
+     psum = psum + vecA(i)
+  end do
+  !$omp end do
+  !$omp critical
+  sum = sum + psum
+  !$omp end critical
+  !$omp end parallel
+  write(*,*) 'Sum using private variable and critical section: ', sum
+
+end program vectorsum
--- a/race-condition/solution/vectorsum.c
+++ b/race-condition/solution/vectorsum.c
+#include <stdio.h>
+
+#define NX 102400
+
+int main(void)
+{
+    long vecA[NX];
+    long sum, psum, sumex;
+    int i;
+
+    /* Initialization of the vectors */
+    for (i = 0; i < NX; i++) {
+        vecA[i] = (long) i + 1;
+    }
+
+    sumex = (long) NX * (NX + 1) / ((long) 2);
+    printf("Arithmetic sum formula (exact):                  %ld\n",
+           sumex);
+
+    sum = 0.0;
+    /* Version with data race */
+    #pragma omp parallel for default(shared) private(i)
+    for (i = 0; i < NX; i++) {
+        sum += vecA[i];
+    }
+    printf("Sum with data race:                              %ld\n",
+           sum);
+
+    sum = 0.0;
+    /* Dot product using critical section = SERIAL CODE! */
+    #pragma omp parallel for default(shared) private(i)
+    for (i = 0; i < NX; i++) {
+        #pragma omp critical(dummy)
+        sum += vecA[i];
+    }
+    printf("Sum using critical section:                      %ld\n",
+           sum);
+
+    sum = 0.0;
+    /* Dot product using private partial sums and critical section */
+    #pragma omp parallel default(shared) private(i, psum)
+    {
+        psum = 0.0;
+        #pragma omp for
+        for (i = 0; i < NX; i++) {
+            psum += vecA[i];
+        }
+        #pragma omp critical(par_sum)
+        sum += psum;
+    }
+    printf("Sum using private variable and critical section: %ld\n",
+           sum);
+
+    sum = 0.0;
+    /* Dot product using reduction */
+    #pragma omp parallel for default(shared) private(i) reduction(+:sum)
+    for (i = 0; i < NX; i++) {
+        sum += vecA[i];
+    }
+    printf("Reduction sum:                                   %ld\n",
+           sum);
+
+    return 0;
+}
--- a/race-condition/vectorsum.F90
+++ b/race-condition/vectorsum.F90
+program vectorsum
+  use iso_fortran_env, only: int64
+  implicit none
+  integer, parameter :: ik = int64
+  integer(kind=ik), parameter :: nx = 102400_ik
+
+  integer(kind=ik), dimension(nx) :: vecA
+  integer(kind=ik) :: sum, psum, sumex
+  integer(kind=ik) :: i
+
+  ! Initialization of vector
+  do i = 1, nx
+     vecA(i) = i
+  end do
+
+  sumex = nx*(nx+1_ik)/2_ik
+  write(*,*) 'Arithmetic sum formula (exact):                  ', sumex
+
+  sum = 0
+  ! TODO: Parallelize the computation
+  do i = 1, nx
+     sum = sum + vecA(i)
+  end do
+  write(*,*) 'Sum: ', sum
+end program vectorsum
--- a/race-condition/vectorsum.c
+++ b/race-condition/vectorsum.c
+#include <stdio.h>
+
+#define NX 102400
+
+int main(void)
+{
+    long vecA[NX];
+    long sum, psum, sumex;
+    int i;
+
+    /* Initialization of the vectors */
+    for (i = 0; i < NX; i++) {
+        vecA[i] = (long) i + 1;
+    }
+
+    sumex = (long) NX * (NX + 1) / ((long) 2);
+    printf("Arithmetic sum formula (exact):                  %ld\n",
+           sumex);
+
+    sum = 0.0;
+    /* TODO: Parallelize computation */
+    for (i = 0; i < NX; i++) {
+        sum += vecA[i];
+    }
+    printf("Sum: %ld\n", sum);
+
+    return 0;
+}
--- a/tasks/LICENSE.txt
+++ b/tasks/LICENSE.txt
+
+Copyright (C) 2018  CSC - IT Center for Science Ltd.
+
+Licensed under the terms of the GNU General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+Code is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Copy of the GNU General Public License can be obtained from
+<http://www.gnu.org/licenses/>.
+
--- a/tasks/README.md
+++ b/tasks/README.md
+## Bonus: Using OpenMP tasks for dynamic parallelization ##
+
+Mandelbrot set of complex numbers can be presented as two dimensional
+fractal image. The computation of the pixel values is relatively time
+consuming, and the computational cost varies for each pixel. Thus,
+simply dividing the two dimensional grid evenly to threads leads into
+load imbalance and suboptimal performance.
+
+Files [tasks/c/mandelbrot.c](c/mandelbrot.c) and
+[tasks/c/mandelbrot.F90](fortran/mandelbrot.F90) contain recursive
+implementation for calculating the Mandelbrot fractal, which can be
+parallelized dynamically with OpenMP tasks. Insert missing pragmas for
+parallelizing the code (look for TODOs in the source code), and
+investigate the scalability with varying number of threads.
--- a/tasks/c/Makefile
+++ b/tasks/c/Makefile
+ifeq ($(COMP),)
+COMP=cray
+endif
+
+COMMONDIR=../../../heat/common
+
+ifeq ($(COMP),cray)
+FC=ftn
+CC=cc
+FCFLAGS=-O3
+CCFLAGS=-O3 -I/appl/opt/libpng/include -I$(COMMONDIR)
+LDFLAGS=-L/appl/opt/libpng/lib
+LIBS=-lpng -lz
+endif
+
+ifeq ($(COMP),gnu)
+FC=gfortran
+CC=gcc
+FCFLAGS=-fopenmp -O3 -Wall
+CCFLAGS=-fopenmp -O3 -Wall -I$(COMMONDIR)
+LDFLAGS=
+LIBS=-lpng
+endif
+
+EXE=mandelbrot
+OBJS=mandelbrot.o
+OBJS_PNG=pngwriter.o
+CORRECT_OBJS_PNG=$(COMMONDIR)/pngwriter.o
+
+all: $(EXE)
+
+$(COMMONDIR)/pngwriter.o: $(COMMONDIR)/pngwriter.c $(COMMONDIR)/pngwriter.h
+mandelbrot.o: mandelbrot.c
+
+pngwriter.o: pngwriter.c pngwriter.h
+	$(CC) $(CCFLAGS) -c -o $@ $<
+
+$(EXE): $(OBJS) $(OBJS_PNG)
+	$(CC) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS)
+
+%.o: %.F90
+	$(FC) $(FCFLAGS) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) -c $< -o $@
+
+.PHONY: clean
+clean:
+	-/bin/rm -f $(EXE) a.out *.o *.mod *.png *~ mandelbrot.png
--- a/tasks/c/mandelbrot.c
+++ b/tasks/c/mandelbrot.c
+/*
+ * This example is based on the code of Andrew V. Adinetz
+ * https://github.com/canonizer/mandelbrot-dyn
+ * Licensed under The MIT License
+ */
+
+#include <omp.h>
+#include <complex.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pngwriter.h"
+
+// Maximum number of iterations
+const int MAX_ITER_COUNT = 512;
+// Marker for different iteration counts
+const int DIFF_ITER_COUNT = -1;
+// Maximum recursion depth
+const int MAX_DEPTH = 6;
+// Region size below which do per-pixel
+const int MIN_SIZE = 32;
+// Subdivision factor along each axis
+const int SUBDIV = 4;
+
+
+// |z|^2 of a complex number z
+float abs2(complex v)
+{
+    return creal(v) * creal(v) + cimag(v) * cimag(v);
+}
+
+// The kernel to count per-pixel values of the portion of the Mandelbrot set
+// Does not need to be edited
+int kernel(int w, int h, complex cmin, complex cmax,
+           int x, int y)
+{
+    complex dc = cmax - cmin;
+    float fx = (float)x / w;
+    float fy = (float)y / h;
+    complex c = cmin + fx * creal(dc) + fy * cimag(dc) * I;
+    int iteration = 0;
+    complex z = c;
+    while (iteration < MAX_ITER_COUNT && abs2(z) < 2 * 2) {
+        z = z * z + c;
+        iteration++;
+    }
+    return iteration;
+}
+
+
+/* Computes the Mandelbrot image recursively
+ * At each call, the image is divided into smaller blocks (by a factor of
+ * subdiv), and the function is called recursively with arguments corresponding
+ * to subblock. When maximum recursion depth is reached or size of block
+ * is smaller than predefined minimum, one starts to calculate actual pixel
+ * values
+ *
+ * - - - - - - - -           -----       -----
+ * |             |           |   |       |   |
+ * |             |           -----       -----
+ * |             |  -->                         -->   ...
+ * |             |           -----       -----
+ * |             |           |   |       |   |
+ * |             |           -----       -----
+ * ---------------
+ */
+void mandelbrot_block(int *iter_counts, int w, int h, complex cmin,
+                      complex cmax, int x0, int y0, int d, int depth)
+{
+
+// TODO Parallelize the recursive function call
+// with OpenMP tasks
+
+    int block_size = d / SUBDIV;
+    if (depth + 1 < MAX_DEPTH && block_size > MIN_SIZE) {
+        // Subdivide recursively
+        for (int i = 0; i < SUBDIV; i++) {
+            for (int j = 0; j < SUBDIV; j++) {
+                mandelbrot_block(iter_counts, w, h, cmin, cmax,
+                                 x0 + i * block_size, y0 + j * block_size,
+                                 d / SUBDIV, depth + 1);
+            }
+        }
+    } else {
+        // Last recursion level reached, calculate the values
+        for (int i = x0; i < x0 + d; i++) {
+            for (int j = y0; j < y0 + d; j++) {
+                iter_counts[j * w + i] = kernel(w, h, cmin, cmax, i, j);
+            }
+        }
+    }
+}
+
+
+int main(int argc, char **argv)
+{
+    // Picture size, should be power of two
+    const int w = 512;
+    const int h = w;
+    int *iter_counts;
+
+    complex cmin, cmax;
+
+    int pic_bytes = w * h * sizeof(int);
+    iter_counts = (int *)malloc(pic_bytes);
+
+    cmin = -1.5 + -1.0 * I;
+    cmax = 0.5 + 1.0 * I;
+
+    double t1 = omp_get_wtime();
+
+// TODO create parallel region. How many threads should be calling
+// mandelbrot_block in this uppermost level?
+
+    {
+        mandelbrot_block(iter_counts, w, h, cmin, cmax,
+                         0, 0, w, 1);
+    }
+    double t2 = omp_get_wtime();
+
+    // Save the image to a PNG file
+    save_png(iter_counts, w, h, "mandelbrot.png");
+
+    double walltime = t2 - t1;
+    // Print the timings
+    printf("Mandelbrot set computed in %.3lf s, at %.3lf Mpix/s\n",
+           walltime, h * w * 1e-6 / walltime);
+
+    free(iter_counts);
+    return 0;
+}
+