Skip to content
! Heat equation solver in 2D.
program heat_solve
use heat
use core
use io
use setup
use utilities
implicit none
real(dp), parameter :: a = 0.5 ! Diffusion constant
type(field) :: current, previous ! Current and previus temperature fields
real(dp) :: dt ! Time step
integer :: nsteps ! Number of time steps
integer, parameter :: image_interval = 10 ! Image output interval
integer :: iter
real :: start, stop ! Timers
!$OMP PARALLEL PRIVATE(iter)
call initialize(current, previous, nsteps)
!$OMP SINGLE
! Draw the picture of the initial state
call write_field(current, 0)
! Largest stable time step
dt = current%dx**2 * current%dy**2 / &
& (2.0 * a * (current%dx**2 + current%dy**2))
! Main iteration loop, save a picture every
! image_interval steps
call cpu_time(start)
!$OMP END SINGLE
do iter = 1, nsteps
call evolve(current, previous, a, dt)
!$OMP SINGLE
if (mod(iter, image_interval) == 0) then
call write_field(current, iter)
end if
call swap_fields(current, previous)
!$OMP END SINGLE
end do
!$OMP SINGLE
call cpu_time(stop)
!$OMP END SINGLE
!$OMP END PARALLEL
write(*,'(A,F7.3,A)') 'Iteration took ', stop - start, ' seconds.'
write(*,'(A,G0)') 'Reference value at 5,5: ', previous % data(5,5)
call finalize(current, previous)
end program heat_solve
! PNG writer for heat equation solver
module pngwriter
use heat
contains
function save_png(data, nx, ny, fname) result(stat)
use, intrinsic :: ISO_C_BINDING
implicit none
real(dp), dimension(:,:), intent(in) :: data
integer, intent(in) :: nx, ny
character(len=*), intent(in) :: fname
integer :: stat
! Interface for save_png C-function
interface
! The C-function definition is
! int save_png(double *data, const int nx, const int ny,
! const char *fname)
function save_png_c(data, nx, ny, fname, order) &
& bind(C,name="save_png") result(stat)
use, intrinsic :: ISO_C_BINDING
implicit none
real(kind=C_DOUBLE) :: data(*)
integer(kind=C_INT), value, intent(IN) :: nx, ny
character(kind=C_CHAR), intent(IN) :: fname(*)
character(kind=C_CHAR), value, intent(IN) :: order
integer(kind=C_INT) :: stat
end function save_png_c
end interface
stat = save_png_c(data, nx, ny, trim(fname) // C_NULL_CHAR, 'f')
if (stat /= 0) then
write(*,*) 'save_png returned error!'
end if
end function save_png
end module pngwriter
! Setup routines for heat equation solver
module setup
contains
subroutine initialize(previous, current, nsteps)
use heat
use utilities
use io
implicit none
type(field), intent(out) :: previous, current
integer, intent(out) :: nsteps
integer :: rows, cols
logical :: using_input_file
character(len=85) :: input_file, arg ! Input file name and command line arguments
! Default values for grid size and time steps
rows = 200
cols = 200
nsteps = 500
using_input_file = .false.
! Read in the command line arguments and
! set up the needed variables
select case(command_argument_count())
case(0) ! No arguments -> default values
case(1) ! One argument -> input file name
using_input_file = .true.
call get_command_argument(1, input_file)
case(2) ! Two arguments -> input file name and number of steps
using_input_file = .true.
call get_command_argument(1, input_file)
call get_command_argument(2, arg)
read(arg, *) nsteps
case(3) ! Three arguments -> rows, cols and nsteps
call get_command_argument(1, arg)
read(arg, *) rows
call get_command_argument(2, arg)
read(arg, *) cols
call get_command_argument(3, arg)
read(arg, *) nsteps
case default
call usage()
stop
end select
! Initialize the fields according the command line arguments
if (using_input_file) then
!$OMP SINGLE
call read_field(previous, input_file)
call copy_fields(previous, current)
!$OMP END SINGLE
else
!$OMP SINGLE
call set_field_dimensions(previous, rows, cols)
call set_field_dimensions(current, rows, cols)
!$OMP END SINGLE
call generate_field(previous)
call generate_field(current)
end if
end subroutine initialize
! Generate initial the temperature field. Pattern is disc with a radius
! of nx_full / 6 in the center of the grid.
! Boundary conditions are (different) constant temperatures outside the grid
subroutine generate_field(field0)
use heat
implicit none
type(field), intent(inout) :: field0
real(dp) :: radius2
integer :: i, j, ds2
! The arrays for field contain also a halo region
!$OMP SINGLE
allocate(field0%data(0:field0%nx+1, 0:field0%ny+1))
!$OMP END SINGLE
! Square of the disk radius
radius2 = (field0%nx / 6.0_dp)**2
!$OMP DO
do j = 0, field0%ny + 1
do i = 0, field0%nx + 1
ds2 = int((i - field0%nx / 2.0_dp + 1)**2 + &
& (j - field0%ny / 2.0_dp + 1)**2)
if (ds2 < radius2) then
field0%data(i,j) = 5.0_dp
else
field0%data(i,j) = 65.0_dp
end if
end do
end do
!$OMP END DO
! Boundary conditions
!$OMP DO
do j = 0, field0%nx + 1
field0%data(j, 0) = 20.0_dp
end do
!$OMP END DO NOWAIT
!$OMP DO
do j = 0, field0%nx + 1
field0%data(j, field0%ny + 1) = 70.0_dp
end do
!$OMP END DO
!$OMP DO
do j = 0, field0%ny + 1
field0%data(0, j) = 85.0_dp
end do
!$OMP END DO NOWAIT
!$OMP DO
do j = 0, field0%ny+1
field0%data(field0%nx + 1, j) = 5.0_dp
end do
!$OMP END DO
end subroutine generate_field
! Clean up routine for field type
! Arguments:
! field0 (type(field)): field variable to be cleared
subroutine finalize(field0, field1)
use heat
implicit none
type(field), intent(inout) :: field0, field1
deallocate(field0%data)
deallocate(field1%data)
end subroutine finalize
! Helper routine that prints out a simple usage if
! user gives more than three arguments
subroutine usage()
implicit none
character(len=256) :: buf
call get_command_argument(0, buf)
write (*,'(A)') 'Usage:'
write (*,'(A, " (default values will be used)")') trim(buf)
write (*,'(A, " <filename>")') trim(buf)
write (*,'(A, " <filename> <nsteps>")') trim(buf)
write (*,'(A, " <rows> <cols> <nsteps>")') trim(buf)
end subroutine usage
end module setup
! Utility routines for heat equation solver
! NOTE: This file does not need to be edited!
module utilities
contains
! Swap the data fields of two variables of type field
! Arguments:
! curr, prev (type(field)): the two variables that are swapped
subroutine swap_fields(curr, prev)
use heat
implicit none
type(field), intent(inout) :: curr, prev
real(dp), allocatable, dimension(:,:) :: tmp
call move_alloc(curr%data, tmp)
call move_alloc(prev%data, curr%data)
call move_alloc(tmp, prev%data)
end subroutine swap_fields
! Copy the data from one field to another
! Arguments:
! from_field (type(field)): variable to copy from
! to_field (type(field)): variable to copy to
subroutine copy_fields(from_field, to_field)
use heat
implicit none
type(field), intent(in) :: from_field
type(field), intent(out) :: to_field
! Consistency checks
if (.not.allocated(from_field%data)) then
write (*,*) "Can not copy from a field without allocated data"
stop
end if
if (.not.allocated(to_field%data)) then
! Target is not initialize, allocate memory
allocate(to_field%data(lbound(from_field%data, 1):ubound(from_field%data, 1), &
& lbound(from_field%data, 2):ubound(from_field%data, 2)))
else if (any(shape(from_field%data) /= shape(to_field%data))) then
write (*,*) "Wrong field data sizes in copy routine"
print *, shape(from_field%data), shape(to_field%data)
stop
end if
to_field%data = from_field%data
to_field%nx = from_field%nx
to_field%ny = from_field%ny
to_field%dx = from_field%dx
to_field%dy = from_field%dy
end subroutine copy_fields
end module utilities
Copyright (C) 2018 CSC - IT Center for Science Ltd.
Licensed under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Code is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
Copy of the GNU General Public License can be obtained from
<http://www.gnu.org/licenses/>.
## Hello World ##
In this exercise you will test that you are able to compile and run
OpenMP application. Take a look at a simple example program in
[hello.c](hello.c) ([hello.F90](hello.F90) for
Fortran) that has been parallelized using OpenMP. It will first print
out a hello message (in serial) after which each thread will print out
an "X" character (in parallel).
Compile this program so that you enable OpenMP, and run it with one,
two and four threads. Do you get the expected amount of "X"s?
program hello
implicit none
print *, 'Hello world!'
!$omp parallel
print *, 'X'
!$omp end parallel
end program hello
#include <stdio.h>
int main(int argc, char *argv[])
{
printf("Hello world!\n");
#pragma omp parallel
{
printf("X\n");
}
return 0;
}
program hello
implicit none
print *, 'Hello world!'
!$omp parallel
print *, 'X'
!$omp end parallel
end program hello
#include <stdio.h>
int main(int argc, char *argv[])
{
printf("Hello world!\n");
#pragma omp parallel
{
printf("X\n");
}
return 0;
}
Copyright (C) 2018 CSC - IT Center for Science Ltd.
Licensed under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Code is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
Copy of the GNU General Public License can be obtained from
<http://www.gnu.org/licenses/>.
## Vector sum and race condition ##
In [vectorsum.c](vectorsum.c) (or [vectorsum.F90](vectorsum.F90) for
Fortran) you will find a serial code that sums up all the elements of
a vector A, initialized as A=1,2,...,N. For reference, from the
arithmetic sum formula we know that the result should be S=N(N+1)/2.
Try to parallelize the code by using only `omp parallel` or `omp for`
pragmas. Are you able to get same results with different number of
threads and in different runs? Explain why the program does not work
correctly in parallel.
Next, try to use `reduction` clause to compute the sum correctly.
Implement finally an alternative version where each thread computes its
own part to a private variable and the use a `critical` section after
the loop to compute the global sum. Which version (`reduction` or `critical`)
gives better performance?
program vectorsum
use iso_fortran_env, only: int64
implicit none
integer, parameter :: ik = int64
integer(kind=ik), parameter :: nx = 102400_ik
integer(kind=ik), dimension(nx) :: vecA
integer(kind=ik) :: sum, psum, sumex
integer(kind=ik) :: i
! Initialization of vector
do i = 1, nx
vecA(i) = i
end do
sumex = nx*(nx+1_ik)/2_ik
write(*,*) 'Arithmetic sum formula (exact): ', sumex
sum = 0
! Sum with data race
!$omp parallel do default(shared) private(i)
do i = 1, nx
sum = sum + vecA(i)
end do
!$omp end parallel do
write(*,*) 'Sum with data race: ', sum
sum = 0
! Dot product using critical section = SERIAL CODE
!$omp parallel do default(shared) private(i)
do i = 1, nx
!$omp critical
sum = sum + vecA(i)
!$omp end critical
end do
!$omp end parallel do
write(*,*) 'Sum using critical section: ', sum
sum = 0
! Dot product using reduction
!$omp parallel do default(shared) private(i) reduction(+:sum)
do i = 1, nx
sum = sum + vecA(i)
end do
!$omp end parallel do
write(*,*) 'Sum using reduction: ', sum
sum = 0
! Dot product using private variable and critical secion
!$omp parallel default(shared) private(i, psum)
psum = 0
!$omp do
do i = 1, nx
psum = psum + vecA(i)
end do
!$omp end do
!$omp critical
sum = sum + psum
!$omp end critical
!$omp end parallel
write(*,*) 'Sum using private variable and critical section: ', sum
end program vectorsum
#include <stdio.h>
#define NX 102400
int main(void)
{
long vecA[NX];
long sum, psum, sumex;
int i;
/* Initialization of the vectors */
for (i = 0; i < NX; i++) {
vecA[i] = (long) i + 1;
}
sumex = (long) NX * (NX + 1) / ((long) 2);
printf("Arithmetic sum formula (exact): %ld\n",
sumex);
sum = 0.0;
/* Version with data race */
#pragma omp parallel for default(shared) private(i)
for (i = 0; i < NX; i++) {
sum += vecA[i];
}
printf("Sum with data race: %ld\n",
sum);
sum = 0.0;
/* Dot product using critical section = SERIAL CODE! */
#pragma omp parallel for default(shared) private(i)
for (i = 0; i < NX; i++) {
#pragma omp critical(dummy)
sum += vecA[i];
}
printf("Sum using critical section: %ld\n",
sum);
sum = 0.0;
/* Dot product using private partial sums and critical section */
#pragma omp parallel default(shared) private(i, psum)
{
psum = 0.0;
#pragma omp for
for (i = 0; i < NX; i++) {
psum += vecA[i];
}
#pragma omp critical(par_sum)
sum += psum;
}
printf("Sum using private variable and critical section: %ld\n",
sum);
sum = 0.0;
/* Dot product using reduction */
#pragma omp parallel for default(shared) private(i) reduction(+:sum)
for (i = 0; i < NX; i++) {
sum += vecA[i];
}
printf("Reduction sum: %ld\n",
sum);
return 0;
}
program vectorsum
use iso_fortran_env, only: int64
implicit none
integer, parameter :: ik = int64
integer(kind=ik), parameter :: nx = 102400_ik
integer(kind=ik), dimension(nx) :: vecA
integer(kind=ik) :: sum, psum, sumex
integer(kind=ik) :: i
! Initialization of vector
do i = 1, nx
vecA(i) = i
end do
sumex = nx*(nx+1_ik)/2_ik
write(*,*) 'Arithmetic sum formula (exact): ', sumex
sum = 0
! TODO: Parallelize the computation
do i = 1, nx
sum = sum + vecA(i)
end do
write(*,*) 'Sum: ', sum
end program vectorsum
#include <stdio.h>
#define NX 102400
int main(void)
{
long vecA[NX];
long sum, psum, sumex;
int i;
/* Initialization of the vectors */
for (i = 0; i < NX; i++) {
vecA[i] = (long) i + 1;
}
sumex = (long) NX * (NX + 1) / ((long) 2);
printf("Arithmetic sum formula (exact): %ld\n",
sumex);
sum = 0.0;
/* TODO: Parallelize computation */
for (i = 0; i < NX; i++) {
sum += vecA[i];
}
printf("Sum: %ld\n", sum);
return 0;
}
Copyright (C) 2018 CSC - IT Center for Science Ltd.
Licensed under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Code is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
Copy of the GNU General Public License can be obtained from
<http://www.gnu.org/licenses/>.
## Bonus: Using OpenMP tasks for dynamic parallelization ##
Mandelbrot set of complex numbers can be presented as two dimensional
fractal image. The computation of the pixel values is relatively time
consuming, and the computational cost varies for each pixel. Thus,
simply dividing the two dimensional grid evenly to threads leads into
load imbalance and suboptimal performance.
Files [tasks/c/mandelbrot.c](c/mandelbrot.c) and
[tasks/c/mandelbrot.F90](fortran/mandelbrot.F90) contain recursive
implementation for calculating the Mandelbrot fractal, which can be
parallelized dynamically with OpenMP tasks. Insert missing pragmas for
parallelizing the code (look for TODOs in the source code), and
investigate the scalability with varying number of threads.
ifeq ($(COMP),)
COMP=cray
endif
COMMONDIR=../../../heat/common
ifeq ($(COMP),cray)
FC=ftn
CC=cc
FCFLAGS=-O3
CCFLAGS=-O3 -I/appl/opt/libpng/include -I$(COMMONDIR)
LDFLAGS=-L/appl/opt/libpng/lib
LIBS=-lpng -lz
endif
ifeq ($(COMP),gnu)
FC=gfortran
CC=gcc
FCFLAGS=-fopenmp -O3 -Wall
CCFLAGS=-fopenmp -O3 -Wall -I$(COMMONDIR)
LDFLAGS=
LIBS=-lpng
endif
EXE=mandelbrot
OBJS=mandelbrot.o
OBJS_PNG=pngwriter.o
CORRECT_OBJS_PNG=$(COMMONDIR)/pngwriter.o
all: $(EXE)
$(COMMONDIR)/pngwriter.o: $(COMMONDIR)/pngwriter.c $(COMMONDIR)/pngwriter.h
mandelbrot.o: mandelbrot.c
pngwriter.o: pngwriter.c pngwriter.h
$(CC) $(CCFLAGS) -c -o $@ $<
$(EXE): $(OBJS) $(OBJS_PNG)
$(CC) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS)
%.o: %.F90
$(FC) $(FCFLAGS) -c $< -o $@
%.o: %.c
$(CC) $(CCFLAGS) -c $< -o $@
.PHONY: clean
clean:
-/bin/rm -f $(EXE) a.out *.o *.mod *.png *~ mandelbrot.png
/*
* This example is based on the code of Andrew V. Adinetz
* https://github.com/canonizer/mandelbrot-dyn
* Licensed under The MIT License
*/
#include <omp.h>
#include <complex.h>
#include <stdio.h>
#include <stdlib.h>
#include "pngwriter.h"
// Maximum number of iterations
const int MAX_ITER_COUNT = 512;
// Marker for different iteration counts
const int DIFF_ITER_COUNT = -1;
// Maximum recursion depth
const int MAX_DEPTH = 6;
// Region size below which do per-pixel
const int MIN_SIZE = 32;
// Subdivision factor along each axis
const int SUBDIV = 4;
// |z|^2 of a complex number z
float abs2(complex v)
{
return creal(v) * creal(v) + cimag(v) * cimag(v);
}
// The kernel to count per-pixel values of the portion of the Mandelbrot set
// Does not need to be edited
int kernel(int w, int h, complex cmin, complex cmax,
int x, int y)
{
complex dc = cmax - cmin;
float fx = (float)x / w;
float fy = (float)y / h;
complex c = cmin + fx * creal(dc) + fy * cimag(dc) * I;
int iteration = 0;
complex z = c;
while (iteration < MAX_ITER_COUNT && abs2(z) < 2 * 2) {
z = z * z + c;
iteration++;
}
return iteration;
}
/* Computes the Mandelbrot image recursively
* At each call, the image is divided into smaller blocks (by a factor of
* subdiv), and the function is called recursively with arguments corresponding
* to subblock. When maximum recursion depth is reached or size of block
* is smaller than predefined minimum, one starts to calculate actual pixel
* values
*
* - - - - - - - - ----- -----
* | | | | | |
* | | ----- -----
* | | --> --> ...
* | | ----- -----
* | | | | | |
* | | ----- -----
* ---------------
*/
void mandelbrot_block(int *iter_counts, int w, int h, complex cmin,
complex cmax, int x0, int y0, int d, int depth)
{
// TODO Parallelize the recursive function call
// with OpenMP tasks
int block_size = d / SUBDIV;
if (depth + 1 < MAX_DEPTH && block_size > MIN_SIZE) {
// Subdivide recursively
for (int i = 0; i < SUBDIV; i++) {
for (int j = 0; j < SUBDIV; j++) {
mandelbrot_block(iter_counts, w, h, cmin, cmax,
x0 + i * block_size, y0 + j * block_size,
d / SUBDIV, depth + 1);
}
}
} else {
// Last recursion level reached, calculate the values
for (int i = x0; i < x0 + d; i++) {
for (int j = y0; j < y0 + d; j++) {
iter_counts[j * w + i] = kernel(w, h, cmin, cmax, i, j);
}
}
}
}
int main(int argc, char **argv)
{
// Picture size, should be power of two
const int w = 512;
const int h = w;
int *iter_counts;
complex cmin, cmax;
int pic_bytes = w * h * sizeof(int);
iter_counts = (int *)malloc(pic_bytes);
cmin = -1.5 + -1.0 * I;
cmax = 0.5 + 1.0 * I;
double t1 = omp_get_wtime();
// TODO create parallel region. How many threads should be calling
// mandelbrot_block in this uppermost level?
{
mandelbrot_block(iter_counts, w, h, cmin, cmax,
0, 0, w, 1);
}
double t2 = omp_get_wtime();
// Save the image to a PNG file
save_png(iter_counts, w, h, "mandelbrot.png");
double walltime = t2 - t1;
// Print the timings
printf("Mandelbrot set computed in %.3lf s, at %.3lf Mpix/s\n",
walltime, h * w * 1e-6 / walltime);
free(iter_counts);
return 0;
}