UW AMath High Performance Scientific Computing
 
Coursera Edition

Previous topic

Jacobi iteration using OpenMP with parallel do constructs

Next topic

Jacobi iteration using MPI

This Page

Jacobi iteration using OpenMP with coarse-grain parallel blockΒΆ

The code below implements Jacobi iteration for solving the linear system arising from the steady state heat equation with a single parallel block. Work is split up manually between threads.

Compare to:

The code:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
! $UWHPSC/codes/openmp/jacobi1d_omp2.f90
!
! Domain decomposition version of Jacobi iteration illustrating
! coarse grain parallelism with OpenMP.
!
! The grid points are split up into nthreads disjoint sets and each thread
! is assigned one set that it updates for all iterations.

program jacobi1d_omp2
    use omp_lib
    implicit none
    real(kind=8), dimension(:), allocatable :: x,u,uold,f
    real(kind=8) :: alpha, beta, dx, tol, dumax, dumax_thread
    real(kind=8), intrinsic :: exp
    real(kind=8) :: t1,t2
    integer :: n, nthreads, points_per_thread,thread_num
    integer :: i,iter,maxiter,istart,iend,nprint

    ! Specify number of threads to use:
    nthreads = 1       ! need this value in serial mode
    !$ nthreads = 2    
    !$ call omp_set_num_threads(nthreads)
    !$ print "('Using OpenMP with ',i3,' threads')", nthreads

    nprint = 10000 ! print dumax every nprint iterations

    print *, "Input n ... "
    read *, n

    ! allocate storage for boundary points too:
    allocate(x(0:n+1), u(0:n+1), uold(0:n+1), f(0:n+1))

    open(unit=20, file="heatsoln.txt", status="unknown")

    call cpu_time(t1)

    ! grid spacing:
    dx = 1.d0 / (n+1.d0)

    ! boundary conditions:
    alpha = 20.d0
    beta = 60.d0

    ! tolerance and max number of iterations:
    tol = 0.1 * dx**2
    print *, "Convergence tolerance: tol = ",tol
    maxiter = 100000
    print *, "Maximum number of iterations: maxiter = ",maxiter

    ! Determine how many points to handle with each thread.
    ! Note that dividing two integers and assigning to an integer will
    ! round down if the result is not an integer.  
    ! This, together with the min(...) in the definition of iend below,
    ! insures that all points will get distributed to some thread.
    points_per_thread = (n + nthreads - 1) / nthreads
    print *, "points_per_thread = ",points_per_thread


    ! Start of the parallel block... 
    ! ------------------------------

    ! This is the only time threads are forked in this program:

    !$omp parallel private(thread_num, iter, istart, iend, i, dumax_thread) 

    thread_num = 0     ! needed in serial mode
    !$ thread_num = omp_get_thread_num()    ! unique for each thread

    ! Determine start and end index for the set of points to be 
    ! handled by this thread:
    istart = thread_num * points_per_thread + 1
    iend = min((thread_num+1) * points_per_thread, n)

    !$omp critical
    print '("Thread ",i2," will take i = ",i6," through i = ",i6)', &
          thread_num, istart, iend
    !$omp end critical

    ! Initialize:
    ! -----------

    ! each thread sets part of these arrays:
    do i=istart, iend
        ! grid points:
        x(i) = i*dx
        ! source term:
        f(i) = 100.*exp(x(i))
        ! initial guess:
        u(i) = alpha + x(i)*(beta-alpha)
        enddo
    
    ! boundary conditions need to be added:
    u(0) = alpha
    u(n+1) = beta

    uold = u   ! initialize, including boundary values


    ! Jacobi iteratation:
    ! -------------------


    do iter=1,maxiter

        ! initialize uold to u (note each thread does part!)
        uold(istart:iend) = u(istart:iend) 

        !$omp single
        dumax = 0.d0     ! global max initialized by one thread
        !$omp end single

        ! Make sure all of uold is initialized before iterating
        !$omp barrier
        ! Make sure uold is consitent in memory:
        !$omp flush

        dumax_thread = 0.d0   ! max seen by this thread
        do i=istart,iend
            u(i) = 0.5d0*(uold(i-1) + uold(i+1) + dx**2*f(i))
            dumax_thread = max(dumax_thread, abs(u(i)-uold(i)))
            enddo

        !$omp critical
        ! update global dumax using value from this thread:
        dumax = max(dumax, dumax_thread)
        !$omp end critical

        ! make sure all threads are done with dumax:
        !$omp barrier

        !$omp single
        ! only one thread will do this print statement:
        if (mod(iter,nprint)==0) then
            print '("After ",i8," iterations, dumax = ",d16.6,/)', iter, dumax
            endif
        !$omp end single

        ! check for convergence:
        ! note that all threads have same value of dumax 
        ! at this point, so they will all exit on the same iteration.
        
        if (dumax .lt. tol) exit

        ! need to synchronize here so no thread resets dumax = 0
        ! at start of next iteration before all have done the test above.
        !$omp barrier

        enddo

    print '("Thread number ",i2," finished after ",i9, &
            " iterations, dumax = ", e16.6)', &
          thread_num,iter,dumax

    !$omp end parallel

    call cpu_time(t2)
    print '("CPU time = ",f12.8, " seconds")', t2-t1


    ! Write solution to heatsoln.txt:
    write(20,*) "          x                  u"
    do i=0,n+1
        write(20,'(2e20.10)'), x(i), u(i)
        enddo

    print *, "Solution is in heatsoln.txt"

    close(20)

end program jacobi1d_omp2