!===============================================================================
!
! BQCD -- Berlin Quantum ChromoDynamics program
!
! Author: Hinnerk Stueben <stueben@zib.de>
!
! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
!
!-------------------------------------------------------------------------------
!
! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger)
!          (optimization for Hitachi SR8000: hybrid programming model, 
!           MPI + OpenMP + overlapping communication and computation)
!
!-------------------------------------------------------------------------------
# include "defs.h"

!-------------------------------------------------------------------------------
subroutine NAME(ee, oo, out, in, u)
 
! out := NAME in 
!
! NAME = d or d_dag
!
! out is of type "e" = EVEN or ODD
! in is of type "o" = ODD or EVEN

  use module_nn
  use module_vol
  use module_thread
  implicit none
 
  integer :: ee, oo
  SPINCOL_FIELD :: out, in
  GAUGE_FIELD :: u

  integer :: thread, i1, i2, omp_get_thread_num, e, o

  TIMING_START(STRCAT(timing_bin_, NAME))

  call xbound_fill_buffer_y(in)
  call xbound_fill_buffer_z(in)

  !$omp parallel private(thread, i1, i2, e, o)

  thread = omp_get_thread_num()
  e = ee
  o = oo

  !$omp barrier

  i1 = xyz_start(thread)
  i2 = xyz_end(thread)

  if (thread == 0) then
     TIMING_START(timing_bin_d_xf)
     call xbound_copy_buffer_y(in)
  else
     call STRCAT(NAME, _switch_0)(e, o, out, in, u, i1, i2, 1)
  endif

  !$omp barrier

  if (thread == 0) then
     TIMING_STOP(timing_bin_d_xf)
     TIMING_START(timing_bin_d_yf)
     call xbound_copy_buffer_z(in)
     !!call xbound_d3(in, 3)
  else
     call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 2)
  endif

  !$omp barrier

  if (thread == 0) then
     TIMING_STOP(timing_bin_d_yf)
     TIMING_START(timing_bin_d_zf)
     call xbound_d3(in, 4)
  else
     call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 3)
  endif

  !$omp barrier

#ifdef TIMING
  if (thread == 0) then
     TIMING_STOP(timing_bin_d_zf)
     TIMING_START(timing_bin_d_t)
  endif
#endif

  i1 = t_start(thread)
  i2 = t_end(thread)

     call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 4)

  !$omp end parallel

  TIMING_STOP(timing_bin_d_t)
  TIMING_STOP(STRCAT(timing_bin_, NAME))

end

!===============================================================================
