module dist_mat
! Form distributed sector Hamiltonian matrix and diagonalize
! Time-stamp: "2003-09-23 17:08:23 cjn"
  use io_units, only: fo
  use precisn, only: wp
!  use blacs, only: ctxt, myrow, mycol, p, q, nblock, mynumrows,&
!       mynumcols, p_error, dlen_, nb_, csrc_
  use error_out, only: error_check
!  use magma
  implicit none

  integer, save               :: ny           ! a dimension
  real(wp), allocatable, save :: a(:)         ! distributed matrix
  real(wp), allocatable, save :: z(:)         ! eigenvectors
  real(wp), allocatable, save :: evals(:)     ! eigenvalues
!  integer, save               :: desca(dlen_) ! Hamiltonian descriptor
!  integer, save               :: descz(dlen_) ! Eigenvector descriptor

  private
  public A_fill, sh_diag, kill_az
  public evals, z

  interface

     integer(c_int) function cudaSetDevice(dev) bind(C, name='cudaSetDevice')
        use, intrinsic :: iso_c_binding
        integer(c_int), value   :: dev
     end function cudaSetDevice

     integer(c_int) function magma_setdevice ( dev ) bind(C,name='magma_setdevice')
        use, intrinsic :: iso_c_binding
!       use iso_c_binding
!       integer(c_int64_t) :: magma_setdevice
       integer(c_int),value  :: dev
     end function magma_setdevice

  end interface

contains

  subroutine A_fill (n)
! fill lower triangle of symmetric matrix, A
    use sec_ham, only: h_el
    integer, intent(in)   :: n       ! matrix dimension
    integer               :: j, jj, jloc, jglob, i, ii, iloc, iglob
    integer               :: status, info, numroc
    integer :: ind

    ny = n

! Compute local dimensions of matrix A
!    mynumrows = NUMROC(n, nblock, myrow, 0, p)
!    mynumcols = NUMROC(n, nblock, mycol, 0, q)

! Allocate local part of A
    allocate (a(ny*ny), z(ny*ny), stat=status)
    call error_check (status, ' Allocation Error, A_fill')

    a = 0.0_wp        ! initialize A

! Form array descriptor od destributed matrix A
!    call descinit (desca, n, n, nblock, nblock, 0, 0, ctxt, mynumrows,&
!         info)
!    descz = desca

!    do j = 1, mynumcols, nblock                   ! local column blocks
!       do jj = 1, MIN(nblock, mynumcols-j+1)      ! all columns
!          jloc = j - 1 + jj                       ! local column index
!          jglob = (j-1) * q + mycol * nblock + jj ! global column index

!          do i = 1, mynumrows, nblock
!             do ii = 1, MIN(nblock, mynumrows-i+1)
!                iloc = i - 1 + ii                       ! local index
!                iglob = (i-1) * p + myrow * nblock + ii ! global index
!                if (iglob > jglob) cycle
                
!                a(iloc+(jloc-1)*mynumrows) = h_el(iglob, jglob)

!             end do
!          end do
!       end do
!    end do

! sequential version - full matrix
  ind = 0
  do j = 1, ny
    do i = 1, j
      ind = ind + 1
      a(ind) = h_el(i,j)
    end do
  end do

  end subroutine A_fill

  subroutine sh_diag (imesh,sect)
! diagonalize the matrix a
    real(wp), allocatable        :: work(:)
    integer, allocatable         :: iwork(:)
    integer                      :: lwork, liwork
    real(wp)                     :: alpha, beta
    character(len=1)             :: jobz, uplo
    logical                      :: ismycol
    real(wp)                     :: dnorm2
    integer                      :: status, ii, nstrt, trilwmin
    integer                      :: np, nq, lda, iproc, info, numroc
    integer                      :: indxg2p, lwork1, ip, iq, imyrow, imycol
    integer                      :: rows, cols
    real(wp)                     :: t0, t1
    integer                      :: c0, c1, cr
    integer                      :: imesh, sect
    real(wp), external :: dnrm2
! Magma settings
    integer :: ijobz = 999 ! default setting
    integer :: iuplo = 999 ! default setting
    magma_devptr_t :: dA, dEvals, dWork, dIwork
    integer :: nGPU=1
    integer :: sizeof_double=8
    integer :: sizeof_int=4
    integer :: queue = -66
      CHARACTER(len=255) :: c_nGPU
      integer :: i_nGPU, status_val

! initialize eigenvector distributed array, z.
     alpha = 0.0_wp
     beta = 1.0_wp
!     call pdlaset ('all', ny, ny, alpha, beta, z, 1, 1, descz)
!     np = mynumrows
!     nq = mynumcols5

! perform diagonalization:
     jobz = 'V'
     uplo = 'U'
     liwork = 3 + 5*ny
     lwork = 1 + 6*ny + 2*ny*ny

     i_nGPU=-1
     call get_environment_variable("RMX_NGPU", c_nGPU, STATUS=status_val)
     if(status_val==0) then
        read( c_nGPU, '(i10)' ) i_nGPU
        write (*,'(a,i0)') "Environmental variable RMX_NGPU (no. GPUs for each sector diag) = ",i_nGPU
     else if (status_val==1) then
        write (*,'(a)') "Environmental variable RMX_NGPU not set, using default no. GPUs (1) "
     else 
        write (*,'(a,i0)') "**** Error - env variable RMX_NGPU, using default (1) ****, status value = ", status_val
     end if

     if (i_nGPU > 0 ) then
       nGPU = i_nGPU
     else
       nGPU = 1
     end if

     allocate (work(lwork), iwork(liwork), evals(ny), stat=status)
     call error_check (status, 'sh_diag: allocation error')

     write (fo,'(/,a,/)') 'Calling MAGMA DSYEVD'
     call cpu_time (t0)
     call system_clock (count=c0)

     if (nGPU == 1 ) then
        call magmaf_dsyevd(jobz, uplo, ny, a, ny, evals, work, lwork, iwork,liwork, info)
    else
        call magmaf_dsyevd_m(nGPU, jobz, uplo, ny, a, ny, evals, work, lwork, iwork,liwork, info)
    end if    

     call cpu_time (t1)
     write (fo,'(a,f16.4,a)') 'MAGMA DSYEVD CPU time = ', t1 - t0, ' secs'
     call system_clock (count=c1, count_rate=cr)
     write (fo,'(a,f16.4,a)') 'MAGMA DSYEVD Elapsed time = ', REAL(c1-c0,wp) / &
             REAL(cr,wp), ' secs'

     call error_check (info, 'sh_diag: pdsyevd error')

     deallocate (work, iwork, stat=status)
     call error_check (status, 'sh_diag: deallocation error')

     write (fo,'(/,a,i0,a,i0,a,5f14.4)') ' Mesh ',imesh,', Sector ',sect,': first five eigenvalues = ', evals(1:5)
     write (fo,'(a,i0,a,i0,a,5f14.4,/)') ' Mesh ',imesh,', Sector ',sect,': final five eigenvalues = ', evals(ny-4:ny)

     z = a
     evecs: do ii = 1, ny ! normalize eigenvectors
! indxg2p computes process coord which posseses entry of a
! distributed matrix specified by a global index INDXGLOB.
!        iproc = indxg2p(ii, descz(nb_), mycol, descz(csrc_), q)
!        ismycol = (iproc == mycol)
!        if (ismycol) then
!           call pdnrm2 (ny, dnorm2, z, 1, ii, descz, 1)
            
           dnorm2 = dnrm2(ny,z,1)    
           alpha = 1.0_wp / REAL(dnorm2,wp)
           call dscal(ny, alpha, z, 1)
!        end if
     end do evecs
   end subroutine sh_diag

   subroutine kill_az
     integer       :: status
     deallocate (a, z, evals, stat=status)
     call error_check (status, 'kill_az: deallocation error')
   end subroutine kill_az

end module dist_mat
