/*************  mult_Hw.c *******************************/
/*
   mult_Hw(wilson_vector *src, wilson_vector *dest);
   dest(x)=g5*{src(x)-kappa*SUM_dirs( (1+g[dir])*U(x,dir)*src(x+dir) 
   +(1-g[dir])*U+(x-dir,dir)*src(x-dir))}       
 */

#include "./include/includes.h"
#include <omp.h>

void dslash( wilson_vector * src, wilson_vector * dest, int isign, int parity );
void dslash_32( float * src, float * dest, int isign, int parity );

/* WILSON */

/* single precision matrix multiplication */
void multiply_fmat_32( float * src, float * dest, int isign )
{
    dslash_32( src, dest, isign, EVEN );
    dslash_32( src, dest, isign, ODD );
    latutil_xpay_32(src,-(float)kappa,dest,EVENANDODD);

}
void multiply_hfmat_32( float * src, float * dest )
{

    dslash_32( src, dest, 1, EVENANDODD );
    latutil_5xpay_32(src,-(float)kappa,dest,EVENANDODD);

}
void multiply_hfmat( wilson_vector * src, wilson_vector * dest )
{
    int i;
    site *s;
    dslash( src, dest, 1, EVENANDODD );
    FORALLSITES(i,s)
    {
        scalar_mult_add_wvec(&(src[i]),&(dest[i]),-kappa,&(dest[i]));
        g5_mult_wvec( &( dest[i] ), &( dest[i] ) );
    }
}


void multiply_fmat( wilson_vector * src, wilson_vector * dest, int isign) 
{

  double t1, t2;

  t1=omp_get_wtime();  
  dslash( src, dest, isign, EVENANDODD );

#ifdef VERBOSE_TIMINGS
  t2=omp_get_wtime();node0_printf(" - dslash %1.16e s \n",t2-t1);t1=omp_get_wtime();  
#endif


  scalar_mult_add_wvec_lattice __targetLaunch__(sites_on_node) (src,dest,-kappa,dest);
  targetSynchronize();

#ifdef VERBOSE_TIMINGS
  double time;
    t2=omp_get_wtime();time=t2-t1;node0_printf(" -  scalar_mult_add_wvec %1.16e s %1.16e GB/s \n",time,sites_on_node*576./(time*1073741824.));t1=omp_get_wtime();  
#endif


}
