#include "includes.h"
#include <omp.h>
 
/* matrix x matrix */
void mult_su3_na_KE(  su3_matrix *a, su3_matrix *b, su3_matrix *c )
{
    register int i,j,k;
    register complex x,y;
    for(i=0;i<3;i++)for(j=0;j<3;j++)
    {
        x.real=x.imag=0.0;
        for(k=0;k<3;k++){
            CMUL_J( a->ROWCOL(i,k) , b->ROWCOL(j,k) , y );
            CSUM( x , y );
        }
        c->ROWCOL(i,j) = x;
    }
}
void mult_su3_nn_KE(  su3_matrix *a, su3_matrix *b, su3_matrix *c )
{
    register int i,j,k;
    register complex x,y;
    for(i=0;i<3;i++)for(j=0;j<3;j++){
        x.real=x.imag=0.0;
        for(k=0;k<3;k++){
            CMUL( a->ROWCOL(i,k) , b->ROWCOL(k,j) , y );
            CSUM( x , y );
        }
        c->ROWCOL(i,j).real = x.real;
        c->ROWCOL(i,j).imag = x.imag;
    }
}
void mult_su3_an_KE(  su3_matrix *a, su3_matrix *b, su3_matrix *c )
{
    register int i,j,k;
    register complex x,y;
    for(i=0;i<3;i++)for(j=0;j<3;j++){
        x.real=x.imag=0.0;
        for(k=0;k<3;k++){
            CMULJ_( a->ROWCOL(k,i) , b->ROWCOL(k,j), y );
            CSUM( x , y );
        }
        c->ROWCOL(i,j) = x;
    }
}
void mult_su3_aa_KE(  su3_matrix *a, su3_matrix *b, su3_matrix *c )
{
    register int i,j,k;
    register complex x,y;
    for(i=0;i<3;i++)for(j=0;j<3;j++){
        x.real=x.imag=0.0;
        for(k=0;k<3;k++){
            CMULJJ( a->ROWCOL(k,i) , b->ROWCOL(j,k), y );
            CSUM( x , y );
        }
        c->ROWCOL(i,j) = x;
    }
}


//inline void mult_su3_32(float *A, float *x, float *y)
void mult_su3_32(float *A, float *x, float *y)
{
    register int c, d;
    register float re, im;
    for (c=0;c<3;c++)
    {
        re=im=0;
        for (d=0;d<3;d++)
        {
            re+=A[2*(3*c+d)+0]*x[2*d+0]-A[2*(3*c+d)+1]*x[2*d+1];            
            im+=A[2*(3*c+d)+1]*x[2*d+0]+A[2*(3*c+d)+0]*x[2*d+1];            
        }
        y[2*c+0]=re;
        y[2*c+1]=im;
    }
}

void mult_adj_su3_32(float *A, float *x, float *y)
{
    register int c, d;
    register float re, im;
    for (c=0;c<3;c++)
    {
        re=im=0;
        for (d=0;d<3;d++)
        {
            re+=A[2*(3*d+c)+0]*x[2*d+0]+A[2*(3*d+c)+1]*x[2*d+1];            
            im+=-A[2*(3*d+c)+1]*x[2*d+0]+A[2*(3*d+c)+0]*x[2*d+1];            
        }
        y[2*c+0]=re;
        y[2*c+1]=im;
    }
}

/* dslash */
void latutil_dslash0_32(float *src, 
        float *b1, float *b2, float *b3, float *b4, 
        int isign, int parity)
{
    int i, c;
    int lat_begin, lat_end;

    lat_begin = 0;
    if( parity == ODD )
	lat_begin = even_sites_on_node;
    lat_end = sites_on_node;
    if( parity == EVEN )
	lat_end = even_sites_on_node;
    
    if (isign==PLUS)
    {
        for (i=lat_begin;i<lat_end;i++)
        {
            for (c=0;c<3;c++)
            {
                /* X */
                b1[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*3+1];
                b1[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*3+0];
                b1[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*2+1];
                b1[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*2+0];

                /* Y */
                b2[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*3+0];
                b2[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*3+1];
                b2[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*2+0];
                b2[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*2+1];

                /* Z */
                b3[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*2+1];
                b3[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*2+0];
                b3[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*3+1];
                b3[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*3+0];

                /* T */
                b4[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*2+0];
                b4[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*2+1];
                b4[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*3+0];
                b4[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*3+1];
            }

        }

    }
    else
    {
        for (i=0;i<sites_on_node;i++)
        {
            for (c=0;c<3;c++)
            {
                /* XDOWN */
                b1[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*3+1];
                b1[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*3+0];
                b1[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*2+1];
                b1[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*2+0];

                /* YDOWN */
                b2[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*3+0];
                b2[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*3+1];
                b2[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*2+0];
                b2[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*2+1];

                /* ZDOWN */
                b3[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*2+1];
                b3[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*2+0];
                b3[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*3+1];
                b3[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*3+0];

                /* TDOWN */
                b4[12*i+6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*2+0];
                b4[12*i+6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*2+1];
                b4[12*i+6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*3+0];
                b4[12*i+6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*3+1];
            }

        }
    }

}

void latutil_dslash1_32(float *src, float *u,
        float *b1, float *b2, float *b3, float *b4, 
        int isign, int parity)
{
    int i, c;
    int lat_begin, lat_end;
    float b1temp[12], b2temp[12], b3temp[12], b4temp[12];
    
    lat_begin = 0;
    if( parity == ODD )
	lat_begin = even_sites_on_node;
    lat_end = sites_on_node;
    if( parity == EVEN )
	lat_end = even_sites_on_node;
    
    if (isign==PLUS)
    {
        for (i=lat_begin;i<lat_end;i++)
        {
            for (c=0;c<3;c++)
            {
                /* X */
                b1temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*3+1];
                b1temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*3+0];
                b1temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*2+1];
                b1temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*2+0];

                /* Y */
                b2temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*3+0];
                b2temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*3+1];
                b2temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*2+0];
                b2temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*2+1];

                /* Z */
                b3temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*2+1];
                b3temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*2+0];
                b3temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*3+1];
                b3temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*3+0];

                /* T */
                b4temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*2+0];
                b4temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*2+1];
                b4temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*3+0];
                b4temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*3+1];
            }

            /* multiply by adjoint matrix */
            mult_adj_su3_32(u+18*(4*i+XUP),b1temp+6*0,b1+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+XUP),b1temp+6*1,b1+12*i+6*1);
            mult_adj_su3_32(u+18*(4*i+YUP),b2temp+6*0,b2+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+YUP),b2temp+6*1,b2+12*i+6*1);
            mult_adj_su3_32(u+18*(4*i+ZUP),b3temp+6*0,b3+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+ZUP),b3temp+6*1,b3+12*i+6*1);
            mult_adj_su3_32(u+18*(4*i+TUP),b4temp+6*0,b4+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+TUP),b4temp+6*1,b4+12*i+6*1);

        }

    }
    else
    {
        for (i=0;i<sites_on_node;i++)
        {
            for (c=0;c<3;c++)
            {
                /* XDOWN */
                b1temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*3+1];
                b1temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*3+0];
                b1temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] + src[24*i+8*c+2*2+1];
                b1temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*2+0];

                /* YDOWN */
                b2temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*3+0];
                b2temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] + src[24*i+8*c+2*3+1];
                b2temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*2+0];
                b2temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*2+1];

                /* ZDOWN */
                b3temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] + src[24*i+8*c+2*2+1];
                b3temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*2+0];
                b3temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*3+1];
                b3temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] + src[24*i+8*c+2*3+0];

                /* TDOWN */
                b4temp[6*0+2*c+0] = src[24*i+8*c+2*0+0] - src[24*i+8*c+2*2+0];
                b4temp[6*0+2*c+1] = src[24*i+8*c+2*0+1] - src[24*i+8*c+2*2+1];
                b4temp[6*1+2*c+0] = src[24*i+8*c+2*1+0] - src[24*i+8*c+2*3+0];
                b4temp[6*1+2*c+1] = src[24*i+8*c+2*1+1] - src[24*i+8*c+2*3+1];
            }

            /* multiply by adjoint matrix */
            mult_adj_su3_32(u+18*(4*i+XUP),b1temp+6*0,b1+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+XUP),b1temp+6*1,b1+12*i+6*1);
            mult_adj_su3_32(u+18*(4*i+YUP),b2temp+6*0,b2+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+YUP),b2temp+6*1,b2+12*i+6*1);
            mult_adj_su3_32(u+18*(4*i+ZUP),b3temp+6*0,b3+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+ZUP),b3temp+6*1,b3+12*i+6*1);
            mult_adj_su3_32(u+18*(4*i+TUP),b4temp+6*0,b4+12*i+6*0);
            mult_adj_su3_32(u+18*(4*i+TUP),b4temp+6*1,b4+12*i+6*1);

        }
    }

}

void latutil_dslash2_32(float *dest, float *u, 
        char **pt1, char **pt2, char **pt3, char **pt4, 
        int isign, int parity)
{
    int i, c;
    int lat_begin, lat_end;
    float b1temp[12], b2temp[12], b3temp[12], b4temp[12];
    float *b1, *b2, *b3, *b4;
    
    lat_begin = 0;
    if( parity == ODD )
	lat_begin = even_sites_on_node;
    lat_end = sites_on_node;
    if( parity == EVEN )
	lat_end = even_sites_on_node;
   
    if (isign==PLUS)
    {
        for (i=lat_begin;i<lat_end;i++)
        {
            b1=(float *)(pt1[i]);
            b2=(float *)(pt2[i]);
            b3=(float *)(pt3[i]);
            b4=(float *)(pt4[i]);

            /* multiply by matrix */
            mult_su3_32(u+18*(4*i+XUP),b1+6*0,b1temp+6*0);
            mult_su3_32(u+18*(4*i+XUP),b1+6*1,b1temp+6*1);
            mult_su3_32(u+18*(4*i+YUP),b2+6*0,b2temp+6*0);
            mult_su3_32(u+18*(4*i+YUP),b2+6*1,b2temp+6*1);
            mult_su3_32(u+18*(4*i+ZUP),b3+6*0,b3temp+6*0);
            mult_su3_32(u+18*(4*i+ZUP),b3+6*1,b3temp+6*1);
            mult_su3_32(u+18*(4*i+TUP),b4+6*0,b4temp+6*0);
            mult_su3_32(u+18*(4*i+TUP),b4+6*1,b4temp+6*1);

            for (c=0;c<3;c++)
            {
                /* XUP - NOSUM */
                dest[24*i+8*c+2*0+0]=+b1temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]=+b1temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]=+b1temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]=+b1temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]=+b1temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+1]=-b1temp[6*1+2*c+0];
                dest[24*i+8*c+2*3+0]=+b1temp[6*0+2*c+1];
                dest[24*i+8*c+2*3+1]=-b1temp[6*0+2*c+0];

                /* YUP */
                dest[24*i+8*c+2*0+0]+=b2temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b2temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b2temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b2temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b2temp[6*1+2*c+0];
                dest[24*i+8*c+2*2+1]+=b2temp[6*1+2*c+1];
                dest[24*i+8*c+2*3+0]-=b2temp[6*0+2*c+0];
                dest[24*i+8*c+2*3+1]-=b2temp[6*0+2*c+1];

                /* ZUP */
                dest[24*i+8*c+2*0+0]+=b3temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b3temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b3temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b3temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b3temp[6*0+2*c+1];
                dest[24*i+8*c+2*2+1]-=b3temp[6*0+2*c+0];
                dest[24*i+8*c+2*3+0]-=b3temp[6*1+2*c+1];
                dest[24*i+8*c+2*3+1]+=b3temp[6*1+2*c+0];

                /* TUP */
                dest[24*i+8*c+2*0+0]+=b4temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b4temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b4temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b4temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b4temp[6*0+2*c+0];
                dest[24*i+8*c+2*2+1]+=b4temp[6*0+2*c+1];
                dest[24*i+8*c+2*3+0]+=b4temp[6*1+2*c+0];
                dest[24*i+8*c+2*3+1]+=b4temp[6*1+2*c+1];
            }
        }

    }
    else
    {
        for (i=lat_begin;i<lat_end;i++)
        {
            b1=(float *)(pt1[i]);
            b2=(float *)(pt2[i]);
            b3=(float *)(pt3[i]);
            b4=(float *)(pt4[i]);

            /* multiply by matrix */
            mult_su3_32(u+18*(4*i+XUP),b1+6*0,b1temp+6*0);
            mult_su3_32(u+18*(4*i+XUP),b1+6*1,b1temp+6*1);
            mult_su3_32(u+18*(4*i+YUP),b2+6*0,b2temp+6*0);
            mult_su3_32(u+18*(4*i+YUP),b2+6*1,b2temp+6*1);
            mult_su3_32(u+18*(4*i+ZUP),b3+6*0,b3temp+6*0);
            mult_su3_32(u+18*(4*i+ZUP),b3+6*1,b3temp+6*1);
            mult_su3_32(u+18*(4*i+TUP),b4+6*0,b4temp+6*0);
            mult_su3_32(u+18*(4*i+TUP),b4+6*1,b4temp+6*1);

            for (c=0;c<3;c++)
            {
                /* case XDOWN: - NOSUM */
                dest[24*i+8*c+2*0+0]=+b1temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]=+b1temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]=+b1temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]=+b1temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]=-b1temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+1]=+b1temp[6*1+2*c+0];
                dest[24*i+8*c+2*3+0]=-b1temp[6*0+2*c+1];
                dest[24*i+8*c+2*3+1]=+b1temp[6*0+2*c+0];

                /*  case YDOWN: */
                dest[24*i+8*c+2*0+0]+=b2temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b2temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b2temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b2temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b2temp[6*1+2*c+0];
                dest[24*i+8*c+2*2+1]-=b2temp[6*1+2*c+1];
                dest[24*i+8*c+2*3+0]+=b2temp[6*0+2*c+0];
                dest[24*i+8*c+2*3+1]+=b2temp[6*0+2*c+1];

                /*  case ZDOWN: */
                dest[24*i+8*c+2*0+0]+=b3temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b3temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b3temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b3temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b3temp[6*0+2*c+1];
                dest[24*i+8*c+2*2+1]+=b3temp[6*0+2*c+0];
                dest[24*i+8*c+2*3+0]+=b3temp[6*1+2*c+1];
                dest[24*i+8*c+2*3+1]-=b3temp[6*1+2*c+0];

                /*  case TDOWN: */
                dest[24*i+8*c+2*0+0]+=b4temp[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b4temp[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b4temp[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b4temp[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b4temp[6*0+2*c+0];
                dest[24*i+8*c+2*2+1]-=b4temp[6*0+2*c+1];
                dest[24*i+8*c+2*3+0]-=b4temp[6*1+2*c+0];
                dest[24*i+8*c+2*3+1]-=b4temp[6*1+2*c+1];

            }
        }

    }
    

}

void latutil_dslash3_32(float *dest, 
        char **pt1, char **pt2, char **pt3, char **pt4, 
        int isign, int parity)
{
    int i, c;
    int lat_begin, lat_end;
    float *b1, *b2, *b3, *b4;
    
    lat_begin = 0;
    if( parity == ODD )
	lat_begin = even_sites_on_node;
    lat_end = sites_on_node;
    if( parity == EVEN )
	lat_end = even_sites_on_node;
   
    if (isign==PLUS)
    {
        for (i=lat_begin;i<lat_end;i++)
        {
            b1=(float *)(pt1[i]);
            b2=(float *)(pt2[i]);
            b3=(float *)(pt3[i]);
            b4=(float *)(pt4[i]);
            for (c=0;c<3;c++)
            {
                /* XUP */
                dest[24*i+8*c+2*0+0]+=b1[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b1[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b1[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b1[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b1[6*1+2*c+1];
                dest[24*i+8*c+2*2+1]-=b1[6*1+2*c+0];
                dest[24*i+8*c+2*3+0]+=b1[6*0+2*c+1];
                dest[24*i+8*c+2*3+1]-=b1[6*0+2*c+0];

                /* YUP */
                dest[24*i+8*c+2*0+0]+=b2[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b2[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b2[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b2[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b2[6*1+2*c+0];
                dest[24*i+8*c+2*2+1]+=b2[6*1+2*c+1];
                dest[24*i+8*c+2*3+0]-=b2[6*0+2*c+0];
                dest[24*i+8*c+2*3+1]-=b2[6*0+2*c+1];

                /* ZUP */
                dest[24*i+8*c+2*0+0]+=b3[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b3[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b3[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b3[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b3[6*0+2*c+1];
                dest[24*i+8*c+2*2+1]-=b3[6*0+2*c+0];
                dest[24*i+8*c+2*3+0]-=b3[6*1+2*c+1];
                dest[24*i+8*c+2*3+1]+=b3[6*1+2*c+0];

                /* TUP */
                dest[24*i+8*c+2*0+0]+=b4[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b4[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b4[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b4[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]+=b4[6*0+2*c+0];
                dest[24*i+8*c+2*2+1]+=b4[6*0+2*c+1];
                dest[24*i+8*c+2*3+0]+=b4[6*1+2*c+0];
                dest[24*i+8*c+2*3+1]+=b4[6*1+2*c+1];
            }
        }

    }
    else
    {
        for (i=lat_begin;i<lat_end;i++)
        {
            b1=(float *)(pt1[i]);
            b2=(float *)(pt2[i]);
            b3=(float *)(pt3[i]);
            b4=(float *)(pt4[i]);
            for (c=0;c<3;c++)
            {
                /* case XDOWN: */
                dest[24*i+8*c+2*0+0]+=b1[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b1[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b1[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b1[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b1[6*1+2*c+1];
                dest[24*i+8*c+2*2+1]+=b1[6*1+2*c+0];
                dest[24*i+8*c+2*3+0]-=b1[6*0+2*c+1];
                dest[24*i+8*c+2*3+1]+=b1[6*0+2*c+0];

                /*  case YDOWN: */
                dest[24*i+8*c+2*0+0]+=b2[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b2[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b2[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b2[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b2[6*1+2*c+0];
                dest[24*i+8*c+2*2+1]-=b2[6*1+2*c+1];
                dest[24*i+8*c+2*3+0]+=b2[6*0+2*c+0];
                dest[24*i+8*c+2*3+1]+=b2[6*0+2*c+1];

                /*  case ZDOWN: */
                dest[24*i+8*c+2*0+0]+=b3[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b3[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b3[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b3[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b3[6*0+2*c+1];
                dest[24*i+8*c+2*2+1]+=b3[6*0+2*c+0];
                dest[24*i+8*c+2*3+0]+=b3[6*1+2*c+1];
                dest[24*i+8*c+2*3+1]-=b3[6*1+2*c+0];

                /*  case TDOWN: */
                dest[24*i+8*c+2*0+0]+=b4[6*0+2*c+0];
                dest[24*i+8*c+2*0+1]+=b4[6*0+2*c+1];
                dest[24*i+8*c+2*1+0]+=b4[6*1+2*c+0];
                dest[24*i+8*c+2*1+1]+=b4[6*1+2*c+1];
                dest[24*i+8*c+2*2+0]-=b4[6*0+2*c+0];
                dest[24*i+8*c+2*2+1]-=b4[6*0+2*c+1];
                dest[24*i+8*c+2*3+0]-=b4[6*1+2*c+0];
                dest[24*i+8*c+2*3+1]-=b4[6*1+2*c+1];
            }
        }

    }
    

}

void dslash_32( float * src, float * dest, int isign, int parity )
{
    int dir;
    msg_tag *tag[8];

    latutil_dslash0_32(src,
            htmp_32[XUP],htmp_32[YUP],htmp_32[ZUP],htmp_32[TUP],
            isign,OPP_PAR(parity));
       
    for( dir=XUP; dir <= TUP; dir++) 
        tag[dir]=start_gather_from_temp(htmp_32[dir], 12*sizeof(float),
                12*sizeof(float),dir, parity, gen_pt[dir] );

    latutil_dslash1_32(src,gauge_32,
            htmp_32[XDOWN],htmp_32[YDOWN],htmp_32[ZDOWN],htmp_32[TDOWN],
            -isign,OPP_PAR(parity));


    for( dir=XUP; dir <= TUP; dir++) 
        tag[OPP_DIR(dir)]=start_gather_from_temp(htmp_32[OPP_DIR(dir)],
                12*sizeof(float), 12*sizeof(float),OPP_DIR(dir),
                parity, gen_pt[OPP_DIR(dir)] );

    for( dir=XUP; dir <= TUP; dir++) 
        wait_gather_KE(tag[dir]);

    latutil_dslash2_32(dest,gauge_32,
            gen_pt[XUP],gen_pt[YUP],gen_pt[ZUP],gen_pt[TUP],
            isign,parity);

    for( dir=XUP; dir <= TUP; dir++) 
        cleanup_gather(tag[dir]);

    for( dir=XUP; dir <= TUP; dir++) 
        wait_gather_KE(tag[OPP_DIR(dir)]);

    latutil_dslash3_32(dest,
            gen_pt[XDOWN],gen_pt[YDOWN],gen_pt[ZDOWN],gen_pt[TDOWN],  
            -isign,parity);

    for( dir=XUP; dir <= TUP; dir++) 
        cleanup_gather(tag[OPP_DIR(dir)]);

}

extern __targetConst__ int t_sites_on_node;



inline __target__ void mult_su3_mat_hwvec_tdp_inline(  const double* __restrict__ matstart, int idirmat,   const double* __restrict__ srcstart,   double* deststart, int isite )
{

  int i,j;
  int iv=0;

  //load matrix 
  double matloc[3][3][2][VVL];
  for(i=0;i<3;i++){
    for(j=0;j<3;j++){
      __targetILP__(iv) matloc[i][j][REPART][iv]=matstart[SU3MI(isite+iv,i,j,idirmat,REPART)];
      __targetILP__(iv) matloc[i][j][IMPART][iv]=matstart[SU3MI(isite+iv,i,j,idirmat,IMPART)];
    }
  }


  double srcloc[3][2][2][VVL];
  for(i=0;i<3;i++){
    for(j=0;j<2;j++){
      __targetILP__(iv) srcloc[i][j][REPART][iv]=srcstart[HWVI(isite+iv,i,j,REPART)];
      __targetILP__(iv) srcloc[i][j][IMPART][iv]=srcstart[HWVI(isite+iv,i,j,IMPART)];
    }
  }


  __targetILP__(iv) deststart[HWVLI(iv,0,0,REPART)] = 
     matloc[0][0][REPART][iv] * srcloc[0][0][REPART][iv]
       -  matloc[0][0][IMPART][iv] * srcloc[0][0][IMPART][iv]
       +  matloc[0][1][REPART][iv] * srcloc[1][0][REPART][iv]
       -   matloc[0][1][IMPART][iv] *  srcloc[1][0][IMPART][iv]
       +   matloc[0][2][REPART][iv] * srcloc[2][0][REPART][iv]
     -  matloc[0][2][IMPART][iv] * srcloc[2][0][IMPART][iv];

    __targetILP__(iv) deststart[HWVLI(iv,0,0,IMPART)] = 
      matloc[0][0][REPART][iv] * srcloc[0][0][IMPART][iv]
      +  matloc[0][0][IMPART][iv] * srcloc[0][0][REPART][iv]
      +  matloc[0][1][REPART][iv] *  srcloc[1][0][IMPART][iv]
      +   matloc[0][1][IMPART][iv] * srcloc[1][0][REPART][iv]
      +  matloc[0][2][REPART][iv] * srcloc[2][0][IMPART][iv]
      +  matloc[0][2][IMPART][iv] * srcloc[2][0][REPART][iv];



    __targetILP__(iv) deststart[HWVLI(iv,1,0,REPART)] = 
       matloc[1][0][REPART][iv] * srcloc[0][0][REPART][iv]
      -  matloc[1][0][IMPART][iv] * srcloc[0][0][IMPART][iv]
      +  matloc[1][1][REPART][iv] * srcloc[1][0][REPART][iv]
      -   matloc[1][1][IMPART][iv] *  srcloc[1][0][IMPART][iv]
      +   matloc[1][2][REPART][iv] * srcloc[2][0][REPART][iv]
      -  matloc[1][2][IMPART][iv] * srcloc[2][0][IMPART][iv];

    __targetILP__(iv) deststart[HWVLI(iv,1,0,IMPART)] = 
       matloc[1][0][REPART][iv] * srcloc[0][0][IMPART][iv]
      +  matloc[1][0][IMPART][iv] * srcloc[0][0][REPART][iv]
      +  matloc[1][1][REPART][iv] *  srcloc[1][0][IMPART][iv]
      +   matloc[1][1][IMPART][iv] * srcloc[1][0][REPART][iv]
      +  matloc[1][2][REPART][iv] * srcloc[2][0][IMPART][iv]
      +  matloc[1][2][IMPART][iv] * srcloc[2][0][REPART][iv];


    __targetILP__(iv) deststart[HWVLI(iv,2,0,REPART)] = 
       matloc[2][0][REPART][iv] * srcloc[0][0][REPART][iv]
      -  matloc[2][0][IMPART][iv] * srcloc[0][0][IMPART][iv]
      +  matloc[2][1][REPART][iv] * srcloc[1][0][REPART][iv]
      -   matloc[2][1][IMPART][iv] *  srcloc[1][0][IMPART][iv]
      +   matloc[2][2][REPART][iv] * srcloc[2][0][REPART][iv]
      -  matloc[2][2][IMPART][iv] * srcloc[2][0][IMPART][iv];

    __targetILP__(iv) deststart[HWVLI(iv,2,0,IMPART)] = 
       matloc[2][0][REPART][iv] * srcloc[0][0][IMPART][iv]
      +  matloc[2][0][IMPART][iv] * srcloc[0][0][REPART][iv]
      +  matloc[2][1][REPART][iv] *  srcloc[1][0][IMPART][iv]
      +   matloc[2][1][IMPART][iv] * srcloc[1][0][REPART][iv]
      +  matloc[2][2][REPART][iv] * srcloc[2][0][IMPART][iv]
      +  matloc[2][2][IMPART][iv] * srcloc[2][0][REPART][iv];



    __targetILP__(iv) deststart[HWVLI(iv,0,1,REPART)] = 
       matloc[0][0][REPART][iv] * srcloc[0][1][REPART][iv]
      -  matloc[0][0][IMPART][iv] * srcloc[0][1][IMPART][iv]
      +  matloc[0][1][REPART][iv] * srcloc[1][1][REPART][iv]
      -   matloc[0][1][IMPART][iv] *  srcloc[1][1][IMPART][iv]
      +   matloc[0][2][REPART][iv]* srcloc[2][1][REPART][iv]
      -  matloc[0][2][IMPART][iv] * srcloc[2][1][IMPART][iv];

    __targetILP__(iv) deststart[HWVLI(iv,0,1,IMPART)] = 
       matloc[0][0][REPART][iv] * srcloc[0][1][IMPART][iv]
      +  matloc[0][0][IMPART][iv] * srcloc[0][1][REPART][iv]
      +  matloc[0][1][REPART][iv] *  srcloc[1][1][IMPART][iv]
      +   matloc[0][1][IMPART][iv] * srcloc[1][1][REPART][iv]
      +  matloc[0][2][REPART][iv] * srcloc[2][1][IMPART][iv]
      +  matloc[0][2][IMPART][iv] * srcloc[2][1][REPART][iv];


    __targetILP__(iv) deststart[HWVLI(iv,1,1,REPART)] = 
       matloc[1][0][REPART][iv] * srcloc[0][1][REPART][iv]
      -  matloc[1][0][IMPART][iv] * srcloc[0][1][IMPART][iv]
      +  matloc[1][1][REPART][iv] * srcloc[1][1][REPART][iv]
      -   matloc[1][1][IMPART][iv] *  srcloc[1][1][IMPART][iv]
      +   matloc[1][2][REPART][iv] * srcloc[2][1][REPART][iv]
      -  matloc[1][2][IMPART][iv] * srcloc[2][1][IMPART][iv];

    __targetILP__(iv) deststart[HWVLI(iv,1,1,IMPART)] = 
       matloc[1][0][REPART][iv] * srcloc[0][1][IMPART][iv]
      +  matloc[1][0][IMPART][iv] * srcloc[0][1][REPART][iv]
      +  matloc[1][1][REPART][iv] *  srcloc[1][1][IMPART][iv]
      +   matloc[1][1][IMPART][iv] * srcloc[1][1][REPART][iv]
      +  matloc[1][2][REPART][iv] * srcloc[2][1][IMPART][iv]
      +  matloc[1][2][IMPART][iv] * srcloc[2][1][REPART][iv];


    __targetILP__(iv) deststart[HWVLI(iv,2,1,REPART)] = 
       matloc[2][0][REPART][iv] * srcloc[0][1][REPART][iv]
      -  matloc[2][0][IMPART][iv] * srcloc[0][1][IMPART][iv]
      +  matloc[2][1][REPART][iv] * srcloc[1][1][REPART][iv]
      -   matloc[2][1][IMPART][iv] *  srcloc[1][1][IMPART][iv]
      +   matloc[2][2][REPART][iv] * srcloc[2][1][REPART][iv]
      -  matloc[2][2][IMPART][iv] * srcloc[2][1][IMPART][iv];

    __targetILP__(iv) deststart[HWVLI(iv,2,1,IMPART)] = 
       matloc[2][0][REPART][iv] * srcloc[0][1][IMPART][iv]
      +  matloc[2][0][IMPART][iv] * srcloc[0][1][REPART][iv]
      +  matloc[2][1][REPART][iv] *  srcloc[1][1][IMPART][iv]
      +   matloc[2][1][IMPART][iv] * srcloc[1][1][REPART][iv]
      + matloc[2][2][REPART][iv] * srcloc[2][1][IMPART][iv]
      +  matloc[2][2][IMPART][iv] * srcloc[2][1][REPART][iv];


}

inline __target__ void mult_adj_su3_mat_hwvec_tdp_inline(  double* matstart, int idirmat,   double* srcstart,   double* deststart, int isite )
{

  int i,j;
  int iv=0;


  // load matrix into temporary data structure
  double matloc[3][3][2][VVL];
  for(i=0;i<3;i++){
    for(j=0;j<3;j++){
      __targetILP__(iv) matloc[i][j][REPART][iv]=matstart[SU3MI(isite+iv,i,j,idirmat,REPART)];
      __targetILP__(iv) matloc[i][j][IMPART][iv]=matstart[SU3MI(isite+iv,i,j,idirmat,IMPART)];
    }
  }


  // perform computation
  __targetILP__(iv) deststart[HWVI(isite+iv,0,0,REPART)] = 
    matloc[0][0][REPART][iv] * srcstart[HWVLI(iv,0,0,REPART)]
    +  matloc[0][0][IMPART][iv] * srcstart[HWVLI(iv,0,0,IMPART)]
    +  matloc[1][0][REPART][iv] * srcstart[HWVLI(iv,1,0,REPART)]
    +   matloc[1][0][IMPART][iv] *  srcstart[HWVLI(iv,1,0,IMPART)]
    +   matloc[2][0][REPART][iv] * srcstart[HWVLI(iv,2,0,REPART)]
    +  matloc[2][0][IMPART][iv] * srcstart[HWVLI(iv,2,0,IMPART)];
  
					
  
  __targetILP__(iv) deststart[HWVI(isite+iv,0,0,IMPART)] = 
    matloc[0][0][REPART][iv] * srcstart[HWVLI(iv,0,0,IMPART)]
    -  matloc[0][0][IMPART][iv] * srcstart[HWVLI(iv,0,0,REPART)]
    +  matloc[1][0][REPART][iv] *  srcstart[HWVLI(iv,1,0,IMPART)]
    -   matloc[1][0][IMPART][iv] * srcstart[HWVLI(iv,1,0,REPART)]
    +  matloc[2][0][REPART][iv] * srcstart[HWVLI(iv,2,0,IMPART)]
    -  matloc[2][0][IMPART][iv] * srcstart[HWVLI(iv,2,0,REPART)];
  
  
  
  __targetILP__(iv) deststart[HWVI(isite+iv,1,0,REPART)] = 
    matloc[0][1][REPART][iv] * srcstart[HWVLI(iv,0,0,REPART)]
    +  matloc[0][1][IMPART][iv] * srcstart[HWVLI(iv,0,0,IMPART)]
    +  matloc[1][1][REPART][iv] * srcstart[HWVLI(iv,1,0,REPART)]
    +   matloc[1][1][IMPART][iv] *  srcstart[HWVLI(iv,1,0,IMPART)]
    +   matloc[2][1][REPART][iv] * srcstart[HWVLI(iv,2,0,REPART)]
    +  matloc[2][1][IMPART][iv] * srcstart[HWVLI(iv,2,0,IMPART)];
  
  __targetILP__(iv) deststart[HWVI(isite+iv,1,0,IMPART)] = 
    matloc[0][1][REPART][iv] * srcstart[HWVLI(iv,0,0,IMPART)]
    -  matloc[0][1][IMPART][iv] * srcstart[HWVLI(iv,0,0,REPART)]
    +  matloc[1][1][REPART][iv] *  srcstart[HWVLI(iv,1,0,IMPART)]
    -   matloc[1][1][IMPART][iv] * srcstart[HWVLI(iv,1,0,REPART)]
    +  matloc[2][1][REPART][iv] * srcstart[HWVLI(iv,2,0,IMPART)]
    -  matloc[2][1][IMPART][iv] * srcstart[HWVLI(iv,2,0,REPART)];
  
  
  __targetILP__(iv) deststart[HWVI(isite+iv,2,0,REPART)] = 
    matloc[0][2][REPART][iv] * srcstart[HWVLI(iv,0,0,REPART)]
    +  matloc[0][2][IMPART][iv] * srcstart[HWVLI(iv,0,0,IMPART)]
    +  matloc[1][2][REPART][iv] * srcstart[HWVLI(iv,1,0,REPART)]
    +   matloc[1][2][IMPART][iv] *  srcstart[HWVLI(iv,1,0,IMPART)]
    +   matloc[2][2][REPART][iv] * srcstart[HWVLI(iv,2,0,REPART)]
    +  matloc[2][2][IMPART][iv] * srcstart[HWVLI(iv,2,0,IMPART)];
  
  __targetILP__(iv) deststart[HWVI(isite+iv,2,0,IMPART)] = 
    matloc[0][2][REPART][iv] * srcstart[HWVLI(iv,0,0,IMPART)]
    -  matloc[0][2][IMPART][iv] * srcstart[HWVLI(iv,0,0,REPART)]
    +  matloc[1][2][REPART][iv] *  srcstart[HWVLI(iv,1,0,IMPART)]
    -   matloc[1][2][IMPART][iv] * srcstart[HWVLI(iv,1,0,REPART)]
    +  matloc[2][2][REPART][iv] * srcstart[HWVLI(iv,2,0,IMPART)]
    -  matloc[2][2][IMPART][iv] * srcstart[HWVLI(iv,2,0,REPART)];
  
  
  
  __targetILP__(iv) deststart[HWVI(isite+iv,0,1,REPART)] = 
    matloc[0][0][REPART][iv] * srcstart[HWVLI(iv,0,1,REPART)]
    +  matloc[0][0][IMPART][iv] * srcstart[HWVLI(iv,0,1,IMPART)]
    +  matloc[1][0][REPART][iv] * srcstart[HWVLI(iv,1,1,REPART)]
    +   matloc[1][0][IMPART][iv] *  srcstart[HWVLI(iv,1,1,IMPART)]
    +   matloc[2][0][REPART][iv]* srcstart[HWVLI(iv,2,1,REPART)]
    +  matloc[2][0][IMPART][iv] * srcstart[HWVLI(iv,2,1,IMPART)];
  
  __targetILP__(iv) deststart[HWVI(isite+iv,0,1,IMPART)] = 
    matloc[0][0][REPART][iv] * srcstart[HWVLI(iv,0,1,IMPART)]
    -  matloc[0][0][IMPART][iv] * srcstart[HWVLI(iv,0,1,REPART)]
    +  matloc[1][0][REPART][iv] *  srcstart[HWVLI(iv,1,1,IMPART)]
    -   matloc[1][0][IMPART][iv] * srcstart[HWVLI(iv,1,1,REPART)]
    +  matloc[2][0][REPART][iv] * srcstart[HWVLI(iv,2,1,IMPART)]
    -  matloc[2][0][IMPART][iv] * srcstart[HWVLI(iv,2,1,REPART)];
  
  
  __targetILP__(iv) deststart[HWVI(isite+iv,1,1,REPART)] = 
    matloc[0][1][REPART][iv] * srcstart[HWVLI(iv,0,1,REPART)]
    +  matloc[0][1][IMPART][iv] * srcstart[HWVLI(iv,0,1,IMPART)]
    +  matloc[1][1][REPART][iv] * srcstart[HWVLI(iv,1,1,REPART)]
    +   matloc[1][1][IMPART][iv] *  srcstart[HWVLI(iv,1,1,IMPART)]
    +   matloc[2][1][REPART][iv] * srcstart[HWVLI(iv,2,1,REPART)]
    +  matloc[2][1][IMPART][iv] * srcstart[HWVLI(iv,2,1,IMPART)];
  
  __targetILP__(iv) deststart[HWVI(isite+iv,1,1,IMPART)] = 
    matloc[0][1][REPART][iv] * srcstart[HWVLI(iv,0,1,IMPART)]
    -  matloc[0][1][IMPART][iv] * srcstart[HWVLI(iv,0,1,REPART)]
    +  matloc[1][1][REPART][iv] *  srcstart[HWVLI(iv,1,1,IMPART)]
    -   matloc[1][1][IMPART][iv] * srcstart[HWVLI(iv,1,1,REPART)]
    +  matloc[2][1][REPART][iv] * srcstart[HWVLI(iv,2,1,IMPART)]
    -  matloc[2][1][IMPART][iv] * srcstart[HWVLI(iv,2,1,REPART)];
  
  
  __targetILP__(iv) deststart[HWVI(isite+iv,2,1,REPART)] = 
    matloc[0][2][REPART][iv] * srcstart[HWVLI(iv,0,1,REPART)]
    +  matloc[0][2][IMPART][iv] * srcstart[HWVLI(iv,0,1,IMPART)]
    +  matloc[1][2][REPART][iv] * srcstart[HWVLI(iv,1,1,REPART)]
    +   matloc[1][2][IMPART][iv] *  srcstart[HWVLI(iv,1,1,IMPART)]
    +   matloc[2][2][REPART][iv] * srcstart[HWVLI(iv,2,1,REPART)]
    +  matloc[2][2][IMPART][iv] * srcstart[HWVLI(iv,2,1,IMPART)];
  
  __targetILP__(iv) deststart[HWVI(isite+iv,2,1,IMPART)] = 
    matloc[0][2][REPART][iv] * srcstart[HWVLI(iv,0,1,IMPART)]
    -  matloc[0][2][IMPART][iv] * srcstart[HWVLI(iv,0,1,REPART)]
    +  matloc[1][2][REPART][iv] *  srcstart[HWVLI(iv,1,1,IMPART)]
    -   matloc[1][2][IMPART][iv] * srcstart[HWVLI(iv,1,1,REPART)]
    +  matloc[2][2][REPART][iv] * srcstart[HWVLI(iv,2,1,IMPART)]
    -  matloc[2][2][IMPART][iv] * srcstart[HWVLI(iv,2,1,REPART)];
  
}


__targetEntry__  void shrink1_lattice(const wilson_vector* __restrict__ src, int isign, int parity, half_wilson_vector** t_htmp){
  

  int isite;

  __targetTLP__(isite,t_sites_on_node)
    {
      int iv=0;
      int icol,ispin,ireim;

      double* dtmp;

      double hwvx[VVL*3*2*2];
      double hwvy[VVL*3*2*2];
      double hwvz[VVL*3*2*2];
      double hwvt[VVL*3*2*2];

      double* a = (double*) &(src[0]);
      int i;
    for ( i = 0; i < 3; i++ ){

      double a0re[VVL];
      double a0im[VVL];
      double a1re[VVL];
      double a1im[VVL];
      double a2re[VVL];
      double a2im[VVL];
      double a3re[VVL];
      double a3im[VVL];
      
    __targetILP__(iv) a0re[iv]= a[WVI(isite+iv,i,0,REPART)] ;
    __targetILP__(iv) a0im[iv]= a[WVI(isite+iv,i,0,IMPART)] ;
    __targetILP__(iv) a1re[iv]= a[WVI(isite+iv,i,1,REPART)] ;
    __targetILP__(iv) a1im[iv]= a[WVI(isite+iv,i,1,IMPART)] ;
    __targetILP__(iv) a2re[iv]= a[WVI(isite+iv,i,2,REPART)] ;
    __targetILP__(iv) a2im[iv]= a[WVI(isite+iv,i,2,IMPART)] ;
    __targetILP__(iv) a3re[iv]= a[WVI(isite+iv,i,3,REPART)] ;
    __targetILP__(iv) a3im[iv]= a[WVI(isite+iv,i,3,IMPART)] ;


    if( isign == PLUS )
    {
    	/* case XUP: */
    	{

    	  __targetILP__(iv) hwvx[HWVLI(iv,i,0,REPART)] = a0re[iv]- a3im[iv];
    	  __targetILP__(iv) hwvx[HWVLI(iv,i,0,IMPART)] = a0im[iv]+ a3re[iv];
    	  __targetILP__(iv) hwvx[HWVLI(iv,i,1,REPART)] = a1re[iv] - a2im[iv];
    	  __targetILP__(iv) hwvx[HWVLI(iv,i,1,IMPART)] = a1im[iv] + a2re[iv];
	  
    	}
    }
    else
    {
    	/* case XDOWN: */
    	{

    	  __targetILP__(iv) hwvx[HWVLI(iv,i,0,REPART)] = a0re[iv] + a3im[iv];
    	  __targetILP__(iv) hwvx[HWVLI(iv,i,0,IMPART)] = a0im[iv] - a3re[iv];
    	  __targetILP__(iv) hwvx[HWVLI(iv,i,1,REPART)] = a1re[iv] + a2im[iv];
    	  __targetILP__(iv) hwvx[HWVLI(iv,i,1,IMPART)] = a1im[iv] - a2re[iv];


    	}
    }


    /*wp_shrink( a,b2,YUP,sign); */

    if( isign == PLUS )
    {
    	/* case YUP: */
    	{

    	  __targetILP__(iv) hwvy[HWVLI(iv,i,0,REPART)] = a0re[iv] - a3re[iv];
    	  __targetILP__(iv) hwvy[HWVLI(iv,i,0,IMPART)] = a0im[iv] - a3im[iv];
    	  __targetILP__(iv) hwvy[HWVLI(iv,i,1,REPART)] = a1re[iv] + a2re[iv];
    	  __targetILP__(iv) hwvy[HWVLI(iv,i,1,IMPART)] = a1im[iv] + a2im[iv];



    	}

    }
    else
    {
    	/* case YDOWN: */
    	{

    	  __targetILP__(iv) hwvy[HWVLI(iv,i,0,REPART)] = a0re[iv] + a3re[iv];
    	  __targetILP__(iv) hwvy[HWVLI(iv,i,0,IMPART)] = a0im[iv] + a3im[iv];
    	    __targetILP__(iv) hwvy[HWVLI(iv,i,1,REPART)] = a1re[iv] - a2re[iv];
    	  __targetILP__(iv) hwvy[HWVLI(iv,i,1,IMPART)] = a1im[iv] - a2im[iv];

    	}
    }

    /*wp_shrink( a,b3,ZUP,sign); */

    if( isign == PLUS )
    {
    	/* case ZUP: */
    	{

    	  __targetILP__(iv) hwvz[HWVLI(iv,i,0,REPART)] = a0re[iv] - a2im[iv];
    	  __targetILP__(iv) hwvz[HWVLI(iv,i,0,IMPART)] = a0im[iv] + a2re[iv];
    	  __targetILP__(iv) hwvz[HWVLI(iv,i,1,REPART)] = a1re[iv] + a3im[iv];
    	  __targetILP__(iv) hwvz[HWVLI(iv,i,1,IMPART)] = a1im[iv] - a3re[iv];


    	}
    }
    else
    {
    	/* case ZDOWN: */
    	{

    	  __targetILP__(iv) hwvz[HWVLI(iv,i,0,REPART)] = a0re[iv] + a2im[iv];
    	  __targetILP__(iv) hwvz[HWVLI(iv,i,0,IMPART)] = a0im[iv] - a2re[iv];
    	  __targetILP__(iv) hwvz[HWVLI(iv,i,1,REPART)] = a1re[iv] - a3im[iv];
    	  __targetILP__(iv) hwvz[HWVLI(iv,i,1,IMPART)] = a1im[iv] + a3re[iv];


    	}

    }

    /*wp_shrink( a,b4,TUP,sign); */

    if( isign == PLUS )
    {
    	/* case TUP: */
    	{

    	  __targetILP__(iv) hwvt[HWVLI(iv,i,0,REPART)] = a0re[iv] + a2re[iv];
    	  __targetILP__(iv) hwvt[HWVLI(iv,i,0,IMPART)] = a0im[iv] + a2im[iv];
    	  __targetILP__(iv) hwvt[HWVLI(iv,i,1,REPART)] = a1re[iv] + a3re[iv];
    	  __targetILP__(iv) hwvt[HWVLI(iv,i,1,IMPART)] = a1im[iv] + a3im[iv];


    	}
    }
    else
    {
    	/* case TDOWN: */
    	{

    	  __targetILP__(iv) hwvt[HWVLI(iv,i,0,REPART)] = a0re[iv] - a2re[iv];
    	  __targetILP__(iv) hwvt[HWVLI(iv,i,0,IMPART)] = a0im[iv] - a2im[iv];
    	  __targetILP__(iv) hwvt[HWVLI(iv,i,1,REPART)] = a1re[iv] - a3re[iv];
    	  __targetILP__(iv) hwvt[HWVLI(iv,i,1,IMPART)] = a1im[iv] - a3im[iv];



    	}
    }

    }

      

      for ( icol = 0; icol < 3; icol++ ){      
      	for ( ispin = 0; ispin < 2; ispin++ ) {     
      	  for ( ireim = 0; ireim < 2; ireim++ )    {  

      	    dtmp=(double*) &(t_htmp[XUP][0]);
      	    __targetILP__(iv) dtmp[HWVI(isite+iv,icol,ispin,ireim)]=hwvx[HWVLI(iv,icol,ispin,ireim)];
      	    dtmp=(double*) &(t_htmp[YUP][0]);
      	    __targetILP__(iv) dtmp[HWVI(isite+iv,icol,ispin,ireim)]=hwvy[HWVLI(iv,icol,ispin,ireim)];
      	    dtmp=(double*) &(t_htmp[ZUP][0]);
      	    __targetILP__(iv) dtmp[HWVI(isite+iv,icol,ispin,ireim)]=hwvz[HWVLI(iv,icol,ispin,ireim)];
      	    dtmp=(double*) &(t_htmp[TUP][0]);
      	    __targetILP__(iv) dtmp[HWVI(isite+iv,icol,ispin,ireim)]=hwvt[HWVLI(iv,icol,ispin,ireim)];

      	  }
      	}
      }


    }
  
  
  return;
  
}

__targetEntry__  void shrink2_lattice(const wilson_vector* __restrict__ src, int isign, int parity, half_wilson_vector**  t_htmp, const su3_matrix* __restrict__ t_gauge){
  
  int isite;
  
  __targetTLP__(isite,t_sites_on_node)
    {
      
      int iv=0;

      double hwv[VVL*3*2*2];
      
      double* a = (double*) &(src[0]);
      int i;
      //    for ( i = 0; i < 3; i++ ){

      double a0re[3][VVL];
      double a0im[3][VVL];
      double a1re[3][VVL];
      double a1im[3][VVL];
      double a2re[3][VVL];
      double a2im[3][VVL];
      double a3re[3][VVL];
      double a3im[3][VVL];
      
for ( i = 0; i < 3; i++ ){
    __targetILP__(iv) a0re[i][iv]= a[WVI(isite+iv,i,0,REPART)] ;
    __targetILP__(iv) a0im[i][iv]= a[WVI(isite+iv,i,0,IMPART)] ;
    __targetILP__(iv) a1re[i][iv]= a[WVI(isite+iv,i,1,REPART)] ;
    __targetILP__(iv) a1im[i][iv]= a[WVI(isite+iv,i,1,IMPART)] ;
    __targetILP__(iv) a2re[i][iv]= a[WVI(isite+iv,i,2,REPART)] ;
    __targetILP__(iv) a2im[i][iv]= a[WVI(isite+iv,i,2,IMPART)] ;
    __targetILP__(iv) a3re[i][iv]= a[WVI(isite+iv,i,3,REPART)] ;
    __targetILP__(iv) a3im[i][iv]= a[WVI(isite+iv,i,3,IMPART)] ;
 }

    if( -isign == PLUS )
    {
    	/* case XUP: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv]- a3im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv]+ a3re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] - a2im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] + a2re[i][iv];
	  
    	}
    }
    else
    {
    	/* case XDOWN: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] + a3im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] - a3re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] + a2im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] - a2re[i][iv];


    	}
    }

    mult_adj_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), XUP,
				 (double*) hwv, 
				 (double*) &(t_htmp[XDOWN][0]),isite);
    


    if( -isign == PLUS )
    {
    	/* case YUP: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] - a3re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] - a3im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] + a2re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] + a2im[i][iv];



    	}

    }
    else
    {
    	/* case YDOWN: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] + a3re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] + a3im[i][iv];
	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] - a2re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] - a2im[i][iv];

    	}
    }


      mult_adj_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), YUP,
      				   (double*) hwv, 
      				    (double*) &(t_htmp[YDOWN][0]),isite);



    if( -isign == PLUS )
    {
    	/* case ZUP: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] - a2im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] + a2re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] + a3im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] - a3re[i][iv];


    	}
    }
    else
    {
    	/* case ZDOWN: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] + a2im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] - a2re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] - a3im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] + a3re[i][iv];


    	}

    }



    mult_adj_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), ZUP,
				 (double*) hwv, 
				 (double*) &(t_htmp[ZDOWN][0]),isite);
    


    if( -isign == PLUS )
    {
    	/* case TUP: */
    	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] + a2re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] + a2im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] + a3re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] + a3im[i][iv];


    	}
    }
    else
    {
    	/* case TDOWN: */
      	for ( i = 0; i < 3; i++ )
    	{

    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,REPART)] = a0re[i][iv] - a2re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,0,IMPART)] = a0im[i][iv] - a2im[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,REPART)] = a1re[i][iv] - a3re[i][iv];
    	  __targetILP__(iv) hwv[HWVLI(iv,i,1,IMPART)] = a1im[i][iv] - a3im[i][iv];



    	}
    }


      mult_adj_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), TUP,
      				   (double*) hwv, 
      				    (double*) &(t_htmp[TDOWN][0]),isite);




    }


    return;

}




__targetEntry__  void grow_add1_lattice( wilson_vector *dest, int isign, const double** __restrict__ t_htmp_prime, const su3_matrix* __restrict__ t_gauge){
  
  int isite;

  __targetTLP__(isite,t_sites_on_node)
    {

      int iv=0;
      int i;
      
      double hwv[VVL*3*2*2];
      
      
      {
	
	
  	double a0re[3][VVL];
  	double a0im[3][VVL];
	
  	double a1re[3][VVL];
  	double a1im[3][VVL];
	
  	double a2re[3][VVL];
  	double a2im[3][VVL];
	
  	double a3re[3][VVL];
  	double a3im[3][VVL];
	
	for ( i = 0; i < 3; i++ )
	  {
	    
	    __targetILP__(iv) {
	      a0re[i][iv]=0.;
	      a0im[i][iv]=0.;
	      a1re[i][iv]=0.;
	      a1im[i][iv]=0.;
	      a2re[i][iv]=0.;
	      a2im[i][iv]=0.;
	      a3re[i][iv]=0.;
	      a3im[i][iv]=0.;
	      
	    }
	    
	  }	
	
	
	mult_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), XUP,
					(double*) &(t_htmp_prime[XUP][0]),
					(double*) hwv,isite);
	
	
	/* case XUP: */
	if( isign == PLUS )
	  {
	    
	    for ( i = 0; i < 3; i++ )
	      {
  		__targetILP__(iv) a0re[i][iv]= hwv[HWVLI(iv,i,0,REPART)];
  		__targetILP__(iv) a0im[i][iv]= hwv[HWVLI(iv,i,0,IMPART)];
		
  		__targetILP__(iv) a1re[i][iv]= hwv[HWVLI(iv,i,1,REPART)];
  		__targetILP__(iv) a1im[i][iv]= hwv[HWVLI(iv,i,1,IMPART)];
		
  		__targetILP__(iv) a3re[i][iv]=hwv[HWVLI(iv,i,0,IMPART)];
  		__targetILP__(iv) a3im[i][iv]=-hwv[HWVLI(iv,i,0,REPART)];
		
  		__targetILP__(iv) a2re[i][iv]=hwv[HWVLI(iv,i,1,IMPART)];
  		__targetILP__(iv) a2im[i][iv]=-hwv[HWVLI(iv,i,1,REPART)];
	      }		
	  }
	else
	  {
	    /* case XDOWN: */
	    for ( i = 0; i < 3; i++ )
	      {		
  		__targetILP__(iv) a0re[i][iv]= hwv[HWVLI(iv,i,0,REPART)];
  		__targetILP__(iv) a0im[i][iv]= hwv[HWVLI(iv,i,0,IMPART)];
		
  		__targetILP__(iv) a1re[i][iv]= hwv[HWVLI(iv,i,1,REPART)];
  		__targetILP__(iv) a1im[i][iv]= hwv[HWVLI(iv,i,1,IMPART)];
		
  		__targetILP__(iv) a3re[i][iv]=-hwv[HWVLI(iv,i,0,IMPART)];
  		__targetILP__(iv) a3im[i][iv]=hwv[HWVLI(iv,i,0,REPART)];
		
  		__targetILP__(iv) a2re[i][iv]=-hwv[HWVLI(iv,i,1,IMPART)];
  		__targetILP__(iv) a2im[i][iv]=hwv[HWVLI(iv,i,1,REPART)];
	      }		
	    
	  }
	
	
	mult_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), YUP,
					(double*) &(t_htmp_prime[YUP][0]),
					(double*) hwv,isite);
	
	
  	if( isign == PLUS )
  	  {
  	    /*  case YUP: */
	    
	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a2im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a3re[i][iv]-= hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a3im[i][iv]-= hwv[HWVLI(iv,i,0,IMPART)];
	      }	    
	    
  	  }
  	else
  	  {
  	    /*  case YDOWN: */
	    
	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]-=  hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a2im[i][iv]-=  hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a3re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a3im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
	      }
  	  }	
	
	mult_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), ZUP,
					(double*) &(t_htmp_prime[ZUP][0]),
					(double*) hwv,isite);
	
	
  	if( isign == PLUS )
  	  {
  	    /*  case ZUP: */
	    
	    for ( i = 0; i < 3; i++ )
	      { 
		__targetILP__(iv) a0re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		__targetILP__(iv) a2im[i][iv]-=hwv[HWVLI(iv,i,0,REPART)];
		
		__targetILP__(iv) a3re[i][iv]-=hwv[HWVLI(iv,i,1,IMPART)];
		__targetILP__(iv) a3im[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
	      }	    
  	  }
  	else
  	  {
  	    /*  case ZDOWN: */
	    
	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]-=hwv[HWVLI(iv,i,0,IMPART)];
		__targetILP__(iv) a2im[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		
		__targetILP__(iv) a3re[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		__targetILP__(iv) a3im[i][iv]-=hwv[HWVLI(iv,i,1,REPART)];
	      }	    
	    
  	  }
	
	
	mult_su3_mat_hwvec_tdp_inline(  (double*) &(t_gauge[0]), TUP,
					(double*) &(t_htmp_prime[TUP][0]),
					(double*) hwv,isite);
	
	
  	if( isign == PLUS )
  	  {
  	    /*  case TUP: */
	    
	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a2im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a3re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a3im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
	      }	    
	    
  	  }
  	else
  	  {
  	    /*  case TDOWN: */
	    
	    for ( i = 0; i < 3; i++ )
	      {
		__targetILP__(iv) a0re[i][iv]+=hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwv[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]-= hwv[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a2im[i][iv]-= hwv[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a3re[i][iv]-= hwv[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a3im[i][iv]-= hwv[HWVLI(iv,i,1,IMPART)];
	      }	    
  	  }
	
	
  	double* a = (double*) &(dest[0]);
	
  	//write back 
	for ( i = 0; i < 3; i++ )
	  {
	    __targetILP__(iv) a[WVI(isite+iv,i,0,REPART)]=a0re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,0,IMPART)]=a0im[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,1,REPART)]=a1re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,1,IMPART)]=a1im[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,2,REPART)]=a2re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,2,IMPART)]=a2im[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,3,REPART)]=a3re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,3,IMPART)]=a3im[i][iv];
	  }	
	
	
	
	
      }
      
      
    }
  
  
  
  
  return;
  
}


__targetEntry__  void grow_add2_lattice(wilson_vector *dest, int isign, const double** __restrict__  t_htmp_prime) {
  

  int isite;
  __targetTLP__(isite,t_sites_on_node)
    {

      int i;
      int iv=0;
      int ispin,icol,ireim;
      
      double* dtmp;
      double* a = (double*) &(dest[0]);

      
      double hwvx[VVL*3*2*2];
      double hwvy[VVL*3*2*2];
      double hwvz[VVL*3*2*2];
      double hwvt[VVL*3*2*2];



      for ( icol = 0; icol < 3; icol++ ){      
      	for ( ispin = 0; ispin < 2; ispin++ ) {     
      	  for ( ireim = 0; ireim < 2; ireim++ )    {  

      	    dtmp=(double*) &(t_htmp_prime[XDOWN][0]);
      	    __targetILP__(iv) hwvx[HWVLI(iv,icol,ispin,ireim)]=dtmp[HWVI(isite+iv,icol,ispin,ireim)];
      	    dtmp=(double*) &(t_htmp_prime[YDOWN][0]);
      	    __targetILP__(iv) hwvy[HWVLI(iv,icol,ispin,ireim)]=dtmp[HWVI(isite+iv,icol,ispin,ireim)];
      	    dtmp=(double*) &(t_htmp_prime[ZDOWN][0]);
      	    __targetILP__(iv) hwvz[HWVLI(iv,icol,ispin,ireim)]=dtmp[HWVI(isite+iv,icol,ispin,ireim)];
      	    dtmp=(double*) &(t_htmp_prime[TDOWN][0]);
      	    __targetILP__(iv) hwvt[HWVLI(iv,icol,ispin,ireim)]=dtmp[HWVI(isite+iv,icol,ispin,ireim)];

      	  }
      	}
      }



      
      {


	double a0re[3][VVL];
	double a0im[3][VVL];
	
	double a1re[3][VVL];
	double a1im[3][VVL];
	
	double a2re[3][VVL];
	double a2im[3][VVL];
	
	double a3re[3][VVL];
	double a3im[3][VVL];

	for ( i = 0; i < 3; i++ )
	  {

	    
	
	    //read a
	    __targetILP__(iv) a0re[i][iv]=a[WVI(isite+iv,i,0,REPART)];
	    __targetILP__(iv) a0im[i][iv]=a[WVI(isite+iv,i,0,IMPART)];
	    __targetILP__(iv) a1re[i][iv]=a[WVI(isite+iv,i,1,REPART)];
	    __targetILP__(iv) a1im[i][iv]=a[WVI(isite+iv,i,1,IMPART)];
	    __targetILP__(iv) a2re[i][iv]=a[WVI(isite+iv,i,2,REPART)];
	    __targetILP__(iv) a2im[i][iv]=a[WVI(isite+iv,i,2,IMPART)];
	    __targetILP__(iv) a3re[i][iv]=a[WVI(isite+iv,i,3,REPART)];
	    __targetILP__(iv) a3im[i][iv]=a[WVI(isite+iv,i,3,IMPART)];
	    
	  }
	
	/* wp_grow_add( b1,a,XUP,sign); */
	
	/* case XUP: */
	if( -isign == PLUS )
	  {
		

	    for ( i = 0; i < 3; i++ )
	      {
		__targetILP__(iv) a0re[i][iv]+=hwvx[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv)a0im[i][iv]+=hwvx[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwvx[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvx[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]+=hwvx[HWVLI(iv,i,1,IMPART)];
		__targetILP__(iv) a2im[i][iv]-=hwvx[HWVLI(iv,i,1,REPART)];
		
		__targetILP__(iv) a3re[i][iv]+=hwvx[HWVLI(iv,i,0,IMPART)];
		__targetILP__(iv) a3im[i][iv]-=hwvx[HWVLI(iv,i,0,REPART)];
	      }		
		
	  }
	else
	  {
	    /* case XDOWN: */

	    for ( i = 0; i < 3; i++ )
	      {		
		__targetILP__(iv) a0re[i][iv]+=hwvx[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvx[HWVLI(iv,i,0,IMPART)];
		
		__targetILP__(iv) a1re[i][iv]+=hwvx[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvx[HWVLI(iv,i,1,IMPART)];
		
		__targetILP__(iv) a2re[i][iv]-=hwvx[HWVLI(iv,i,1,IMPART)];
		__targetILP__(iv) a2im[i][iv]+=hwvx[HWVLI(iv,i,1,REPART)];
		
		__targetILP__(iv) a3re[i][iv]-=hwvx[HWVLI(iv,i,0,IMPART)];
		__targetILP__(iv) a3im[i][iv]+=hwvx[HWVLI(iv,i,0,REPART)];
	      }		
		
		
	  }
	  



	if( -isign == PLUS )
	  {
	    /*  case YUP: */

	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwvy[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvy[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a1re[i][iv]+=hwvy[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvy[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a2re[i][iv]+=hwvy[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a2im[i][iv]+=hwvy[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a3re[i][iv]-= hwvy[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a3im[i][iv]-= hwvy[HWVLI(iv,i,0,IMPART)];
	      }	    
	    
	  }
	else
	  {
	    /*  case YDOWN: */

	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwvy[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvy[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a1re[i][iv]+=hwvy[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvy[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a2re[i][iv]-=  hwvy[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a2im[i][iv]-=  hwvy[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a3re[i][iv]+=hwvy[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a3im[i][iv]+=hwvy[HWVLI(iv,i,0,IMPART)];
	    
	      }
	  }	

	
	if( -isign == PLUS )
	  {
	    /*  case ZUP: */
	   
	    for ( i = 0; i < 3; i++ )
	      { 
		__targetILP__(iv) a0re[i][iv]+=hwvz[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvz[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a1re[i][iv]+=hwvz[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvz[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a2re[i][iv]+=hwvz[HWVLI(iv,i,0,IMPART)];
		__targetILP__(iv) a2im[i][iv]-=hwvz[HWVLI(iv,i,0,REPART)];
	    
		__targetILP__(iv) a3re[i][iv]-=hwvz[HWVLI(iv,i,1,IMPART)];
		__targetILP__(iv) a3im[i][iv]+=hwvz[HWVLI(iv,i,1,REPART)];
	      }	    
	  }
	else
	  {
	    /*  case ZDOWN: */

	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwvz[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvz[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a1re[i][iv]+=hwvz[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvz[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a2re[i][iv]-=hwvz[HWVLI(iv,i,0,IMPART)];
		__targetILP__(iv) a2im[i][iv]+=hwvz[HWVLI(iv,i,0,REPART)];
	    
		__targetILP__(iv) a3re[i][iv]+=hwvz[HWVLI(iv,i,1,IMPART)];
		__targetILP__(iv) a3im[i][iv]-=hwvz[HWVLI(iv,i,1,REPART)];
	      }	    
	    
	  }
	

	if( -isign == PLUS )
	  {
	    /*  case TUP: */

	    for ( i = 0; i < 3; i++ )
	      {	    
		__targetILP__(iv) a0re[i][iv]+=hwvt[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvt[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a1re[i][iv]+=hwvt[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvt[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a2re[i][iv]+=hwvt[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a2im[i][iv]+=hwvt[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a3re[i][iv]+=hwvt[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a3im[i][iv]+=hwvt[HWVLI(iv,i,1,IMPART)];
	      }	    
	    
	  }
	else
	  {
	    /*  case TDOWN: */
	    
	    for ( i = 0; i < 3; i++ )
	      {
		__targetILP__(iv) a0re[i][iv]+=hwvt[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a0im[i][iv]+=hwvt[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a1re[i][iv]+=hwvt[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a1im[i][iv]+=hwvt[HWVLI(iv,i,1,IMPART)];
	    
		__targetILP__(iv) a2re[i][iv]-= hwvt[HWVLI(iv,i,0,REPART)];
		__targetILP__(iv) a2im[i][iv]-= hwvt[HWVLI(iv,i,0,IMPART)];
	    
		__targetILP__(iv) a3re[i][iv]-= hwvt[HWVLI(iv,i,1,REPART)];
		__targetILP__(iv) a3im[i][iv]-= hwvt[HWVLI(iv,i,1,IMPART)];
	      }	    
	  }
	
	
	//write back 
	for ( i = 0; i < 3; i++ )
	  {
	    __targetILP__(iv) a[WVI(isite+iv,i,0,REPART)]=a0re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,0,IMPART)]=a0im[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,1,REPART)]=a1re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,1,IMPART)]=a1im[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,2,REPART)]=a2re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,2,IMPART)]=a2im[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,3,REPART)]=a3re[i][iv];
	    __targetILP__(iv) a[WVI(isite+iv,i,3,IMPART)]=a3im[i][iv];
	  }	




      }


      
    }
  
  
  
  return;
  
}

__targetEntry__ void update_genpt(char*** t_gen_pt, half_wilson_vector** t_htmp, half_wilson_vector** t_htmp_prime, int id, half_wilson_vector* htmpid){
  
  int i;
  __targetTLPNoStride__(i,t_sites_on_node){
    
    int offset=t_gen_pt[id][i]-((char*) htmpid);
    
    t_gen_pt[id][i]=  ((char*) t_htmp[id]) + offset ;
    
    int element;
    double* dptrin=(double*) t_gen_pt[id][i];
    double* dptrout=(double*) &(t_htmp_prime[id][i].h[0].c[0].real);
    for (element=0;element<2*2*3;element++){
      dptrout[element] = dptrin[element];
    }
    
  }
  
  
  return;

}



__targetEntry__ void onNodeShift(const double* __restrict__ din,double* dout, int* neighbor[8], int nF, int dir){
  

  int i;
  
  __targetTLP__(i,t_sites_on_node)
    {
      int iv=0;
      int in[VVL];
      
      
      
      __targetILP__(iv) in[iv] = neighbor[dir][i+iv];
      
      
      int commPresent=0;
      __targetILP__(iv) {
	if( in[iv]  >= COM_BIT )
	  commPresent=1;	
      }


      //output is multi-valued site associated with (on-node) neighbour
      
      if (commPresent){
	int dir_, colour1_, colour2_, reim_;
	for(dir_=0;dir_<2;dir_++)
	  for(colour1_=0;colour1_<3;colour1_++)
	    for(reim_=0;reim_<2;reim_++)
	      
	      __targetILP__(iv){
		if( in[iv]  < COM_BIT )
		  dout[HWVI(i+iv,colour1_,dir_,reim_)]=din[HWVI(in[iv],colour1_,dir_,reim_)];
	      }
      }
      else
	{
	  int dir_, colour1_, colour2_, reim_;
	  for(dir_=0;dir_<2;dir_++)
	    for(colour1_=0;colour1_<3;colour1_++)
	      for(reim_=0;reim_<2;reim_++)
		
		__targetILP__(iv){
		  dout[HWVI(i+iv,colour1_,dir_,reim_)]=din[HWVI(in[iv],colour1_,dir_,reim_)];
		}
	}
      
      
    }      
  
  
  return;
  
}

static int n_off[8];
static int noffmax;
static int* fullindexsend_;
static int* fullindexrecv_;
static int* t_fullindexsend_;
static int* t_fullindexrecv_;

extern int **neighbor;
extern int offnode_even[8];		/* # of even sites that have off-node neighbors in a dir */
extern int offnode_odd[8];		/* # of odd sites that have off-node neighbors in a dir */


__targetEntry__ void packSendBuffer(double* sendbuf,double* din, int* fullindexsend, int nF, int n_off, int n_off_max, int dir){

  int i;

  __targetTLP__(i,n_off)
    {


      int k=0;
      int iv=0;

      int spin_, colour1_, colour2_, reim_;
      for(spin_=0;spin_<2;spin_++){
	for(colour1_=0;colour1_<3;colour1_++){
	  for(reim_=0;reim_<2;reim_++){

	    __targetILP__(iv){
	      if ((i+iv) < n_off)
		sendbuf[k*n_off+i+iv]=din[HWVI(fullindexsend[dir*n_off_max+i+iv],colour1_,spin_,reim_)];
	    }
		k++;
	  }
	}
      }
      
      
    }
  
  return;
  
}



__targetEntry__ void unpackRecvBuffer(double* recvbuf,double* dout, int* fullindexrecv, int nF, int n_off, int n_off_max, int dir){
  
  int i;
  
  __targetTLP__(i,n_off)
    {

      int iv=0;
      //pack send buffer      

      int k=0;

	  int spin_, colour1_, colour2_, reim_;
	  for(spin_=0;spin_<2;spin_++){
	    for(colour1_=0;colour1_<3;colour1_++){
	      for(reim_=0;reim_<2;reim_++){

	    __targetILP__(iv){
	      if ((i+iv) < n_off)
		dout[HWVI(fullindexrecv[dir*n_off_max+i+iv],colour1_,spin_,reim_)]=recvbuf[k*n_off+i+iv];
	    }
		k++;
	      }
	    }
	  }
      
    }
  
  return;
  
}

					 
void setup_dslash_comms(){

    static int i;
    site *s;
    int dir;
    msg_tag *tag[8];

    void* tmpptr;
    int id;

    //perform shrink operations
    
    noffmax=0;
    for( dir=0; dir < 8; dir++){ 
      
      n_off[dir] = offnode_even[dir] + offnode_odd[dir];
      if (n_off[dir] > noffmax)
	noffmax=n_off[dir];
    }      
    int in, k;                  


    if (noffmax > 0){
      fullindexsend_ = (int*) malloc(noffmax*8*sizeof(int));
      fullindexrecv_ = (int*) malloc(noffmax*8*sizeof(int));      
      targetMalloc((void**) &t_fullindexsend_, noffmax*8*sizeof(int));	
      targetMalloc((void**) &t_fullindexrecv_, noffmax*8*sizeof(int)); 

    for( dir=0; dir < 8; dir++){ 
      if (n_off[dir] > 0){

	//get mapping
	int n=0;
	for(i=0;i<sites_on_node;i++)
	  {
	    
	    if(  ( in = neighbor[dir][i] ) >= COM_BIT ){
	      fullindexsend_[noffmax*dir+n]=(in-COM_BIT);
	      fullindexrecv_[noffmax*dir+n]=i;
	      n++;
	    }
	  }
	
      }
    }

    copyToTarget(t_fullindexsend_,fullindexsend_,noffmax*8*sizeof(int));
    copyToTarget(t_fullindexrecv_,fullindexrecv_,noffmax*8*sizeof(int));

    }
    return;
}

void finalise_dslash_comms(){


  if (fullindexsend_)
    free(fullindexsend_);
  if (fullindexrecv_)
    free(fullindexrecv_);
  if (t_fullindexsend_)
    targetFree(t_fullindexsend_);
  if (t_fullindexrecv_)
    targetFree(t_fullindexrecv_);
  
  return;

}

//target copies of data structures
extern half_wilson_vector** t_htmp;    
extern half_wilson_vector** t_htmp_prime;
extern int** t_neighbor;        
extern char*** t_gen_pt;    
extern su3_matrix *t_gauge;


extern half_wilson_vector* htmp_prime[8];

extern double* sendbuf;
extern double* recvbuf;

extern double* t_sendbuf;
extern double* t_recvbuf;

extern int totnodes[4];		/* number of nodes in machine directions */
extern int Mynode[4], node_parity;
extern MPI_Comm comm_grid, comm_subgrid[4]; /* grid communicators */

__targetHost__ void gen_send_recv( int dir, char *sbuf, char *rbuf, int size );


void dslash(wilson_vector *src, wilson_vector *dest, int isign, int parity)
{
    half_wilson_vector hwvx,hwvy,hwvz,hwvt;
    static int i;
    site *s;
    int dir;
    msg_tag *tag[8];

    int id;

    //perform shrink operations
    
    double t1, t2, t3, t4, time;
    
    t1=omp_get_wtime();  
    
    shrink1_lattice __targetLaunch__(sites_on_node) (src, isign, parity, t_htmp);
    targetSynchronize();
    
#ifdef VERBOSE_TIMINGS
    long int totbytes=sites_on_node*(sizeof(wilson_vector)+4*sizeof(half_wilson_vector));
    t2=omp_get_wtime();time=t2-t1;node0_printf(" - - shrink1 %1.16e s %1.16e GB/s\n",time,totbytes/(time*1073741824.));t1=omp_get_wtime();  
    
    totbytes=sites_on_node*(sizeof(wilson_vector)+4*sizeof(half_wilson_vector)+(3*3*4*2*8));
#endif

    shrink2_lattice __targetLaunch__(sites_on_node) (src, isign, parity, t_htmp,  t_gauge);
    targetSynchronize();

#ifdef VERBOSE_TIMINGS    
    t2=omp_get_wtime();time=t2-t1;node0_printf(" - - shrink2 %1.16e s %1.16e GB/s \n",time,totbytes/(time*1073741824.));t1=omp_get_wtime();  
        
#endif

    //start comms



    site* st;
    int nF=sizeof(half_wilson_vector)/sizeof(double);
    int sourcerank, destrank;
    
    for( dir=0; dir < 8; dir++){ 
            
      int in, k;                  
      double t3, t4;
      void* p_t_htmp;
      void* p_t_htmp_prime;

      copyFromTarget(&p_t_htmp,&(t_htmp[dir]),sizeof(half_wilson_vector*));
      copyFromTarget(&p_t_htmp_prime,&(t_htmp_prime[dir]),sizeof(half_wilson_vector*));

      
      if (n_off[dir] > 0){
	packSendBuffer __targetLaunch__(n_off[dir]) (t_sendbuf,(double*) p_t_htmp,t_fullindexsend_,nF,n_off[dir],noffmax,dir);
	targetSynchronize();
      }
      
      if (n_off[dir] > 0){
	copyFromTarget(sendbuf,t_sendbuf,n_off[dir]*nF*sizeof(double));
      }

      int dir_=dir;
      if (dir_<4)
	{
	  /* positive direction */
	  sourcerank=(Mynode[dir_]+1)%totnodes[dir_];
	  destrank=(Mynode[dir_]-1+totnodes[dir_])%totnodes[dir_];
	}
      else 
	{
	  /* negative direction */
	  dir_=OPP_DIR(dir_); 
	  destrank=(Mynode[dir_]+1)%totnodes[dir_];
	  sourcerank=(Mynode[dir_]-1+totnodes[dir_])%totnodes[dir_];
	}
      
      
      MPI_Request request[2];
      MPI_Status status[2];
      
      
      if (n_off[dir] > 0){	
	MPI_Irecv(recvbuf, nF * n_off[dir], MPI_DOUBLE_PRECISION,
		  sourcerank, 123, comm_subgrid[dir_], &request[0]);
	MPI_Isend(sendbuf, nF * n_off[dir], MPI_DOUBLE_PRECISION,
		  destrank, 123, comm_subgrid[dir_], &request[1]);
      }



      t3=omp_get_wtime();  

      onNodeShift __targetLaunch__(sites_on_node) ((double*) p_t_htmp,(double*) p_t_htmp_prime,t_neighbor,nF,dir);
      targetSynchronize();

#ifdef VERBOSE_TIMINGS
      t4=omp_get_wtime();time=t4-t3;node0_printf(" - - - onnodeshift %1.16e s \n",time);t3=omp_get_wtime();  
#endif

      
      
      if (n_off[dir] > 0){	
	MPI_Waitall(2, request, status);
      }
      
      
      
      if (n_off[dir] > 0)
	copyToTarget(t_recvbuf,recvbuf,n_off[dir]*nF*sizeof(double));
      
      
      if (n_off[dir] > 0){
	unpackRecvBuffer __targetLaunch__(n_off[dir]) (t_recvbuf,(double*) p_t_htmp_prime,t_fullindexrecv_,nF,n_off[dir],noffmax,dir);
	targetSynchronize();
      }
      
    }
    
  

    //end comms
    
#ifdef VERBOSE_TIMINGS
    t2=omp_get_wtime();node0_printf(" - - comms %1.16e s \n",t2-t1);t1=omp_get_wtime();  
#endif

    //perform grow operations
    grow_add1_lattice __targetLaunch__(sites_on_node) (dest, isign, (const double**) t_htmp_prime, t_gauge);
     targetSynchronize();

#ifdef VERBOSE_TIMINGS
    totbytes=sites_on_node*(sizeof(wilson_vector)+4*sizeof(half_wilson_vector)+(3*3*4*2*8));
    t2=omp_get_wtime();time=t2-t1;node0_printf(" - - growadd1 %1.16e s %1.16e GB/s \n",time,totbytes/(time*1073741824.));t1=omp_get_wtime();  
#endif

    grow_add2_lattice __targetLaunch__(sites_on_node) (dest, isign, (const double**) t_htmp_prime);
    targetSynchronize();

#ifdef VERBOSE_TIMINGS
    totbytes=sites_on_node*(sizeof(wilson_vector)+4*sizeof(half_wilson_vector));
    t2=omp_get_wtime();time=t2-t1;node0_printf(" - - growadd2 %1.16e s %1.16e GB/s \n",time,totbytes/(time*1073741824.));t1=omp_get_wtime();  
#endif

    return; 

} 

