/*
 * local variables
 */
#define NPIPE_DEFAULT (256)
#define NFOMAX (384) // number of forces calculated in parallel.
// #define DEFECT
#define DEFECTPE (15)

static int    Sflag_cutoff[NHIBMAX];
static int    Sflag_nnb[NHIBMAX];
static int    Snpipe[NHIBMAX];
static int    Ndevice = 0;
static int    Device[NHIBMAX];
static int    Sendfunc = SENDFUNC_PIOW;
static Hib    *H[NHIBMAX];
static UINT64 *Rbuf[NHIBMAX];
static UINT64 *Wbuf[NHIBMAX];
static UINT64 Piowbuf[NHIBMAX][1024];
static UINT64 Idp00[NHIBMAX][1000];
static UINT64 Idp01[NHIBMAX][1000];
static UINT64 Idp10[NHIBMAX][1000];
static UINT64 Idp11[NHIBMAX][1000];        
static UINT64 Isp[NHIBMAX][1000];
static int    Resend_ip[NHIBMAX];
static int    Resend_index[NHIBMAX];
static int    Resend_run[NHIBMAX];
static int    Resend_fo[NHIBMAX];
static int    Snj[NHIBMAX];
static double Seps2[NHIBMAX];
static double Setainv[NHIBMAX];
static int    Model[NHIBMAX];
static int    Version[NHIBMAX];
static int Nispinit[NHIBMAX];
static int Nplus[NHIBMAX];
static int Nisp[NHIBMAX];
static double Xscale[NHIBMAX];
static double Xscale2[NHIBMAX];
static double Xoffset[NHIBMAX];

#define DEFAULT_FORCE_CORRECTION (1.00012)
#define DEFAULT_POT_CORRECTION (1.00005)

#define WARN(lv, fmt, args...) if (lv <= Warn_level) fprintf(stderr, fmt, ## args);
static int Warn_level = 2; /* warning message output level. the higher the more verbose.
                              0: no warning (may cause wrong result with g7pkg/scripts/check.csh)
                              1: minimum
                              2: default
                              3: for debugging purpose
                           */

// local fuction prototypes:
static void print_wbuf(int clusterid);
static void send_vector(int clusterid, int i0, int i1);
static void em_write(int clusterid, int nidp, int adrem);
static void run_counter(int clusterid, int nisp,int adrisp,int nrepisp,
                        int ncntidp,int nidp,int adridp, int memadr,int nrepidp);
static void send_ispdata(int clusterid, int n, int ibegin, UINT64 ispdata[]);
static void init_envs(void);
static void set_resend_flags(void);

/*
 * local functions
 */
static void print_wbuf(int clusterid)
{
    int ii,j,perror,par[9];
      
    perror=0;
#ifdef PARITY      
    for(ii=0;ii<64;ii++){
        par[0] = (Wbuf[clusterid][4*ii+1]>>56) & 0x1; for(j=57;j<64;j++) par[0] = par[0] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[1] = (Wbuf[clusterid][4*ii+1]>>48) & 0x1; for(j=49;j<56;j++) par[1] = par[1] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[2] = (Wbuf[clusterid][4*ii+1]>>40) & 0x1; for(j=41;j<48;j++) par[2] = par[2] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[3] = (Wbuf[clusterid][4*ii+1]>>32) & 0x1; for(j=33;j<40;j++) par[3] = par[3] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[4] = (Wbuf[clusterid][4*ii+1]>>24) & 0x1; for(j=25;j<32;j++) par[4] = par[4] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[5] = (Wbuf[clusterid][4*ii+1]>>16) & 0x1; for(j=17;j<24;j++) par[5] = par[5] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[6] = (Wbuf[clusterid][4*ii+1]>> 8) & 0x1; for(j= 9;j<16;j++) par[6] = par[6] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[7] = (Wbuf[clusterid][4*ii+1]    ) & 0x1; for(j= 1;j< 8;j++) par[7] = par[7] ^ ((Wbuf[clusterid][4*ii+1]>>j) & 0x1);
        par[8] = (Wbuf[clusterid][4*ii+0]>>56) & 0x1; for(j=57;j<64;j++) par[8] = par[8] ^ ((Wbuf[clusterid][4*ii+0]>>j) & 0x1);
        for(j=0;j<9;j++){
            if(par[j]!= ((Wbuf[clusterid][4*ii+0]>>(55-j))&0x1)) perror = 1;
        }
        par[0] = (Wbuf[clusterid][4*ii+3]>>56) & 0x1; for(j=57;j<64;j++) par[0] = par[0] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[1] = (Wbuf[clusterid][4*ii+3]>>48) & 0x1; for(j=49;j<56;j++) par[1] = par[1] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[2] = (Wbuf[clusterid][4*ii+3]>>40) & 0x1; for(j=41;j<48;j++) par[2] = par[2] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[3] = (Wbuf[clusterid][4*ii+3]>>32) & 0x1; for(j=33;j<40;j++) par[3] = par[3] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[4] = (Wbuf[clusterid][4*ii+3]>>24) & 0x1; for(j=25;j<32;j++) par[4] = par[4] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[5] = (Wbuf[clusterid][4*ii+3]>>16) & 0x1; for(j=17;j<24;j++) par[5] = par[5] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[6] = (Wbuf[clusterid][4*ii+3]>> 8) & 0x1; for(j= 9;j<16;j++) par[6] = par[6] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[7] = (Wbuf[clusterid][4*ii+3]    ) & 0x1; for(j= 1;j< 8;j++) par[7] = par[7] ^ ((Wbuf[clusterid][4*ii+3]>>j) & 0x1);
        par[8] = (Wbuf[clusterid][4*ii+2]>>56) & 0x1; for(j=57;j<64;j++) par[8] = par[8] ^ ((Wbuf[clusterid][4*ii+2]>>j) & 0x1);
        for(j=0;j<9;j++){
            if(par[j]!= ((Wbuf[clusterid][4*ii+2]>>(55-j))&0x1)) perror = 1;
        }

        if(perror==1){
            //         printf("%016llx%02x %03x %016llx%02x %03x *****\n",Wbuf[clusterid][4*ii+1],0xff&(Wbuf[clusterid][4*ii+0]>>56),0x1ff&(Wbuf[clusterid][4*ii+0]>>47),
            //                Wbuf[clusterid][4*ii+3],0xff&(Wbuf[clusterid][4*ii+2]>>56),0x1ff&(Wbuf[clusterid][4*ii+2]>>47));
        }
    }
#endif
}

static void send_vector(int clusterid, int i0, int i1)
{
    int i,nn,data,n;
    int par[14],j;

    n= i1-i0 +1;
    //  printf("n; %d\n", n);

    //  for(i=0;i<n;i++) printf("0x%016llx 0x%016llx %5x 0x%016llx %5x\n",Isp[clusterid][i],Idp00[clusterid][i],Idp01[clusterid][i],Idp10[clusterid][i],Idp11[clusterid][i]);
#ifdef PARITY
    for(i=0;i<n;i+=2){
        par[0] = (Isp[clusterid][i0+i]>>56) & 0x1; for(j=57;j<64;j++) par[0] = par[0] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[1] = (Isp[clusterid][i0+i]>>48) & 0x1; for(j=49;j<56;j++) par[1] = par[1] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[2] = (Isp[clusterid][i0+i]>>40) & 0x1; for(j=41;j<48;j++) par[2] = par[2] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[3] = (Isp[clusterid][i0+i]>>32) & 0x1; for(j=33;j<40;j++) par[3] = par[3] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[4] = (Isp[clusterid][i0+i]>>24) & 0x1; for(j=25;j<32;j++) par[4] = par[4] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[5] = (Isp[clusterid][i0+i]>>16) & 0x1; for(j=17;j<24;j++) par[5] = par[5] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[6] = (Isp[clusterid][i0+i]>> 8) & 0x1; for(j= 9;j<16;j++) par[6] = par[6] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);
        par[7] = (Isp[clusterid][i0+i]    ) & 0x1; for(j= 1;j< 8;j++) par[7] = par[7] ^ ((Isp[clusterid][i0+i]>>j) & 0x1);

        par[8] = (Isp[clusterid][i0+i+1]>>56) & 0x1; for(j=57;j<64;j++) par[8] = par[8] ^ ((Isp[clusterid][i0+i+1]>>j) & 0x1);
        par[9] = (Isp[clusterid][i0+i+1]>>48) & 0x1; for(j=49;j<56;j++) par[9] = par[9] ^ ((Isp[clusterid][i0+i+1]>>j) & 0x1);
        par[10] = (Isp[clusterid][i0+i+1]>>40) & 0x1; for(j=41;j<48;j++) par[10] = par[10] ^ ((Isp[clusterid][i0+i+1]>>j) & 0x1);
        par[11] = (Isp[clusterid][i0+i+1]>>32) & 0x1; for(j=33;j<40;j++) par[11] = par[11] ^ ((Isp[clusterid][i0+i+1]>>j) & 0x1);
        par[12] = (Isp[clusterid][i0+i+1]>>24) & 0x1; for(j=25;j<32;j++) par[12] = par[12] ^ ((Isp[clusterid][i0+i+1]>>j) & 0x1);
        par[13] = (Isp[clusterid][i0+i+1]>>16) & 0x1; for(j=17;j<24;j++) par[13] = par[13] ^ ((Isp[clusterid][i0+i+1]>>j) & 0x1);

        Isp[clusterid][i0+i+1] = Isp[clusterid][i0+i+1] | ((par[0]&0x1)<<15)|((par[1]&0x1)<<14)|((par[2]&0x1)<<13)|((par[3]&0x1)<<12)
            |((par[4]&0x1)<<11)|((par[5]&0x1)<<10)|((par[6]&0x1)<<9)|((par[7]&0x1)<<8)|((par[8]&0x1)<<7)|((par[9]&0x1)<<6)
            |((par[10]&0x1)<<5)|((par[11]&0x1)<<4)|((par[12]&0x1)<<3)|((par[13]&0x1)<<2);

    }

    for(i=0;i<n;i++){
        par[0] = (Idp00[clusterid][i0+i]>>55) & 0x1; for(j=56;j<64;j++) par[0] = par[0] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[1] = (Idp00[clusterid][i0+i]>>46) & 0x1; for(j=47;j<55;j++) par[1] = par[1] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[2] = (Idp00[clusterid][i0+i]>>37) & 0x1; for(j=38;j<46;j++) par[2] = par[2] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[3] = (Idp00[clusterid][i0+i]>>28) & 0x1; for(j=29;j<37;j++) par[3] = par[3] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[4] = (Idp00[clusterid][i0+i]>>19) & 0x1; for(j=20;j<28;j++) par[4] = par[4] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[5] = (Idp00[clusterid][i0+i]>>10) & 0x1; for(j=11;j<19;j++) par[5] = par[5] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[6] = (Idp00[clusterid][i0+i]>> 1) & 0x1; for(j= 2;j<10;j++) par[6] = par[6] ^ ((Idp00[clusterid][i0+i]>>j) & 0x1);
        par[7] = (Idp00[clusterid][i0+i]    ) & 0x1; for(j= 9;j<17;j++) par[7] = par[7] ^ ((Idp01[clusterid][i0+i]>>j) & 0x1);
        Idp01[clusterid][i0+i] = Idp01[clusterid][i0+i] | ((par[0]&0x1)<<8)|((par[1]&0x1)<<7)|((par[2]&0x1)<<6)|((par[3]&0x1)<<5)
            |((par[4]&0x1)<<4)|((par[5]&0x1)<<3)|((par[6]&0x1)<<2)|((par[7]&0x1)<<1);

        par[0] = (Idp10[clusterid][i0+i]>>55) & 0x1; for(j=56;j<64;j++) par[0] = par[0] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[1] = (Idp10[clusterid][i0+i]>>46) & 0x1; for(j=47;j<55;j++) par[1] = par[1] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[2] = (Idp10[clusterid][i0+i]>>37) & 0x1; for(j=38;j<46;j++) par[2] = par[2] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[3] = (Idp10[clusterid][i0+i]>>28) & 0x1; for(j=29;j<37;j++) par[3] = par[3] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[4] = (Idp10[clusterid][i0+i]>>19) & 0x1; for(j=20;j<28;j++) par[4] = par[4] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[5] = (Idp10[clusterid][i0+i]>>10) & 0x1; for(j=11;j<19;j++) par[5] = par[5] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[6] = (Idp10[clusterid][i0+i]>> 1) & 0x1; for(j= 2;j<10;j++) par[6] = par[6] ^ ((Idp10[clusterid][i0+i]>>j) & 0x1);
        par[7] = (Idp10[clusterid][i0+i]    ) & 0x1; for(j= 9;j<17;j++) par[7] = par[7] ^ ((Idp11[clusterid][i0+i]>>j) & 0x1);
        Idp11[clusterid][i0+i] = Idp11[clusterid][i0+i] | ((par[0]&0x1)<<8)|((par[1]&0x1)<<7)|((par[2]&0x1)<<6)|((par[3]&0x1)<<5)
            |((par[4]&0x1)<<4)|((par[5]&0x1)<<3)|((par[6]&0x1)<<2)|((par[7]&0x1)<<1);
    }
#endif

    /* ISP */
    Rbuf[clusterid][0] = 0x0000000400000000ll | n ;
    for(i=0;i<n;i++) Rbuf[clusterid][i+1] = Isp[clusterid][i0+i];
    //for(i=0;i<n+1;i++) printf("i %x %d rbuf 0x%016llx isp 0x%016llx\n",i,i,Rbuf[clusterid][i],Isp[clusterid][i]);
    hib_sendMC(clusterid, n+1, Rbuf[clusterid]);

    //     printf("isp; flags data 0x%016llx\n", data);
 
    /* IDP00 */
    Rbuf[clusterid][0] = 0x0000000000000000ll | n ;
    for(i=0;i<n;i++) Rbuf[clusterid][i+1] = Idp00[clusterid][i0+i];
    //     for(i=0;i<n+1;i++) printf("idp00 i %x %d rbuf 0x%016llx\n",i,i,Rbuf[clusterid][i]);
    hib_sendMC(clusterid, n+1, Rbuf[clusterid]);

    //     printf("idp00; flags data 0x%016llx\n", data);

    /* IDP01 */
    Rbuf[clusterid][0] = 0x0000000100000000ll | n ;
    for(i=0;i<n;i++) Rbuf[clusterid][i+1] = Idp01[clusterid][i0+i];
    //     for(i=0;i<n+1;i++) printf("idp01 i %x %d rbuf 0x%016llx\n",i,i,Rbuf[clusterid][i]);
    hib_sendMC(clusterid, n+1, Rbuf[clusterid]);

    //     printf("idp01; flags data 0x%016llx\n", data);

    /* IDP10 */
    Rbuf[clusterid][0] = 0x0000000200000000ll | n ;
    for(i=0;i<n;i++) Rbuf[clusterid][i+1] = Idp10[clusterid][i0+i];
    //     for(i=0;i<n+1;i++) printf("idp10 i %x %d rbuf 0x%016llx\n",i,i,Rbuf[clusterid][i]);
    hib_sendMC(clusterid, n+1, Rbuf[clusterid]);

    //     printf("idp10; flags data 0x%016llx\n", data);

    /* IDP11 */
    Rbuf[clusterid][0] = 0x0000000300000000ll | n ;
    for(i=0;i<n;i++) Rbuf[clusterid][i+1] = Idp11[clusterid][i0+i];
    //    for(i=0;i<n+1;i++) printf("idp11 i %x %d rbuf 0x%016llx\n",i,i,Rbuf[clusterid][i]);
    hib_sendMC(clusterid, n+1, Rbuf[clusterid]);

    //     printf("idp11; flags data 0x%016llx\n", data);

    /* RUN */
    nn=1;
    Rbuf[clusterid][0] = 0x0000000800000000ll | nn;
    for(i=1;i<nn+1;i++) Rbuf[clusterid][i] = 0x0000000000000000ll;
    //     for(i=0;i<n;i++) printf("i %x %d rbuf 0x%016llx\n",i,i,Rbuf[clusterid][i]);
    hib_sendMC(clusterid, nn+1, Rbuf[clusterid]);
}

static void em_write(int clusterid, int nidp, int adrem)
{
    int nn,ii;

    nn = 5*nidp;
    Rbuf[clusterid][0] = 0x0000000700000000ll | nn;
    for(ii=0;ii<nidp;ii++){
        Rbuf[clusterid][5*ii+1] = ((UINT64)(ii+adrem))<<32 | (0x1&Idp01[clusterid][2*ii])<<24 | (0x1&Idp11[clusterid][2*ii])<<16
            | (0x1&Idp01[clusterid][2*ii+1])<<8 | (0x1&Idp11[clusterid][2*ii+1]);
        Rbuf[clusterid][5*ii+2] = Idp00[clusterid][2*ii];
        Rbuf[clusterid][5*ii+3] = Idp10[clusterid][2*ii];
        Rbuf[clusterid][5*ii+4] = Idp00[clusterid][2*ii+1];
        Rbuf[clusterid][5*ii+5] = Idp10[clusterid][2*ii+1];
    }
    //   for(ii=0;ii<nn+1;ii++) printf("idp00 ii %x %d rbuf 0x%016llx\n",ii,ii,Rbuf[clusterid][ii]);
    hib_sendMC(clusterid, nn+1, Rbuf[clusterid]);
}


static void em_write460(int clusterid, int nidp, int adrem)
{
    int nn,ii;

    nn = 5*nidp;
    Rbuf[clusterid][0] = 0x0000000700000000ll | nn;
    for(ii=0;ii<nidp;ii++){
        Rbuf[clusterid][5*ii+1] = ((UINT64)(ii+adrem))<<32 | (0xff&Idp01[clusterid][2*ii])<<24 | (0xff&Idp11[clusterid][2*ii])<<16
            | (0xff&Idp01[clusterid][2*ii+1])<<8 | (0xff&Idp11[clusterid][2*ii+1]);
        Rbuf[clusterid][5*ii+2] = Idp00[clusterid][2*ii];
        Rbuf[clusterid][5*ii+3] = Idp10[clusterid][2*ii];
        Rbuf[clusterid][5*ii+4] = Idp00[clusterid][2*ii+1];
        Rbuf[clusterid][5*ii+5] = Idp10[clusterid][2*ii+1];
    }
    //    for(ii=0;ii<2*nidp;ii++) printf("Idp ii %x %d Idp00 0x%016llx Idp10 0x%016llx\n",ii,ii,Idp00[clusterid][ii],Idp10[clusterid][ii]);
    //  for(ii=0;ii<nn+1;ii++) printf("idp00 ii %x %d rbuf 0x%016llx\n",ii,ii,Rbuf[clusterid][ii]);    
   hib_sendMC(clusterid, nn+1, Rbuf[clusterid]);

}


static void run_counter(int clusterid, int nisp,int adrisp,int nrepisp,
                        int ncntidp,int nidp,int adridp, int memadr,int nrepidp)
{
    int i,id,size;

    Rbuf[clusterid][0] = 0x0000000e00000002ll | ((UINT64)nisp)<<48 | adrisp<<16;
    Rbuf[clusterid][1] = 0x0000000000000000ll | ((UINT64)nrepisp)<<40 | ((UINT64)ncntidp)<<24 | nrepidp;
    Rbuf[clusterid][2] = 0x0000000000000000ll | ((UINT64)adridp)<<48 | ((UINT64)nidp)<<32 | memadr ;

    hib_sendMC(clusterid, 3, Rbuf[clusterid]);

}

static void reset_odp(int clusterid)
{
  Rbuf[clusterid][0] = 0x0000000b00000007ll;
  Rbuf[clusterid][1] = 0x0000000000000000ll;
  Rbuf[clusterid][2] = 0x0000000000000000ll;
  Rbuf[clusterid][3] = 0x0000000000000007ll;
  Rbuf[clusterid][4] = 0x0000000000000007ll;
  Rbuf[clusterid][5] = 0x0000000000000007ll;            
  Rbuf[clusterid][6] = 0x0000000000000007ll;
  Rbuf[clusterid][7] = 0x0000000000000000ll;            

  hib_sendMC(clusterid, 8, Rbuf[clusterid]);
}

static void set_odp_64bitmode(int clusterid)
{
  int i;

  Rbuf[clusterid][0] = 0x0000000b00000007ll;
  for(i=1;i<8;i++) Rbuf[clusterid][i] = 0x0000000000000008ll;
  hib_sendMC(clusterid, 8, Rbuf[clusterid]);
}

static void reset_hib(int clusterid)
{
  if (H[clusterid]->type == HIB_GRAPEDRG) {
    hib_mem_writeMC(clusterid, H[clusterid]->r->dma0cmd, (1 << H[clusterid]->r->dma0cmd_abort_bit));
    hib_mem_writeMC(clusterid, H[clusterid]->r->dma1cmd, (1 << H[clusterid]->r->dma1cmd_abort_bit));
  }
  else {
    hib_mem_writeMC(clusterid, H[clusterid]->r->command, (1<<H[clusterid]->r->command_dma_reset_bit));
  }
}

static void send_ispdata(int clusterid, int n, int ibegin, UINT64 ispdata[])
{
    int i;

    Rbuf[clusterid][0] = 0x0000000600000000ll | n | ibegin<<16 ;
    for(i=0;i<n;i++) Rbuf[clusterid][i+1] = ispdata[i];
    //for(i=0;i<n+1;i++) printf("i %x %d rbuf 0x%016llx isp 0x%016llx\n",i,i,Rbuf[clusterid][i],Isp[clusterid][i]);
    hib_sendMC(clusterid, n+1, Rbuf[clusterid]);

}

#include "./mtemp.c"

/*
 * initialize variables used for "standard" functions.  this
 * initialization is not necessary for "primitive" functions, or even
 * harmful for use in multi-threaded application.
 */
static void init_envs(void)
{
    int ic;
    int dummy[NHIBMAX];
    char *p;
    char *cardno;
    static int firstcall = 1;

    if (firstcall) {
        firstcall = 0;
	p = getenv("G5_WARNLEVEL");
        if (p) {
            int tmp;
            tmp = atoi(strtok(p, " "));
            if (0 <= tmp) {
                Warn_level = tmp;
            }
            WARN(3, "Warn_level: %d\n", Warn_level);
        }
        hib_set_warn_level(Warn_level);

	p = getenv("G5_SENDFUNC");
        if (p) {
	    if (0 == strcmp("DMAR", p)) {
		Sendfunc = SENDFUNC_DMAR;
	    }
        }
    }

    if (Ndevice == 0) {
        /* cards are not allocated yet.
           try to allocate cards specified by environment variable "G5_DEVICE".
           try to allocate all cards, if G5_DEVICE is not set. */

	p = getenv("GDEVICE");
        if (!p) { // for backward compatibility.
            p = getenv("G5_CARDS");
        }
	if (p) { // parse G5_DEVICE
	    for (ic = 0; ic < hib_ndevice(); ic++) {
		Device[ic] = 0;
	    }
            cardno = strtok(p, " ");
            while (cardno) {
                ic = atoi(cardno);
                if (ic < 0 || ic >= hib_ndevice()) {
                    fprintf(stderr, "GDEVICE (or G5_CARDS) have device_id out of range: %d\n", ic);
                    exit(2);
                }
                Device[ic] = 1;
                Ndevice++;
                cardno = strtok(NULL, " ");
            }
            
	}
	else { // G5_DEVICE is not set
	    Ndevice = hib_ndevice();
	    for (ic = 0; ic < hib_ndevice(); ic++) {
		Device[ic] = 1;
	    }
	}
    }
}

static void set_resend_flags(void)
{
    int ic;
    for (ic = 0; ic < hib_ndevice(); ic++) {
        Resend_ip[ic] = 1;
        Resend_index[ic] = 1;	
        Resend_run[ic] = 1;
        Resend_fo[ic] = 1;
    }
}

/*
 * GRAPE-5 primitive functions
 */

#include "./vsm/vsmconst0.c"
#include "./vsm/vsmconst.gravity.c"
#include "./vsm/vsmconst.gravity2.c"
#include "./vsm/vsmconst.gravity3.c"
#include "./vsm/vsmconst.gravity_cutoff.c"
#include "./vsm/vsmconst.gravity_cutoff2.c"
#include "./vsm/vsmconst.gravity2_nnb.c"
#include "./vsm/vsmconst.gravity2_nnb_cutoff.c"

void g5_openMC(int clusterid)
{    
    int i0,i1,i;
    UINT32 binfo,productid,modelid,verid;

    WARN(4, "g5_openMC(%d)\n", clusterid);

    init_envs();
    set_resend_flags();
    H[clusterid] = hib_openMC(clusterid);
    Wbuf[clusterid] = H[clusterid]->dmaw_buf;
    hib_set_sendfuncMC(clusterid, SENDFUNC_PIOW);
    Rbuf[clusterid] = Piowbuf[clusterid];
    //    hib_set_sendfuncMC(clusterid, SENDFUNC_DMAR);
    //    Rbuf[clusterid] = H[clusterid]->dmar_buf;

    //#define DMASTAT  (0x20)
    //    TBmemWrite(0, DMASTAT, 0x80000000);
    reset_hib(clusterid);

    Snpipe[clusterid] = NPIPE_DEFAULT;
    Sflag_cutoff[clusterid] = 0;
    Sflag_nnb[clusterid] = 0;    

    if(Snpipe[clusterid]==128)  gravity_send_vsm_constant(clusterid);
    if(Snpipe[clusterid]==256)  gravity2_send_vsm_constant(clusterid);    
    if(Snpipe[clusterid]==384)  gravity3_send_vsm_constant(clusterid);

    binfo = hib_mem_readMC(clusterid, H[clusterid]->r->boardinfo);
    productid = (binfo >> 28) & 0xf;
    modelid = (binfo >> 24) & 0xf;
    Version[clusterid] = (binfo >> 16) & 0xff;    
    if(modelid==3) Model[clusterid] = 460;
    if(modelid==4) Model[clusterid] = 460;    

    //    fprintf(stderr,"ver model %d %d\n",Version[clusterid],Model[clusterid]);
    
    if(Model[clusterid]==460) mtempMC(clusterid);

     reset_odp(clusterid);   // reset odpfifo and flag
     reset_hib(clusterid);
     reset_hib(clusterid);
     reset_hib(clusterid);
    if(Model[clusterid]==460)  set_odp_64bitmode(clusterid);

}

void g5_closeMC(int clusterid)
{
    hib_closeMC(clusterid);
}

void g5_set_rangeMC(int clusterid, double xmin, double xmax, double mmin)
{
    Xscale[clusterid] = 1.0/(xmax - xmin);
    Xscale2[clusterid] = Xscale[clusterid]*Xscale[clusterid];
    Xoffset[clusterid] = xmin;      

    //    printf("xscale %g %g xoffset %g\n",Xscale[clusterid],Xscale2[clusterid],Xoffset[clusterid]);
}

int g5_get_number_of_pipelinesMC(int clusterid)
{  
#ifdef DEFECT
  return Snpipe[clusterid]-2;  
#else
  return Snpipe[clusterid];
#endif
}

int g5_get_jmemsizeMC(int clusterid)
{
    return 4194304;
}

void g5_set_nMC(int clusterid, int n)
{
    Snj[clusterid] = n;
}

struct eps2_consts{
   double _const_0;
};
struct eps2_consts eps2_constants;

void  g5_set_eps2_to_allMC(int clusterid, double eps2)
{
      int ndata = 1;
      int firstaddress= 254;
      int conversions[]={1};

      //    Seps2[clusterid] = eps2;

      if(Version[clusterid] == 3){
	eps2_constants._const_0 = eps2 * Xscale2[clusterid];
      }else{
        eps2_constants._const_0 = eps2;
      }

      SING_LM_write_regconv(clusterid,ndata,conversions);
      SING_LM_write(clusterid,&eps2_constants, firstaddress, 0);

}

void g5_set_eps2MC(int clusterid, int ni, double *eps2)
{
    fprintf(stderr, "g5_set_eps2MC() not implemented yet.\n");
    exit(1);
}


struct etainv_consts{
   double _const_0;
};
struct etainv_consts etainv_constants;

void g5_set_etaMC(int clusterid, double eta)
{
      int ndata = 1;
      int firstaddress= 252;
      int conversions[]={1};

      //   Setainv[clusterid] = 1.0/eta;

      if(Version[clusterid] == 3){
	etainv_constants._const_0 = 1.0/(eta * Xscale[clusterid]);
      }else{
        etainv_constants._const_0 = 1.0/eta;
      }

      SING_LM_write_regconv(clusterid,ndata,conversions);
      SING_LM_write(clusterid,&etainv_constants, firstaddress, 0);
}


void g5_set_jpMC(int clusterid, int adr, int nj, double *m, double (*x)[3])
{
    double xtmp[3];    
    int i0,i1,i,nn,ii,nidp,nii,iii;
    UINT64 id,adrem,tmp1=0,tmp2;


    if(Model[clusterid]==460){
      if(Version[clusterid]==3){  // version 3

        Idp00[clusterid][0] = 0;  Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0; Idp11[clusterid][0] = 0;
        Idp00[clusterid][1] = 0;  Idp01[clusterid][1] = 0; Idp11[clusterid][1] = 0x1;
        Idp10[clusterid][1] = 0x0400003f00000000 ; 
        for(i=0;i<16;i++){
          Idp01[clusterid][2+2*i] = 0x1;
          Idp11[clusterid][2+2*i] = 0x1;
          Idp01[clusterid][3+2*i] = 0x1;
          Idp11[clusterid][3+2*i] = 0x13;
        }
        nidp = 17;	

        for(ii=0;ii<nj;ii+=16){
          if((ii+16)>nj){
             nii = nj - ii;
          }else{
            nii = 16;
          }
          adrem = (ii/16) * nidp;

          for(i=0;i<nii;i++){
            xtmp[0] = (x[i+ii][0]-Xoffset[clusterid])*Xscale[clusterid] + 1.0;
            xtmp[1] = (x[i+ii][1]-Xoffset[clusterid])*Xscale[clusterid] + 1.0;
            xtmp[2] = (x[i+ii][2]-Xoffset[clusterid])*Xscale[clusterid] + 1.0;	    	    
            Idp00[clusterid][2+2*i] = *((UINT64 *)(&xtmp[0]));
            Idp10[clusterid][2+2*i] = *((UINT64 *)(&xtmp[1]));
            Idp00[clusterid][3+2*i] = *((UINT64 *)(&xtmp[2]));
	    tmp2 = *((UINT64 *)(&m[i+ii]));
            Idp10[clusterid][3+2*i] = (0xc000000000000000&tmp2) | (0x3fffffff00000000&(tmp2<<3));
          }

  	  for(i=nii;i<16;i++) Idp10[clusterid][3+2*i] = 0;       // set mj=0 if (nj%16)!=0

//	  em_write460(clusterid, nidp, adrem); 

          nn = 38;
          Rbuf[clusterid][0] = 0x0000000700000000ll | nn;
            iii=0;
            Rbuf[clusterid][1] = ((UINT64)(iii+adrem))<<32 | 0x00000001;
            Rbuf[clusterid][2] = Idp00[clusterid][2*iii];
            Rbuf[clusterid][3] = Idp10[clusterid][2*iii];
            Rbuf[clusterid][4] = Idp00[clusterid][2*iii+1];
            Rbuf[clusterid][5] = Idp10[clusterid][2*iii+1];

          for(iii=1;iii<nidp;iii++){
            if(iii==1){
	      Rbuf[clusterid][6] = ((UINT64)1)<<63 | ((UINT64)16)<<56 |  ((UINT64)(iii+adrem))<<32 | 0x01010113;
	    }
            Rbuf[clusterid][2*iii+5] = (0xffffffff00000000 & (Idp00[clusterid][2*iii]<<12)) | (0x00000000ffffffff &  (Idp10[clusterid][2*iii]>>20)) ;
            Rbuf[clusterid][2*iii+6] = (0xffffffff00000000 & (Idp00[clusterid][2*iii+1]<<12)) | (0x00000000ffffffff &  (Idp10[clusterid][2*iii+1]>>32)) ;
          }
	  //   for(iii=0;iii<nn+1;iii++) printf("idp00 ii %x %d rbuf 0x%016llx\n",iii,iii,Rbuf[clusterid][iii]);

          hib_sendMC(clusterid, nn+1, Rbuf[clusterid]);

        }

        for(i=0;i<(2*nidp);i++) Idp00[clusterid][i]=Idp01[clusterid][i]=Idp10[clusterid][i]=Idp11[clusterid][i]= 0;
        adrem += nidp;
        em_write460(clusterid, nidp, adrem);

      }else{  // version 2

        Idp00[clusterid][0] = 0;  Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0; Idp11[clusterid][0] = 0;
        Idp00[clusterid][1] = 0;  Idp01[clusterid][1] = 0; Idp11[clusterid][1] = 0x1;
        Idp10[clusterid][1] = 0x0400003f00000000 ; 
        for(i=0;i<16;i++){
          Idp01[clusterid][2+2*i] = 0x1;
          Idp11[clusterid][2+2*i] = 0x1;
          Idp01[clusterid][3+2*i] = 0x1;
          Idp11[clusterid][3+2*i] = 0x13;
        }
        nidp = 17;	

        for(ii=0;ii<nj;ii+=16){
          if((ii+16)>nj){
            nii = nj - ii;
          }else{
            nii = 16;
          }
          adrem = (ii/16) * nidp;

          for(i=0;i<nii;i++){
            Idp00[clusterid][2+2*i] = *((UINT64 *)(&x[i+ii][0]));
            Idp10[clusterid][2+2*i] = *((UINT64 *)(&x[i+ii][1]));
            Idp00[clusterid][3+2*i] = *((UINT64 *)(&x[i+ii][2]));
	    tmp2 = *((UINT64 *)(&m[i+ii]));
            Idp10[clusterid][3+2*i] = (0xc000000000000000&tmp2) | (0x3fffffff00000000&(tmp2<<3));
         }

	for(i=nii;i<16;i++) Idp10[clusterid][3+2*i] = 0;       // set mj=0 if (nj%16)!=0

        em_write460(clusterid, nidp, adrem); 
      }

      for(i=0;i<(2*nidp);i++) Idp00[clusterid][i]=Idp01[clusterid][i]=Idp10[clusterid][i]=Idp11[clusterid][i]= 0;
      adrem += nidp;
      em_write460(clusterid, nidp, adrem);

      }
    }else{ // if model

    tmp1 =  *((UINT64 *)(&Seps2[clusterid]));
    nidp = 21;	
    for(ii=0;ii<nj;ii+=16){
        if((ii+16)>nj){
            nii = nj - ii;
        }else{
            nii = 16;
        }
        adrem = (ii/16) * nidp;

        Idp00[clusterid][0] = 0;  Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0; Idp11[clusterid][0] = 0;
        Idp00[clusterid][1] = 0;  Idp01[clusterid][1] = 0; Idp11[clusterid][1] = 0x1;
        Idp10[clusterid][1] = 0x0500003f00000000 ; 

        for(i=0;i<8;i++){
            id = 2*i + ii + 128;
            Idp00[clusterid][2+5*i] = *((UINT64 *)(&x[2*i+ii][0]));                                  Idp01[clusterid][2+5*i] = 0x1;
            Idp10[clusterid][2+5*i] = *((UINT64 *)(&x[2*i+ii][1]));                                  Idp11[clusterid][2+5*i] = 0x1;
            Idp00[clusterid][3+5*i] = *((UINT64 *)(&x[2*i+ii][2]));                                  Idp01[clusterid][3+5*i] = 0x1;
            Idp10[clusterid][3+5*i] = (0xfffffffff0000000&(*((UINT64 *)(&m[2*i+ii])))) | (0xfffffff&(tmp1>>36));       
            Idp11[clusterid][3+5*i] = (0xff&(tmp1>>28))<<9 | 0x1;
            Idp00[clusterid][4+5*i] = id<<28;                                                        Idp01[clusterid][4+5*i] = 0x1;

            Idp10[clusterid][4+5*i] = *((UINT64 *)(&x[2*i+ii+1][0]));                                Idp11[clusterid][4+5*i] = 0x1;
            Idp00[clusterid][5+5*i] = *((UINT64 *)(&x[2*i+ii+1][1]));                                Idp01[clusterid][5+5*i] = 0x1;
            Idp10[clusterid][5+5*i] = *((UINT64 *)(&x[2*i+ii+1][2]));                                Idp11[clusterid][5+5*i] = 0x1;
            Idp00[clusterid][6+5*i] = (0xfffffffff0000000&(*((UINT64 *)(&m[2*i+ii+1])))) | (0xfffffff&(tmp1>>36)); 
            Idp01[clusterid][6+5*i] = (0xff&(tmp1>>28))<<9 | 0x1;
            Idp10[clusterid][6+5*i] = (id+1)<<28;                                                    Idp11[clusterid][6+5*i] = 0x1;
        }

        if(nii<16){                         // set mj=0 if (nj%16)!=0
            if((nii%2)==1){
                i=nii/2;
                Idp00[clusterid][6+5*i] = (0xfffffff&(tmp1>>36)); 
                Idp01[clusterid][6+5*i] = (0xff&(tmp1>>28))<<9 | 0x1;
            }
        }
        if(nii<15){
            for(i=(nii+1)/2;i<8;i++){
                Idp10[clusterid][3+5*i] = (0xfffffff&(tmp1>>36));       
                Idp11[clusterid][3+5*i] = (0xff&(tmp1>>28))<<9 | 0x1;
                Idp00[clusterid][6+5*i] = (0xfffffff&(tmp1>>36)); 
                Idp01[clusterid][6+5*i] = (0xff&(tmp1>>28))<<9 | 0x1;
            }
        }
        em_write(clusterid, nidp, adrem); 
    }

    for(i=0;i<(2*nidp);i++) Idp00[clusterid][i]=Idp01[clusterid][i]=Idp10[clusterid][i]=Idp11[clusterid][i]= 0;
    adrem += nidp;
    em_write(clusterid, nidp, adrem);
    adrem += nidp;
    em_write(clusterid, nidp, adrem);

    }
}



void g5_set_xiMC(int clusterid, int ni, double (*x)[3])
{
  int i,ii,i0,i1,iii,n,ibegin,adrem,nidp,niii,k,kk,nn;
  UINT64 id,ipe,tmp0;
  UINT64 adr_xi[3],adr_yi[3],adr_zi[3],adr_idi[3]; 
  double xitmp00,xitmp10,xitmp20,xitmp30;
  double xitmp01,xitmp11,xitmp21,xitmp31;
  double xitmp02,xitmp12,xitmp22,xitmp32;  
  double xs,xo;

  if(Snpipe[clusterid]==128){
    adr_xi[0] = 0x0;  adr_yi[0] = 0x8;  adr_zi[0] = 0x10;  adr_idi[0] = 0x18;
    niii = 1;
  }
  if(Snpipe[clusterid]==256){
    adr_xi[0] = 0x0;  adr_yi[0] = 0x8;  adr_zi[0] = 0x10;  adr_idi[0] = 0x18;
    adr_xi[1] = 0x1c;  adr_yi[1] = 0x24;  adr_zi[1] = 0x2c;  adr_idi[1] = 0x34;    
    niii = 2;
  }
  if(Snpipe[clusterid]==384){
    adr_xi[0] = 0x0;  adr_yi[0] = 0x8;  adr_zi[0] = 0x10;  adr_idi[0] = 0x18;
    adr_xi[1] = 0x1c;  adr_yi[1] = 0x24;  adr_zi[1] = 0x2c;  adr_idi[1] = 0x34;
    adr_xi[2] = 0x38;  adr_yi[2] = 0x40;  adr_zi[2] = 0x48;  adr_idi[2] = 0x50;        
    niii = 3;
  }

    if(Model[clusterid]==460){
      
      if(Resend_ip[clusterid] == 1){
        for(iii=0;iii<niii;iii++){
          Isp[clusterid][0] = 0x8024000000000000; Isp[clusterid][1] = 0x0001470000000000;
          Isp[clusterid][2] = 0x8024000000000000; Isp[clusterid][3] = 0x0001470000000000;
          i=4;
          for(ii=0;ii<32;ii++){
            ipe = ii;
            Isp[clusterid][i]    = 0x8080000000000000; Isp[clusterid][i+1]  = 0x0002470000000000;
            Isp[clusterid][i+2]  = 0x8024000000000000; Isp[clusterid][i+3]  = 0x0001420000000000;
            Isp[clusterid][i+4]  = 0x84000000006000a0 | (adr_xi[iii]<<11); 
            Isp[clusterid][i+5]  = (0x00000000001 + (24*ipe << 6)) <<20; 
            Isp[clusterid][i+6]  = 0x84000000006000a0 | (adr_yi[iii]<<11); 
            Isp[clusterid][i+7]  = (0x00000000201 + (24*ipe << 6)) <<20; 
            Isp[clusterid][i+8]  = 0x84000000006000a0 | (adr_zi[iii]<<11);  
            Isp[clusterid][i+9]  = (0x00000000401 + (24*ipe << 6)) <<20; 
            i+=10; 
          }
          n = 324; ibegin = 1000 + 324 * iii;     
          send_ispdata(clusterid,n,ibegin,Isp[clusterid]);           // ISP: BM -> LM  
        }
        Resend_ip[clusterid] = 0;
      }

      if(Version[clusterid] == 3){
        int iioffset;
	xs = Xscale[clusterid];
        xo = Xoffset[clusterid];      
	iioffset = 0;
        for(iii=0;iii<niii;iii++){
          Idp00[clusterid][0] = 0; Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0 ;                 Idp11[clusterid][0] = 0;
          Idp00[clusterid][1] = 0; Idp01[clusterid][1] = 0; Idp10[clusterid][1] = 0xc000000000000000 ;Idp11[clusterid][1] = 0x1;  
          i=2;
          for(ii=(128*iii);ii<(128*iii+128);ii+=4){
 	     xitmp00 = (x[iioffset+ii][0]  -xo)*xs+1.0 ;
      	     xitmp01 = (x[iioffset+ii][1]  -xo)*xs+1.0 ;
   	     xitmp02 = (x[iioffset+ii][2]  -xo)*xs+1.0 ;
   	     xitmp10 = (x[iioffset+ii+1][0]-xo)*xs+1.0 ;
   	     xitmp11 = (x[iioffset+ii+1][1]-xo)*xs+1.0 ;
   	     xitmp12 = (x[iioffset+ii+1][2]-xo)*xs+1.0 ;
       	     xitmp20 = (x[iioffset+ii+2][0]-xo)*xs+1.0 ;
   	     xitmp21 = (x[iioffset+ii+2][1]-xo)*xs+1.0 ;
   	     xitmp22 = (x[iioffset+ii+2][2]-xo)*xs+1.0 ;
  	     xitmp30 = (x[iioffset+ii+3][0]-xo)*xs+1.0 ;
   	     xitmp31 = (x[iioffset+ii+3][1]-xo)*xs+1.0 ;
   	     xitmp32 = (x[iioffset+ii+3][2]-xo)*xs+1.0 ;
#ifdef DEFECT
	     if(ii==(DEFECTPE-3)) iioffset = -1;
	     if(ii==(128+(DEFECTPE-3))) iioffset = -2;          // valid only for (ii%4)==3
#endif
            Idp00[clusterid][i]   = *((UINT64 *)(&xitmp00)); Idp01[clusterid][i] = 0x1;
            Idp10[clusterid][i]   = *((UINT64 *)(&xitmp10)); Idp11[clusterid][i] = 0x1;
            Idp00[clusterid][i+1] = *((UINT64 *)(&xitmp20)); Idp01[clusterid][i+1] = 0x1;
            Idp10[clusterid][i+1] = *((UINT64 *)(&xitmp30)); Idp11[clusterid][i+1] = 0x1;
            Idp00[clusterid][i+2] = *((UINT64 *)(&xitmp01)); Idp01[clusterid][i+2] = 0x1;
            Idp10[clusterid][i+2] = *((UINT64 *)(&xitmp11)); Idp11[clusterid][i+2] = 0x1;
            Idp00[clusterid][i+3] = *((UINT64 *)(&xitmp21)); Idp01[clusterid][i+3] = 0x1;
            Idp10[clusterid][i+3] = *((UINT64 *)(&xitmp31)); Idp11[clusterid][i+3] = 0x1;
            Idp00[clusterid][i+4] = *((UINT64 *)(&xitmp02)); Idp01[clusterid][i+4] = 0x1;
            Idp10[clusterid][i+4] = *((UINT64 *)(&xitmp12)); Idp11[clusterid][i+4] = 0x1;
            Idp00[clusterid][i+5] = *((UINT64 *)(&xitmp22)); Idp01[clusterid][i+5] = 0x1;
            Idp10[clusterid][i+5] = *((UINT64 *)(&xitmp32)); Idp11[clusterid][i+5] = 0x1;

            i+=6;
            if(i==98){
              Idp00[clusterid][98] = 0; Idp01[clusterid][98] = 0; Idp10[clusterid][98] = 0 ;                 Idp11[clusterid][98] = 0;
              Idp00[clusterid][99] = 0; Idp01[clusterid][99] = 0; Idp10[clusterid][99] = 0xc00c000000000000 ;Idp11[clusterid][99] = 0x1;  
  	      i+=2;
	    }
          }
          adrem = 0x7ff000;     nidp = 98;

	  //	  em_write460(clusterid, nidp, adrem);                     // host-> EM 

	  //          nn = 5*nidp;
	  nn = 204;
          Rbuf[clusterid][0] = 0x0000000700000000ll | nn;

          ii=0;
          {
            Rbuf[clusterid][1] = ((UINT64)(ii+adrem))<<32 | 0x00000001;
            Rbuf[clusterid][2] = Idp00[clusterid][2*ii];
            Rbuf[clusterid][3] = Idp10[clusterid][2*ii];
            Rbuf[clusterid][4] = Idp00[clusterid][2*ii+1];
            Rbuf[clusterid][5] = Idp10[clusterid][2*ii+1];
          }
          for(ii=1;ii<49;ii++){
	    if(ii==1){
  	      Rbuf[clusterid][6] = ((UINT64)1)<<63 | ((UINT64)48)<<56 | ((UINT64)(ii+adrem))<<32 | 0x01010101;
	    }
            Rbuf[clusterid][2*ii+5] = (0xffffffff00000000 & (Idp00[clusterid][2*ii]<<12)) | (0x00000000ffffffff &  (Idp10[clusterid][2*ii]>>20)) ;
            Rbuf[clusterid][2*ii+6] = (0xffffffff00000000 & (Idp00[clusterid][2*ii+1]<<12)) | (0x00000000ffffffff &  (Idp10[clusterid][2*ii+1]>>20)) ;
          }
          ii=49;
          {
            Rbuf[clusterid][103] = ((UINT64)(ii+adrem))<<32 | 0x00000001;
            Rbuf[clusterid][104] = Idp00[clusterid][2*ii];
            Rbuf[clusterid][105] = Idp10[clusterid][2*ii];
            Rbuf[clusterid][106] = Idp00[clusterid][2*ii+1];
            Rbuf[clusterid][107] = Idp10[clusterid][2*ii+1];
          }
          for(ii=50;ii<98;ii++){
	    if(ii==50){
	      Rbuf[clusterid][108] = ((UINT64)1)<<63 | ((UINT64)48)<<56 | ((UINT64)(ii+adrem))<<32 | 0x01010101;
	    }
            Rbuf[clusterid][2*ii+9] = (0xffffffff00000000 & (Idp00[clusterid][2*ii]<<12)) | (0x00000000ffffffff &  (Idp10[clusterid][2*ii]>>20)) ;
            Rbuf[clusterid][2*ii+10] = (0xffffffff00000000 & (Idp00[clusterid][2*ii+1]<<12)) | (0x00000000ffffffff &  (Idp10[clusterid][2*ii+1]>>20)) ;
          }
	  //for(ii=0;ii<nn+1;ii++) printf("idp00 ii %x %d rbuf 0x%016llx\n",ii,ii,Rbuf[clusterid][ii]);    
          hib_sendMC(clusterid, nn+1, Rbuf[clusterid]);

          run_counter(clusterid,0,0,0,nidp,nidp,0,adrem,1);     // EM -> IDP -> BM
          n = 324; ibegin = 1000 + 324 * iii;         

          run_counter(clusterid,n,ibegin,1,0,0,0,0,0);       // BM -> LM  
        }

      }else{   // version 2

      for(iii=0;iii<niii;iii++){
        Idp00[clusterid][0] = 0; Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0 ;                 Idp11[clusterid][0] = 0;
        Idp00[clusterid][1] = 0; Idp01[clusterid][1] = 0; Idp10[clusterid][1] = 0xc000000000000000 ;Idp11[clusterid][1] = 0x1;  
        i=2;
        for(ii=(128*iii);ii<(128*iii+128);ii+=4){
          Idp00[clusterid][i]   = *((UINT64 *)(&x[ii][0]));   Idp01[clusterid][i] = 0x1;
          Idp10[clusterid][i]   = *((UINT64 *)(&x[ii+1][0])); Idp11[clusterid][i] = 0x1;
          Idp00[clusterid][i+1] = *((UINT64 *)(&x[ii+2][0])); Idp01[clusterid][i+1] = 0x1;
          Idp10[clusterid][i+1] = *((UINT64 *)(&x[ii+3][0])); Idp11[clusterid][i+1] = 0x1;
          Idp00[clusterid][i+2] = *((UINT64 *)(&x[ii][1]));   Idp01[clusterid][i+2] = 0x1;
          Idp10[clusterid][i+2] = *((UINT64 *)(&x[ii+1][1])); Idp11[clusterid][i+2] = 0x1;
          Idp00[clusterid][i+3] = *((UINT64 *)(&x[ii+2][1])); Idp01[clusterid][i+3] = 0x1;
          Idp10[clusterid][i+3] = *((UINT64 *)(&x[ii+3][1])); Idp11[clusterid][i+3] = 0x1;
          Idp00[clusterid][i+4] = *((UINT64 *)(&x[ii][2]));   Idp01[clusterid][i+4] = 0x1;
          Idp10[clusterid][i+4] = *((UINT64 *)(&x[ii+1][2])); Idp11[clusterid][i+4] = 0x1;
          Idp00[clusterid][i+5] = *((UINT64 *)(&x[ii+2][2])); Idp01[clusterid][i+5] = 0x1;
          Idp10[clusterid][i+5] = *((UINT64 *)(&x[ii+3][2])); Idp11[clusterid][i+5] = 0x1;
          i+=6;
          if(i==98){
            Idp00[clusterid][98] = 0; Idp01[clusterid][98] = 0; Idp10[clusterid][98] = 0 ;                 Idp11[clusterid][98] = 0;
            Idp00[clusterid][99] = 0; Idp01[clusterid][99] = 0; Idp10[clusterid][99] = 0xc00c000000000000 ;Idp11[clusterid][99] = 0x1;  
	    i+=2;
	  }
        }
        adrem = 0x7ff000;     nidp = 98;
	//	em_write460_b(clusterid, nidp, adrem);                     // host-> EM 
	//        usleep(10);
	
	em_write460(clusterid, nidp, adrem);                     // host-> EM 
        run_counter(clusterid,0,0,0,nidp,nidp,0,adrem,1);     // EM -> IDP -> BM
	
        n = 324; ibegin = 1000 + 324 * iii;         
        run_counter(clusterid,n,ibegin,1,0,0,0,0,0);       // BM -> LM  
      }
      }

    }else{ // if model

    if(Resend_ip[clusterid] == 1){
      for(iii=0;iii<niii;iii++){
        Isp[clusterid][0] = 0x8024000000000000; Isp[clusterid][1] = 0x0001470000000000;
        Isp[clusterid][2] = 0x8024000000000000; Isp[clusterid][3] = 0x0001470000000000;
        i=4;
        for(ii=0;ii<32;ii++){
            ipe = ii;
            Isp[clusterid][i]    = 0x8080000000000000; Isp[clusterid][i+1]  = 0x0002470000000000;
            Isp[clusterid][i+2]  = 0x8024000000000000; Isp[clusterid][i+3]  = 0x0001420000000000;
            Isp[clusterid][i+4]  = 0x84000000006000a0 | (adr_xi[iii]<<11); 
            Isp[clusterid][i+5]  = (0x00000000001 + (28*ipe << 6)) <<20; 
            Isp[clusterid][i+6]  = 0x84000000006000a0 | (adr_yi[iii]<<11); 
            Isp[clusterid][i+7]  = (0x00000000201 + (28*ipe << 6)) <<20; 
            Isp[clusterid][i+8]  = 0x84000000006000a0 | (adr_zi[iii]<<11);  
            Isp[clusterid][i+9]  = (0x00000000401 + (28*ipe << 6)) <<20; 
            Isp[clusterid][i+10] = 0x44000000006000a0 | (adr_idi[iii]<<11);  
            Isp[clusterid][i+11] = (0x00000000601 + (28*ipe << 6)) <<20; 
            i+=12; 
        }
        n = 388; ibegin = 1000 + 388 * iii;     
        send_ispdata(clusterid,n,ibegin,Isp[clusterid]);           // ISP: BM -> LM  
      }
      Resend_ip[clusterid] = 0;
    }

    Idp00[clusterid][0] = 0;   Idp01[clusterid][0] = 0;   Idp10[clusterid][0] = 0 ;                   Idp11[clusterid][0] = 0;
    Idp00[clusterid][1] = 0;   Idp01[clusterid][1] = 0;   Idp10[clusterid][1] = 0 ;                   Idp11[clusterid][1] = 0x1;  
    Idp00[clusterid][130] = 0; Idp01[clusterid][130] = 0; Idp10[clusterid][130] = 0 ;                 Idp11[clusterid][130] = 0;
    Idp00[clusterid][131] = 0; Idp01[clusterid][131] = 0; Idp10[clusterid][131] = 0xc010000000000000 ;Idp11[clusterid][131] = 0x1;  

    for(iii=0;iii<niii;iii++){
      i=2;
      for(ii=(128*iii);ii<(128*(iii+1));ii+=4){
        id=ii ;
        Idp00[clusterid][i]   = *((UINT64 *)(&x[ii][0]));   Idp01[clusterid][i] = 0x1;
        Idp10[clusterid][i]   = *((UINT64 *)(&x[ii+1][0])); Idp11[clusterid][i] = 0x1;
        Idp00[clusterid][i+1] = *((UINT64 *)(&x[ii+2][0])); Idp01[clusterid][i+1] = 0x1;
        Idp10[clusterid][i+1] = *((UINT64 *)(&x[ii+3][0])); Idp11[clusterid][i+1] = 0x1;
        if(i==128) i+=2;
        Idp00[clusterid][i+2] = *((UINT64 *)(&x[ii][1]));   Idp01[clusterid][i+2] = 0x1;
        Idp10[clusterid][i+2] = *((UINT64 *)(&x[ii+1][1])); Idp11[clusterid][i+2] = 0x1;
        Idp00[clusterid][i+3] = *((UINT64 *)(&x[ii+2][1])); Idp01[clusterid][i+3] = 0x1;
        Idp10[clusterid][i+3] = *((UINT64 *)(&x[ii+3][1])); Idp11[clusterid][i+3] = 0x1;
        Idp00[clusterid][i+4] = *((UINT64 *)(&x[ii][2]));   Idp01[clusterid][i+4] = 0x1;
        Idp10[clusterid][i+4] = *((UINT64 *)(&x[ii+1][2])); Idp11[clusterid][i+4] = 0x1;
        Idp00[clusterid][i+5] = *((UINT64 *)(&x[ii+2][2])); Idp01[clusterid][i+5] = 0x1;
        Idp10[clusterid][i+5] = *((UINT64 *)(&x[ii+3][2])); Idp11[clusterid][i+5] = 0x1;
  	tmp0 =  *((UINT64 *)(&Setainv[clusterid]));
  	Idp00[clusterid][i+6] =  (0xfffffffff0000000&tmp0) | (0xfffffff&(tmp0>>36));
        Idp01[clusterid][i+6] =  (0xff&(tmp0>>28))<<9 | 0x1;
	Idp10[clusterid][i+6] =  (0xfffffffff0000000&tmp0) | (0xfffffff&(tmp0>>36));
        Idp11[clusterid][i+6] =  (0xff&(tmp0>>28))<<9 | 0x1;

        i+=7;
      }
      adrem = 0x7ff000;     nidp = 228/2;
      em_write(clusterid, nidp, adrem);                     // host-> EM 
      run_counter(clusterid,0,0,0,nidp,nidp,0,adrem,1);     // EM -> IDP -> BM

      n = 388; ibegin = 1000 + 388 * iii;         
      run_counter(clusterid,n,ibegin,1,0,0,0,0,0);       // BM -> LM  
    }
    }
}

void g5_runMC(int clusterid)
{
    int i0,i1,nn,i,j,nidp,n,ibegin,ii,ic,irep0,irep1;
    int zero,tmpnpipe,npipe; 

    //   printf("run\n");
    
    if(Resend_run[clusterid]==1){

        /* ISP init */

        ii=0;

	if(Sflag_cutoff[clusterid]==0){
          if(Snpipe[clusterid]==128){
#include "./vsm/ispinitloop.gravity.dat"
          }
          if(Snpipe[clusterid]==256){
	    if(Sflag_nnb[clusterid]==0){
#include "./vsm/ispinitloop.gravity2.dat"	
	    }else{
#include "./vsm/ispinitloop.gravity2_nnb.dat"	
	    }  
          }
          if(Snpipe[clusterid]==384){
#include "./vsm/ispinitloop.gravity3.dat"	
          }
	}
	if(Sflag_cutoff[clusterid]==1){
          if(Snpipe[clusterid]==128){
#include "./vsm/ispinitloop.gravity_cutoff.dat"
          }
          if(Snpipe[clusterid]==256){
	    if(Sflag_nnb[clusterid]==0){
#include "./vsm/ispinitloop.gravity_cutoff2.dat"
	    }else{
#include "./vsm/ispinitloop.gravity2_nnb_cutoff.dat"
	    }
          }
	}

	Nispinit[clusterid] = ii; 

        n = Nispinit[clusterid]; ibegin = 800;
        send_ispdata(clusterid,n,ibegin,Isp[clusterid]);

        /* ISP loop */

        ii=0;      

//#include "./g5isploop.dat"
	if(Sflag_cutoff[clusterid]==0){
          if(Snpipe[clusterid]==128){
#include "./vsm/isploop.gravity.dat"
            Nplus[clusterid] = 40 + 54;
          }
          if(Snpipe[clusterid]==256){
	    if(Sflag_nnb[clusterid]==0){
#include "./vsm/isploop.gravity2.dat"
              Nplus[clusterid] = 48;
	    }else{
#include "./vsm/isploop.gravity2_nnb.dat"
              Nplus[clusterid] = 48;
	    }
          }
          if(Snpipe[clusterid]==384){
#include "./vsm/isploop.gravity3.dat"	
            Nplus[clusterid] = 6 ;
          }
        }
	if(Sflag_cutoff[clusterid]==1){
          if(Snpipe[clusterid]==128){
#include "./vsm/isploop.gravity_cutoff.dat"
            Nplus[clusterid] = 44;
          }
          if(Snpipe[clusterid]==256){
	    if(Sflag_nnb[clusterid]==0){
#include "./vsm/isploop.gravity_cutoff2.dat"
              Nplus[clusterid] = 0;
	    }else{
#include "./vsm/isploop.gravity2_nnb_cutoff.dat"
              Nplus[clusterid] = 0;
	    }
          }
	}
	if(Model[clusterid]==460) Nplus[clusterid] -= (42+6);
	if(Nplus[clusterid]<0) Nplus[clusterid] = 0;

        Nisp[clusterid] = ii;
        for(i=0;i<Nplus[clusterid];i++) Isp[clusterid][Nisp[clusterid]+i] = 0;
        n = Nisp[clusterid] + Nplus[clusterid]; ibegin=100;  // 100 needs for eps2 
        send_ispdata(clusterid,n,ibegin,Isp[clusterid]);

	//         fprintf(stderr,"nisp %d/%d nplus %d/%d %g %g\n",Nisp[clusterid],Nisp[clusterid]/2,Nplus[clusterid],
	//         Nplus[clusterid]/2,19.0/(Nisp[clusterid]/2.0),19.0/((Nisp[clusterid]+Nplus[clusterid])/2.0));
         Resend_run[clusterid]=0;

    }

    n = Nispinit[clusterid]; ibegin = 800;    
    run_counter(clusterid,n,ibegin,1,0,0,0,0,0);             // run ISP init
 
    if(Model[clusterid]==460){
      nidp = 17;    
    }else{
      nidp = 21;
    }
    run_counter(clusterid,0,0,0,20*nidp,nidp,0,0,1);         // first IDP

    nn = (Snj[clusterid]-1)/16 + 1;
    if(Snj[clusterid]==0) nn = 0;
    n = Nisp[clusterid] + Nplus[clusterid];	ibegin=100;
    run_counter(clusterid,n,ibegin,nn,n,nidp,0,nidp,nn);     // run ISP loop

}



void g5_get_forceMC(int clusterid, int ni, double (*a)[3], double *pot)
{
  int i0,i1,i,ii,n,ibegin,nidp,nn,memadr,iii,niii,k;
  UINT64 adr_ax[3],adr_ay[3],adr_az[3],adr_p[3],adr_nnb[3];

     if(Snpipe[clusterid]==128){
        adr_ax[0] = 0x20; adr_ay[0] = 0x28; adr_az[0] = 0x30; adr_p[0] = 0x38;
	niii = 1;
     }
     if(Snpipe[clusterid]==256){
	adr_ax[0] = 0x3c; adr_ay[0] = 0x44; adr_az[0] = 0x4c; adr_p[0] = 0x54;
        adr_ax[1] = 0x5c; adr_ay[1] = 0x64; adr_az[1] = 0x6c; adr_p[1] = 0x74;	
	adr_nnb[0] = 0x7c; adr_nnb[1] = 0x84;
	niii = 2;
     }
     if(Snpipe[clusterid]==384){
	adr_ax[0] = 0x58; adr_ay[0] = 0x60; adr_az[0] = 0x68; adr_p[0] = 0x70;
        adr_ax[1] = 0x78; adr_ay[1] = 0x80; adr_az[1] = 0x88; adr_p[1] = 0x90;
        adr_ax[2] = 0x98; adr_ay[2] = 0xa0; adr_az[2] = 0xa8; adr_p[2] = 0xb0;		
	niii = 3;
     }
 
    if(Resend_fo[clusterid]==1){
        usleep(10000);
       
        Isp[clusterid][0]  = 0x8000000000000000; Isp[clusterid][1]  = 0x0000000000000000;
        Isp[clusterid][2]  = 0x80028180000100a6; Isp[clusterid][3]  = 0x6865ea8000000000;
        Isp[clusterid][4]  = 0x8000000000000000; Isp[clusterid][5]  = 0x0000000000000000;
        Isp[clusterid][6]  = 0x8000000001800006; Isp[clusterid][7]  = 0x6864003c00100000;
        Isp[clusterid][8]  = 0x8000000001800006; Isp[clusterid][9]  = 0x6864003c20300000;
        Isp[clusterid][10] = 0x8000000001800006; Isp[clusterid][11] = 0x6864003c40500000;
        Isp[clusterid][12] = 0x8000000001800006; Isp[clusterid][13] = 0x6864003c60700000;
        Isp[clusterid][14] = 0x8000000001800006; Isp[clusterid][15] = 0x6864003c80900000;
        Isp[clusterid][16] = 0x8000000001800006; Isp[clusterid][17] = 0x6864003ca0b00000;
        Isp[clusterid][18] = 0x8000000001800006; Isp[clusterid][19] = 0x6864003cc0d00000;
        Isp[clusterid][20] = 0x8000000001800006; Isp[clusterid][21] = 0x6864003ce0f00000;
        Isp[clusterid][22] = 0x8000000001800006; Isp[clusterid][23] = 0x6864003d01100000;
        Isp[clusterid][24] = 0x8000000001800006; Isp[clusterid][25] = 0x6864003d21300000;
        Isp[clusterid][26] = 0x8000000001800006; Isp[clusterid][27] = 0x6864003d41500000;
        Isp[clusterid][28] = 0x8000000001800006; Isp[clusterid][29] = 0x6864003d61700000;
        Isp[clusterid][30] = 0x8000000001800006; Isp[clusterid][31] = 0x6864003d81900000;
        Isp[clusterid][32] = 0x8000000001800006; Isp[clusterid][33] = 0x6864003da1b00000;
        Isp[clusterid][34] = 0x8000000001800006; Isp[clusterid][35] = 0x6864003dc1d00000;
        Isp[clusterid][36] = 0x8000000001800006; Isp[clusterid][37] = 0x6864003de1f00000;
        Isp[clusterid][38] = 0x8000000001800006; Isp[clusterid][39] = 0x6864003e02100000;
        Isp[clusterid][40] = 0x8000000001800006; Isp[clusterid][41] = 0x6864003e22300000;
        Isp[clusterid][42] = 0x8000000001800006; Isp[clusterid][43] = 0x6864003e42500000;
        Isp[clusterid][44] = 0x8000000001800006; Isp[clusterid][45] = 0x6864003e62700000;
        Isp[clusterid][46] = 0x8000000001800006; Isp[clusterid][47] = 0x6864003e82900000;
        Isp[clusterid][48] = 0x8000000001800006; Isp[clusterid][49] = 0x6864003ea2b00000;
        Isp[clusterid][50] = 0x8000000001800006; Isp[clusterid][51] = 0x6864003ec2d00000;
        Isp[clusterid][52] = 0x8000000001800006; Isp[clusterid][53] = 0x6864003ee2f00000;
        Isp[clusterid][54] = 0x8000000001800006; Isp[clusterid][55] = 0x6864003f03100000;
        Isp[clusterid][56] = 0x8000000001800006; Isp[clusterid][57] = 0x6864003f23300000;
        Isp[clusterid][58] = 0x8000000001800006; Isp[clusterid][59] = 0x6864003f43500000;
        Isp[clusterid][60] = 0x8000000001800006; Isp[clusterid][61] = 0x6864003f63700000;
        Isp[clusterid][62] = 0x8000000001800006; Isp[clusterid][63] = 0x6864003f83900000;
        Isp[clusterid][64] = 0x8000000001800006; Isp[clusterid][65] = 0x6864003fa3b00000;
        Isp[clusterid][66] = 0x8000000001800006; Isp[clusterid][67] = 0x6864003fc3d00000;
        Isp[clusterid][68] = 0x8000000001800006; Isp[clusterid][69] = 0x6864003fe3f00000;
        Isp[clusterid][70] = 0x8000000000000000; Isp[clusterid][71] = 0x0000000000000000;

        Idp00[clusterid][0] = 0x0000000000000000; Idp01[clusterid][0] = 0x0; Idp10[clusterid][0] = 0x0000000000000000; Idp11[clusterid][0] = 0x0;
        Idp00[clusterid][1] = 0x0180000000000000; Idp01[clusterid][1] = 0x1; Idp10[clusterid][1] = 0x3804003e40c00000; Idp11[clusterid][1] = 0x1;       

        nidp = 1; memadr = 0x700000;
        em_write(clusterid, nidp, memadr);

	Idp00[clusterid][1] = 0x0180000000000000; Idp01[clusterid][1] = 0x1; Idp10[clusterid][1] = 0x3804003653c00000; Idp11[clusterid][1] = 0x1;

	nidp = 1; memadr = 0x700001;
	em_write(clusterid, nidp, memadr);

	for(iii=0;iii<niii;iii++){
  	  Isp[clusterid][2] = 0x80028180000000a6 | (adr_ax[iii] << 11);		  
          n = 72; ibegin = 2200 + 500*iii;
          send_ispdata(clusterid,n,ibegin,Isp[clusterid]);

          Isp[clusterid][2] = 0x80028180000000a6 | (adr_ay[iii] << 11);		  
          n = 72; ibegin = 2300 + 500*iii;
          send_ispdata(clusterid,n,ibegin,Isp[clusterid]);

          Isp[clusterid][2] = 0x80028180000000a6 | (adr_az[iii] << 11);
          n = 72; ibegin = 2400 + 500*iii;
          send_ispdata(clusterid,n,ibegin,Isp[clusterid]);

          Isp[clusterid][2] = 0x80028180000000a6 | (adr_p[iii] << 11);
          n = 72; ibegin = 2500 + 500*iii;
          send_ispdata(clusterid,n,ibegin,Isp[clusterid]);

          Isp[clusterid][2] = 0x80028180000000a6 | (adr_nnb[iii] << 11);
          n = 72; ibegin = 2600 + 500*iii;
          send_ispdata(clusterid,n,ibegin,Isp[clusterid]);
	}

        Resend_fo[clusterid] = 0;
        usleep(10000);
    }

    /***********************/ 
    if(Model[clusterid]==460){
      int iioffset[4];
      iioffset[0] = iioffset[1] = iioffset[2] = iioffset[3] =0 ;
      for(iii=0;iii<niii;iii++){

        if(((iii+1)*128)>ni){
#ifdef DEFECT
  	  nn = ni-iii*128+2;
#else
  	  nn = ni-iii*128;
#endif	  

        }else{
  	  nn = 128;
        }

	for(k=0;k<3;k++){
          n = 72; ibegin = 2200 + 100*k + 500*iii ; memadr = 0x700000;
          run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
          hib_recvMC(clusterid, 128, Wbuf[clusterid]);
          for(ii=0;ii<nn;ii++){
	    //            printf("ii %d iii %d iioffset %d %d\n",ii,iii,iioffset[k],iioffset[k] + ii+128*iii);
            a[iioffset[k] + ii+128*iii][k] = *((double *)(Wbuf[clusterid]+ii));
            a[iioffset[k] + ii+128*iii][k] *= DEFAULT_FORCE_CORRECTION;
	    if(Version[clusterid]==3) a[iioffset[k] + ii+128*iii][k] *= Xscale2[clusterid];
#ifdef DEFECT
	    if(ii==DEFECTPE){
	      if(iii == 0) iioffset[k] = -1;
	      if(iii == 1) iioffset[k] = -2;	      
	    }
#endif
          }
	}

        n = 72; ibegin = 2500 + 500*iii ; memadr = 0x700000;   
        run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
        hib_recvMC(clusterid, 128, Wbuf[clusterid]);
        for(ii=0;ii<nn;ii++){
          pot[iioffset[3] + ii+128*iii]   = -(*((double *)(Wbuf[clusterid]+ii)));
          if(Version[clusterid]==2){  // version 2
            pot[ii+128*iii]   *= DEFAULT_FORCE_CORRECTION;
	  }else{
            pot[iioffset[3] + ii+128*iii]   *= DEFAULT_POT_CORRECTION;
	  }
	  if(Version[clusterid]==3) pot[iioffset[3]+ ii+128*iii] *= Xscale[clusterid];
#ifdef DEFECT
	  if(ii==DEFECTPE){
	    if(iii == 0) iioffset[3] = -1;
	    if(iii == 1) iioffset[3] = -2;	      
	  }
#endif
        }
      }  
      //      printf("force %g %g %g pot %g\n",a[0][0],a[0][1],a[0][2],pot[0]);

    }else{  // if model

      for(iii=0;iii<niii;iii++){

        n = 72; ibegin = 2200 + 400*iii ; memadr = 0x700000;
        run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
        hib_recvMC(clusterid, 256, Wbuf[clusterid]);
       //for(ii=0;ii<128;ii++) printf("%d %016lx %016lx\n",ii+128*iii,Wbuf[clusterid][2*ii+1],Wbuf[clusterid][2*ii]);
        print_wbuf(clusterid);
        for(ii=0;ii<64;ii++){
          a[2*(ii+64*iii)][0]   = *((double *)(Wbuf[clusterid]+4*ii+1));
          a[2*(ii+64*iii)+1][0] = *((double *)(Wbuf[clusterid]+4*ii+3));         
      //      printf("a[%d][0] %e %e\n",2*ii,a[2*ii][0],a[2*ii+1][0]);
          a[2*(ii+64*iii)][0]   *= DEFAULT_FORCE_CORRECTION;
          a[2*(ii+64*iii)+1][0] *= DEFAULT_FORCE_CORRECTION;
        }
     //    printf("%016lx %016lx\n",Wbuf[clusterid][1],Wbuf[clusterid][0]);

    /***********************/ 

        n = 72; ibegin = 2300 + 400*iii; memadr = 0x700000;
        run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
        hib_recvMC(clusterid, 256, Wbuf[clusterid]);
        print_wbuf(clusterid);
        for(ii=0;ii<64;ii++){
          a[2*(ii+64*iii)][1]   = *((double *)(Wbuf[clusterid]+4*ii+1));
          a[2*(ii+64*iii)+1][1] = *((double *)(Wbuf[clusterid]+4*ii+3));
         //     printf("a[%d][1] %e %e\n",2*ii,a[2*ii][1],a[2*ii+1][1]);
          a[2*(ii+64*iii)][1]   *= DEFAULT_FORCE_CORRECTION;
          a[2*(ii+64*iii)+1][1] *= DEFAULT_FORCE_CORRECTION;
        }

    /***********************/ 

        n = 72; ibegin = 2400 + 400*iii ; memadr = 0x700000;   
        run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
        hib_recvMC(clusterid, 256, Wbuf[clusterid]);
        print_wbuf(clusterid);
        for(ii=0;ii<64;ii++){
          a[2*(ii+64*iii)][2]   = *((double *)(Wbuf[clusterid]+4*ii+1));
          a[2*(ii+64*iii)+1][2] = *((double *)(Wbuf[clusterid]+4*ii+3));
        //       printf("a[2] %e %e\n",a[2*ii][2],a[2*ii+1][2]);
          a[2*(ii+64*iii)][2]   *= DEFAULT_FORCE_CORRECTION;
          a[2*(ii+64*iii)+1][2] *= DEFAULT_FORCE_CORRECTION;
        }

    /***********************/

        n = 72; ibegin = 2500 + 400*iii ; memadr = 0x700000;   
        run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
        hib_recvMC(clusterid, 256, Wbuf[clusterid]);
        print_wbuf(clusterid);
        for(ii=0;ii<64;ii++){
          pot[2*(ii+64*iii)]   = -(*((double *)(Wbuf[clusterid]+4*ii+1)));
          pot[2*(ii+64*iii)+1] = -(*((double *)(Wbuf[clusterid]+4*ii+3)));         
        //       printf("pot %e %e\n",pot[2*ii],pot[2*ii+1]);
          pot[2*(ii+64*iii)]   *= DEFAULT_FORCE_CORRECTION;
          pot[2*(ii+64*iii)+1] *= DEFAULT_FORCE_CORRECTION;
        }
 
      }  

    }

}

static double
g_p3m(double re)
{
  double func, cppfrc;

  if((re>=0)&&(re<1)){
    func=re*(224.+re*re*(-224.+re*(70.+re*(48.-re*21.))))/(35.*4.0);
    cppfrc = 1.0-re*re*func;
  }else{
    if((re>=1)&&(re<2)){
      func=(12./(re*re)-224.+re*(896.+re*(-840.+re*(224.+re*(70.+re*(-48.+re*7.))))))/(35.*4.0);
      cppfrc = 1.0-re*re*func;      
    }else{
      cppfrc = 0;
    }
  }
  return cppfrc;   
}

struct SING_grape_cutoff{
  double _const_[16];
};
struct SING_grape_cutoff cutoffdata;

void
g5_set_cutoff_tableMC(int clusterid,
                      double (*ffunc)(double), double fcut, double fcor,
                      double (*pfunc)(double), double pcut, double pcor)
{
    static int firstcall = 1;
    int ndata = 16;
    int address,ii,i;
    int conversions[]={1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
    double re;

    SING_LM_write_regconv(clusterid,ndata,conversions);

    for(ii=0;ii<16;ii++){
      for(i=0;i<16;i++){      
	re = (i+16*ii)*1.0/128.0;
        cutoffdata._const_[i] = g_p3m(re+0.5/128.0);
	//	printf("re %g g %g\n",re,g_p3m(re+0.5/128.0));
      }	
      address = 256 + ii*16;
      SING_LM_write(clusterid,&cutoffdata, address, 0);
    }

    if(firstcall == 1){
      WARN(2, "Warning: cut-off function is implemented in this revision only for PPPM cutoff force (not for potential), where fcut is fixed to 2.\n", clusterid);
      firstcall = 0;
    }

    if(Snpipe[clusterid] > 256) Snpipe[clusterid] = 256;
    Sflag_cutoff[clusterid] = 1;

    if(Snpipe[clusterid]==128) gravity_cutoff_send_vsm_constant(clusterid);
    if(Snpipe[clusterid]==256){
      if(Sflag_nnb[clusterid] == 0){
        gravity_cutoff2_send_vsm_constant(clusterid);    
      }else{
        gravity2_nnb_cutoff_send_vsm_constant(clusterid);    
      }
    }
}

void g5_calculate_force_on_xMC(int clusterid, double (*x)[3], double (*a)[3], double *p, int ni)
{
  int off, nii, np;

  np = g5_get_number_of_pipelines();

  for (off = 0; off < ni; off += np) {
    nii = np;
    if (off+nii > ni) {
      nii = ni - off;
    }

    g5_set_xiMC(clusterid, nii, (double (*)[3])x[off]);
    g5_runMC(clusterid);
    g5_get_forceMC(clusterid,nii, (double (*)[3])a[off], &p[off]);
  }
}


/************************** nnb function ***************************/


void g5n_set_jpMC(int clusterid, int adr, int nj, double *m, double (*x)[3], int *index)
{
    double xtmp[3];    
    int i0,i1,i,nn,ii,nidp,nii,iii;
    UINT64 id,adrem,tmp1=0,tmp2,tmp3;

    Sflag_nnb[clusterid] = 1;    // set nnb flag

    if(Sflag_cutoff[clusterid]==0){
      gravity2_nnb_send_vsm_constant(clusterid);
    }else{
      gravity2_nnb_cutoff_send_vsm_constant(clusterid);
    }
    
    if(Model[clusterid]==460){

      Idp00[clusterid][0] = 0;  Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0; Idp11[clusterid][0] = 0;
      Idp00[clusterid][1] = 0;  Idp01[clusterid][1] = 0; Idp11[clusterid][1] = 0x1;
      Idp10[clusterid][1] = 0x0400003f00000000 ; 
      for(i=0;i<16;i++){
        Idp01[clusterid][2+2*i] = 0x1;
        Idp11[clusterid][2+2*i] = 0x1;
        Idp01[clusterid][3+2*i] = 0x1;
        Idp11[clusterid][3+2*i] = 0x15;
//      Idp11[clusterid][3+2*i] = 0x13;	  
      }
      nidp = 17;	

      for(ii=0;ii<nj;ii+=16){
        if((ii+16)>nj){
          nii = nj - ii;
        }else{
          nii = 16;
        }
        adrem = (ii/16) * nidp;

        for(i=0;i<nii;i++){
          xtmp[0] = (x[i+ii][0]-Xoffset[clusterid])*Xscale[clusterid] + 1.0;
          xtmp[1] = (x[i+ii][1]-Xoffset[clusterid])*Xscale[clusterid] + 1.0;
          xtmp[2] = (x[i+ii][2]-Xoffset[clusterid])*Xscale[clusterid] + 1.0;	    	    
          Idp00[clusterid][2+2*i] = *((UINT64 *)(&xtmp[0]));
          Idp00[clusterid][2+2*i] &= 0xfffffffffff00000;
          Idp10[clusterid][2+2*i] = *((UINT64 *)(&xtmp[1]));
          Idp10[clusterid][2+2*i] &= 0xfffffffffff00000;
          Idp00[clusterid][3+2*i] = *((UINT64 *)(&xtmp[2]));
          Idp00[clusterid][3+2*i] &= 0xfffffffffff00000;
	  tmp2 = *((UINT64 *)(&m[i+ii]));
	  tmp3 = *((UINT64 *)(&index[i+ii]));	    
          Idp10[clusterid][3+2*i] = (0xc000000000000000&tmp2) | (0x3fffffff00000000&(tmp2<<3)) | (0x00000000ffffffff & tmp3) ;
        }

  	for(i=nii;i<16;i++){
          xtmp[0] = 1.0;
          Idp00[clusterid][2+2*i] = *((UINT64 *)(&xtmp[0]));  // for nnb
          Idp00[clusterid][2+2*i] &= 0xfffffffffff00000;
	  Idp10[clusterid][3+2*i] = 0;       // set mj=0 if (nj%16)!=0
	}

	em_write460(clusterid, nidp, adrem); 

      }

      for(i=0;i<(2*nidp);i++) Idp00[clusterid][i]=Idp01[clusterid][i]=Idp10[clusterid][i]=Idp11[clusterid][i]= 0;
      adrem += nidp;
      em_write460(clusterid, nidp, adrem);

    }
}

void g5_set_indexMC(int clusterid, int ni, int *index)
{
  int i,ii,i0,i1,iii,n,ibegin,adrem,nidp,niii,k,kk,nn;
  UINT64 id,ipe,tmp0;
  UINT64 adr_xi[3],adr_yi[3],adr_zi[3],adr_idi[3]; 

  if(Snpipe[clusterid]==256){
    adr_xi[0] = 0x0;  adr_yi[0] = 0x8;  adr_zi[0] = 0x10;  adr_idi[0] = 0x18;
    adr_xi[1] = 0x1c;  adr_yi[1] = 0x24;  adr_zi[1] = 0x2c;  adr_idi[1] = 0x34;    
    niii = 2;
  }

  if(Model[clusterid]==460){
      
    if(Resend_index[clusterid] == 1){
      for(iii=0;iii<niii;iii++){
        Isp[clusterid][0] = 0x8024000000000000; Isp[clusterid][1] = 0x0001470000000000;
        Isp[clusterid][2] = 0x8024000000000000; Isp[clusterid][3] = 0x0001470000000000;
        i=4;
        for(ii=0;ii<32;ii++){
          ipe = ii;
          Isp[clusterid][i]    = 0x8080000000000000; Isp[clusterid][i+1]  = 0x0002470000000000;
          Isp[clusterid][i+2]  = 0x8024000000000000; Isp[clusterid][i+3]  = 0x0001420000000000;
          Isp[clusterid][i+4]  = 0x44000000006000a0 | (adr_idi[iii]<<11); 
          Isp[clusterid][i+5]  = (0x00000000001 + (4*ipe << 6)) <<20; 
          i+=6; 
        }
        n = 196; ibegin = 1800 + 196 * iii;     // doesn't work when niii = 3
        send_ispdata(clusterid,n,ibegin,Isp[clusterid]);           // ISP: BM -> LM  
      }
      Resend_index[clusterid] = 0;
    }

    for(iii=0;iii<niii;iii++){
      Idp00[clusterid][0] = 0; Idp01[clusterid][0] = 0; Idp10[clusterid][0] = 0 ;                 Idp11[clusterid][0] = 0;
      Idp00[clusterid][1] = 0; Idp01[clusterid][1] = 0; Idp10[clusterid][1] = 0x2000000000000000 ; 
      Idp11[clusterid][1] = 0x1;  
      i=2;
      for(ii=(128*iii);ii<(128*iii+128);ii+=4){
        Idp00[clusterid][i] = (0x00000000ffffffff&(*((UINT64 *)(&index[ii]))))<<32 | (0x00000000ffffffff&(*((UINT64 *)(&index[ii+1]))));
	Idp01[clusterid][i] = 0x25;
        Idp10[clusterid][i] = (0x00000000ffffffff&(*((UINT64 *)(&index[ii+2]))))<<32 | (0x00000000ffffffff&(*((UINT64 *)(&index[ii+3]))));
        Idp11[clusterid][i] = 0x25;
        i++;
        if(i==18){
          Idp00[clusterid][18] = 0; Idp01[clusterid][18] = 0; Idp10[clusterid][18] = 0 ;                 Idp11[clusterid][18] = 0;
          Idp00[clusterid][19] = 0; Idp01[clusterid][19] = 0; Idp10[clusterid][19] = 0x2002000000000000 ;Idp11[clusterid][19] = 0x1;  
          i+=2;
	}
      }
      adrem = 0x7ff000;     nidp = 18;
      em_write460(clusterid, nidp, adrem);                     // host-> EM 
      run_counter(clusterid,0,0,0,10*nidp,nidp,0,adrem,1);     // EM -> IDP -> BM

      n = 196; ibegin = 1800 + 196 * iii;
      run_counter(clusterid,n,ibegin,1,0,0,0,0,0);       // BM -> LM  
    }

  }

}

void g5n_get_forceMC(int clusterid, int ni, double (*a)[3], double *pot,
                     double * rnnb2, int * innb)
{
  int nn,n,ibegin,memadr,iii,niii,ii;
  UINT64 ir0, ir1;

    g5_get_forceMC(clusterid,ni,a,pot);

    reset_odp(clusterid);
    reset_hib(clusterid);
    reset_hib(clusterid);
    reset_hib(clusterid);

    niii = 2;
    
    for(iii=0;iii<niii;iii++){

      n = 72; ibegin = 2600 + 500*iii ; memadr = 0x700001;
      run_counter(clusterid,n,ibegin,1,n,1,0,memadr,1);
      hib_recvMC(clusterid, 256, Wbuf[clusterid]);

      if(((iii+1)*128)>ni){
  	nn = ni-iii*128;
      }else{
        nn = 128;
      }

      for(ii=0;ii<nn;ii++){
	innb[ii+128*iii] = (int)((0xffffff & Wbuf[clusterid][2*ii+1])<<8 | (0xff & (Wbuf[clusterid][2*ii+0]>>56)));
	ir0 = (0xfffffffff0000000LL & Wbuf[clusterid][2*ii+1]);
	rnnb2[ii+128*iii] = (*((double *)&ir0))/Xscale2[clusterid];
      }

    }

    set_odp_64bitmode(clusterid);
}

void g5n_calculate_force_on_xMC(int clusterid, double (*x)[3],
				  int * index,
				  double (*a)[3], double *p,
				  double * rnnb2,
				  int * innb, int ni)
{
  int off, nii, np;

  np = g5_get_number_of_pipelines();

  for (off = 0; off < ni; off += np) {
    nii = np;
    if (off+nii > ni) {
      nii = ni - off;
    }

    g5_set_xiMC(clusterid, nii, (double (*)[3])x[off]);
    g5_set_indexMC(clusterid, nii, index+off);
    g5_runMC(clusterid);
    g5n_get_forceMC(clusterid,nii, (double (*)[3])a[off], &p[off], &rnnb2[off], &innb[off]);
  }
}
