/* code for nbody : sticky8.c 
   (4nd-order predictor-corrector scheme
   and Individual time step)
*/
#include <stdlib.h>
#include <math.h>
#include "g6util.h"
#include "./sticky.h"

void force_on_ith_particle(int i,
                           REAL xi[DIM],
                           REAL vi[DIM],
                           REAL x[NMAX][DIM],
                           REAL v[NMAX][DIM],
                           REAL m[NMAX],
                           REAL eps,
                           REAL ai[DIM],
                           REAL adoti[DIM],
                           REAL *poti,
                           int n)
{
    int j,d,k;
    REAL r2,r3inv,r2inv,rinv,eps2,xdotv;
    REAL r5inv,xdotvr5inv,r3invdx;
    REAL r3invdvetc;
    REAL dx[DIM];
    REAL dv[DIM];
    
    for(k=0;k<DIM;k++){
        ai[k] = 0.0;
        adoti[k] = 0.0;
    }
    *poti = 0.0;
    eps2 = eps*eps;

    for(j=0;j<n;j++){
        if(j!=i){
            r2 = eps2;
            xdotv = 0.0;
            for(d=0;d<DIM;d++){
                dx[d] = x[j][d] - xi[d];
                dv[d] = v[j][d] - vi[d];
                r2 += dx[d] * dx[d];
                xdotv += dx[d]*dv[d];
            }
            r2inv = 1.0/r2;
            rinv = sqrt(r2inv);
            r3inv = r2inv*rinv;
            r5inv = r2inv*r2inv*rinv;
            xdotvr5inv = 3.0*xdotv*r5inv;
            for(d=0;d<DIM;d++){
                r3invdx = r3inv * dx[d];
                ai[d] += m[j] * r3invdx;
                r3invdvetc = r3inv * dv[d] - xdotvr5inv * dx[d];
                adoti[d] += m[j] * r3invdvetc;
            }
            *poti += -m[j]*rinv;                    
        }
    }
}

void force_host(REAL x[NMAX][DIM],
                REAL v[NMAX][DIM],
                REAL m[NMAX],
                REAL eps,
                REAL a[NMAX][DIM],
                REAL adot[NMAX][DIM],
                REAL pot[NMAX],
                int n)
{
    int  i,j,d,k;
    REAL r2,r3inv,r2inv,rinv,eps2,xdotv;
    REAL r5inv,xdotvr5inv,r3invdx;
    REAL r3invdvetc;
    REAL dx[DIM];
    REAL dv[DIM];
    
    for(j=0;j<n;j++){  
        for(k=0;k<DIM;k++){
            a[j][k] = 0.0;
            adot[j][k] = 0.0;
        }
        pot[j] = 0.0;
    }
    eps2 = eps*eps;

    for(i=0;i<n-1;i++){
        for(j=i+1;j<n;j++){
            r2 = eps2;
            xdotv = 0.0;
            for(d=0;d<DIM;d++){
                dx[d] = x[j][d] - x[i][d];
                dv[d] = v[j][d] - v[i][d];
                r2 += dx[d] * dx[d];
                xdotv += dx[d]*dv[d];
            }
            r2inv = 1.0/r2;
            rinv = sqrt(r2inv);
            r3inv = r2inv*rinv;
            r5inv = r2inv*r2inv*rinv;
            xdotvr5inv = 3.0*xdotv*r5inv;
            for(d=0;d<DIM;d++){
                r3invdx = r3inv * dx[d];
                a[i][d] += m[j] * r3invdx;
                a[j][d] += - m[i] * r3invdx;
                r3invdvetc = r3inv * dv[d] - xdotvr5inv * dx[d];
                adot[i][d] += m[j] * r3invdvetc;
                adot[j][d] += - m[i] * r3invdvetc;
            }
            pot[i] += m[j]*rinv;                    
            pot[j] += m[i]*rinv;                    
        }
    }
    for(i=0;i<n;i++) pot[i] *= -1;
}

void potential_host(REAL x[NMAX][DIM], REAL m[NMAX], REAL eps,
                    REAL pot[NMAX],
                    int n)
{
    int i,j,d,k;
    REAL r2,r3inv,r2inv,rinv,eps2,xdotv;
    REAL r5inv,xdotvr5inv,r3invdx;
    REAL r3invdvetc;
    REAL dx[DIM];
    REAL dv[DIM];
	
    for(j=0;j<n;j++){  
        pot[j] = 0.0;
    }
    eps2 = eps*eps;

    for(i=0;i<n-1;i++){
        for(j=i+1;j<n;j++){
	    r2 = eps2;
	    for(d=0;d<3;d++){
                dx[d] = x[j][d] - x[i][d];
                r2 += dx[d] * dx[d];
	    }
            r2inv = 1.0/r2;
            rinv = sqrt(r2inv);
	    pot[i] += m[j]*rinv;					
	    pot[j] += m[i]*rinv;					
        }
    }
    for(i=0;i<n;i++) pot[i] *= -1;    
}

void force(REAL x[NMAX][DIM],
           REAL v[NMAX][DIM],
           REAL m[NMAX],
           REAL t[NMAX],
           REAL eps,
           REAL a[NMAX][DIM],
           REAL adot[NMAX][DIM],
           REAL pot[NMAX],
           int n)
{
    int i,k;
    REAL error2;
    static int cnt = 0;
    fprintf(stderr, "force():cnt:%d    n:%d\n", cnt, n);
    cnt++;


#ifdef COMPARE 
    static REAL ah[NMAX][DIM];
    static REAL adoth[NMAX][DIM];
    static REAL poth[NMAX];

    force_grape(x,v,m,t,eps,a,adot,pot,n);
    force_host(x,v,m,eps,ah,adoth,poth,n);

    for(i=0;i<n;i++){
        for(k=0;k<3;k++) {
            error2 = (a[i][k] - ah[i][k])*(a[i][k] - ah[i][k]);
            //            if (error2>1.0e-12 && k == 0) {
            if (k == 0) {
                fprintf(stderr,"acc   g6[%d] : %+.16e %+.16e %+.16e  %s\n",
                        i,a[i][0],a[i][1],a[i][2], error2>1.0e-12 ? "NG":"OK");
                fprintf(stderr,"acc host[%d] : %+.16e %+.16e %+.16e\n",
                        i,ah[i][0],ah[i][1],ah[i][2]);
                fprintf(stderr,"\n");
            }
        }     
        for(k=0;k<3;k++) {
            error2 = (adot[i][k] - adoth[i][k])*(adot[i][k] - adoth[i][k]);
            // if (error2>1.0e-12 && k == 0) {
            if (k == 0) {
                fprintf(stderr,"adot   g6[%d] : %+.16e %+.16e %+.16e  %s\n",
                        i,adot[i][0],adot[i][1],adot[i][2], error2>1.0e-12 ? "NG":"OK");
                fprintf(stderr,"adot host[%d] : %+.16e %+.16e %+.16e\n",
                        i,adoth[i][0],adoth[i][1],adoth[i][2]);
                fprintf(stderr,"\n");
            }
            //        if(error2>1.0e-10) printf("adot adoth [%d][%d] = %e %e ,error %e\n",i,k,adot[i][k],adoth[i][k],sqrt(error2));
        }     


#if 0

        error2 = (pot[i] - poth[i])*(pot[i] - poth[i]);
        if (error2>1.0e-12) {
            fprintf(stderr,"pot   g6[%d] : %+.16e\n",i,pot[i]);
            fprintf(stderr,"pot host[%d] : %+.16e\n",i,poth[i]);
            fprintf(stderr,"\n");
        }

#endif

    }
#endif

#ifdef ONLYGRAPE
    force_grape(x,v,m,t,eps,a,adot,pot,n);
#endif

#ifdef ONLYHOST
    force_host(x,v,m,eps,a,adot,pot,n);
#endif
}

void energy(REAL time,
            REAL pot[NMAX],
            REAL x[NMAX][DIM],
            REAL v[NMAX][DIM],
            REAL m[NMAX],
            int n,
            REAL eps,
            REAL init_ene)
{
    REAL total_pot=0,total_kin=0;
    int i;    

    potential_host(x, m, eps, pot, n); // !!! calculates pot[] on host since g6.pg2 does not.
    for(i=0;i<n;i++){
        total_pot += 0.5*m[i]*pot[i];
    }

    for(i=0;i<n;i++){ 
        total_kin += m[i]*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
    }
    total_kin *= 0.5;

    printf("t = %g\n",time);
    printf("pot = %22.15e kin = %22.15e \n total= %e ratio = %e\n",
           total_pot,total_kin,total_pot+total_kin,total_kin/total_pot);
    printf("   error = %e %g\n",(init_ene-(total_pot+total_kin))/init_ene,time);
}

void initial_energy(REAL pot[NMAX],
                    REAL x[NMAX][DIM],
                    REAL v[NMAX][DIM],
                    REAL m[NMAX],
                    int n,
                    REAL eps,
                    REAL *init_ene)
{
    REAL total_pot=0.0,total_kin=0.0;
    int i;    

    potential_host(x, m, eps, pot, n); // !!! calculates pot[] on host since g6.pg2 does not.
    for(i=0;i<n;i++){
        total_pot += 0.5*m[i]*pot[i];
    }

    for(i=0;i<n;i++){ 
        total_kin += m[i]*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
    }
    total_kin *= 0.5;

    printf("pot = %22.15e kin = %22.15e \n total= %22.15e ratio = %e\n",
           total_pot,total_kin,total_pot+total_kin,total_kin/total_pot);
    *init_ene = total_pot+total_kin;
}

void predict(REAL time,
             REAL x1[DIM],
             REAL v1[DIM],
             REAL ti,
             REAL x0[DIM],
             REAL v0[DIM],
             REAL a0[DIM],
             REAL adot0[DIM])
{
    int k;    
    REAL dt2half,dt3over6,dt;

    dt = time - ti;
    dt2half = 0.5*dt*dt;
    dt3over6 = 1.0/3.0*dt*dt2half;

    for(k=0;k<DIM;k++) x1[k] = x0[k] + dt*v0[k] + dt2half*a0[k] + dt3over6*adot0[k];
    for(k=0;k<DIM;k++) v1[k] = v0[k] + dt*a0[k] + dt2half*adot0[k];
}

REAL mod(REAL x, REAL y)
{
    return ((x/y)-((int)(x/y)))/y;
}

void correct(REAL x1[DIM], REAL v1[DIM],
             REAL x0[DIM], REAL v0[DIM], REAL a0[DIM], REAL adot0[DIM],
             REAL a1[DIM], REAL adot1[DIM],
             REAL *dt, REAL time, REAL eta)
{
    int k,dum;    
    REAL dt3over6,dt4over24,dt5over120;
    REAL dtinv,dt2inv,dt3inv,nextdt;
    REAL a0mia1,ad04plad12,ad0plad1,a2[DIM],a3[DIM];
    REAL a1abs,adot1abs,a2dot1abs,a3dot1abs,a2dot1[DIM];

    dt3over6 = (*dt)*(*dt)*(*dt)/6.0;
    dt4over24 = dt3over6*(*dt)/4.0;
    dt5over120 = dt4over24*(*dt)/5.0;
    dtinv = 1.0/(*dt);
    dt2inv = dtinv*dtinv;
    dt3inv = dt2inv*dtinv;
    for(k=0;k<DIM;k++) {
        a0mia1 = a0[k]-a1[k];
        ad04plad12 = 4.0*adot0[k] + 2.0*adot1[k];
        ad0plad1 = adot0[k] + adot1[k];
        a2[k] = -6.0*a0mia1*dt2inv - ad04plad12*dtinv;
        a3[k] = 12.0*a0mia1*dt3inv + 6.0*ad0plad1*dt2inv;
        x1[k] +=  dt4over24*a2[k] + dt5over120*a3[k];
        v1[k] +=  dt3over6*a2[k] + dt4over24*a3[k];
    }

#ifndef SHAREDTIMESTEP
    a1abs = sqrt(a1[0]*a1[0]+a1[1]*a1[1]+a1[2]*a1[2]);
    adot1abs = sqrt(adot1[0]*adot1[0]+adot1[1]*adot1[1]+adot1[2]*adot1[2]);
    for(k=0;k<DIM;k++) a2dot1[k] = a2[k] + (*dt)*a3[k];    
    a2dot1abs = sqrt(a2dot1[0]*a2dot1[0]+a2dot1[1]*a2dot1[1]+a2dot1[2]*a2dot1[2]);
    a3dot1abs = sqrt(a3[0]*a3[0]+a3[1]*a3[1]+a3[2]*a3[2]);

    nextdt=    sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)
                    /(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));

    if((nextdt < (*dt))&&(nextdt > 1.0e-8)){
        int power;
        power = log(nextdt)/log(2.0)-1;
        *dt = pow(2.0,(double)power);
    }      
    if((nextdt > 2.0*(*dt))&&(mod(time,2.0*(*dt))==0)&&((2.0*(*dt))<=MAXTIMESTEP)){
        *dt *= 2.0;
    }
#endif
}

#if PRIMITIVEAPI
#warning PRIVITIVEAPI defined.
static int clusterid=0;
#endif

void hold_grape(void)
{
    int tunit,xunit;
    tunit = 51;                         /* 2^51 */
    g6_set_tunit(tunit);
    xunit = 51;                         /* 2^50 */
    g6_set_xunit(xunit);

#if PRIMITIVEAPI
    g6_open(clusterid);
#else
    g6_open_all();
#endif
}

void free_grape(void)
{
#if PRIMITIVEAPI
    g6_close(clusterid);
#else
    g6_close_all();
#endif
}

void initial_timestep(REAL a[NMAX][DIM],
                      REAL adot[NMAX][DIM],
                      REAL dt[NMAX],
                      int n,
                      REAL eta_s)
{
    REAL a2,adot2;
    int power,i;

    for(i=0;i<n;i++){
        a2 = a[i][0]*a[i][0]+a[i][1]*a[i][1]+a[i][2]*a[i][2];
        adot2 = adot[i][0]*adot[i][0]+adot[i][1]*adot[i][1]+adot[i][2]*adot[i][2];
        if(adot2 == 0){
            dt[i] = eta_s;
        }else{
            dt[i] = eta_s*sqrt(a2/adot2);
        }
        power = log(dt[i])/log(2.0);
        dt[i] = pow(2.0,(double)(power-1));        
        if(dt[i]>MAXTIMESTEP) dt[i] = MAXTIMESTEP;

#ifdef SHAREDTIMESTEP
        dt[i] = 1.0/512.0;
#endif
        /*printf("a2 adot2 dt %e %e %e\n",a2,adot2,dt[i]);*/
    }
}

#ifdef ANIMATION

void plot_star(REAL x[NMAX][DIM], int n, REAL time, REAL *ratio, REAL m[NMAX], REAL initm)
{
    static REAL anix[NMAX][DIM],maxx,button;
    static int tmpc[NMAX];
    int i,d;
          
    /*      maxx = 0.0;
            for(i=0;i<n;i++){
            double absx;
            for(d=0;d<2;d++){
            absx = fabs(x[i][d]);
            if(absx>maxx) maxx = absx;
            printf("maxx absx %e %e\n",maxx,absx);
            }
            }
            maxx = pow(10.0,(double)((int)log10(maxx)+1));
      
    */
    /*        button = xgetbutton_now();
              if(button==35{
              *ratio *= 3;
              }if(button==1){
              *ratio *= 1.0/3.0;
              }
    */
    *ratio = 0.2;
    for(i=0;i<n;i++){
        for(d=0;d<DIM;d++) anix[i][d] = x[i][d]* *ratio + 0.5;
        tmpc[i] = 0;
        if(m[i]>initm) tmpc[i]=14;
        if(m[i]>2*initm) tmpc[i]=13;
        if(m[i]>0.1) tmpc[i]=12;
        if(m[i]>0.2) tmpc[i]=11;
        if(m[i]>0.3) tmpc[i]=10;
        /*    printf("tmpc %d initm %g m %g\n",tmpc[i],initm,m[i]);*/
    }
           
    plot_particle2D(anix,n,tmpc,time);
}       
#endif

int
main(int argc, char **argv)
{
    REAL eps,epsinv;
    char filename[200],radfile[100];
    REAL icm[DIM],init_ene,time=0.0,outtime,douttime,endtime;
    REAL deouttime,eouttime,eta_s,eta,maxm,alpha,rtidal,eadd,initm,ratio;
    FILE *fp1,*fp2;
    int step,n,i,k,nts=0,nsame,isame,ni;
    REAL lt=0,st=0,pst=0,st0=0,lt0=0;
    int nstep=0,nstepp=0;

    static REAL x0[NMAX][DIM];
    static REAL v0[NMAX][DIM];
    static REAL a0[NMAX][DIM];
    static REAL adot0[NMAX][DIM];
    static REAL x1[NMAX][DIM];
    static REAL v1[NMAX][DIM];
    static REAL a1[NMAX][DIM];
    static REAL adot1[NMAX][DIM];
    static REAL m[NMAX];
    static REAL dti[NMAX];
    static REAL ti[NMAX];
    static REAL pot[NMAX];
    static int index[NMAX],proflag;

    int ii,idi,iii;
    int jpmax,nboards,nodd,one=1,zero=0,ip,j;
    REAL over2=1.0/2.0,over6=1.0/6.0;
    REAL aby2[DIM];
    REAL a1by6[DIM];
    REAL a2by18[DIM];
    REAL h2=0.0;
    static REAL eps2;
    static int dmaindex[NMAX];
    static int *index2 = NULL;
    static REAL (*xi)[3] = NULL;
    static REAL (*vi)[3] = NULL;
    static REAL *h2i = NULL;
    static REAL (*foldi)[3] = NULL;
    static REAL (*joldi)[3] = NULL;
    static REAL *phioldi = NULL;
    static REAL (*tmpa)[DIM] = NULL;
    static REAL (*tmpadot)[DIM] = NULL;
    static REAL *tmppot = NULL;
#ifdef COMPARE
    static REAL a1h[NMAX][DIM];
    static REAL adot1h[NMAX][DIM];
    static REAL poth[NMAX];
    static REAL x1h[NMAX][DIM];
    static REAL v1h[NMAX][DIM];
#endif
    double suma[DIM],sumadot[DIM],tgrape,tcommj;
    struct previous prev;
    static int npipe = 0;

    hold_grape();
    if (npipe < g6_npipes()) {
        npipe = g6_npipes();
        xi        = (REAL (*)[3])  realloc(xi,      sizeof(REAL) * 3 * npipe);
        vi        = (REAL (*)[3])  realloc(vi,      sizeof(REAL) * 3 * npipe);
        h2i       = (REAL *)       realloc(h2i,     sizeof(REAL) * npipe);
        foldi     = (REAL (*)[3])  realloc(foldi,   sizeof(REAL) * 3 * npipe);
        joldi     = (REAL (*)[3])  realloc(joldi,   sizeof(REAL) * 3 * npipe);
        phioldi   = (REAL *)       realloc(phioldi, sizeof(REAL) * npipe);
        index2    = (int *)        realloc(index2,  sizeof(REAL) * npipe);
        tmpa      = (REAL (*)[DIM])realloc(tmpa,    sizeof(REAL) * DIM * npipe);
        tmpadot   = (REAL (*)[DIM])realloc(tmpadot, sizeof(REAL) * DIM * npipe);
        tmppot    = (REAL *)       realloc(tmppot,  sizeof(REAL) * npipe);
    }
    prev.flag = 0;
    prev.rc = prev.cod[0] = prev.cod[1] = prev.cod[2] = 0.0;
    prev.total_m = 1.0;

    if(argc==2){
        fp2 = fopen(argv[1],"r");
    }else{
        fp2 = fopen("inputpara","r");
    }
    if (fp2 == NULL) {
        char buf[256];
        sprintf(buf, "In %s main() fopen(\"%s\", \"r\") failed. ",
                __FILE__, "inputpara");
        perror(buf);
        exit(1);
    }
    if (fp2 == NULL) {
        perror("main");
    }

    fscanf(fp2,"%lf%lf%lf%lf",&epsinv,&douttime,&endtime,&deouttime);
    fscanf(fp2,"%s%lf%lf",filename,&eta_s,&eta);
    /*                      (input  para)
                            1.0/eps: softening parameter (9999=> eps=0)
                            douttime: interval of output
                            endtime: end time
                            deoutime: interval of energy output
                            filename: name of input file 
                            eta_s: parameter for initial timestep determination
                            eta: parameter for timestep determination
    */
    fclose(fp2);

    get_cputime(&st,&lt);
        
    if(epsinv==9999.0){
        eps = 0.0;
    }else{
        eps = 1.0/epsinv;
    }

    data_input(x0,v0,m,icm,&n,filename,&time);
    printf("eta_s,eta %e %e n %d\n",eta_s,eta,n);
    initm = m[0];

    proflag=0;

    if(time==0.0){
        for(i=0;i<n;i++) ti[i] = 0.0;
    }else{
        for(i=0;i<n;i++) ti[i] = time;
    }    

    outtime = douttime + time;
    eouttime = deouttime + time;
    printf("eps:%g douttime:%g \ndeouttime:%g endtime:%g \n",
           eps,douttime,deouttime,endtime);
    printf("inputfile %s \n",filename);
    printf("npipe %d\n",npipe);

    for(i=0;i<n;i++){
        for(k=0;k<DIM;k++){
            a0[i][k] = 1.0;
            adot0[i][k] = 1000.0;
        }
        pot[i] = -1.0;
    }
    force(x0,v0,m,ti,eps,a0,adot0,pot,n);
    printf("initial force\n");

    //    force(x0,v0,m,ti,eps,a0,adot0,pot,n);
    //    printf("initial force\n");

    /*    for(i=0;i<10;i++){
          printf("i a adot pot %d %f %f %f\n",i,a0[i][0],adot0[i][0],pot[i]);
          }
    */
    
#ifdef ANIMATION    
    initial_animation();
    ratio = 1.0;
    plot_star(x0,n,time,&ratio,m,initm);
#endif
    initial_energy(pot,x0,v0,m,n,eps,&init_ene);
    fflush(stdout);

    /*        radius(time,x0,v0,m,n,dmaindex,&prev,pot,"radfile");*/

    initial_timestep(a0,adot0,dti,n,eta_s);
    for(i=0;i<n;i++) index[i] = i;

#if JPSORTED
    sort_timestep_m(0,n-1,dti,index);
    for(i=0;i<n;i++){
        idi = index[i];
        for(k=0;k<DIM;k++){
            aby2[k] = over2*a0[idi][k];
            a1by6[k] = over6*adot0[idi][k];
            a2by18[k] = 0.0;
        }
#if PRIMITIVEAPI
        g6_set_j_particle(clusterid,i,idi,ti[idi],dti[idi],m[idi],a2by18,a1by6,aby2,v0[idi],x0[idi]);
#else
        g6_set_j_particle_all(i,idi,ti[idi],dti[idi],m[idi],a2by18,a1by6,aby2,v0[idi],x0[idi]);
#endif
    }

#else // !JPSORTED
    set_particle_on_grape(x0,v0,a0,adot0,m,ti,dti,n);

#endif // JPSORTED

    nsame = n;
    do{
        REAL nextt;
        int ncor=0;

#if !JPSORTED
        sort_timestep_m(0,nsame-1,dti,index);
#endif
        time = ti[index[0]] + dti[index[0]];
        //  printf("time ti dti0 %22.14e %22.14e %e %d\n",time,ti[index[0]],dti[index[0]],nsame);
        nts++;

#ifdef ONLYHOST
        for(i=0;i<n;i++)predict(time,x1[i],v1[i],ti[i],x0[i],v0[i],a0[i],adot0[i]);
        i=0;
        do{               
            int ii;
            ii = index[i];
            force_on_ith_particle(ii,x1[ii],v1[ii],x1,v1,m,eps,a1[ii],adot1[ii],&pot[ii],n);    
            correct(x1[ii],v1[ii],x0[ii],v0[ii],a0[ii],adot0[ii],a1[ii],adot1[ii],&dti[ii],time,eta);
            ti[ii] = time;
            for(k=0;k<DIM;k++){ 
                x0[ii][k] = x1[ii][k];
                v0[ii][k] = v1[ii][k];
                a0[ii][k] = a1[ii][k];
                adot0[ii][k] = adot1[ii][k];
            } 
            i++;
            nextt = ti[index[i]] + dti[index[i]];
            ncor++;
        }while(time==nextt);
#endif
#ifdef COMPARE
        for(i=0;i<n;i++)predict(time,x1h[i],v1h[i],ti[i],x0[i],v0[i],a0[i],adot0[i]);
#endif
        isame = 0;
        do{
            isame++;
            nextt = ti[index[isame]] + dti[index[isame]];
        }while((time == nextt)&&(isame<n));
        nsame = isame;

        for(i=0;i<nsame;i++){
            idi = index[i];        
            predict(time,x1[idi],v1[idi],ti[idi],x0[idi],v0[idi],a0[idi],adot0[idi]);
            ti[idi] = time;
        }
        get_cputime(&st0,&lt0);
#if PRIMITIVEAPI
        g6_set_ti(clusterid,time);
#else
        g6_set_ti_all(time);
#endif
        eps2 = eps*eps;

        for(i=0;i<nsame;i+= npipe){
            ni = npipe;
            if(i+ni>nsame) ni = nsame - i;

            for(ii=0;ii<ni;ii++){
                idi = index[i+ii];
                index2[ii] = index[i+ii];
                for(k=0;k<3;k++){
                    xi[ii][k] = x1[idi][k];
                    vi[ii][k] = v1[idi][k];
                    foldi[ii][k] = a0[idi][k];
                    joldi[ii][k] = adot0[idi][k];
                }
                h2i[ii] = h2;
                phioldi[ii] = pot[idi];
            }

#if PRIMITIVEAPI
            g6calc_firsthalf(clusterid,n,ni,index2,xi,vi,foldi,joldi,phioldi,eps2,h2i);
            g6calc_lasthalf(clusterid,n,ni,index2,xi,vi,eps,h2i,tmpa,tmpadot,tmppot);
#else
            g6calc_firsthalf_all(n,ni,index2,xi,vi,foldi,joldi,phioldi,eps2,h2i);
            g6calc_lasthalf_all(n,ni,index2,xi,vi,eps,h2i,tmpa,tmpadot,tmppot);
#endif
            for(ii=i;ii<(ni+i);ii++){
                idi = index[ii];
                for(k=0;k<DIM;k++){
                    a1[idi][k] = tmpa[ii-i][k];
                    adot1[idi][k] = tmpadot[ii-i][k];
                }    
                pot[idi] = tmppot[ii-i];

#ifdef COMPARE
                force_on_ith_particle(idi,x1h[idi],v1h[idi],x1h,v1h,m,eps,a1h[idi],adot1h[idi],&poth[idi],n);    

                if((fabs(a1[idi][0]-a1h[idi][0])>3.0e-4)||(fabs(a1[idi][1]-a1h[idi][1])>3.0e-4)
                   ||(fabs(a1[idi][2]-a1h[idi][2])>3.0e-4)){
                    printf("t %g %d g6 i %d ii %d a %g %g %g j %g %g %g p %g\n",time,nsame,idi,ii,
                           a1[idi][0],a1[idi][1],a1[idi][2],adot1[idi][0],adot1[idi][1],adot1[idi][2],pot[idi]);
                    printf("t %g %d ho i %d ii %d a %g %g %g j %g %g %g p %g\n",time,nsame,idi,ii,
                           a1h[idi][0],a1h[idi][1],a1h[idi][2],adot1h[idi][0],adot1h[idi][1],adot1h[idi][2],poth[idi]);
                }
                printf("t %g %d g6 i %d ii %d a %g %g %g j %g %g %g p %g\n",time,nsame,idi,ii,
                       a1[idi][0],a1[idi][1],a1[idi][2],adot1[idi][0],adot1[idi][1],adot1[idi][2],pot[idi]);
                printf("t %g %d ho i %d ii %d a %g %g %g j %g %g %g p %g\n",time,nsame,idi,ii,
                       a1h[idi][0],a1h[idi][1],a1h[idi][2],adot1h[idi][0],adot1h[idi][1],adot1h[idi][2],poth[idi]);
#endif
            }  
            nstepp += npipe; 
        }

        get_cputime(&st0,&lt0);
        tgrape += st0;

        for(i=0;i<nsame;i+= npipe){
            ni = npipe;
            if(i+ni>nsame) ni = nsame - i;

            for(ii=i;ii<(ni+i);ii++){
                idi = index[ii];
                correct(x1[idi],v1[idi],x0[idi],v0[idi],a0[idi],adot0[idi],a1[idi],adot1[idi],&dti[idi],time,eta);
            }
        }

        get_cputime(&st0,&lt0);

#if JPSORTED
        sort_timestep_m(0,nsame-1,dti,index);
#endif
        for(i=0;i<nsame;i+= npipe){
            ni = npipe;
            if(i+ni>nsame) ni = nsame - i;

            for(ii=i;ii<(ni+i);ii++){
                idi = index[ii];
                for(k=0;k<DIM;k++){ 
                    x0[idi][k] = x1[idi][k];
                    v0[idi][k] = v1[idi][k];
                    a0[idi][k] = a1[idi][k];
                    adot0[idi][k] = adot1[idi][k];
                } 
  
                for(k=0;k<DIM;k++){
                    aby2[k] = over2*a0[idi][k];
                    a1by6[k] = over6*adot0[idi][k];
                    a2by18[k] = 0.0;
                }
#if JPSORTED
#warning JPSORTED defined.
#if PRIMITIVEAPI
                g6_set_j_particle(clusterid,ii,idi,ti[idi],dti[idi],m[idi],a2by18,a1by6,aby2,v0[idi],x0[idi]);        
#else
                g6_set_j_particle_all(ii,idi,ti[idi],dti[idi],m[idi],a2by18,a1by6,aby2,v0[idi],x0[idi]);        
#endif
#else
#if PRIMITIVEAPI
                g6_set_j_particle(clusterid,idi,idi,ti[idi],dti[idi],m[idi],a2by18,a1by6,aby2,v0[idi],x0[idi]);
#else
                g6_set_j_particle_all(idi,idi,ti[idi],dti[idi],m[idi],a2by18,a1by6,aby2,v0[idi],x0[idi]);
#endif
#endif
            }
        }
        get_cputime(&st0,&lt0);
        tgrape += st0;
        tcommj += st0;      
        ncor = nsame;

#ifdef ANIMATION    
        if(((time*64)-(int)(time*64))==0)plot_star(x1,n,time,&ratio,m,initm);
#endif
        nstep += ncor;
        if( time >= eouttime){
            double tg;
            energy(time,pot,x1,v1,m,n,eps,init_ene);
            eouttime += deouttime;
	    printf("   time %g %d ",time,ncor);
            printf("nts %d nstep %d %d %g\n",nts,nstep,nstepp,(double)nstep/nstepp);
            get_cputime(&st,&lt);
	    printf("   cputime %g ",st);
            printf("speed %g Gflops %g nstep/s\n",
		   57.0*((double)n)*((double)nstep)/st/1e9,
                   nstep/st);

            // tg = nstep*(n+100.0)/(380e6*512.0/55.5);
            // printf("tgrape+comm %g tgrape %g tcomm %g %g %g thost %g %g\n",
            // tgrape,tg,tgrape-tg,(tgrape-tg)/nstep,(tgrape-tg)/nstep/188.0,st-tgrape,(st-tgrape)/nstep);
            // printf("tcommj %g %g\n",tcommj,tcommj/nstep);
            nts=0;
            nstep=0;
            nstepp=0;
            pst = lt;
            tgrape = 0;
            tcommj = 0;        
            fflush(stdout);

        }
        ncor=0;

        if( time >= outtime) {
            outtime += douttime;
            /*            radius(time,x1,v1,m,n,dmaindex,&prev,pot,"radfile");*/
            fp1 = fopen("nemoout","w");
            data_output(time,x1,v1,m,n,fp1);
            fclose(fp1);
        }

    }while(time<endtime);
    free_grape();
    exit(0);
}
