#include<stdio.h>
#include<math.h>
#include<malloc.h>
#include<pthread.h>
#define REAL double
#define DIM 3
#include "jlist.h"
#define NTHREAD 4

pthread_mutex_t force_mutex;

void calculate_force(x,m,a,pot,ilist,nilist,jlist,njlist,devid)
REAL x[][3];
REAL m[];
REAL a[][3];
REAL pot[];
int ilist[];
int nilist;
struct jlist_t *jlist;
int njlist;
int *devid;
{
  int i,j,k,npipes,ii,jj,nn,idevid;

  double (*xi)[3];
  double (*atmp)[3];
  double (*ptmp);
  double lt=0,st=0; 
  double forcemjt=0,forcexjt=0,forceit=0;

  xi = (double (*)[3])malloc(sizeof(double)*3*200);
  atmp = (double (*)[3])malloc(sizeof(double)*3*200);
  ptmp = (double *)malloc(sizeof(double)*200);

/*  double xi[100][3];
  double atmp[100][3];
  double ptmp[100];*/

  get_wcputime(&lt,&st);

  npipes = g5_get_number_of_pipelines_per_board();

  idevid = *devid;

  //  g5_set_xjMC(idevid,0,njlist,(*jlist).x);

  g5_set_xmjMC(idevid,0,njlist,(*jlist).x,(*jlist).mass);
  
  get_wcputime(&lt,&st);
  forcexjt += lt;

  //  g5_set_mjMC(idevid,0,njlist,(*jlist).mass);

  get_wcputime(&lt,&st);
  forcemjt += lt;

  g5_set_nMC(idevid,njlist);

  for(i=0;i<nilist;i+=npipes){
    nn = npipes;
    if((i+npipes)>nilist) nn = nilist-i;

    for(ii=0;ii<nn;ii++){
      for(k=0;k<3;k++) xi[ii][k] = x[ilist[i+ii]][k];
    }

/*    g5_calculate_force_on_xMC(devid,xi,atmp,ptmp,nn);*/

    g5_set_xiMC(idevid,nn,xi);

/*printf("thread %d ilist %d 0",*devid,ilist[0]);*/

    g5_runMC(idevid);

    g5_get_forceMC(idevid,nn,atmp,ptmp);  

    for(ii=0;ii<nn;ii++){
      int iii;
      iii = ilist[i+ii];
      for(k=0;k<3;k++) a[iii][k] = atmp[ii][k];
      pot[iii] = -ptmp[ii];
    }

  }

  get_wcputime(&lt,&st);
  forceit += lt;

  free(xi); free(atmp); free(ptmp);
}

static first_flag[NTHREAD]={1,1,1,1};

void calculate_force_using_tree_partial(p_iwalkfirst,p_nwalk,p_n,x,m,a,pot,p_eps,clist,walklist,index,p_maxx,st,devid)
int *p_iwalkfirst;
int *p_nwalk;
int *p_n;
REAL x[][3];
REAL m[];
REAL a[][3];
REAL pot[];
REAL *p_eps;
struct clist_t clist[];
int walklist[];
int index[];
double *p_maxx;
double *st;
int *devid;
{
  int ii,i,j,idevid;
  double tlist,tgrape,lt=0;
  long int current_key;
  long int ninter,sumjlist,sumilist=0;
  int iwalkfirst,nwalk,n;
  REAL eps,maxx;
  double stt=0;
  double tmpcm[3];
   
  int *ilist = NULL;
  struct jlist_t *jlist;

  get_wcputime(&lt,&stt);
  printf("tread %d calculate force %22.14lf\n",*devid,stt);

  jlist = (struct jlist_t *)malloc(sizeof(struct jlist_t));

  iwalkfirst = *p_iwalkfirst;
  nwalk = *p_nwalk;
  n = *p_n;
  eps = *p_eps;
  maxx = *p_maxx;

  idevid = *devid;

  printf("calc_force thread %d iwf %d nw %d\n",idevid,iwalkfirst,nwalk);


/*
  pthread_mutex_lock(&force_mutex);
*/
  if(first_flag[idevid]==1){
    g5_openMC(idevid);
/*    first_flag[idevid]=0;*/
  }
  printf("npipe %d\n",g5_get_number_of_pipelines());
/*  g5_set_range(-maxx*2.0,maxx*2.0,m[0]/32.0);*/
/*  g5_set_range(-maxx*2.0,maxx*2.0,m[0]);*/
  g5_set_rangeMC(idevid,-maxx*2.0,maxx*2.0,m[0]);
  g5_set_eps_to_allMC(idevid,eps);

/*
  pthread_mutex_unlock(&force_mutex);
*/

  tlist = tgrape = 0;

  ninter = sumjlist = 0;

  for(ii=iwalkfirst;ii<nwalk;ii++){
    int njlist=0,nilist,tmpif;
    double coc[3],totalm=0;

    i = walklist[ii];

    tmpcm[0]=tmpcm[1]=tmpcm[2]=0.0;

    nilist = clist[i].n;
    tmpif = clist[i].ifirst;

/*printf("ii %d walklist %d nilist %d tmpif %d\n",ii,i,nilist,tmpif);*/

    ilist = (int *)realloc(ilist,sizeof(int)*nilist);
      
    for(j=0;j<nilist;j++){
      ilist[j] = index[j+tmpif];
/*      if(index[j+tmpif]<1000){
         printf("i %d iwalk %d\n",index[j+tmpif],ii);
         sumilist++;
      }
*/
    }
/*printf("ii %d i %d clist.key %lx\n",ii,i,clist[i].key);*/

    center_of_cell(clist[i].key,clist[i].key_level,clist[i].length,coc,maxx);
    current_key = 1;
    make_interaction_list(clist[i].key,coc,clist[i].length,current_key,clist,&njlist,index,jlist,x,m);

    if(njlist > NJMAX) printf("njlist %d > NJMAX\n",njlist);

/*printf("ii %d walklist %d njlist %d\n",ii,i,njlist);*/

    ninter += njlist * nilist;
    sumjlist += njlist; 
    sumilist += nilist;

    get_wcputime(&lt,&stt);
    tlist += lt; 

/*
    for(j=0;j<njlist;j++){
    tmpcm[0] += (jlist->x[j][0])*(jlist->mass[j]);
    tmpcm[1] += (jlist->x[j][1])*(jlist->mass[j]); 
    tmpcm[2] += (jlist->x[j][2])*(jlist->mass[j]); 
    } 
    if((fabs(tmpcm[0])>1.0e-15)||(fabs(tmpcm[1])>1.0e-15)||(fabs(tmpcm[2])>1.0e-15)){
      printf("ii %d cm %g %g %g\n",ii,tmpcm[0],tmpcm[1],tmpcm[2]); 
    }
*/

  calculate_force(x,m,a,pot,ilist,nilist,jlist,njlist,devid);

/*printf("thread %d key %lx i %d a %g %g %g\n",*devid,clist[i].key,ilist[0],a[ilist[0]][0],a[ilist[0]][1],a[ilist[0]][2]);*/

    get_wcputime(&lt,&stt);
    tgrape += lt;
  }

  printf("thread %d end calcforce %22.14lf\n",*devid,*st);

/*  printf("force time xjt %g mjt %g it %g\n",forcexjt,forcemjt,forceit);*/


  g5_closeMC(idevid);

  printf("ninter %ld ave %d sumjlist %d sumilist %d\n",ninter,ninter/n,sumjlist,sumilist);

    printf("cpu %g: making list\n",tlist);
    printf("cpu %g: calculating on GRAPE\n",tgrape);

/*  get_wcputime(&lt,st);
  printf("thread %d end calcforce %22.14lf\n",*devid,*st);
*/

  free(jlist);free(ilist);

}

typedef struct {
int *iwalkf;
int *nwalk;
int *n;
REAL (*x)[3];
REAL (*m);
REAL (*a)[3]; 
REAL (*pot);
REAL *eps;  
struct clist_t *clist;
int *walklist;
int *index;
double *maxx;   
double *st;
int *devid;
} package_t;

static int force_ok_flag[NTHREAD]={0,0,0,0};
static int force_done_flag[NTHREAD]={0,0,0,0};
pthread_mutex_t ok_mutex[NTHREAD];
pthread_mutex_t done_mutex[NTHREAD];
pthread_cond_t ok_cond[NTHREAD];
pthread_cond_t done_cond[NTHREAD];

void *force_calculator(void *arg)
{
  package_t *pp = (package_t *)arg;  
  int idev,i,si;

  idev = *(pp->devid);
  printf("pthread created: %d\n",idev);      

  while(1){
    pthread_mutex_lock(&ok_mutex[idev]);
    pthread_cond_wait(&ok_cond[idev],&ok_mutex[idev]);
    pthread_mutex_unlock(&ok_mutex[idev]);      

    printf("pthread awake: %d done %d\n",idev,force_done_flag[idev]);      

    calculate_force_using_tree_partial(pp->iwalkf,pp->nwalk,pp->n,pp->x,pp->m,pp->a,pp->pot,
     pp->eps,pp->clist,pp->walklist,pp->index,pp->maxx,pp->st,pp->devid);

    pthread_mutex_lock(&done_mutex[idev]);
    force_done_flag[idev] = 1;  
    pthread_cond_signal(&done_cond[idev]);
    pthread_mutex_unlock(&done_mutex[idev]);      

    printf("pthread done: %d\n",idev);      
  }

 free(pp);
  return (NULL);
}

static package_t *p[NTHREAD];
static int fixdevid[NTHREAD]={0,1,2,3};
static int iwalkf_b[NTHREAD],nwalk_b[NTHREAD];
pthread_t thread[NTHREAD];
pthread_attr_t custom_attr;

void initialize_force_function()
{
  int i;
  unsigned long def_stack_size;
/*  pthread_attr_init(&custom_attr);
  pthread_attr_getstacksize(&custom_attr,&def_stack_size);
  printf("def stack size %d\n",def_stack_size);
*/

  for(i=0;i<NTHREAD;i++) pthread_mutex_init(&ok_mutex[i],NULL);
  for(i=0;i<NTHREAD;i++) pthread_mutex_init(&done_mutex[i],NULL);
  for(i=0;i<NTHREAD;i++) pthread_cond_init(&ok_cond[i],NULL);
  for(i=0;i<NTHREAD;i++) pthread_cond_init(&done_cond[i],NULL);
  
  pthread_mutex_init(&force_mutex,NULL);

  for(i=0;i<NTHREAD;i++) p[i] = (package_t *)malloc(sizeof(package_t));

  for(i=0;i<NTHREAD;i++){
    p[i]->iwalkf = &iwalkf_b[i];
    p[i]->nwalk = &nwalk_b[i];
    p[i]->devid = &fixdevid[i];

    pthread_create(&thread[i], NULL, force_calculator, (void *)p[i]);
  }
}

void calculate_force_using_tree(n,x,m,a,pot,eps,clist,nwalk,walklist,index,maxx,st)
int n;
REAL x[][3];
 REAL m[];
REAL a[][3];
REAL pot[];
REAL eps;
struct clist_t clist[];
int nwalk;
int walklist[];
int index[];
double maxx;
double *st;
{
  int nhalf,i,ii;
  int flag0,flag1,sumn;
  double ltt,stt0=0,stt1=0;

  get_wcputime(&ltt,&stt0);

  printf("calculate force %22.14lf\n",stt0);
  
  /*  sumn = 0;
  for(i=0;sumn<(n/2);i++) sumn += clist[walklist[i]].n;
  nhalf = i;

  iwalkf_b[0] = 0;
  nwalk_b[0] = nhalf;

  iwalkf_b[1] = nhalf;
  nwalk_b[1] = nwalk;

  printf("i,n[0] %d %d i,n[1] %d %d\n",iwalkf_b[0],nwalk_b[0],iwalkf_b[1],nwalk_b[1]);
  */
  
  i=0; sumn=0;
  for(ii=0;ii<NTHREAD;ii++){
    int nb;
    iwalkf_b[ii] = i;    
    nb = (n/NTHREAD)*(ii+1);
    while((sumn<nb)&&(i<nwalk)){
      sumn += clist[walklist[i]].n;
      i++;
    }
    nwalk_b[ii] = i;
    //    printf("ii %d iwalk %d nwalk %d\n",ii,iwalkf_b[ii],nwalk_b[ii]);
  }


  
  for(i=0;i<NTHREAD;i++){
    p[i]->n = &n;
    p[i]->x = x;
    p[i]->m = m;
    p[i]->a = a;
    p[i]->pot = pot;
    p[i]->eps = &eps;
    p[i]->clist = clist;
    p[i]->walklist = walklist;
    p[i]->index = index;
    p[i]->maxx = &maxx;
    p[i]->st = st;
  }

  /*  system("ps augx | grep jcode | grep -v grep");*/

  for(i=0;i<NTHREAD;i++){
    pthread_mutex_lock(&ok_mutex[i]);
    pthread_cond_signal(&ok_cond[i]);
    pthread_mutex_unlock(&ok_mutex[i]);
  }

  for(i=0;i<NTHREAD;i++){
    pthread_mutex_lock(&done_mutex[i]);
    if(force_done_flag[i]==0){
      pthread_cond_wait(&done_cond[i],&done_mutex[i]);
    }
    force_done_flag[i]=0;
    pthread_mutex_unlock(&done_mutex[i]);
  }

  get_wcputime(&ltt,&stt1);
  printf("cpu %g : calculate force %22.14lf\n",stt1-stt0,stt1);

}



