// GRAPE-5 compatible APIs
#include <stdio.h>
#include <math.h>
#include <assert.h>
#include "sse_type.h"
#include "gp5util.h"
#ifdef CUTOFF_FORCE
#include "pg5_table.h"
#endif

#define NUM_PIPE 4
#define JMEMSIZE 65536

#ifndef MAXDEV
#define MAXDEV 4
#endif
static struct Ptcl_Mem{
	Ipdata iptcl;
	Fodata fout;
	Jpdata jptcl[JMEMSIZE];
	int Nbody, pad[15];
} ptcl_mem[MAXDEV] ALIGN64;

static double Eps = 1./256.; 
static double Eta;

#ifdef CUTOFF_FORCE
static double Xscale;
static v4sf XMscale;
static v4sf Ascale;
static v4sf R2cut_xscale2 = {
	(1<<(1+(1<<EXP_BIT))) - 3,
	(1<<(1+(1<<EXP_BIT))) - 3,
	(1<<(1+(1<<EXP_BIT))) - 3,
	(1<<(1+(1<<EXP_BIT))) - 3
};

void pg5_set_xscale(double xscale){
	Xscale = xscale;
	XMscale = (v4sf){xscale, xscale, xscale, 1.0};
	double ascale = 1./xscale;
	Ascale = (v4sf){ascale, ascale, ascale, ascale};
}
#else
static float Acc_correct = 1.0;
static float Pot_correct = -1.0;
static v4sf Acc_correctV = {1.0, 1.0, 1.0, 1.0};
static v4sf Pot_correctV = {-1.0, -1.0, -1.0, -1.0};
#endif

/******** GRAPE-5 APIs ********/

int g5_get_number_of_pipelines(void){
	return NUM_PIPE;
}
int g5_get_jmemsize(void){
	return JMEMSIZE;
}

void g5_open(){
#ifdef PERIODIC_BOUNDARY
	pg5_gen_s2_force_table(SFT_FOR_PP, SFT_FOR_PM);
#else
	static int init_call = 1;
	if(init_call){
		double rsqrt_bias();
		double bias = rsqrt_bias();
		float acc_corr = 1.0 - 3*bias;
		float pot_corr = -(1.0 - bias);
		Acc_correct = acc_corr;
		Pot_correct = pot_corr;
		Acc_correctV = (v4sf){acc_corr, acc_corr, acc_corr, acc_corr}; 
		Pot_correctV = (v4sf){pot_corr, pot_corr, pot_corr, pot_corr}; 
		init_call = 0;
	}
#endif
}

void g5_close(){
}

void g5_set_eta(double eta){
	Eta = eta;
}

void g5_set_eps_to_all(double eps){
	Eps = eps;
}

void g5_set_eps2_to_all(double eps2){
    Eps = sqrt(eps2);
}

void g5_set_range(double xmin, double xmax, double mmin){
}

#ifdef PERIODIC_BOUNDARY
void g5_set_cutoff_table(double (*ffunc)(double), double fcut, double fcor,
					double (*pfunc)(double), double pcut, double pcor){
	// pg5_gen_plummer_force_table();
	// please implement the function calls pg5_gen_force_table().
}
#endif

void g5_set_nMC(int devid, int n){
	struct Ptcl_Mem *pm = ptcl_mem + devid;
	pm->Nbody = n;
}

void g5_set_n(int n){
	g5_set_nMC(0, n);
}

void g5_set_xiMC(int devid, int ni, double (*xi)[3]){
	int i;
	struct Ptcl_Mem *pm = ptcl_mem + devid;

	assert(ni <= NUM_PIPE);
	for(i=0;i<ni;i++){
#ifdef CUTOFF_FORCE
		pm->iptcl.x[i] = (float)xi[i][0] * Xscale;
		pm->iptcl.y[i] = (float)xi[i][1] * Xscale;
		pm->iptcl.z[i] = (float)xi[i][2] * Xscale;
#else
		float eps2 = Eps*Eps;
		pm->iptcl.x[i] = (float)xi[i][0];
		pm->iptcl.y[i] = (float)xi[i][1];
		pm->iptcl.z[i] = (float)xi[i][2];
		pm->iptcl.eps2[i] = eps2;
#endif
	}
}

void g5_set_xi(int ni, double (*xi)[3]){
	g5_set_xiMC(0, ni, xi);
}

void g5_set_xmjMC(int devid, int adr, int nj, double (*xj)[3], double *mj){
	int j;
	struct Ptcl_Mem *pm = ptcl_mem + devid;

	for(j=adr; j<adr+nj; j++){
#if __GNUC__ ==  4
		v2df pd0 = {xj[j][0], xj[j][2]};
		v2df pd1 = {xj[j][1], mj[j]   };
#else
		v2df pd0, pd1;
		V2DF_GATHER(pd0, xj[j],   xj[j]+2);
		V2DF_GATHER(pd1, xj[j]+1, mj+j);
#endif
		v4sf ps0, ps1;
		ps0 = __builtin_ia32_cvtpd2ps(pd0);
		ps1 = __builtin_ia32_cvtpd2ps(pd1);
		ps0 = __builtin_ia32_unpcklps(ps0, ps1);
#ifdef CUTOFF_FORCE
		*(v4sf *)(pm->jptcl+j) = ps0 * XMscale;
#else
		*(v4sf *)(pm->jptcl+j) = ps0;
#endif
	}
}

void g5_set_xmj(int adr, int nj, double (*xj)[3], double *mj){
	return g5_set_xmjMC(0, adr, nj, xj, mj);
}

void g5_runMC(int devid){
	struct Ptcl_Mem *pm = ptcl_mem + devid;
	void gravity_kernel(pIpdata, pJpdata, pFodata, int, float (*)[2], v4sf, v4sf);
#ifdef CUTOFF_FORCE
	gravity_kernel(&pm->iptcl, pm->jptcl, &pm->fout, pm->Nbody, 
			Force_table, R2cut_xscale2, Ascale);
#else
	void GravityKernel(pIpdata, pFodata, pJpdata, int);
	GravityKernel(&pm->iptcl, &pm->fout, pm->jptcl, pm->Nbody);
#endif
}

void g5_set_jpMC(int devid, int adr, int nj, double *mj, double (*xj)[3]){
    return g5_set_xmjMC(devid, adr, nj, xj, mj);
}

void g5_set_jp(int adr, int nj, double *mj, double (*xj)[3]){
	return g5_set_xmjMC(0, adr, nj, xj, mj);
}

void g5_run(void){
	g5_runMC(0);
}

void g5_get_forceMC(int devid, int ni, double (*a)[3], double *pot){
	struct Ptcl_Mem *pm = ptcl_mem + devid;
#ifdef CUTOFF_FORCE
	int i;
	for(i=0;i<ni;i++){
		a[i][0] = (double)pm->fout.ax[i];
		a[i][1] = (double)pm->fout.ay[i];
		a[i][2] = (double)pm->fout.az[i];
		pot[i] = 0.0;
	}
#else
#if __GNUC__ == 4
	v4sf ax = *(v4sf *)(pm->fout.ax) * Acc_correctV;
	v4sf ay = *(v4sf *)(pm->fout.ay) * Acc_correctV;
	v4sf az = *(v4sf *)(pm->fout.az) * Acc_correctV;
	v4sf phi = *(v4sf *)(pm->fout.phi) * Pot_correctV;
	v4sf f0, f1, f2, f3;
	v4sf_transpose(&f0, &f1, &f2, &f3, ax, ay, az, phi);
	if (ni==4){
		v4sf_store_dp(f0, &a[0][0], &a[0][1], &a[0][2], &pot[0]);
		v4sf_store_dp(f1, &a[1][0], &a[1][1], &a[1][2], &pot[1]);
		v4sf_store_dp(f2, &a[2][0], &a[2][1], &a[2][2], &pot[2]);
		v4sf_store_dp(f3, &a[3][0], &a[3][1], &a[3][2], &pot[3]);
	}else if (ni==3){
		v4sf_store_dp(f0, &a[0][0], &a[0][1], &a[0][2], &pot[0]);
		v4sf_store_dp(f1, &a[1][0], &a[1][1], &a[1][2], &pot[1]);
		v4sf_store_dp(f2, &a[2][0], &a[2][1], &a[2][2], &pot[2]);
	}else if (ni==2){
		v4sf_store_dp(f0, &a[0][0], &a[0][1], &a[0][2], &pot[0]);
		v4sf_store_dp(f1, &a[1][0], &a[1][1], &a[1][2], &pot[1]);
	}else if (ni==1){
		v4sf_store_dp(f0, &a[0][0], &a[0][1], &a[0][2], &pot[0]);
	}
#else
	int i;
	for(i=0;i<ni;i++){
		a[i][0] = (double)(pm->fout.ax[i] * Acc_correct);
		a[i][1] = (double)(pm->fout.ay[i] * Acc_correct);
		a[i][2] = (double)(pm->fout.az[i] * Acc_correct);
		pot[i] =  (double)(pm->fout.phi[i] * Pot_correct);
	}
#endif
#endif
}

void g5_get_force(int ni, double (*a)[3], double *pot){
	g5_get_forceMC(0, ni, a, pot);
}

void g5_calculate_force_on_xMC(int devid, double (*x)[3], double (*a)[3], double *p, int ni)
{
   int off;
   int np = g5_get_number_of_pipelines();
   for(off=0; off<ni; off+=np) {
      int nii = np < ni-off ? np : ni-off;
      g5_set_xiMC(devid, nii, x+off);
      g5_runMC(devid);
      g5_get_forceMC(devid, nii, a+off, p+off);
   }
}

#ifdef _OPENMP

#include <omp.h>
void g5_calculate_force_on_x(double (*x)[3], double (*a)[3], double *p, int nitot)
{
	int off;
	const int np = g5_get_number_of_pipelines();
#pragma omp parallel for
	for(off=0; off<nitot; off+=np) {
		int tid = omp_get_thread_num();
		int ni = np < nitot-off ? np : nitot-off;
		g5_set_xiMC(tid, ni, x+off);
		{
			void GravityKernel(pIpdata, pFodata, pJpdata, int);
			pIpdata ip = &ptcl_mem[tid].iptcl;
			pFodata fo = &ptcl_mem[tid].fout;
			pJpdata jp = ptcl_mem[0].jptcl;
			int nbody  = ptcl_mem[0].Nbody;
			GravityKernel(ip, fo, jp, nbody);
		}
		g5_get_forceMC(tid, ni, a+off, p+off);
   }
}

#else

void g5_calculate_force_on_x(double (*x)[3], double (*a)[3], double *p, int ni)
{
	g5_calculate_force_on_xMC(0, x, a, p, ni);
}

#endif
