#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "hibdrv.h"
#include "hibutil.h"
#include "g5util.h"

void force_grape(int devid, double (*x)[3], double *m, double eps2,
		 double (*a)[3], double *pot, int n);

void copy_to_buf(unsigned *srcbuf, int nword);
void copy_from_buf(unsigned *dstbuf, int nword);


static unsigned int *rbuf;
static unsigned int *wbuf;

static unsigned int *piowbuf = NULL; /* PIO write buffer */

#ifdef ICC_RCD
#define ONEHALF (0.0) // Intel CC with -rcd switch
#else
#define ONEHALF (0.5) // standard C
#endif

#ifdef NPIPES
#undef NPIPES
#endif
#define NPIPES (1)
#define NMAX (10000)

int
main(int argc, char **argv)
{
  double x[NMAX][3];
  double m[NMAX];
  double eps2;
  double a[NMAX][3];
  double p[NMAX];
  int devid;
  int n;
  int i;

  if (argc < 2) {
      fprintf(stderr, "usage: %s <card ID>\n", argv[0]);
      exit(1);
  }
  devid = atoi(argv[1]);
  fprintf(stderr, "force_grape() uses g5[%d].\n", devid);

  n = 2;
#if 1
  x[0][0] = 1.0;
  x[0][1] = 1.0;
  x[0][2] = 1.0;

  x[1][0] = -1.0;
  x[1][1] = -1.0;
  x[1][2] = -1.0;
#else
  x[0][0] = 1.0;
  x[0][1] = 0.0;
  x[0][2] = 0.0;

  x[1][0] = 0.0;
  x[1][1] = 0.0;
  x[1][2] = 0.0;

  x[2][0] = 1.0;
  x[2][1] = 0.0;
  x[2][2] = 0.0;

  x[3][0] = 1.0;
  x[3][1] = 0.0;
  x[3][2] = 0.0;
#endif

  // eps2 = pow(1.0/16.0, 2.0);
  eps2 = 0.0;

  m[0] = 1.0;
  m[1] = 1.0;
  m[2] = 1.0;
  m[3] = 1.0;

  a[0][0] = 111.0;
  a[0][1] = 222.0;
  a[0][2] = 333.0;
  a[1][0] = 444.0;
  a[1][1] = 555.0;
  a[1][2] = 666.0;

  force_grape(devid, x, m, eps2, a, p, n);

  printf("\n");
  for (i = 0; i < n; i++) {
    printf("a[%d]: (%5.3f %5.3f %5.3f )\n",
	   i, a[i][0], a[i][1], a[i][2]);
  }
  printf("\n");
}

static unsigned int
fixtofp(unsigned int fixdata)
{
    unsigned int s, e, m;
    unsigned int fpdata;

    s = fixdata >> 31;
    m = abs(fixdata);

    if (fixdata == 0) {
	fprintf(stderr, "#### fixdata: 0x%x\n", fixdata);
	return 0;
    }

    e = 0;
    while (1) {
        if (m < (1<<e)) break;
        e++;
    }
    e--;
    if (e > 9) {
        m = m >> (e-9);
    }
    else {
        m = m << (9-e);
    }
    m &= 0x1ff;

    fpdata = (s << 16) | (e <<9) | m;

    fprintf(stderr, "#### fpdata:0x%x    e:0x%x    m:0x%x\n", fpdata, e, m);

    return fpdata;
}

void
force_grape(int devid, double (*x)[3], double *m, double eps2,
	    double (*a)[3], double *pot, int n)
{
    Hib *hib;
    int i, ii, nn, j, jj, jjj, rc;
    int npipe;
    int ni,nd,nword;
    unsigned int xj, yj, zj, mj;
    unsigned int xi, yi, zi;
    unsigned int ieps2;
    unsigned int ipdata[2048];
    unsigned int jpdata[8192*8];
    unsigned int fodata[2048];
    long long int sx, sy, sz;
    double xoffset, xsize, xscale;
    double mmin, mscale;
    double ascale;
    UINT32 binfo;

    xoffset = -32.0;;
    xsize = 64.0;

#if 1 // GRAPE-5 compatible scaling
    xscale = pow(2.0,32.0) / xsize;
    mmin = 1.0 / 1024.0;
    mscale = pow(2.0,96.0) / mmin;
    ascale = (-xscale * xscale)/mscale;
    ascale *= pow(2.0, 23.0);
#else // GRAPE-3 compatible scaling
    xscale = pow(2.0,20.0) / xsize;
    mmin = 1.0 / 1024.0;
    mscale = pow(2.0,60.0) / mmin;
    ascale = (-xscale * xscale)/mscale;
#endif

    fprintf(stderr, "x[0]: %5.3f %5.3f %5.3f\n",
	    x[0][0], x[0][1], x[0][2]);
    fprintf(stderr, "x[1]: %5.3f %5.3f %5.3f\n",
	    x[1][0], x[1][1], x[1][2]);
    fprintf(stderr, "m[0]: %5.3f\n", m[0]);
    fprintf(stderr, "m[1]: %5.3f\n", m[1]);
    fprintf(stderr, "eps2: %5.3f\n", eps2);
    fprintf(stderr, "n: %d\n", n);

    npipe = ni = NPIPES;
    fprintf(stderr, "npipe: %d\n", npipe);

    hib = hib_openMC(devid);
#if 1 // PIO write
    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
    piowbuf = malloc(hib->r->piowbuf_bytes);
    if (!piowbuf) {
	fprintf(stderr, "force_grape: malloc failed.\n");
        exit(1);
    }
    rbuf = piowbuf;
    wbuf = (unsigned int *)hib->dmaw_buf;
#else
    rbuf = (unsigned int *)hib->dmar_buf;
    wbuf = (unsigned int *)hib->dmaw_buf;
#endif

    binfo = hib_mem_readMC(devid, hib->r->boardinfo);

    /** IP reg **/
    nd = 2;
    nword = 2;
    ipdata[0] = 0xc0000000;
    ipdata[1] = nd<<16 | ni;

    copy_to_buf(ipdata, nword);
    fprintf(stderr, "######## will IP reg n:%d\n", nword);
    hib_sendMC(devid, nword/2, (UINT64*)rbuf);
    fprintf(stderr, "######## done IP reg\n");

    /** FO reg **/
    nd = 3;
    nword = 2;
    ipdata[0] = 0xe0000000;
    ipdata[1] = nd<<16 | ni;

    copy_to_buf(ipdata, nword);
    fprintf(stderr, "######## will FO reg\n");
    hib_sendMC(devid, nword/2, (UINT64*)rbuf);
    fprintf(stderr, "######## done FO reg\n");

    if ((binfo>>15) & 0x1) { // G5 supports cutoff

	// set large enough eta^2
	nword = 2;
	ipdata[0] = 0xd0000000;
	ipdata[1] = (1<<15) | 0x3fff; // = 0xbfff (nz:1 exp:011 1111 frac:1111 1111)
	copy_to_buf(ipdata, nword);
	hib_sendMC(devid, nword/2, (UINT64*)rbuf);
    }

    /*
    for (i = 0; i < nword; i++, rc++) {
	fprintf(stderr, "rbuf[%d]: 0x%08x\n", rc, rbuf[i]);
    }
    */

    /** j particle **/
    jj = 2;
    jjj = 0;

    for (j=0; j<n; j++) {
	xj = (unsigned int)((x[j][0] - xoffset) * xscale + 0.5) & 0xffffffff;
	yj = (unsigned int)((x[j][1] - xoffset) * xscale + 0.5) & 0xffffffff;
	zj = (unsigned int)((x[j][2] - xoffset) * xscale + 0.5) & 0xffffffff;
	if(m[j] == 0.0){
	    mj = 0;
	}else if(m[j] > 0.0){
	    mj = ((unsigned int)(pow(2.0,8.0) * log(+m[j] * mscale) / log(2.0) + 0.5) & 0x7fff) | 0x8000;
	}else{
	    mj = ((unsigned int)(pow(2.0,8.0) * log(-m[j] * mscale) / log(2.0) + 0.5) & 0x7fff) | 0x18000;
	}
	jpdata[jj] = xj;
	jpdata[jj+1] = yj;
	jpdata[jj+2] = zj;
	jpdata[jj+3] = mj;
	jj += 4;
	jjj += 2;
    }
    jpdata[0] = 0x40000000;
    jpdata[1] = jjj;
    copy_to_buf(jpdata, jj);
    hib_sendMC(devid, jj/2, (UINT64*)rbuf);
    usleep(1000);

    /*
    for (i = 0; i < jj; i++, rc++) {
	fprintf(stderr, "rbuf[%d]: 0x%08x\n", rc, rbuf[i]);
    }
    */

    /* i particle */
    nword = 2;
    for (i=0;i<n;i+=npipe) {
	if ((i+npipe)>n) {
	    nn = n - i;
	} else {
	    nn = npipe;
	}
	for (ii=0;ii<nn;ii++) {
	    xi = (unsigned int)((x[i+ii][0] - xoffset) * xscale + 0.5) & 0xffffffff;
	    yi = (unsigned int)((x[i+ii][1] - xoffset) * xscale + 0.5) & 0xffffffff;
	    zi = (unsigned int)((x[i+ii][2] - xoffset) * xscale + 0.5) & 0xffffffff;
	    if ((binfo>>14) & 0x1) { // eps2 in floating-point format
		int ie;
		ie = (unsigned int)(eps2 * xscale * xscale + ONEHALF);

		if (eps2 == 0.0) {
		    ieps2 = 0;
		} else if (eps2 > 0.0) {
		    ieps2 = (fixtofp(ie) & 0xffff);
		} else {
		    ieps2 = (fixtofp(-ie) & 0xffff | 0x10000);
		}
	    }
	    else { // eps2 in logarithmic format
		if (eps2 == 0.0) {
		    ieps2 = 0;
		} else if(eps2 > 0.0) {
		    ieps2 = ((unsigned int)(256.0 * log(+eps2 * xscale * xscale) / log(2.0) + ONEHALF) &
			     0x7fff) | 0x8000;
		} else {
		    ieps2 = ((unsigned int)(256.0 * log(-eps2 * xscale * xscale) / log(2.0) + ONEHALF) &
			     0x7fff) | 0x18000;
		}
	    }

	    ipdata[nword++] = xi;
	    ipdata[nword++] = yi;
	    ipdata[nword++] = zi;
	    ipdata[nword++] = ieps2;

	    printf("xi    0x%08x\n", xi);
	    printf("yi    0x%08x\n", yi);
	    printf("zi    0x%08x\n", zi);
	    printf("ieps2 0x%08x\n\n", ieps2);
	}
    }
    ipdata[0] = 0;
    ipdata[1] = 2 * n;
    copy_to_buf(ipdata, nword);
    // fprintf(stderr, "# IP\n");
    usleep(1000);
    hib_sendMC(devid, nword/2, (UINT64*)rbuf);
    usleep(1000);

    /*
    for (i = 0; i < nword; i++, rc++) {
	fprintf(stderr, "rbuf[%d]: 0x%08x\n", rc, rbuf[i]);
    }
    */

    /* set N and run */
    nd = 2;
    nword = 2;
    ipdata[0] = 0x80000000;
    ipdata[1] = (nd*ni << 16) | n;
    copy_to_buf(ipdata, nword);
    hib_sendMC(devid, nword/2, (UINT64*)rbuf);
    usleep(1000);

    /* fodata */
    nd = 3;
    nword = ((n-1)/npipe + 1) * npipe; /* ni must be rounded up by npipe */
    nword *= 2 * nd;
    fprintf(stderr, "nword: %d (%ld byte)\n", nword, nword*sizeof(int));
    {
	int k;
	for (k = 0; k < 10; k++) {
	    wbuf[k] = k;
	}
	for (k = 0; k < 10; k++) {
	    fprintf(stderr, "wbuf[%d] before dma: %08x\n", k, wbuf[k]);
	}
    }
    if (hib_recvMC(devid, nword/2, (UINT64*)wbuf) < 0) {
	fprintf(stderr, "hib_recvMC: not enough data\n");
	exit(1);
    }
    {
	int k;
	for (k = 0; k < 10; k++) {
	    fprintf(stderr, "wbuf[%d] after dma: %08x\n", k, wbuf[k]);
	}
    }
    copy_from_buf(fodata, nword);

    for (i=0; i<n; i+=npipe) {
	if ((i+npipe) > n) {
	    nn = n - i;
	} else {
	    nn = npipe;
	}
	{
	    int ii;
	    for (ii = 0; ii < nword; ii++) {
		printf("fodata[%d]: 0x%08x\n", ii, fodata[ii]);
	    }
	}
	for (ii=0; ii<nn; ii++) {
	    sx = ((long long int)fodata[1+2*nd*(i+ii)] << 32)
		| (long long int)fodata[0+2*nd*(i+ii)];
	    sy = ((long long int)fodata[3+2*nd*(i+ii)] << 32)
		| (long long int)fodata[2+2*nd*(i+ii)];
	    sz = ((long long int)fodata[5+2*nd*(i+ii)] << 32)
		| (long long int)fodata[4+2*nd*(i+ii)];

	    fprintf(stderr, "sx: 0x%016llx\nsy: 0x%016llx\nsz: 0x%016llx\n", sx, sy, sz);
	    a[i+ii][0] = (double)sx * ascale;
	    a[i+ii][1] = (double)sy * ascale;
	    a[i+ii][2] = (double)sz * ascale;
	}

    }
    hib_closeMC(devid);
}

void
copy_to_buf(unsigned *srcbuf, int nword)
{
    int i;

    for (i = 0; i < nword; i++) {
	rbuf[i] = srcbuf[i];
    }
    usleep(100000);
    //  sleep(1);
}

void
copy_from_buf(unsigned *dstbuf, int nword)
{
    int i;

    for (i = 0; i < nword; i++) {
	dstbuf[i] = wbuf[i];
    }
}
