#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <g5nbutil.h>
#include "direct.h"

#define NBBUFSIZE (288)
/* The maximum number of neighbor particles G5PIPE can store for a
 * single particle in question. For practical use, the number should
 * not be hardcoded like this. The number is subject to change without
 * notice, and thus should be obtained at runtime using
 * g5_get_nbmemsize().
 */

#define NPIPE (256)
/* The maximum number of virtual pipeline this code can handle. For
 * practical use, the number of pipelines should not be hardcoded like
 * this. The number is subject to change without notice, and thus
 * should be obtained at runtime using g5_get_number_of_pipelines().
 */

#define WARNING_LEVEL 1   // set 1 to be quiet. set 2 to be verbose.
#define STRICT_CHECK  0   // set 1 to perform complete (but slow) check.

static const double Margin = 1.005; // 0.5% margin for distance calculation error of G5.

#define NBBUFSIZE_HOST (NBBUFSIZE*10)

double calc_distance(double x0[3], double x1[3], double eps);
void   search_neighbor_on_host(double (*xj)[3], double eps, double nbrad, int n, int tid, int *ntnb, int *tnblist);
void   check_buffersize(int n);
void   fprintnblist(int nnb, int *nblist, FILE *stream);
void   fprintnblist_fingerprint(int nnb, int *nblist, FILE *stream);
void   nb_handling_example(int n, int ni, int i_offset, int j_offset, double (*xj)[3], double eps, double nbrad,
                           int tid, FILE *dumpfp, int *ntnbp, int *tnblist,
                           int *nnbsum, int *noverflowsum, int *nselfsum);
void   nb_handling_example_extra(int n, double (*xj)[3], double eps, double nbrad, int tid,
                                 int nnb, int *nblist, int is_overflown);
int    check_distance_criterion(int nsuspicious, int *slist, int tid, double (*xj)[3], double eps, double rad);
int    check_list0_is_covered_by_list1(int n0, int n1, int *list0, int *list1, int *difflist);
int    islarge(const void *a, const void *b);

void
calc_gravity5(double *mj, double (*xj)[3], double (*vj)[3],
	      double eps, double nbrad, double (*acc)[3], double *pot, int n,
	      int tid, int *ntnbp, int *tnblist, FILE *dumpfp, int *nnbsump, int *noverflowsump)
{
    int i, j, k;
    int nj;
    int npipe = g5_get_number_of_pipelines();
    int jmemsize = g5_get_jmemsize();
    int nbmemsize =g5_get_nbmemsize();
    static int ninterval = 0;
    static double atmp[NMAX][3];
    static int nnb[NMAX];       // total number of neighbors found.
    static int noverflow[NMAX]; // total number of lists overflown.
    static int nself[NMAX];     // total number of neighbors which has the
                                // same index as the particle in
                                // question. the value should be 1.
    check_buffersize(n);

    for (i = 0; i < n; i++) {
	for (k = 0; k < 3; k++) {
	    acc[i][k] = 0.0;
	}
    }
    for (i = 0; i < n; i++) {
        nnb[i] = 0;
        noverflow[i] = 0;
        nself[i] = 0;
    }

    for (j = 0; j < n; j += jmemsize) {

	if (j + jmemsize > n) {
	    nj = n - j;
	}
	else {
	    nj = jmemsize;
	}

	g5_set_jp(0, nj, mj + j, xj + j);
        g5_set_eps_to_all(eps);
        g5_set_n(nj);

	for (i = 0; i < n; i += npipe) {
	    int ni;

	    if (i + npipe > n) {
                ni = n - i;
	    }
	    else {
		ni = npipe;
	    }
	    g5_set_xi(ni, (double (*)[3])xj[i]);
            g5_run();
	    g5_get_force(ni, (double (*)[3])atmp[i], pot+i);

	    // here, do whatever you want to do using the neighbor lists.
	    nb_handling_example(n, ni, i, j, xj, eps, nbrad, tid, dumpfp,
				ntnbp, tnblist, nnb + i, noverflow + i, nself + i);

	} // end of i loop

	for (i = 0; i < n; i++) {
	    for (k = 0; k < 3; k++) {
		acc[i][k] += atmp[i][k];
	    }
	}
    } // end of j loop

    // check if a particle appears exactly one time in the NB list of itself.
    for (i = 0; i < n; i++) {
	if (nself[i] > 1) {
            fprintf(stderr, "NG: self found %d times\n", nself[i]);
            exit(1);
        }
        if (nself[i] == 0 && noverflow[i] == 0) {
            fprintf(stderr,
                    "NG: paritcle %d does not apperar in the NB list of itself,\n"
                    "though the list is not overflown.\n", i);
            fprintf(stderr, "  %d neighbors found for particle: ", nnb[i], i);
            exit(1);
        }
    } // end of i loop

    *nnbsump = 0;
    *noverflowsump = 0;
    for (i = 0; i < n; i++) {
        *nnbsump += nnb[i];
        *noverflowsump += noverflow[i];
    }
    ninterval++;
}

int
main(int argc, char **argv)
{
    static int firstcall = 1;
    static double mj[NMAX], xj[NMAX][3], vj[NMAX][3];
    static double a[NMAX][3], p[NMAX];
    double xmax, xmin, mmin;
    double time, dt, endt;;
    double eps, nbrad;
    double e, e0, ke, pe;
    int n, i;
    int nstep, step;
    int interval;
    UINT32 binfo;
    double peak;
    double sustained=0.0;
    double lt = 0.0, st = 0.0;
    int nnbsum, noverflowsum;
    int tid = 0;                  // test particle id
    static int ntnb;               // number of neighbors of tid
    static int tnblist[NBBUFSIZE]; // neighbor of tid
    static int cid[NMAX];
    FILE *dump = NULL, *nboutfp;
    char nboutfile[128];

    if (argc < 4) {
        fprintf(stderr, "usage: %s <infile> <outfile> <endtime> [nb_rad]\n",  argv[0]);
        exit(2);
    }

    srand48(1234);
    xmax = 64.0;
    xmin = -64.0;

    endt = atof(argv[3]);
    eps = 0.02;
    nbrad = 0.2;
    if (argc > 4) {
	nbrad = atof(argv[4]);
    }
    fprintf(stderr, "neighbor radius: %8.5f\n", nbrad);

    dt = 0.01;
    time = 0.0;
    nstep = endt/dt;
    readnbody(&n, mj, xj, vj, argv[1]);
    sprintf(nboutfile, "%s.nb", argv[2]);
    fprintf(stderr, "n: %d  outfile: %s  nboutfile: %s  endtime: %f\n", n, argv[2], nboutfile, endt);
    nboutfp = fopen(nboutfile, "w");
    if (!nboutfp) {
        perror("nboutfile:");
        exit(2);
    }

    interval = 500 * (10000.0/n) * (10000.0/n);    
    if (interval * 10 > nstep) {
	interval = nstep / 10;
    }
    fprintf(stderr, "interval: %d\n", interval);

    mmin = mj[0];
    g5_open();
    g5_set_range(xmin, xmax, mmin);

#if 0 // you can reduce neighbor particle mem size like this:
    int nbmemsize = g5_set_nbmemsize(50);
    fprintf(stderr, "set nbmemsize to: %d\n", nbmemsize);
#endif

    g5_set_h_to_all(nbrad);

    get_cputime(&lt,&st);
    calc_gravity5(mj, xj, vj, eps, nbrad, a, p, n,
		  tid, &ntnb, tnblist, dump, &nnbsum, &noverflowsum);
    energy(mj, vj, p, n, &ke, &pe);
    e0 = ke+pe;
    printf("ke: %f\n", ke);
    fflush(stdout);

    for (step = 1; step < nstep; step++) {
        if (step + 1 == nstep) dump = nboutfp; // dump all neighbors to nboutfile.
        push_velocity(vj, a, 0.5*dt, n);
        push_position(xj, vj, a, dt, n);
        time = time + dt;
        calc_gravity5(mj, xj, vj, eps, nbrad, a, p, n,
		      tid, &ntnb, tnblist, dump, &nnbsum, &noverflowsum);

        push_velocity(vj, a, 0.5*dt, n);

#ifdef ANIM
	for (i = 0; i < n; i++) {// background particles are shown in yellow.
	    cid[i] = 0;
	}
	for (i = 0; i < ntnb; i++) { // choose particles to be displayed in red.
	    int id = tnblist[i];
	    cid[id] = 1;
	}
        plot_star2(xj, n, time, 0.3, mj, cid);
#endif /* ANIM */
        if (interval > 10 && step % (interval/10) == 0) {
            fprintf(stderr, ".");
        }
        if (step % interval == 0) {
            energy(mj, vj, p, n, &ke, &pe);
            e = ke+pe;

	    get_cputime(&lt,&st);
            printf("\ncputime: %e %e\n",lt,st);
	    sustained = 38.0*((double)n)*((double)n)
		*interval/lt/1e9;
	    peak = 38.0 * g5_get_number_of_real_pipelines()
                * g5_get_pcibus_freq() / 1000.0;
            printf("speed: %g Gflops (%4.1f %%)\n",
                   sustained, sustained / peak * 100.0);
            printf("step: %d time: %e\n", step, time);
            printf("    e: % 15.13E   de: % 15.13E\n", e, e-e0);
            printf("   ke: % 15.13E   pe: % 15.13E\n", ke, pe);
            printf("ke/pe: % 15.13E\n", ke/pe);
            printf("number of NBs per particle    total:%d  ave:%3.1f (%3.1f%%)\n",
                   nnbsum, (double)nnbsum / n, (double)nnbsum / n / n * 100.0);
            printf("number of NB lists overflown  total:%d  ave:%3.1f (%3.1f%%)\n",
                   noverflowsum, (double)noverflowsum / n , (double)noverflowsum / n / n * 100.0);
            printf("particle %d has %d neighbor.\n\n", tid, ntnb);
            fflush(stdout);
	    get_cputime(&lt,&st);
        }
    }
    g5_close();
    fclose(nboutfp);

    if (n < 2048) {
        // dump all particles
        writenbody(n, mj, xj, vj, argv[2]);
    }
    else {
        // too many particles for full dump.
        // use check sum instead.
        writefingerprint(n, mj, xj, vj, argv[2]);
    }
}

/***********************************
 * neighbor list handling example
 ***********************************/
void
nb_handling_example(int n, int ni, int i_offset, int j_offset, double (*xj)[3], double eps, double nbrad,
		    int tid, FILE *dumpfp, int *ntnbp, int *tnblist,
		    int *nnbsum, int *noverflowsum, int *nselfsum)
{
    static int nnb[NPIPE];
    static int nblist[NPIPE][NBBUFSIZE];
    static int firstcall = 1;
    int npipe = g5_get_number_of_pipelines();
    int jmemsize =  g5_get_jmemsize();
    int nbmemsize = g5_get_nbmemsize();
    int k, ii, id, p, nnb_trimmed, are_overflown, is_overflown, self;
    double x0[3], x1[3], r;

    are_overflown = g5_read_neighbor_list();
    /* Retrieves neighbors of xj[i..i+npipe-1] from G5,
       and stores them into a library-internal buffer.
    */
    if (are_overflown && firstcall) {
	firstcall = 0;
	fprintf(stderr, "\n\n");
	fprintf(stderr, "=========================================================\n");
	fprintf(stderr, "==== Warning: one or more neighbor lists overflown.  ====\n");
	fprintf(stderr, "==== use g5_get_neighbor_list() to check the detail. ====\n");
	fprintf(stderr, "=========================================================\n");
	fprintf(stderr, "\n\n");
    }

    for (p = 0; p < ni; p++) {
	nnb[p] = g5_get_neighbor_list(p, nblist[p]);
	/* Neighbor of xj[i_offset+p] is obtained in
	 * nblist[p][0..nnb[p]-1].  Its length is returned to
	 * nnb[p]. nnb[p] == -len if the list is overflown, where len
	 * is the number of valid neighbors stored in
	 * nblist[p][0..len-1].
	 */
    }

    /*
     * check if all neighbors of a particle that has index
     * 'i_pffset+p' resides within the neighbor radius.
     */
    for (p = 0; p < npipe; p++) {

       	if (i_offset + p >= n) continue;

	self = 0;
	for (k = 0; k < 3; k++) {
	    x0[k] = xj[i_offset+p][k];
	}
	
	if (nnb[p] < 0) {         // 'p'-th list has overflown.
	    is_overflown = 1;
	    noverflowsum[p]++;
	    nnb_trimmed = -nnb[p];
	}
	else {
	    is_overflown = 0;
	    nnb_trimmed = nnb[p];
	}
	nnbsum[p] += nnb_trimmed;

	for (ii = 0; ii < nnb_trimmed; ii++) { // loop over all neighbors.

	    id = nblist[p][ii];           // obtain index of a neighbor,

#if STRICT_CHECK
	    for (k = 0; k < 3; k++) {     // and set its position to x1
		x1[k] = xj[id][k];
	    }

	    /* check if the neibor resides within neighbor radius plus
	     * 0.4% calculation error margin, which the distance may
	     * suffer due to limitation of calculation-accuracy on G5.
	     */
	    r = calc_distance(x0, x1, eps);
	    if (r >= nbrad * Margin) {
		fprintf(stderr, "particle %d's neighbor %d is out of the neighbor radius.\n", i_offset + p, id);
		fprintf(stderr, "neighbor radius: %f  distance: %f\n", nbrad, r);
		fprintf(stderr, "x[% 4d]: %+8.5e, %+8.5e, %+8.5e\n", i_offset + p, x0[0], x0[1], x0[2]);
		fprintf(stderr, "x[% 4d]: %+8.5e, %+8.5e, %+8.5e\n", id,  x1[0], x1[1], x1[2]);
		fprintf(stderr, "%d neighbors found: ", nnb_trimmed);
		fprintnblist(nnb_trimmed, nblist[p], stderr);
		fprintf(stderr, "\n");
		fprintf(stderr, "i_offset: %d    p: %d\n", i_offset, p);
		exit(1);
	    }
#endif // STRICT_CHECK

	    if (j_offset + id == i_offset + p) {
		self++;
	    }
	} // end of ii loop

        nselfsum[p] += self;
	/* check if the particle 'i_offset+p' appears exactly one time in the NB list of itself */
	if (self != 1) {
	    if (self > 1) {
		fprintf(stderr, "NG: self found %d times\n", self);
		exit(1);
	    }
	    if (n < jmemsize && self == 0 && !is_overflown) {
		fprintf(stderr, "NG: self not found in the list, though the list is not overflown.\n");
                fprintf(stderr, "n:%d  jmemsize:%d\n", n, jmemsize);
		fprintf(stderr, "%d neighbors found: ", nnb_trimmed);
		fprintf(stderr, "i_offset:%d  p:%d  i_offset+p:%d  nnb[p]:%d", i_offset, p, i_offset + p, nnb[p]);
		fprintnblist(nnb_trimmed, nblist[p], stderr);
                exit(1);
	    }
	}

	// return the neighbor of 'tid'-th particle to the caller.
	if (i_offset + p == tid) {
	    *ntnbp = nnb_trimmed;
	    for (ii = 0; ii < nnb_trimmed; ii++) {
		tnblist[ii] = nblist[p][ii];
	    }

	    // more examination for 'tid'-th particle.
#if STRICT_CHECK
	    nb_handling_example_extra(n, xj, eps, nbrad, tid, nnb_trimmed, nblist[p], is_overflown);
#endif // STRICT_CHECK
	}
        if (dumpfp) {
            fprintf(dumpfp, "nnb[%d]:%d ", i_offset + p, nnb_trimmed);
            qsort(nblist[p], nnb_trimmed, sizeof(int), islarge);

            if (n < 2048) {
                // dump all neighbor lists
                fprintf(dumpfp, "nb: ");
                fprintnblist(nnb_trimmed, nblist[p], dumpfp);
            }
            else {
                // too many neighbor lists for full dump.
                // use check sum instead.
                fprintf(dumpfp, "nbchecksum: ");
                fprintnblist_fingerprint(nnb_trimmed, nblist[p], dumpfp);
            }

        }
    } // end of p loop
}

/**************************************************************
 * complete check, but only for one particle indexed as 'tid'.
 **************************************************************/
void
nb_handling_example_extra(int n, double (*xj)[3], double eps, double nbrad,
			  int tid, int nnb, int *nblist, int is_overflown)
{
    static int nnbh;               // number of neighbors of tid calculated on
    static int nblisth[NBBUFSIZE_HOST]; // neighbor of tid
    int i, nsuspicious, err = 0;
    int slist[NBBUFSIZE];
    double dist;

    // create neighbor list on the host computer for comparison.
    search_neighbor_on_host(xj, eps, nbrad, n, tid, &nnbh, nblisth);

    /*
     * compare the neighbor lists created on the host and G5.
     */

    // check if all neighbors found by G5 is found also by the host.
    nsuspicious = check_list0_is_covered_by_list1(nnb, nnbh, nblist, nblisth, slist);
    err += check_distance_criterion(nsuspicious, slist, tid, xj, eps, nbrad);

    // check if all neighbors found by the host is found also by G5, if its list is not overflown.
    if (!is_overflown) {
	nsuspicious = check_list0_is_covered_by_list1(nnbh, nnb, nblisth, nblist, slist);
	err += check_distance_criterion(nsuspicious, slist, tid, xj, eps, nbrad);
    }

    if (err) {
	fprintf(stderr, "target particle: %d\n", tid);
	fprintf(stderr, "%d neighbors found on G5   : ", nnb);
	fprintnblist(nnb, nblist, stderr);
	fprintf(stderr, "%d neighbors found on host : ", nnbh);
	fprintnblist(nnbh, nblisth, stderr);
	exit(1);
    }
}

int
check_distance_criterion(int nsuspicious, int *slist, int tid, double (*xj)[3], double eps, double rad)
{
    double dist;
    int i, err = 0;

    i = 0;
    while (i < nsuspicious) {
	dist = calc_distance(xj[tid], xj[slist[i]], eps);
	if (dist < rad * Margin) {
#if (WARNING_LEVEL > 1)
            fprintf(stderr, "distance between %d and %d is %8.5f,\n", tid, slist[i], dist);
            fprintf(stderr, "within neighbor radius %5.3f+%3.1f%% margin.\n"
                    "seems to be OK.\n", rad, (Margin - 1) * 100.0);
#endif
	    if (dist < rad / Margin) {
                fprintf(stderr, "distance between %d and %d is %8.5f,\n", tid, slist[i], dist);
		fprintf(stderr, "distance seems too small.\n"
			"probably the neighbor list on the host has overflown.\n"
			"increase NBBUFSIZE_HOST and recompile to remove this message.\n");
	    }
	}
	else {
	    fprintf(stderr, "outside neighbor radius %5.3f+-%3.1f%% margin. "
		    "seems to be NG.\n", rad, (Margin - 1) * 100.0);
	    err++;
	}
#if (WARNING_LEVEL > 1)
	fprintf(stderr, "\n");
#endif
	i++;
    }

    return err;
}

/*
 * particles which resides in list0 but not in list1 are set to difflist.
 * returns length of difflist.
 */
int
check_list0_is_covered_by_list1(int n0, int n1, int *list0, int *list1,
				int *difflist)
{
    int i, j, found;
    int ndiff = 0;

    for (i = 0; i < n0; i++) {
	found = 0;
	for (j = 0; j < n1; j++) {
	    if (list0[i] != list1[j]) continue;
	    found = 1;
	    break;
	}
	if (!found && ndiff < NBBUFSIZE) {
	    difflist[ndiff] = list0[i];
	    ndiff++;
	}
    }
    if (ndiff) {
#if (WARNING_LEVEL > 1)
	fprintf(stderr, "found list inconsistenc%s: ", ndiff > 1 ? "ies":"y");
	for (i = 0; i < ndiff; i++) {
	    fprintf(stderr, "%d ", difflist[i]);
	}
	fprintf(stderr, "\n");
#endif
    }

    return ndiff;
}

double
calc_distance(double x0[3], double x1[3], double eps)
{
    double dist2 = eps * eps;
    int k;

    for (k = 0; k < 3; k++) {
	dist2 += (x0[k] - x1[k]) * (x0[k] - x1[k]);
    }

    return sqrt(dist2);
}

void
search_neighbor_on_host(double (*xj)[3], double eps, double nbrad, int n, int tid,
			int *ntnb, int *tnblist)
{
    double r;
    int i, k;
    int len = 0;

    for (i = 0; i < n; i++) {
	r =calc_distance(xj[i], xj[tid], eps);
	if (r < nbrad && len < NBBUFSIZE_HOST) {
	    tnblist[len] = i;
	    len++;
	}
    }
    *ntnb = len;
}

void
check_buffersize(int n)
{
    int npipe = g5_get_number_of_pipelines();
    int nbmemsize =g5_get_nbmemsize();

    if (NMAX < n) {
	fprintf(stderr, "%s line%d: too large n (%d)\n", __FILE__, __LINE__, n);
	exit(2);
    }
    if (NBBUFSIZE < g5_get_nbmemsize()) {
	fprintf(stderr, "%s line%d: increase NBBUFSIZE to %d and recompile.\n",
		__FILE__, __LINE__, nbmemsize);
	exit (2);
    }
    if (NPIPE < npipe) {
	fprintf(stderr, "%s line%d: increase NPIPE to %d and recompile.\n",
		__FILE__, __LINE__, npipe);
	exit (2);
    }
}

void
fprintnblist(int nnb, int *nblist, FILE *stream)
{
    int i;
    for (i = 0; i < nnb; i++) {
	fprintf(stream, "%d ", nblist[i]);
    }
    fprintf(stream, "\n");
}

void
fprintnblist_fingerprint(int nnb, int *nblist, FILE *stream)
{
    unsigned long long int checksum = 0LL;
    int i;
    for (i = 0; i < nnb; i++) {
        checksum += nblist[i];
    }
    fprintf(stream, "%016llx\n", checksum);
}

int
islarge(const void *a, const void *b)
{
    int ia = *(int *)a;
    int ib = *(int *)b;

    if (ia > ib) {
	return 1;
    }
    else if (ia == ib) {
	return 0;
    }
    else {
	return -1;
    }
}
