#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <errno.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "hibdrv.h"
#include "hibutil.h"
#include "g5util.h"

#ifdef ICC_RCD
#define ONEHALF (0.0) // Intel CC with -rcd switch
#else
#define ONEHALF (0.5) // standard C
#endif


#define DUMMYSIZE (32) /* this may necessary be set to 32 or larger on some hosts. */
#define PAD_DUMMY_DATA(n) while (n < DUMMYSIZE) { Rbuf[devid][n++] = 0; }

#define WARN(lv, fmt, args...) if (lv <= warn_level) fprintf(stderr, fmt, ## args);
static int warn_level = 2; /* warning message output level. the higher the more verbose.
                              0: no warning (may cause wrong result with g7pkg/scripts/check.csh)
                              1: minimum
                              2: default
                              3: for debugging purpose
                           */
/*
 * maximum number of JPs, IPs, FOs can be transfered by
 * a single DMA transaction.
 */
#define IFIFOSIZE (512-1)    // input fifo size in 64-bit word.
#define OFIFOSIZE (1024-1)   // output fifo size in 64-bit word.

#define NJPWORDMAX (HIB_DMABUF_BYTES/4) // max JP DMA size in 32-bit word. 32k-word=128kB=32 pages=8k particles.
#define NIPMAX  (IFIFOSIZE)
#define NFOMAX  (OFIFOSIZE)
static UINT32 Njpmax[NHIBMAX];
static UINT32 Njpwordmax[NHIBMAX];
static UINT32 Nipmax[NHIBMAX];
static UINT32 Nfomax[NHIBMAX];

// #define DEFAULT_FORCE_CORRECTION_FP  (1.00055) // fp64tofp17tuned
#define DEFAULT_FORCE_CORRECTION_FP  (1.00070) // fp64tofp17tuned2

#define DEFAULT_FORCE_CORRECTION_LOG (1.0/1.0027)
#define DEFAULT_POTENTIAL_CORRECTION (1.0/1.0006)

#define NCHIPMAX    (6)
#define FORCEFOSIZE (3)  // number of FO words for 64-bit force x 3.

Hib *H[NHIBMAX];
static UINT32 *Rbuf[NHIBMAX]; /* DMA read buffer */
static UINT32 *Wbuf[NHIBMAX]; /* DMA write buffer */

static UINT32 *Pbuf[NHIBMAX];
/* PIO write buffer. Do not define the size with HIB_PIOWBUF_BYTES,
   which denotes size of the PIOW buffer in HIB.
   Here we defining a buffer in the main memory,
   that needs the size same as that of DMA's, i.e., HIB_DMABUF_BYTES. */

static UINT32 Nbodies[NHIBMAX];
static UINT32 Ieps2[NHIBMAX][NIPMAX];
static UINT32 Ih[NHIBMAX][NIPMAX];
static UINT32 Nretrieved[NHIBMAX]; // number of calculation results retrieved by the last g5_get_forceMC
static UINT32 Ni[NHIBMAX];
static UINT32 Jpsize[NHIBMAX]; // JP packet size in 64-bit word.
static UINT32 Ipsize[NHIBMAX]; // IP packet size in 64-bit word.
static UINT32 Fosize[NHIBMAX]; // FO packet size in 64-bit word.

#define MASS_TABLE_SIZE (1048576)
UINT32 Mass_conversion_table[NHIBMAX][MASS_TABLE_SIZE];

static double Current_xmin[NHIBMAX];
static double Current_xmax[NHIBMAX];
static double Current_mmin[NHIBMAX];

static double Xoffset[NHIBMAX];
static double Xscale[NHIBMAX];
static double Mscale[NHIBMAX];
static double Moffset[NHIBMAX];
static double Ascale[NHIBMAX];

static double Force_rounding_correction[NHIBMAX];
static double Potential_rounding_correction[NHIBMAX];

static int g5_npipes[NHIBMAX];

static int g5_ncards = 0;
static int g5_cards[NHIBMAX];
static int g5_sendfunc = SENDFUNC_PIOW;

// values encoded in board_info register
static UINT32 g5_model[NHIBMAX];
static UINT32 g5_product[NHIBMAX];
static UINT32 g5_nchip[NHIBMAX];
static UINT32 g5_jmemsize[NHIBMAX];
static UINT32 g5_eps2format[NHIBMAX]; // 0:logarithmic    1:floating point
static UINT32 g5_p3mcutoff[NHIBMAX];  // 0:not available  1:available
static UINT32 g5_nbsearch[NHIBMAX];   // 0:not available  1:available
static UINT32 g5_potential[NHIBMAX];  // 0:not available  1:available

static UINT32 Ipaddr[NHIBMAX];    //     ip packet
static UINT32 Jpaddr[NHIBMAX];    //     jp packet
static UINT32 Calcaddr[NHIBMAX];  //     calc command
static UINT32 Foregaddr[NHIBMAX]; //     fo register
static UINT32 Ipregaddr[NHIBMAX]; //     ip register
static UINT32 Cregaddr[NHIBMAX];  //     common register
/*
    pg_ctl.vhd local space address map:

    revision PGPG2 (compatible with G5PIPE version 1.3):
    ------------------------------------------------------------------
    hib_data
    31..28    63..48    47..32
    ------------------------------------------------------------------
    0000      0x0000     ndip * ni        IP packet header
    0100      jaddr      ndjp * nj        JP packet header
    1000      ndip*npipe N(16)            calc
    1100      ndip(16)   ni(16)           IP register
    1110      ndfo(16)   ni(16)           FO register
    1101      data(32)                    general-purpose registers
                                          for constants such as eta & rcut.
                                          addr:(63:56) data(55:32)
    ------------------------------------------------------------------

    original G5PIPE version 1.3 or earlier:
    -------------------------------------------------
    intf_data1(31:28) ipwon 
    -------------------------------------------------
                  00XX 1   IP
                  01XX 1   JP
                  10XX 0   calc
                  1100 0   IP reg (ni, nd)
                  111X 0   FO reg (ni, nd)
                  1101 0   eta
    -------------------------------------------------

    notes
      N     : number of j-particles to be processed in one run
      nj    : JP packet length
      ni    : IP packet length (number of virtual pipelines)
      npipe : number of real pipelines
      ndjp  : number of 64-bit words per 1 JP packet
      ndip  : number of 64-bit words per 1 IP packet
      ndfo  : number of 64-bit words per 1 FO packet
*/

#define ADDR_ETA (0x00)
/* common register space address map:
    ------------------------------------------------------------------
    hib_data
    63..56    55..32
    ------------------------------------------------------------------
    00000000  eta       scale length of P3M cutoff
    ------------------------------------------------------------------
 */

// definitions specific to G5nbPIPE:
#define NBMEMSIZE   (24) // maximum number of neighbors per pipe.
#define NBPERWORD   (5)  // number of neighbors packed in one 64-bit word.
                         // eg. five neighbors can be packed if jindex has 12-bit width.
static int Nbmemsize[NHIBMAX]; // number of neighbors per pipe. set to NBMEMSIZE by default.
static UINT32 Nbmemsize_is_set_by_user[NHIBMAX];


// local functions prototypes:
static void         init_envs(void);
static void         init_boardinfo(int devid);
static void         recalculate_iobuf_attributes(int devid);
static int          calculate_fosize(int devid);
static void         set_reg(int devid, UINT32 addr, UINT32 val);

static void         get_forceMC(int devid, int ni, INT64 (*s)[3]);
static void         convert_force(int n, double scale, double (*a)[3], INT64 (*ia)[3], double *p, INT64 *ip);
static inline int   convert_mass(int devid, double mass);
static int          convert_eps(int type, double eps2);
static UINT32       fp64tofp17(double indata);
static UINT32       fp64tofp17tuned(double indata);
static UINT32       fp64tofp17tuned2(double indata);
static int          convert_double_to_grape_log(double x);

void
g5_open(void)
{
    int ic;

    init_envs();
    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_openMC(ic);
    }
}

void
g5_openMC(int devid)
{
    int nword, ni;

    init_envs();
    H[devid] = hib_openMC(devid);
    if (g5_npipes[devid] == 0) { // open for the first time.

	hib_mem_writeMC(devid, H[devid]->r->command, (1<<H[devid]->r->command_dma_reset_bit));
	hib_mem_writeMC(devid, H[devid]->r->command, (1<<H[devid]->r->command_reset_backend_bit));

        init_boardinfo(devid);

	Ipaddr[devid]    = 0x00000000; // ip packet
	Jpaddr[devid]    = 0x40000000; // jp packet
	Calcaddr[devid]  = 0x80000000; // calc command
	Foregaddr[devid] = 0xe0000000; // fo register
	Ipregaddr[devid] = 0xc0000000; // ip register
	Cregaddr[devid]  = 0xd0000000; // common register

        Nbmemsize[devid] = 0; // do not retrieve neighbor particle by default.
	Nbmemsize_is_set_by_user[devid] = 0;
	recalculate_iobuf_attributes(devid);

	if (g5_eps2format[devid] == 0) {
	    Force_rounding_correction[devid] = DEFAULT_FORCE_CORRECTION_LOG;
	}
	else {
	    Force_rounding_correction[devid] = DEFAULT_FORCE_CORRECTION_FP;
	}
	WARN(3, "Force_rounding_correction[%d]: %e\n", devid, Force_rounding_correction[devid]);
        Potential_rounding_correction[devid] = DEFAULT_POTENTIAL_CORRECTION;
        Current_xmin[devid] = 0.0;
        Current_xmax[devid] = 0.0;
        Current_mmin[devid] = 0.0;

        Pbuf[devid] = (UINT32 *)calloc(HIB_DMABUF_BYTES>>2, sizeof(UINT32));
        if (! Pbuf[devid]) {
            fprintf(stderr, "g5_openMC: failed to allocate a buffer Pbuf[%d]. abort.\n", devid);
            exit (1);
        }
    }

    WARN(3, "g5_sendfunc: %s\n", g5_sendfunc == SENDFUNC_DMAR ? "DMA read" : "PIO write");
    if (g5_sendfunc == SENDFUNC_PIOW) {
        hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
        Rbuf[devid] = Pbuf[devid];
    }
    else {
        hib_set_sendfuncMC(devid, SENDFUNC_DMAR);
	Rbuf[devid] = (UINT32 *)(H[devid]->dmar_buf);
    }
    /* using Rbuf as PIO write buffer would degrade
     * performance, since the buffer is marked up as 'non-cached'.
     */

    Wbuf[devid] = (UINT32 *)(H[devid]->dmaw_buf);

    hib_mem_writeMC(devid, H[devid]->r->command, (1<<H[devid]->r->command_dma_reset_bit)); // stop DMA
    hib_mem_writeMC(devid, H[devid]->r->command, (1<<H[devid]->r->command_reset_backend_bit)); // reset backend

    set_reg(devid, Ipregaddr[devid], Ipsize[devid]<<16 | g5_npipes[devid]);
    usleep(10);
    set_reg(devid, Foregaddr[devid], Fosize[devid]<<16 | g5_npipes[devid]);

    // set large enough eta so that cut off function always returns 1.0.
    // here 0xbfff is used as 'large enough eta' (nz:1 exp:011 1111 frac:1111 1111).
    if (g5_p3mcutoff[devid] == 1) {
        usleep(10);
        set_reg(devid, Cregaddr[devid], ADDR_ETA<<24 | (1<<15 | 0x3fff));
    }

    WARN(2, "g5[%d] opened.\n", devid);
}

void
g5_close(void)
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_closeMC(ic);
    }
}

void
g5_closeMC(int devid)
{
    hib_closeMC(devid);
}


void
g5_set_range(double xmin, double xmax, double mmin)
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_set_rangeMC(ic, xmin, xmax, mmin);
    }
}

void
g5_set_rangeMC(int devid, double xmin, double xmax, double mmin)
{
    UINT32 i;
    double xsize;
    double mk;

    Xoffset[devid] = xmin;
    xsize = xmax - xmin;

#if 0 // for the first lot model300/600 only

    Xscale[devid] = pow(2.0,23.0)/xsize;
    Mscale[devid] = 1.0/mmin;
    Moffset[devid] = pow(2.0,69.0)*Mscale[devid];
    Ascale[devid] = (-Xscale[devid]*Xscale[devid])/Moffset[devid];
    Ascale[devid] *= Force_rounding_correction[devid];

    // recreate mass table only when mmin is changed.
    if (Current_mmin[devid] != mmin) {
        for (i = 0; i < MASS_TABLE_SIZE; i++) {
            mk = (i+0) * Moffset[devid] / Mscale[devid];
            Mass_conversion_table[devid][i] = convert_double_to_grape_log(mk);
        }
    }

#elif 1 // GRAPE-5 compatible 32-bit spacial resolution

    Xscale[devid] = pow(2.0, 32.0)/xsize;
    Mscale[devid] = 1.0/mmin;
    Moffset[devid] = pow(2.0, 96.0)*Mscale[devid];
    Ascale[devid] = (-Xscale[devid]*Xscale[devid])/Moffset[devid];
    Ascale[devid] *= pow(2.0, 23.0); // compensate G5-internal 23-bit rshift
    Ascale[devid] *= Force_rounding_correction[devid];

    /* 
     * amin = mmin * xmax/(xmax^2+ymax^2+zmax^2+eps^2)^(3/2)
     *      = mmin * 2^31/((2^31)^2 * 4)^(3/2)
     *      = mmin * 2^(31-96)
     * to avoid underflow, amin should be larger than 2^8. thus,
     * mmin * 2^(31-96) > 2^8
     * mmin > 2^(8+96-31)
     *      = 2^73
     */

    // recreate mass table only when mmin is changed.
    if (Current_mmin[devid] != mmin) {
        for (i = 0; i < MASS_TABLE_SIZE; i++) {
            mk = (i+0) * Moffset[devid] / Mscale[devid];
            Mass_conversion_table[devid][i] = convert_double_to_grape_log(mk);
        }
    }

#else // GRAPE-3 compatible 20-bit spacial resolution

    Xscale[devid] = pow(2.0,20.0)/xsize;
    Mscale[devid] = 1.0/mmin;
    Moffset[devid] = pow(2.0,60.0)*Mscale[devid];
    Ascale[devid] = (-Xscale[devid]*Xscale[devid])/Moffset[devid];
    Ascale[devid] *= Force_rounding_correction[devid];

    // recreate mass table only when mmin is changed.
    if (Current_mmin[devid] != mmin) {
        for (i = 0; i < MASS_TABLE_SIZE; i++) {
            mk = (i+0) * pow(2.0, 60.0);
            Mass_conversion_table[devid][i] = convert_double_to_grape_log(mk);
        }
    }

#endif

    Current_xmin[devid] = xmin;
    Current_xmax[devid] = xmax;
    Current_mmin[devid] = mmin;

}

void
g5_set_eta(double eta)
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_set_etaMC(ic, eta);
    }
}

void
g5_set_etaMC(int devid, double eta)
{
    int ieta;
    static int firstcall = 1;

    if (g5_p3mcutoff[devid] == 0) {
        if (firstcall == 1) {
            firstcall = 0;
            WARN(2, "Warning: cut-off function is not implemented in this revision "
                 "of G5 pipeline. g5_set_etaMC() has no effect.\n");
        }
    }
    else {
        eta *= Xscale[devid];
        ieta = convert_double_to_grape_log(eta);
	set_reg(devid, Cregaddr[devid], ADDR_ETA<<24 | ieta);
    }
}

void
g5_set_jp(int adr, int nj, double *m, double (*x)[3])
{
    int ic;
    int j0, njj;

    j0 = 0;
    njj = (nj + g5_ncards - 1) / g5_ncards;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	if (nj < j0 + njj) {
	    njj = nj - j0;
	}
	g5_set_jpMC(ic, adr, njj, m+j0, (double (*)[3])(x[j0]));
	j0 += njj;
    }
}

/*
 * send j-particles to memory units in G5 backends. Multiple JPs are
 * packed into a packet of size little bit smaller than
 * Njpwordmax[devid].  A signle packet may or may not contain particles
 * which go to different memory units in different FPGA.
 */
void
g5_set_jpMC(int devid, int adr, int nj, double *m, double (*x)[3])
{
    int nword;
    int nempty_cycle = 4;
    int nword_almost_overflow = Njpwordmax[devid] - (2 * 2 + nempty_cycle * 2 + Jpsize[devid] * 2);
                                                 // (packet header + empty cycle) * 2 + JP size in 32-bitword.
    int jsent, jbuffered, jindex;
    int ic, cid, nnj, i;
    int xj, yj, zj, mj;
    double xs, xo;

    xs = Xscale[devid];
    xo = Xoffset[devid];

    int nchip = g5_nchip[devid];
    nnj = (nj-1)/nchip + 1;

    jsent = 0;     // number of JPs sent.
    jbuffered = 0; // number of JPs packed to the DMA buffer.
    nword = 0;     // number of data words packed to the DMA buffer.

    for (ic = 0; ic < nchip; ic++) {

        jindex = 0; // index uniquely assigned to each JP in a chip.

        // IP reg packet:
        switch (g5_model[devid]) {
          case 2: // chipid of model600 & 300d starts from 1.
          case 6:
            cid = ic + 1;
            break;
          case 3: // that of model300 starts from 4.
            cid = ic + 4;
            break;
        }
        switch (g5_model[devid]) {
          case 2:
          case 3:
          case 6:
            for (i = 0; i < nempty_cycle; i++) {
                Rbuf[devid][nword++] = 0;
            }
            Rbuf[devid][nword++] = Ipregaddr[devid];
            Rbuf[devid][nword++] = cid<<28 | Ipsize[devid]<<16 | g5_npipes[devid]; // write chip id to IP reg.
            for (i = 0; i < nempty_cycle; i++) {
                Rbuf[devid][nword++] = 0;
            }
            break;
          default:
            // nothing to do for model100 & 800.
            break;
        }

        // JP packet tag:
        Rbuf[devid][nword++] = Jpaddr[devid];
        Rbuf[devid][nword++] = ((long long int)(adr) << 16) | Jpsize[devid] * nnj;

        // JP packet body:
        while (jindex < nnj) {
            // index 0,6,12,18... goes to chip0, 1,7,13,19 goes to chip1, and so on.
	    int jsrc = jindex * nchip + ic;
	    int jindextosend;
            if (jsrc < nj) {
                xj = ((UINT32) ((x[jsrc][0] - xo) * xs + ONEHALF));
                yj = ((UINT32) ((x[jsrc][1] - xo) * xs + ONEHALF));
                zj = ((UINT32) ((x[jsrc][2] - xo) * xs + ONEHALF));
                mj = convert_mass(devid, m[jsrc]);
		jindextosend = jindex;
            }
            else { // clear garbage at the end of memory.
                xj = yj = zj = mj = 0;
		jindextosend = -1;
            }

            Rbuf[devid][nword++] = (0xffffffff & xj);
            Rbuf[devid][nword++] = (0xffffffff & yj);
            Rbuf[devid][nword++] = (0xffffffff & zj);
            Rbuf[devid][nword++] = (0xfff & (jindextosend+1))<<17 | (0x1ffff & mj); // offset jindex by 1.
            jsent++;                // reset when this function begins.
            jbuffered++;            // reset when Rbuf is flushed.
            jindex++;               // reset when ic is incremented.

            // DMA buffer is full. flush to the HIB.
            if (nword >= nword_almost_overflow) {
                PAD_DUMMY_DATA(nword);
                hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
                jbuffered = 0;
                nword = 0;
            }

        } // nnj loop

    } // ic loop

    // flush data remaining in the buffer.
    if (nword > 0) {
        PAD_DUMMY_DATA(nword);
        hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
        jbuffered = 0;
        nword = 0;
    }

    // restore IP reg.
    // write chip id 0 (measns broadcast) to IP reg.
    switch (g5_model[devid]) {
      case 2:
      case 3:
      case 6:
	set_reg(devid, Ipregaddr[devid], 0<<28 | Ipsize[devid]<<16 | g5_npipes[devid]);
        break;
      default:
        // nothing to do for model100 & 800.
        break;
    }
}

void
g5_calculate_force_on_x(double (*x)[3], double (*a)[3], double *p, int ni)
{
    int off, nii, np;

    np = g5_get_number_of_pipelines();

    for (off = 0; off < ni; off += np) {
	nii = np;
	if (off+nii > ni) {
	    nii = ni - off;
	}

	g5_set_xi(nii, (double (*)[3])x[off]);
	g5_run();
	g5_get_force(nii, (double (*)[3])a[off], &p[off]);
    }
}

void
g5_set_xi(int ni, double (*x)[3])
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_set_xiMC(ic, ni, x);
    }
}

void
g5_set_xiMC(int devid, int ni, double (*x)[3])
{
    int xi, yi, zi;
    int i, nword;
    double xs, xo;

    if (Nbodies[devid] == 0) return;

    xs = Xscale[devid];
    xo = Xoffset[devid];

    if (ni > Nipmax[devid]) {
	fprintf(stderr, "g5_set_xi: too large ni (%d). "
                "should not be larger than %d. abort.\n", ni, Nipmax[devid]);
	exit(1);
    }
    Ni[devid] = ni;
    nword = 0;
    Rbuf[devid][nword++] = Ipaddr[devid];
    Rbuf[devid][nword++] = Ipsize[devid] * ni;

    for (i = 0; i < ni; i++) {

	xi = (UINT32)((x[i][0] - xo) * xs + ONEHALF);
	yi = (UINT32)((x[i][1] - xo) * xs + ONEHALF);
	zi = (UINT32)((x[i][2] - xo) * xs + ONEHALF);

	Rbuf[devid][nword++] = 0xffffffff & xi;
	Rbuf[devid][nword++] = 0xffffffff & yi;
	Rbuf[devid][nword++] = 0xffffffff & zi;
	Rbuf[devid][nword++] = ((0xffff & Ih[devid][i])<<16) | (0xffff & Ieps2[devid][i]);
    }
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
}

void
g5_run(void)
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_runMC(ic);
    }
}

void
g5_runMC(int devid)
{
    int nword;
    int ni = Ni[devid];

    if (Nbodies[devid] == 0) return;

#if 1 // issue only one DMAW that retrieves all FOUTs at once.

    /* set N and run */
    nword = 0;
    Rbuf[devid][nword++] = Calcaddr[devid];
    Rbuf[devid][nword++] = ((Ipsize[devid] * g5_npipes[devid]) << 16) | Nbodies[devid];
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

    /* kick off DMA write */
    nword = ((ni-1)/g5_npipes[devid] + 1) * g5_npipes[devid]; /* ni must be rounded up by g5_npipes[devid] */
    nword = sizeof(long long)/sizeof(int) * Fosize[devid] * nword;
    hib_start_dmawMC(devid, (nword+1)/2, (UINT64*)Wbuf[devid]);

#else // split DMAW so that FOUTs retrieval for previous runs are
      // parallelized with current run. this is for just in case the
      // HIB logic is not wise enough.

    int size_total, size_left, size;
    UINT64 *addr = (UINT64*)Wbuf[devid];
    int nburst = 256; // size in 64-bit word to be received in one DMAW burst.
                       // must be a multiple of 8 for unknown reason.

    /* kick off DMA write */
    size_total = ((ni-1)/g5_npipes[devid] + 1) * g5_npipes[devid]; // ni must be rounded up by g5_npipes[devid]
    size_total *= Fosize[devid]; // total size in 64-bit word to be received.
    size_left = size_total;

    while (size_left > 0) {
	if (size_left < nburst) {
	    size = size_left;
	}
	else {
	    size = nburst;
	}
	hib_start_dmawMC(devid, size, addr);

	if (size_left == size_total) { // send N and start run, after the 1st transfer kicked off.
	    nword = 0;
	    Rbuf[devid][nword++] = Calcaddr[devid];
	    Rbuf[devid][nword++] = ((Ipsize[devid] * g5_npipes[devid]) << 16) | Nbodies[devid];
	    PAD_DUMMY_DATA(nword);
	    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
	}
	
	if (size_left <= 0) break; // don't need to wait for completion of the final dmaw.

	hib_finish_dmawMC(devid);
	size_left -= size;
	addr += size * sizeof(UINT64);
    }

#endif
}


void
g5_set_n(int nj)
{
    int ic;
    int j0, njj;

    j0 = 0;
    njj = (nj + g5_ncards - 1) / g5_ncards;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	if (nj < j0 + njj) {
	    njj = nj - j0;
	}
	g5_set_nMC(ic, njj);
	j0 += njj;
    }
}

void
g5_set_nMC(int devid, int n)
{
    Nbodies[devid] = (n-1)/g5_nchip[devid] + 1;
}

void
g5_get_force(int ni, double (*a)[3], double *pot)
{
    int ic, i, k;
    static INT64 iacctmp[NFOMAX][3], iacc[NFOMAX][3];
    static INT64 ipot[NFOMAX];

    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            iacc[i][k] = 0.0;
        }
    }
    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	get_forceMC(ic, ni, iacctmp); // get force from DMAW buf
        for (i = 0; i < ni; i++) {
            for (k = 0; k < 3; k++) {
                iacc[i][k] += iacctmp[i][k];
            }
        }
    }
    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] != 0) break;
    }

    // convert 64-bit fixed point to 64-bit floating point
    convert_force(ni, Ascale[ic], a, iacc, pot, ipot);
}

void
g5_get_forceMC(int devid, int ni, double (*a)[3], double *pot)
{
    int i;
    static INT64 (*iacc)[NFOMAX][3];
    static INT64 (*ipot)[NFOMAX];

    if (iacc == NULL) {
        iacc = (INT64 (*)[NFOMAX][3])malloc(sizeof(INT64) * NFOMAX * 3);
        if (iacc == NULL) {
            perror("g5_get_forceMC");
        }
        ipot = (INT64 (*)[NFOMAX])malloc(sizeof(INT64) * NFOMAX);
        if (ipot == NULL) {
            perror("g5_get_forceMC");
        }
    }

    if (ni > Nfomax[devid]) {
	fprintf(stderr, "g5_get_forceMC: too large ni (%d). abort.\n", ni);
	exit(1);
    }

    get_forceMC(devid, ni, iacc[devid]); // get force from DMAW buf

    // convert numerical format (64-bit fixed point to 64-bit floating point)
    convert_force(ni, Ascale[devid], a, iacc[devid], pot, ipot[devid]); // convert 64-bit fixed point to 64-bit floating point
}

void
g5_set_eps2(int ni, double *eps2)
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_set_eps2MC(ic, ni, eps2);
    }
}

void
g5_set_eps2_to_all(double eps2)
{
    int ic;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	g5_set_eps2_to_allMC(ic, eps2);
    }
}

void
g5_set_eps2MC(int devid, int ni, double *eps2)
{
    int i;
    double xs = Xscale[devid];

    if (ni > Nipmax[devid]) {
	fprintf(stderr, "g5_set_eps2MC: too large ni (%d). abort.\n", ni);
	exit(1);
    }

    for (i = 0; i < ni; i++) {
        Ieps2[devid][i] = convert_eps(g5_eps2format[devid], eps2[i] * xs * xs);
    }
}

void
g5_set_eps2_to_allMC(int devid, double eps2)
{
    int i;
    double xs = Xscale[devid];
    double ieps = convert_eps(g5_eps2format[devid], eps2 * xs * xs);

    for (i = 0; i < Nipmax[devid]; i++) {
        Ieps2[devid][i] = ieps;
    }
}

int
g5_get_number_of_pipelines(void)
{
    int ic, n;
    int nmin = 65536; // any large number will do.

    // returns the smallest one
    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	n = g5_get_number_of_pipelinesMC(ic);
	if (nmin > n) {
	    nmin = n;
	}
    }

    return nmin;
}

int
g5_get_jmemsize(void)
{
    int ic;
    int jms = 0;

    for (ic = 0; ic < hib_ndevice(); ic++) {
	if (g5_cards[ic] == 0) continue;
	jms += g5_get_jmemsizeMC(ic);
    }
    return jms;
}

int
g5_get_number_of_pipelinesMC(int devid)
{
    return Nipmax[devid];
}

int
g5_get_jmemsizeMC(int devid)
{
    return g5_jmemsize[devid];
}

/*
 *
 * local functions
 *
 */

/*
 * initialize variables used for "standard" functions.  this
 * initialization is not necessary for "primitive" functions, or even
 * harmful for use in multi-threaded application.
 */
static void
init_envs(void)
{
    int ic, i;
    int dummy[NHIBMAX];
    char *p;
    char *cardno;
    static int firstcall = 1;

    if (firstcall) {
        firstcall = 0;
	p = getenv("GWARNLEVEL");
        if (!p) { // for backward compatibility.
            p = getenv("G5_WARNLEVEL");
        }
        if (p) {
            int tmp;
            tmp = atoi(strtok(p, " "));
            if (0 <= tmp) {
                warn_level = tmp;
            }
            WARN(3, "warn_level: %d\n", warn_level);
        }
        hib_set_warn_level(warn_level);

	p = getenv("GSENDFUNC");
        if (!p) { // for backward compatibility.
            p = getenv("G5_SENDFUNC");
        }
        if (p) {
	    if (0 == strcmp("DMAR", p)) {
		g5_sendfunc = SENDFUNC_DMAR;
	    }
        }
        for (i = 0; i < NHIBMAX; i++) {
            g5_npipes[i] = 0;
        }

    }

    if (g5_ncards == 0) {
        /* cards are not allocated yet.
           try to allocate cards specified by environment variable "G5_CARDS".
           try to allocate all cards, if G5_CARDS is not set. */

	p = getenv("GDEVICE");
        if (!p) { // for backward compatibility.
            p = getenv("G5_CARDS");
        }
	if (p) { // parse G5_CARDS
	    for (ic = 0; ic < hib_ndevice(); ic++) {
		g5_cards[ic] = 0;
	    }
            cardno = strtok(p, " ");
            while (cardno) {
                ic = atoi(cardno);
                if (ic < 0 || ic >= hib_ndevice()) {
                    fprintf(stderr, "G5_CARDS have device_id out of range: %d\n", ic);
                    exit(2);
                }
                g5_cards[ic] = 1;
                g5_ncards++;
                cardno = strtok(NULL, " ");
            }
            
	}
	else { // G5_CARDS is not set
	    g5_ncards = hib_ndevice();
	    for (ic = 0; ic < hib_ndevice(); ic++) {
		g5_cards[ic] = 1;
	    }
	}
        g5_get_cards(dummy);
    }

}

static void
init_boardinfo(int devid)
{
    int tmp;
    UINT32 binfo;

    binfo = hib_mem_readMC(devid, H[devid]->r->boardinfo);
    g5_model[devid] = (binfo >> 24) & 0xf;
    g5_product[devid] = (binfo >> 28) & 0xf;
    switch (g5_product[devid]) {
      case 1:
        WARN(1,"GRAPE-7(PCI-X) ");
	break;
      case 2:
        WARN(1,"GRAPE-7(PCIe) ");
	break;
      default:
        fprintf(stderr,"init_boardinfo: g5_product[%d]=%d  unknown product.\n",
		devid, g5_product[devid]);
	exit(2);
    }
    switch (g5_model[devid]) {
      case 1:
        g5_nchip[devid] = 1;
        WARN(1, "model100 ");
	break;
      case 3:
        g5_nchip[devid] = 3;
        WARN(1, "model300 [4-6] ");
	break;
      case 2:
        g5_nchip[devid] = 3;
        WARN(1, "model300 D[1-3] ");
	break;
      case 6:
        g5_nchip[devid] = 6;
        WARN(1, "model600 ");
	break;
      case 8:
        g5_nchip[devid] = 1;
        WARN(1, "model800 ");
	break;
      default:
        fprintf(stderr,"init_boardinfo: g5_model[%d]=%d  unknown model.\n",
		devid, g5_model[devid]);
	exit(2);
    }
    WARN(1, " g5_nchip:%d ", g5_nchip[devid]);
    g5_potential[devid]  = (binfo>>17) & 0x1;
    g5_nbsearch[devid]   = (binfo>>16) & 0x1;
    g5_p3mcutoff[devid]  = (binfo>>15) & 0x1;
    g5_eps2format[devid] = (binfo>>14) & 0x1;

    g5_jmemsize[devid] = (binfo>>8) & 0x3;
    g5_jmemsize[devid] = 2048 << g5_jmemsize[devid];
    g5_jmemsize[devid]--; // value '0' is used to indicate Nblist is empty.
    g5_jmemsize[devid] = g5_jmemsize[devid] * g5_nchip[devid];

    g5_npipes[devid] = binfo & 0xff;

    WARN(1, "g5_npipes:%d g5_jmemsize:%d g5_eps2format:%s g5_p3mcutoff:%s g5_nbsearch:%s g5_potential:%s\n",
         g5_npipes[devid],
         g5_jmemsize[devid],
         g5_eps2format[devid] == 0 ? "logarithmic"   : "floating-point",
         g5_p3mcutoff[devid]  == 0 ? "not-available" : "available",
         g5_nbsearch[devid]   == 0 ? "not-available" : "available",
	 g5_potential[devid]  == 0 ? "not-available" : "available");
}

static void
recalculate_iobuf_attributes(int devid)
{
    Jpsize[devid] = 2;
    Ipsize[devid] = 2;
    Fosize[devid] = calculate_fosize(devid);

    Nfomax[devid] = OFIFOSIZE / Fosize[devid];
    Njpmax[devid] = IFIFOSIZE / Jpsize[devid]; // used only by g5_set_jpMC0 (obsolete).
    Njpwordmax[devid] = NJPWORDMAX - 100;
    Nipmax[devid] = IFIFOSIZE / Ipsize[devid];
    Nfomax[devid] = OFIFOSIZE / Fosize[devid];
    Nipmax[devid] = Nipmax[devid] / g5_npipes[devid] * g5_npipes[devid];
    Nfomax[devid] = Nfomax[devid] / g5_npipes[devid] * g5_npipes[devid];

    if (Nipmax[devid] > Nfomax[devid]) {
	Nipmax[devid] = Nfomax[devid];
    }
    else {
	Nfomax[devid] = Nipmax[devid];
    }
    WARN(3, "Fosize:%d Nfomax:%d\n", Fosize[devid], Nfomax[devid]);
}

static int
calculate_fosize(int devid)
{
    int size, maxsize;

    if (g5_nbsearch[devid] && Nbmemsize[devid] > 0) {

	switch (g5_model[devid]) {
	  case 1:
	  case 8:
	    size = FORCEFOSIZE + ((Nbmemsize[devid] * 1) - 1) / NBPERWORD + 1;
	    break;

	  case 2: // need to retrieve dummy data for chip 4-6, in addition to valid data for chip 1-3.
	  case 3: // need to retrieve dummy data for chip 1-3, in addition to valid data for chip 4-6.
	  case 6:
	    size = FORCEFOSIZE + ((Nbmemsize[devid] * 6) - 1) / NBPERWORD + 1;
	    break;

	  default:
	    fprintf(stderr, "unknown GRAPE-7 model: %d\n", g5_model[devid]);
	    exit(2);
	}

	switch (g5_model[devid]) {
	  case 1:
	  case 8:
	    maxsize = FORCEFOSIZE + ((NBMEMSIZE * 1) - 1) / NBPERWORD + 1;
	    break;

	  case 2:
	  case 3:
	  case 6:
	    maxsize = FORCEFOSIZE + ((NBMEMSIZE * 6) - 1) / NBPERWORD + 1;
	    break;

	  default:
	    fprintf(stderr, "unknown GRAPE-7 model: %d\n", g5_model[devid]);
	    exit(2);
	}

        if (size < maxsize) {
            size++; // user set nbmemsize smaller than NBMEMSIZE.
                    // need to retrieve (nbmemsize+1) NBs to check overflow.
        }

    }
    else {
        size = FORCEFOSIZE;
    }

    return size;
}

static void
set_reg(int devid, UINT32 addr, UINT32 val)
{
    int nword = 0; /* 2-word command + dummy words */

    Rbuf[devid][nword++] = addr;
    Rbuf[devid][nword++] = val;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
}

static void
get_forceMC(int devid, int ni, INT64 (*s)[3])
{
    int i, id;
    static int firstcall = 1;

    if (firstcall == 1) {
        firstcall = 0;
        WARN(2, "Warning: g5_get_forceMC() does not calculate potential.\n"
             "The value returned is just a dummy.\n");
    }

    if (Nbodies[devid] == 0) { // no calculation done, and no valid data in Wbuf.
        for (i = 0; i < ni; i++) {
            s[i][0] = 0;
            s[i][1] = 0;
            s[i][2] = 0;
        }
        Nretrieved[devid] = 0;
        return;
    }

    /* wait DMA write completion */
    hib_finish_dmawMC(devid);

    for (i = 0, id = 0; i < ni; i++, id += 2 * Fosize[devid]) {
        // !!! correct only on little-endian architecture
        s[i][0] = *(INT64 *)(Wbuf[devid]+id+0);
        s[i][1] = *(INT64 *)(Wbuf[devid]+id+2);
        s[i][2] = *(INT64 *)(Wbuf[devid]+id+4);
    }

    Nretrieved[devid] = ni;
}

/*
 *  convert numerical format of the force
 *  from 64-bit fixed point to 64-bit floating point.
 */
static void
convert_force(int n, double scale,
	      double (*a)[3], INT64 (*ia)[3],
	      double *p, INT64 *ip)
{
    int i, k;

    for (i = 0; i < n; i++) {
	for (k = 0; k < 3; k++) {
	    a[i][k] = scale * (double)ia[i][k];
	}
        p[i] = 0.0;
    }
}

static inline int
convert_mass(int devid, double mass)
{
    double amass;
    int mj, imass;
    double mo = Moffset[devid];
    double ms = Mscale[devid];
    static double l2 = 0.0;

    if (l2 == 0.0) {
	l2 = log(2.0);
    }

    if (mass == 0.0) return 0.0;

    if (mass < 0.0) {
        amass = -mass;
    }
    else {
        amass = mass;
    }
    imass = amass * ms + ONEHALF;

#if 0 // truncate a very small mass to zero.

    if (imass < 1) {
        static int firstcall = 1;
        mj = 0; // very small mass
        if (firstcall == 1) {
            firstcall = 0;
            WARN(1, "Warning: g5_set_jp() truncated mass %e of a particle to zero.\n"
                 "set smaller mmin by g5_set_range() in order to avoid this.\n", mass);
        }
    }
    else  if (imass >= MASS_TABLE_SIZE) {
        mj = ONEHALF + 256.0 * log(amass * mo) / l2;
        mj = mj & 0x7fff; // very large mass
        if (mass > 0.0) {
            mj |= 0x8000;
        }
        else {
            mj |= 0x18000;
        }
    }
    else {
        mj = Mass_conversion_table[devid][imass];
        if (mass < 0.0) {
            mj |= 0x10000;
        }
    }

#else // do not truncate a very small mass to zero. the same behavior
      // as GRAPE-5.

    if (imass < 1) { // warn very small mass
        static int firstcall = 1;
        if (firstcall == 1) {
            firstcall = 0;
            WARN(1,
                 "Warning: g5_set_jp() detected a particle with mass %e, which is smaller\n"
                 "than the minimum value set by g5_set_range().\n"
                 "force from this particle may underflow\n", mass);
        }
    }

    if (imass >= MASS_TABLE_SIZE || imass < 1) { // do not use mass table.
        mj = ONEHALF + 256.0 * log(amass * mo) / l2;
        mj = mj & 0x7fff; // very large mass
        if (mass > 0.0) {
            mj |= 0x8000;
        }
        else {
            mj |= 0x18000;
        }
    }
    else { // use mass table
        mj = Mass_conversion_table[devid][imass];
        if (mass < 0.0) {
            mj |= 0x10000;
        }
    }

#endif

    return mj;
}

static int
convert_eps(int type, double eps2)
{
    int ieps2;
    static double l2 = 0.0;

    if (l2 == 0.0) {
	l2 = log(2.0);
    }

    if (type == 0) { // eps2 in logarithmic format
        if (eps2 == 0.0) {
            ieps2 = 0;
        } else if (eps2 > 0.0) {
            ieps2 = ((int)(256.0 * log(+eps2) / l2 + ONEHALF) & 0x7fff) | 0x8000;
        } else {
	    fprintf(stderr, "negative eps2:%e\n", eps2);
	    exit(2);
        }
    }
    else { // eps2 in floating-point format

        if (eps2 == 0.0) {
            ieps2 = 0;
        } else if (eps2 > 0.0) {
            ieps2 = (fp64tofp17tuned2(eps2) & 0xffff);
        } else {
	    fprintf(stderr, "negative eps2:%e\n", eps2);
	    exit(2);
        }
    }
    return ieps2;
}

static UINT32
fp64tofp17(double indata)
{
    double f;
    static double l2 = 0.0;
    UINT32 s, e, m;
    UINT32 fpdata;

    if (l2 == 0.0) {
	l2 = log(2.0);
    }

    // sign
    if (indata < 0) {
	s = 1;
    }
    else {
	s = 0;
    }

    f = fabs(indata);

    if (f == 0.0) {
	fprintf(stderr, "#### indata: 0x%x\n", indata);
	return 0;
    }

    e = (int)(log(f) / l2) & 0x7f;
    m = (int)((pow(2.0, log(f) / l2 - (double)e) - 1.0) * 512.0 + ONEHALF) & 0x1ff;

    fpdata = (s << 16) | (e <<9) | m;

    return fpdata;
}

/*
 * convert double to G5 floating point.
 * indata need to be in the range (0.0, 2^63).
 */
static UINT32
fp64tofp17tuned(double indata)
{
    UINT64 m;
    UINT32 s, e;
    UINT32 fpdata;

    m = (UINT64)indata;
    s = m >> 63;
    if (s > 0) {
	fprintf(stderr, "#### fp64tofp17tuned too large input: %e\n", indata);
	return 0;
    }

    e = 0;
    while (1) {
        if (m < ((UINT64)1<<e)) break;
        e++;
    }
    e--;
    if (e > 9) {
        m = m >> (e-9);
    }
    else {
        m = m << (9-e);
    }
    m &= 0x1ff;

    fpdata = (e <<9) | m;

    return fpdata;
}

/*
 * fp64tofp17tuned with proper rounding,
 * with additional calculation cost.
 */
static UINT32
fp64tofp17tuned2(double indata)
{
    UINT64 m;
    UINT32 s, e;
    UINT32 fpdata;

    m = (UINT64)(indata + ONEHALF);
    s = m >> 63;
    if (s > 0) {
	fprintf(stderr, "#### fp64tofp17tuned2 too large input: %e\n", indata);
	return 0;
    }

    e = 0;
    while (1) {
        if (m < ((UINT64)1<<e)) break;
        e++;
    }
    e--;
    if (e > 9) {
	UINT64 mcut;
        m = m >> (e-9);

	// round to the nearest integer
	mcut = m << (e-9);
	if ((m - mcut) > (1 << (e-10))) {
	    m += 1;
	}
    }
    else {
        m = m << (9-e);
    }
    m &= 0x1ff;

    fpdata = (e <<9) | m;

    return fpdata;
}

static int
convert_double_to_grape_log(double x)
{
    int y,logpart;
    double ax;
    static double l2 = 0.0;

    if (l2 == 0.0) {
	l2 = log(2.0);
    }

    if (x == 0.0) {
	y = 0;
    }
    else {
        ax = x;
	y = 0x8000;
	if (x < 0.0) {
	    y = 0x18000;
	    ax = -x;
	}
	logpart = ONEHALF + 256.0*(log(ax)/l2);
        if (logpart >= 0) {
	    if (logpart < 0x8000) {
		y |= logpart;
	    }
	    else {
		fprintf(stderr,"convert_double_to_grape_log: too large x %e\n", x);
		y = 0;
	    }
	}
        else {
	    y = 0;
	}
    }
    return y;
}

/*
 * definitions of obsolete functions.
 * these may not be supported in the next revision.
 */
#include "g5oldapi.c"

/*
 * include definitions of library functions specific to G5nbPIPE,
 * if defined a constant 'G5NBUTIL'.
 */
#ifdef G5NBUTIL
#include "g5nbapi.c"
#endif
