/*
 * pg2g6nbutil.c: pg2g6nb user library.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <errno.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <pg2g6nbutil.h>
#define PGUTIL_HEADER
#include "pgemu.h"
#undef PGUTIL_HEADER

#ifdef ICC_RCD
#define ONEHALF (0.0) // Intel CC with -rcd switch
#else
#define ONEHALF (0.5) // standard C
#endif

#define DUMMYSIZE (2) /* this may necessary be set to 32 or larger on some hosts. */
#define PAD_DUMMY_DATA(n) while (n < DUMMYSIZE) { Rbuf[devid][n++] = 0; }

#define WARN(lv, fmt, args...) if (lv <= warn_level) fprintf(stderr, fmt, ## args);
static int warn_level = 2; /* warning message output level. the higher the more verbose.
                              0: no warning (may cause wrong result with g7pkg/scripts/check.csh)
                              1: minimum
                              2: default
                              3: for debugging purpose
                           */

/*
 * maximum number of JPs, IPs, FOs can be transfered by
 * a single DMA transaction.
 */
#define IFIFOSIZE  (512-1)    // input fifo size in 64-bit word.
#define OFIFOSIZE  (1024-1)   // output fifo size in 64-bit word.
#define NJPWORDMAX (HIB_DMABUF_BYTES/4) // max JP DMA size in 32-bit word. 32k-word= 128kB = 32 pages = 8k particles.
#define NIPMAX     (IFIFOSIZE)
#define NFOMAX     (OFIFOSIZE)

static UINT32 Njpmax[NHIBMAX];
static UINT32 Njpwordmax[NHIBMAX];
static UINT32 Nipmax[NHIBMAX];
static UINT32 Nfomax[NHIBMAX];

static Hib *H[NHIBMAX] = { 0 };
static UINT32 *Rbuf[NHIBMAX]; /* DMA read buffer */
static UINT32 *Wbuf[NHIBMAX]; /* DMA write buffer */

#ifndef HIB_DMABUF_BYTES
#define HIB_DMABUF_BYTES (GRAPE7X_DMABUF_BYTES > GRAPE7E_DMABUF_BYTES ? GRAPE7X_DMABUF_BYTES : GRAPE7E_DMABUF_BYTES)
#endif

static UINT32 Pbuf[NHIBMAX][HIB_DMABUF_BYTES/4]; /* PIO write buffer (128kB).
                                                 do not define the size with HIB_PIOWBUF_BYTES.
                                                 it denotes size of the hardware buffer in HIB.
                                                 here we defining a buffer on the main memory,
                                                 that needs size the same as that of DMA's. */
static UINT32 Nbodies[NHIBMAX];
static UINT32 Nretrieved[NHIBMAX]; // number of calculation results retrieved by the last pg2g6nb_get_foutMC
static UINT32 Ni[NHIBMAX];
static UINT32 Jpsize[NHIBMAX]; // JP packet size in 64-bit word.
static UINT32 Ipsize[NHIBMAX]; // IP packet size in 64-bit word.
static UINT32 Fosize[NHIBMAX]; // FO packet size in 64-bit word.

#if NHIBMAX > 16
#error NHIB must not exceed 16
#endif
static int Npipes[16] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

static int Ndevice = 0;
static int DeviceIdInUse[NHIBMAX];
static int Sendfunc = SENDFUNC_PIOW;

// values encoded in board_info register
static UINT32 DeviceModelId[NHIBMAX];
static UINT32 DeviceProductId[NHIBMAX];
static UINT32 Nchip[NHIBMAX];
static UINT32 Jmemsize[NHIBMAX];

static UINT32 Ipaddr[NHIBMAX];    //     ip packet
static UINT32 Jpaddr[NHIBMAX];    //     jp packet
static UINT32 Calcaddr[NHIBMAX];  //     calc command
static UINT32 Foregaddr[NHIBMAX]; //     fo register
static UINT32 Ipregaddr[NHIBMAX]; //     ip register
static UINT32 Coeffaddr[NHIBMAX];  //     coefficient register

/*
    pg_ctl.vhd local space address map:
    ------------------------------------------------------------------
    hib_data
    31..28    63..48    47..32
    ------------------------------------------------------------------
    0000      0x0000     ndip * ni        IP packet header
    0100      jaddr      ndjp * nj        JP packet header
    1000      ndip*npipe N(16)            calc
    1100      ndip(16)   ni(16)           IP register
    1110      ndfo(16)   ni(16)           FO register
    1101      data(32)                    general-purpose registers
                                          for constant coefficients
                                          such as eta & rcut.
                                          addr:(63:56) data(55:32)
    ------------------------------------------------------------------

    coefficient register space address map:
    ------------------------------------------------------------------
    hib_data
    63..56    55..32
    ------------------------------------------------------------------
    00000000  param0
    00000001  param1
    00000010  param2
     ...       ...
    ------------------------------------------------------------------
 */

/*
 * local functions
 */
static void   init_envs(void);
static void   init_boardinfoMC(int devid);
static void   recalculate_iobuf_attributesMC(int devid);
static void   set_regMC(int devid, UINT32 addr, UINT32 val);
static void   initialize_scale_factorMC(int devid);
static UINT64 compose_float(UINT64 sign, UINT64 exp, UINT64 man, int wexp, int wman);
static void decompose_float(UINT64 src, int wexp, int wman, UINT64 *signp, UINT64 *expp, UINT64 *manp);

// COEFF conversion
static inline UINT64 convert_ti(int devid, double src);
static inline UINT64 convert_etainv(int devid, double src);

// JP conversion
static inline UINT64 convert_mj(int devid, double src);
static inline UINT64 convert_xj(int devid, double src);
static inline UINT64 convert_vj(int devid, double src);
static inline UINT64 convert_tj(int devid, double src);
static inline UINT64 convert_acc0by2(int devid, double src);
static inline UINT64 convert_jerk0by6(int devid, double src);
static inline UINT64 convert_indexj(int devid, int src);

// IP conversion
static inline UINT64 convert_xi(int devid, double src);
static inline UINT64 convert_vi(int devid, double src);
static inline UINT64 convert_epsi2(int devid, double src);
static inline UINT64 convert_indexi(int devid, int src);
static inline UINT64 convert_hi2(int devid, double src);

// FO conversion
static inline double convert_acc(int devid, UINT64 src);
static inline double convert_pot(int devid, UINT64 src);
static inline double convert_jerk(int devid, UINT64 src);
static inline int convert_nblist_ovflw(int devid, UINT64 src);
static inline int convert_nblist(int devid, UINT64 src);
static inline int convert_nnb_ovflw(int devid, UINT64 src);
static inline int convert_nnb(int devid, UINT64 src);
static inline int convert_nnbr2_ovflw(int devid, UINT64 src);
static inline double convert_nnbr2(int devid, UINT64 src);


static void unpack_foutMC(int devid, int ni, UINT64 (*iacc)[3], UINT64 *ipot, UINT64 (*ijerk)[3], UINT64 *inblist_ovflw, UINT64 (*inblist)[32], UINT64 *innb_ovflw, UINT64 *innb, UINT64 *innbr2_ovflw, UINT64 *innbr2);

/*
 * scaling utilities for 'ti' of type float63.52:
 */

static double Ti_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_ti(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_tiMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_tiMC(int devid, double scale)
{
    Ti_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_ti(void)
{
    return pg2g6nb_get_scale_tiMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_tiMC(int devid)
{
    return Ti_scale[devid];
}

/*
 * scaling utilities for 'etainv' of type float24.13:
 */

static double Etainv_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_etainv(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_etainvMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_etainvMC(int devid, double scale)
{
    Etainv_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_etainv(void)
{
    return pg2g6nb_get_scale_etainvMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_etainvMC(int devid)
{
    return Etainv_scale[devid];
}


/*
 * scaling utilities for 'mj' of type float34.23:
 */

static double Mj_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_mj(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_mjMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_mjMC(int devid, double scale)
{
    Mj_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_mj(void)
{
    return pg2g6nb_get_scale_mjMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_mjMC(int devid)
{
    return Mj_scale[devid];
}

/*
 * scaling utilities for 'xj' of type int64:
 */

static double Xj_scale[NHIBMAX];
static double Xj_offset[NHIBMAX];
static double Xj_min[NHIBMAX];
static double Xj_max[NHIBMAX];

void
pg2g6nb_set_range_xj(double min, double max)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_range_xjMC(DeviceIdInUse[ic], min, max);
    }
}

void
pg2g6nb_set_range_xjMC(int devid, double min, double max)
{
    double size;

    size = max - min;
    Xj_scale[devid] = pow(2.0, (double)64) / size;
    Xj_offset[devid] = min;
    Xj_min[devid] = min;
    Xj_max[devid] = max;
}

void
pg2g6nb_get_range_xj(double *min, double *max)
{
    pg2g6nb_get_range_xjMC(DeviceIdInUse[0], min, max);
}

void
pg2g6nb_get_range_xjMC(int devid, double *min, double *max)
{
    *min = Xj_min[devid];
    *max = Xj_max[devid];
}

inline double
pg2g6nb_get_scale_xj(void)
{
    return pg2g6nb_get_scale_xjMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_xjMC(int devid)
{
    return Xj_scale[devid];
}

inline double
pg2g6nb_get_offset_xj(void)
{
    return pg2g6nb_get_offset_xjMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_offset_xjMC(int devid)
{
    return Xj_offset[devid];
}

/*
 * scaling utilities for 'vj' of type float28.17:
 */

static double Vj_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_vj(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_vjMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_vjMC(int devid, double scale)
{
    Vj_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_vj(void)
{
    return pg2g6nb_get_scale_vjMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_vjMC(int devid)
{
    return Vj_scale[devid];
}

/*
 * scaling utilities for 'tj' of type float63.52:
 */

static double Tj_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_tj(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_tjMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_tjMC(int devid, double scale)
{
    Tj_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_tj(void)
{
    return pg2g6nb_get_scale_tjMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_tjMC(int devid)
{
    return Tj_scale[devid];
}

/*
 * scaling utilities for 'acc0by2' of type float28.17:
 */

static double Acc0by2_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_acc0by2(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_acc0by2MC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_acc0by2MC(int devid, double scale)
{
    Acc0by2_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_acc0by2(void)
{
    return pg2g6nb_get_scale_acc0by2MC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_acc0by2MC(int devid)
{
    return Acc0by2_scale[devid];
}

/*
 * scaling utilities for 'jerk0by6' of type float28.17:
 */

static double Jerk0by6_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_jerk0by6(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_jerk0by6MC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_jerk0by6MC(int devid, double scale)
{
    Jerk0by6_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_jerk0by6(void)
{
    return pg2g6nb_get_scale_jerk0by6MC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_jerk0by6MC(int devid)
{
    return Jerk0by6_scale[devid];
}


/*
 * scaling utilities for 'xi' of type int64:
 */

static double Xi_scale[NHIBMAX];
static double Xi_offset[NHIBMAX];
static double Xi_min[NHIBMAX];
static double Xi_max[NHIBMAX];

void
pg2g6nb_set_range_xi(double min, double max)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_range_xiMC(DeviceIdInUse[ic], min, max);
    }
}

void
pg2g6nb_set_range_xiMC(int devid, double min, double max)
{
    double size;

    size = max - min;
    Xi_scale[devid] = pow(2.0, (double)64) / size;
    Xi_offset[devid] = min;
    Xi_min[devid] = min;
    Xi_max[devid] = max;
}

void
pg2g6nb_get_range_xi(double *min, double *max)
{
    pg2g6nb_get_range_xiMC(DeviceIdInUse[0], min, max);
}

void
pg2g6nb_get_range_xiMC(int devid, double *min, double *max)
{
    *min = Xi_min[devid];
    *max = Xi_max[devid];
}

inline double
pg2g6nb_get_scale_xi(void)
{
    return pg2g6nb_get_scale_xiMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_xiMC(int devid)
{
    return Xi_scale[devid];
}

inline double
pg2g6nb_get_offset_xi(void)
{
    return pg2g6nb_get_offset_xiMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_offset_xiMC(int devid)
{
    return Xi_offset[devid];
}

/*
 * scaling utilities for 'vi' of type float28.17:
 */

static double Vi_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_vi(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_viMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_viMC(int devid, double scale)
{
    Vi_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_vi(void)
{
    return pg2g6nb_get_scale_viMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_viMC(int devid)
{
    return Vi_scale[devid];
}

/*
 * scaling utilities for 'epsi2' of type float34.23:
 */

static double Epsi2_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_epsi2(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_epsi2MC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_epsi2MC(int devid, double scale)
{
    Epsi2_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_epsi2(void)
{
    return pg2g6nb_get_scale_epsi2MC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_epsi2MC(int devid)
{
    return Epsi2_scale[devid];
}

/*
 * scaling utilities for 'hi2' of type float34.23:
 */

static double Hi2_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_hi2(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_hi2MC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_hi2MC(int devid, double scale)
{
    Hi2_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_hi2(void)
{
    return pg2g6nb_get_scale_hi2MC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_hi2MC(int devid)
{
    return Hi2_scale[devid];
}


/*
 * scaling utilities for 'acc' of type int64:
 */

static double Acc_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_acc(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_accMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_accMC(int devid, double scale)
{
    Acc_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_acc(void)
{
    return pg2g6nb_get_scale_accMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_accMC(int devid)
{
    return Acc_scale[devid];
}

/*
 * scaling utilities for 'pot' of type int64:
 */

static double Pot_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_pot(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_potMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_potMC(int devid, double scale)
{
    Pot_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_pot(void)
{
    return pg2g6nb_get_scale_potMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_potMC(int devid)
{
    return Pot_scale[devid];
}

/*
 * scaling utilities for 'jerk' of type int48:
 */

static double Jerk_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_jerk(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_jerkMC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_jerkMC(int devid, double scale)
{
    Jerk_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_jerk(void)
{
    return pg2g6nb_get_scale_jerkMC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_jerkMC(int devid)
{
    return Jerk_scale[devid];
}

/*
 * scaling utilities for 'nnbr2' of type float34.23:
 */

static double Nnbr2_scale[NHIBMAX];

inline void
pg2g6nb_set_scale_nnbr2(double scale)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        pg2g6nb_set_scale_nnbr2MC(DeviceIdInUse[ic], scale);
    }
}

inline void
pg2g6nb_set_scale_nnbr2MC(int devid, double scale)
{
    Nnbr2_scale[devid] = scale;
}

inline double
pg2g6nb_get_scale_nnbr2(void)
{
    return pg2g6nb_get_scale_nnbr2MC(DeviceIdInUse[0]);
}

inline double
pg2g6nb_get_scale_nnbr2MC(int devid)
{
    return Nnbr2_scale[devid];
}



int
pg2g6nb_get_warn_level(void)
{
    return warn_level;
}

void
pg2g6nb_devices(pg2g6nb_devices_t *devs)
{
    int i;

    init_envs();
    devs->ndevice = Ndevice;
    for (i = 0; i < Ndevice; i++) {
        devs->deviceid[i] = DeviceIdInUse[i];
    }
}

void
pg2g6nb_device_info(pg2g6nb_device_info_t **devinfos)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
        int devid = DeviceIdInUse[ic];
	pg2g6nb_device_infoMC(devid, devinfos[devid]);
    }
}


/*
 * Returns value of the boardinfo register of the device.
 * This can be called even if the device is not open.
 */
void
pg2g6nb_device_infoMC(int devid, pg2g6nb_device_info_t *devinfo)
{
    int need_close = 0;
    if (!H[devid] || H[devid]->fd < 0) { // hib not open.
        H[devid]= hib_openMC(devid);
        need_close = 1;
    }
    devinfo->boardinfo = hib_mem_readMC(devid, H[devid]->r->boardinfo);
    if (need_close) {
        hib_closeMC(devid);
    }
}

void
pg2g6nb_open(void)
{
    int ic;

    init_envs();
    for (ic = 0; ic < Ndevice; ic++) {
	pg2g6nb_openMC(DeviceIdInUse[ic]);
    }
}

void
pg2g6nb_openMC(int devid)
{
    int nword, ni;

    init_envs();
    H[devid] = hib_openMC(devid);
    if (Npipes[devid] == 0) { // open for the first time.
        init_boardinfoMC(devid);
	Ipaddr[devid]    = 0x00000000; // ip packet
	Jpaddr[devid]    = 0x40000000; // jp packet
	Calcaddr[devid]  = 0x80000000; // calc command
	Foregaddr[devid] = 0xe0000000; // fo register
	Ipregaddr[devid] = 0xc0000000; // ip register
	Coeffaddr[devid]  = 0xd0000000; // coefficient register

	recalculate_iobuf_attributesMC(devid);
        initialize_scale_factorMC(devid);
    }

    WARN(3, "Sendfunc: %s\n", Sendfunc == SENDFUNC_DMAR ? "DMA read" : "PIO write");
    if (Sendfunc == SENDFUNC_PIOW) {
        hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
        Rbuf[devid] = Pbuf[devid];
    }
    else {
        hib_set_sendfuncMC(devid, SENDFUNC_DMAR);
	Rbuf[devid] = (UINT32 *)(H[devid]->dmar_buf);
    }
    /* using Rbuf as PIO write buffer would degrade
     * performance, since the buffer is marked up as 'non-cached'.
     */

    Wbuf[devid] = (UINT32 *)(H[devid]->dmaw_buf);

    set_regMC(devid, Ipregaddr[devid], Ipsize[devid]<<16 | Npipes[devid]);
    set_regMC(devid, Foregaddr[devid], Fosize[devid]<<16 | Npipes[devid]);

    WARN(3, "pg2g6nb[%d] opened.\n", devid);
}

void
pg2g6nb_close(void)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
	pg2g6nb_closeMC(DeviceIdInUse[ic]);
    }
}

void
pg2g6nb_closeMC(int devid)
{
    hib_closeMC(devid);
}


void
pg2g6nb_set_coeff(double ti, double etainv)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
	pg2g6nb_set_coeffMC(DeviceIdInUse[ic], ti, etainv);
    }
}

void
pg2g6nb_set_coeffMC(int devid, double ti, double etainv)
{
    int i, k, nword, nword0;
    UINT64  iti, ietainv;

    iti = convert_ti(devid, ti);
    ietainv = convert_etainv(devid, etainv);


    /*
     * pack COEFFs.
     */
    nword = 0;
    Rbuf[devid][nword++] = Coeffaddr[devid];

    // iti
    Rbuf[devid][nword]  = (0x00ffffff & (iti)); // iti[23..0]
    Rbuf[devid][nword] |= (0 << 24); // address 0
    nword++;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

    nword = 0;
    Rbuf[devid][nword++] = Coeffaddr[devid];
    Rbuf[devid][nword]  = (0x00ffffff & (iti >> 24)); // iti[47..24]
    Rbuf[devid][nword] |= (1 << 24); // address 1
    nword++;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

    nword = 0;
    Rbuf[devid][nword++] = Coeffaddr[devid];
    Rbuf[devid][nword]  = (0x00007fff & (iti >> 48)); // iti[62..48]

    // ietainv
    Rbuf[devid][nword] |= (0x000001ff & (ietainv))<< 15; // ietainv[8..0]
    Rbuf[devid][nword] |= (2 << 24); // address 2
    nword++;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

    nword = 0;
    Rbuf[devid][nword++] = Coeffaddr[devid];
    Rbuf[devid][nword]  = (0x00007fff & (ietainv >> 9)); // ietainv[23..9]
    Rbuf[devid][nword] |= (3 << 24); // address 3
    nword++;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);


}


void
pg2g6nb_set_jp(int adr, int nj, double *mj, double (*xj)[3], double (*vj)[3], double *tj, double (*acc0by2)[3], double (*jerk0by6)[3], int *indexj)
{
    int ic;
    int j0, njj;

    j0 = 0;
    njj = (nj + Ndevice - 1) / Ndevice;

    for (ic = 0; ic < Ndevice; ic++) {
	if (nj < j0 + njj) {
	    njj = nj - j0;
	}
	pg2g6nb_set_jpMC(DeviceIdInUse[ic], adr, njj, mj + j0, (double (*)[3])(xj[j0]), (double (*)[3])(vj[j0]), tj + j0, (double (*)[3])(acc0by2[j0]), (double (*)[3])(jerk0by6[j0]), indexj + j0);
	j0 += njj;
    }
}

void
pg2g6nb_set_jpMC(int devid, int adr, int nj, double *mj, double (*xj)[3], double (*vj)[3], double *tj, double (*acc0by2)[3], double (*jerk0by6)[3], int *indexj)
{
    int nword, nword0;
    int nempty_cycle = 4;
    int nword_almost_overflow = Njpwordmax[devid] - (2 * 2 + nempty_cycle * 2 + Jpsize[devid] * 2);
                                                 // (packet header + empty cycle) * 2 + JP size in 32-bitword.
    int jsent, jbuffered, jindex;
    int ic, cid, nnj, i, k;
    UINT64  imj, ixj[3], ivj[3], itj, iacc0by2[3], ijerk0by6[3], iindexj;

    if (nj <= 0) return;
    nnj = (nj-1)/Nchip[devid] + 1;

    jsent = 0;     // number of JPs sent.
    jbuffered = 0; // number of JPs packed to the DMA buffer.
    nword = 0;     // number of data words packed to the DMA buffer.

    for (ic = 0; ic < Nchip[devid]; ic++) {

        jindex = 0; // index uniquely assigned to each JP in a chip.

        // IP reg packet:
        switch (DeviceModelId[devid]) {
          case 2: // chipid of model600 & 300d starts from 1.
          case 6:
            cid = ic + 1;
            break;
          case 3: // that of model300 starts from 4.
            cid = ic + 4;
            break;
        }
        switch (DeviceModelId[devid]) {
          case 2:
          case 3:
          case 6:
            for (i = 0; i < nempty_cycle; i++) {
                Rbuf[devid][nword++] = 0;
            }
            Rbuf[devid][nword++] = Ipregaddr[devid];
            Rbuf[devid][nword++] = cid<<28 | Ipsize[devid]<<16 | Npipes[devid]; // write chip id to IP reg.
            for (i = 0; i < nempty_cycle; i++) {
                Rbuf[devid][nword++] = 0;
            }
            break;
          default:
            // nothing to do for model100 & 800.
            break;
        }

        // JP packet tag:
        Rbuf[devid][nword++] = Jpaddr[devid];
        Rbuf[devid][nword++] = ((long long int)(adr) << 16) | Jpsize[devid] * nnj;

        // JP packet body:
        while (jindex < nnj) {

            // convert numerical format.
            if (jsent < nj) {
                imj = convert_mj(devid, mj[jsent]);
                for (k = 0; k < 3; k++) {
                    ixj[k] = convert_xj(devid, xj[jsent][k]);
                }
                for (k = 0; k < 3; k++) {
                    ivj[k] = convert_vj(devid, vj[jsent][k]);
                }
                itj = convert_tj(devid, tj[jsent]);
                for (k = 0; k < 3; k++) {
                    iacc0by2[k] = convert_acc0by2(devid, acc0by2[jsent][k]);
                }
                for (k = 0; k < 3; k++) {
                    ijerk0by6[k] = convert_jerk0by6(devid, jerk0by6[jsent][k]);
                }
                iindexj = convert_indexj(devid, indexj[jsent]);

            }
            else { // clear garbage in the memory of the last pFPGA.
                imj = 0;
                for (k = 0; k < 3; k++) {
                    ixj[k] = 0;
                }
                for (k = 0; k < 3; k++) {
                    ivj[k] = 0;
                }
                itj = 0;
                for (k = 0; k < 3; k++) {
                    iacc0by2[k] = 0;
                }
                for (k = 0; k < 3; k++) {
                    ijerk0by6[k] = 0;
                }
                iindexj = 0;

            }

            /*
             * pack a JP.
             */
            int nword0 = nword;

            // imj
            Rbuf[devid][nword]  = (0xffffffff & (imj)); // imj[31..0]
            nword++;
            Rbuf[devid][nword]  = (0x00000003 & (imj >> 32)); // imj[33..32]

            // ixj[0]
            Rbuf[devid][nword] |= (0x3fffffff & (ixj[0]))<< 2; // ixj[0][29..0]
            nword++;
            Rbuf[devid][nword]  = (0xffffffff & (ixj[0] >> 30)); // ixj[0][61..30]
            nword++;
            Rbuf[devid][nword]  = (0x00000003 & (ixj[0] >> 62)); // ixj[0][63..62]

            // ixj[1]
            Rbuf[devid][nword] |= (0x3fffffff & (ixj[1]))<< 2; // ixj[1][29..0]
            nword++;
            Rbuf[devid][nword]  = (0xffffffff & (ixj[1] >> 30)); // ixj[1][61..30]
            nword++;
            Rbuf[devid][nword]  = (0x00000003 & (ixj[1] >> 62)); // ixj[1][63..62]

            // ixj[2]
            Rbuf[devid][nword] |= (0x3fffffff & (ixj[2]))<< 2; // ixj[2][29..0]
            nword++;
            Rbuf[devid][nword]  = (0xffffffff & (ixj[2] >> 30)); // ixj[2][61..30]
            nword++;
            Rbuf[devid][nword]  = (0x00000003 & (ixj[2] >> 62)); // ixj[2][63..62]

            // ivj[0]
            Rbuf[devid][nword] |= (0x0fffffff & (ivj[0]))<< 2; // ivj[0][27..0]

            // ivj[1]
            Rbuf[devid][nword] |= (0x00000003 & (ivj[1]))<< 30; // ivj[1][1..0]
            nword++;
            Rbuf[devid][nword]  = (0x03ffffff & (ivj[1] >> 2)); // ivj[1][27..2]

            // ivj[2]
            Rbuf[devid][nword] |= (0x0000003f & (ivj[2]))<< 26; // ivj[2][5..0]
            nword++;
            Rbuf[devid][nword]  = (0x003fffff & (ivj[2] >> 6)); // ivj[2][27..6]

            // itj
            Rbuf[devid][nword] |= (0x000003ff & (itj))<< 22; // itj[9..0]
            nword++;
            Rbuf[devid][nword]  = (0xffffffff & (itj >> 10)); // itj[41..10]
            nword++;
            Rbuf[devid][nword]  = (0x001fffff & (itj >> 42)); // itj[62..42]

            // iacc0by2[0]
            Rbuf[devid][nword] |= (0x000007ff & (iacc0by2[0]))<< 21; // iacc0by2[0][10..0]
            nword++;
            Rbuf[devid][nword]  = (0x0001ffff & (iacc0by2[0] >> 11)); // iacc0by2[0][27..11]

            // iacc0by2[1]
            Rbuf[devid][nword] |= (0x00007fff & (iacc0by2[1]))<< 17; // iacc0by2[1][14..0]
            nword++;
            Rbuf[devid][nword]  = (0x00001fff & (iacc0by2[1] >> 15)); // iacc0by2[1][27..15]

            // iacc0by2[2]
            Rbuf[devid][nword] |= (0x0007ffff & (iacc0by2[2]))<< 13; // iacc0by2[2][18..0]
            nword++;
            Rbuf[devid][nword]  = (0x000001ff & (iacc0by2[2] >> 19)); // iacc0by2[2][27..19]

            // ijerk0by6[0]
            Rbuf[devid][nword] |= (0x007fffff & (ijerk0by6[0]))<< 9; // ijerk0by6[0][22..0]
            nword++;
            Rbuf[devid][nword]  = (0x0000001f & (ijerk0by6[0] >> 23)); // ijerk0by6[0][27..23]

            // ijerk0by6[1]
            Rbuf[devid][nword] |= (0x07ffffff & (ijerk0by6[1]))<< 5; // ijerk0by6[1][26..0]
            nword++;
            Rbuf[devid][nword]  = (0x00000001 & (ijerk0by6[1] >> 27)); // ijerk0by6[1][27..27]

            // ijerk0by6[2]
            Rbuf[devid][nword] |= (0x0fffffff & (ijerk0by6[2]))<< 1; // ijerk0by6[2][27..0]

            // iindexj
            Rbuf[devid][nword] |= (0x00000007 & (iindexj))<< 29; // iindexj[2..0]
            nword++;
            Rbuf[devid][nword]  = (0x1fffffff & (iindexj >> 3)); // iindexj[31..3]
            nword = nword0 + 2 * Jpsize[devid];

#if 0
            Rbuf[devid][nword++] = (0xfff & (jindex+1))<<17 | (0x1ffff & mj); // offset jindex by 1.
#endif


            jsent++;                // reset when this function begins.
            jbuffered++;            // reset when Rbuf is flushed.
            jindex++;               // reset when ic is incremented.

            // DMA buffer is full. flush to the HIB.
            if (nword >= nword_almost_overflow) {
                PAD_DUMMY_DATA(nword);
                hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
                jbuffered = 0;
                nword = 0;
            }

        } // nnj loop

    } // ic loop

    // flush data remaining in the buffer.
    if (nword > 0) {
        PAD_DUMMY_DATA(nword);
        hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

#if 0
	{
	    int i;
	    fprintf(stderr, "JP\n");
	    for (i = 0; i < nword; i++) {
		fprintf(stderr, "Rbuf[%d][%2d]:0x%08x\n",
			devid, i, Rbuf[devid][i]);
	    }
	    fprintf(stderr, "\n");
	}
#endif

        jbuffered = 0;
        nword = 0;
    }

    // write chip id 0 (measns broadcast) to IP reg.
    switch (DeviceModelId[devid]) {
      case 2:
      case 3:
      case 6:
	set_regMC(devid, Ipregaddr[devid], 0<<28 | Ipsize[devid]<<16 | Npipes[devid]);
        break;
      default:
        // nothing to do for model100 & 800.
        break;
    }

}

void
pg2g6nb_set_ip(int ni, double (*xi)[3], double (*vi)[3], double *epsi2, int *indexi, double *hi2)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
	pg2g6nb_set_ipMC(DeviceIdInUse[ic], ni, xi, vi, epsi2, indexi, hi2);
    }
}

void
pg2g6nb_set_ipMC(int devid, int ni, double (*xi)[3], double (*vi)[3], double *epsi2, int *indexi, double *hi2)
{
    int i, k, nword, nword0;
    UINT64  ixi[3], ivi[3], iepsi2, iindexi, ihi2;

    if (ni <= 0) return;

    if (Nbodies[devid] == 0) return;

    if (ni > Nipmax[devid]) {
	fprintf(stderr, "pg2g6nb_set_ip: too large ni (%d). "
                "should not be larger than %d. abort.\n", ni, Nipmax[devid]);
	exit(1);
    }
    Ni[devid] = ni;

    nword = 0;
    Rbuf[devid][nword++] = Ipaddr[devid];
    Rbuf[devid][nword++] = Ipsize[devid] * ni;

    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            ixi[k] = convert_xi(devid, xi[i][k]);
        }
        for (k = 0; k < 3; k++) {
            ivi[k] = convert_vi(devid, vi[i][k]);
        }
        iepsi2 = convert_epsi2(devid, epsi2[i]);
        iindexi = convert_indexi(devid, indexi[i]);
        ihi2 = convert_hi2(devid, hi2[i]);

        /*
         * pack an IP.
         */
	int nword0 = nword;

        // ixi[0]
        Rbuf[devid][nword]  = (0xffffffff & (ixi[0])); // ixi[0][31..0]
        nword++;
        Rbuf[devid][nword]  = (0xffffffff & (ixi[0] >> 32)); // ixi[0][63..32]
        nword++;

        // ixi[1]
        Rbuf[devid][nword]  = (0xffffffff & (ixi[1])); // ixi[1][31..0]
        nword++;
        Rbuf[devid][nword]  = (0xffffffff & (ixi[1] >> 32)); // ixi[1][63..32]
        nword++;

        // ixi[2]
        Rbuf[devid][nword]  = (0xffffffff & (ixi[2])); // ixi[2][31..0]
        nword++;
        Rbuf[devid][nword]  = (0xffffffff & (ixi[2] >> 32)); // ixi[2][63..32]
        nword++;

        // ivi[0]
        Rbuf[devid][nword]  = (0x0fffffff & (ivi[0])); // ivi[0][27..0]

        // ivi[1]
        Rbuf[devid][nword] |= (0x0000000f & (ivi[1]))<< 28; // ivi[1][3..0]
        nword++;
        Rbuf[devid][nword]  = (0x00ffffff & (ivi[1] >> 4)); // ivi[1][27..4]

        // ivi[2]
        Rbuf[devid][nword] |= (0x000000ff & (ivi[2]))<< 24; // ivi[2][7..0]
        nword++;
        Rbuf[devid][nword]  = (0x000fffff & (ivi[2] >> 8)); // ivi[2][27..8]

        // iepsi2
        Rbuf[devid][nword] |= (0x00000fff & (iepsi2))<< 20; // iepsi2[11..0]
        nword++;
        Rbuf[devid][nword]  = (0x003fffff & (iepsi2 >> 12)); // iepsi2[33..12]

        // iindexi
        Rbuf[devid][nword] |= (0x000003ff & (iindexi))<< 22; // iindexi[9..0]
        nword++;
        Rbuf[devid][nword]  = (0x003fffff & (iindexi >> 10)); // iindexi[31..10]

        // ihi2
        Rbuf[devid][nword] |= (0x000003ff & (ihi2))<< 22; // ihi2[9..0]
        nword++;
        Rbuf[devid][nword]  = (0x00ffffff & (ihi2 >> 10)); // ihi2[33..10]
        nword = nword0 + 2 * Ipsize[devid];

    }
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

#if 0
    {
	int i;
	fprintf(stderr, "IP\n");
	for (i = 0; i < nword; i++) {
	    fprintf(stderr, "Rbuf[%d][%2d]:0x%08x\n",
		    devid, i, Rbuf[devid][i]);
	}
	fprintf(stderr, "\n");
    }
#endif
}

void
pg2g6nb_run(void)
{
    int ic;

    for (ic = 0; ic < Ndevice; ic++) {
	pg2g6nb_runMC(DeviceIdInUse[ic]);
    }
}

void
pg2g6nb_runMC(int devid)
{
    int nword;
    int ni = Ni[devid];

    if (ni <= 0) return;

    if (Nbodies[devid] == 0) return;

    /* set N and run */
    nword = 0;
    Rbuf[devid][nword++] = Calcaddr[devid];
    Rbuf[devid][nword++] = ((Ipsize[devid] * Npipes[devid]) << 16) | Nbodies[devid];
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

    /* kick off DMA write.
     * ni must be rounded up by Npipes[devid].
     */
    nword = ((ni-1)/Npipes[devid] + 1) * Npipes[devid];
    nword = sizeof(long long)/sizeof(int) * Fosize[devid] * nword;
    hib_start_dmawMC(devid, (nword+1)/2, (UINT64*)Wbuf[devid]);
}

void
pg2g6nb_set_n(int nj)
{
    int ic;
    int j0, njj;

    j0 = 0;
    njj = (nj + Ndevice - 1) / Ndevice;

    for (ic = 0; ic < Ndevice; ic++) {
	if (nj < j0 + njj) {
	    njj = nj - j0;
	}
	pg2g6nb_set_nMC(DeviceIdInUse[ic], njj);
	j0 += njj;
    }
}

void
pg2g6nb_set_nMC(int devid, int n)
{
    Nbodies[devid] = (n-1)/Nchip[devid] + 1;
}

void
pg2g6nb_get_fout(int ni, double (*acc)[3], double *pot, double (*jerk)[3], int *nblist_ovflw, int (*nblist)[32], int *nnb_ovflw, int *nnb, int *nnbr2_ovflw, double *nnbr2)
{
#if 0 // a run on multiple card returns the same result with
      // that on a single card. works only for fout of type int.

    /*
     * 1) retrieve 'fout's from all cards.
     * 2) sums them up.
     * 3) convert their numerical format. for the conversion,
     *    the scale factor of the first card is applied to all 'fout's.
     */
    static UINT64 iacc[NFOMAX][3];
    static UINT64 ipot[NFOMAX];
    static UINT64 ijerk[NFOMAX][3];
    static UINT64 inblist_ovflw[NFOMAX * NHIBMAX];
    static UINT64 inblist[NFOMAX * NHIBMAX][32];
    static UINT64 innb_ovflw[NFOMAX * NHIBMAX];
    static UINT64 innb[NFOMAX * NHIBMAX];
    static UINT64 innbr2_ovflw[NFOMAX * NHIBMAX];
    static UINT64 innbr2[NFOMAX * NHIBMAX];

    int ic, i, k, devid;

    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            acc[i][k] = 0;
        }
        pot[i] = 0;
        for (k = 0; k < 3; k++) {
            jerk[i][k] = 0;
        }

    }
    for (ic = 0; ic < Ndevice; ic++) {
	unpack_foutMC(DeviceIdInUse[ic], ni, iacc, ipot, ijerk, inblist_ovflw, inblist, innb_ovflw, innb, innbr2_ovflw, innbr2); // copy from DMAW buf to each array.
        for (i = 0; i < ni; i++) {
            for (k = 0; k < 3; k++) {
                iacc_sum[i][k] += iacc[i][k];
            }
            ipot_sum[i] += ipot[i];
            for (k = 0; k < 3; k++) {
                ijerk_sum[i][k] += ijerk[i][k];
            }
            inblist_ovflw_sum[i] += inblist_ovflw[i];
            for (k = 0; k < 32; k++) {
                inblist_sum[i][k] += inblist[i][k];
            }
            innb_ovflw_sum[i] += innb_ovflw[i];
            innb_sum[i] += innb[i];
            innbr2_ovflw_sum[i] += innbr2_ovflw[i];
            innbr2_sum[i] += innbr2[i];

        }
    }
    devid = DeviceIdInUse[0]; // device id of the first card in use.

    // convert numerical format.
    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            acc[i][k] = convert_acc(devid, iacc_sum[i][k]);
        }
        pot[i] = convert_pot(devid, ipot_sum[i]);
        for (k = 0; k < 3; k++) {
            jerk[i][k] = convert_jerk(devid, ijerk_sum[i][k]);
        }
        nblist_ovflw[i] = convert_nblist_ovflw(devid, inblist_ovflw_sum[i]);
        for (k = 0; k < 32; k++) {
            nblist[i][k] = convert_nblist(devid, inblist_sum[i][k]);
        }
        nnb_ovflw[i] = convert_nnb_ovflw(devid, innb_ovflw_sum[i]);
        nnb[i] = convert_nnb(devid, innb_sum[i]);
        nnbr2_ovflw[i] = convert_nnbr2_ovflw(devid, innbr2_ovflw_sum[i]);
        nnbr2[i] = convert_nnbr2(devid, innbr2_sum[i]);

    }

#else // result of a run on multiple cards and 
      // that on a single card is not exactly the same.
      // works fout of type float as well as int.

    int ic, i, k;
    static UINT64 iacc[NFOMAX][3];
    static UINT64 ipot[NFOMAX];
    static UINT64 ijerk[NFOMAX][3];
    static UINT64 inblist_ovflw[NFOMAX * NHIBMAX];
    static UINT64 inblist[NFOMAX * NHIBMAX][32];
    static UINT64 innb_ovflw[NFOMAX * NHIBMAX];
    static UINT64 innb[NFOMAX * NHIBMAX];
    static UINT64 innbr2_ovflw[NFOMAX * NHIBMAX];
    static UINT64 innbr2[NFOMAX * NHIBMAX];

    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            acc[i][k] = 0;
        }
        pot[i] = 0;
        for (k = 0; k < 3; k++) {
            jerk[i][k] = 0;
        }

    }

    for (ic = 0; ic < Ndevice; ic++) {
	unpack_foutMC(DeviceIdInUse[ic], ni, iacc, ipot, ijerk, inblist_ovflw, inblist, innb_ovflw, innb, innbr2_ovflw, innbr2); // copy from DMAW buf to each array.
        for (i = 0; i < ni; i++) {
            for (k = 0; k < 3; k++) {
                acc[i][k] += convert_acc(ic, iacc[i][k]);
            }
            pot[i] += convert_pot(ic, ipot[i]);
            for (k = 0; k < 3; k++) {
                jerk[i][k] += convert_jerk(ic, ijerk[i][k]);
            }
            nblist_ovflw[ni * ic + i] = convert_nblist_ovflw(ic, inblist_ovflw[i]);
            for (k = 0; k < 32; k++) {
                nblist[ni * ic + i][k] = convert_nblist(ic, inblist[i][k]);
            }
            nnb_ovflw[ni * ic + i] = convert_nnb_ovflw(ic, innb_ovflw[i]);
            nnb[ni * ic + i] = convert_nnb(ic, innb[i]);
            nnbr2_ovflw[ni * ic + i] = convert_nnbr2_ovflw(ic, innbr2_ovflw[i]);
            nnbr2[ni * ic + i] = convert_nnbr2(ic, innbr2[i]);

        }
    }

#endif

}

void
pg2g6nb_get_foutMC(int devid, int ni, double (*acc)[3], double *pot, double (*jerk)[3], int *nblist_ovflw, int (*nblist)[32], int *nnb_ovflw, int *nnb, int *nnbr2_ovflw, double *nnbr2)
{
    int i, k;
    static UINT64 iacc[NFOMAX][3];
    static UINT64 ipot[NFOMAX];
    static UINT64 ijerk[NFOMAX][3];
    static UINT64 inblist_ovflw[NFOMAX * NHIBMAX];
    static UINT64 inblist[NFOMAX * NHIBMAX][32];
    static UINT64 innb_ovflw[NFOMAX * NHIBMAX];
    static UINT64 innb[NFOMAX * NHIBMAX];
    static UINT64 innbr2_ovflw[NFOMAX * NHIBMAX];
    static UINT64 innbr2[NFOMAX * NHIBMAX];


    if (ni <= 0) return;

    if (ni > Nfomax[devid]) {
	fprintf(stderr, "pg2g6nb_get_foutMC: too large ni (%d). abort.\n", ni);
	exit(1);
    }

    unpack_foutMC(devid, ni, iacc, ipot, ijerk, inblist_ovflw, inblist, innb_ovflw, innb, innbr2_ovflw, innbr2); // copy from DMAW buf to each array.

    // convert numerical format.
    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            acc[i][k] = convert_acc(devid, iacc[i][k]);
        }
        pot[i] = convert_pot(devid, ipot[i]);
        for (k = 0; k < 3; k++) {
            jerk[i][k] = convert_jerk(devid, ijerk[i][k]);
        }
        nblist_ovflw[i] = convert_nblist_ovflw(devid, inblist_ovflw[i]);
        for (k = 0; k < 32; k++) {
            nblist[i][k] = convert_nblist(devid, inblist[i][k]);
        }
        nnb_ovflw[i] = convert_nnb_ovflw(devid, innb_ovflw[i]);
        nnb[i] = convert_nnb(devid, innb[i]);
        nnbr2_ovflw[i] = convert_nnbr2_ovflw(devid, innbr2_ovflw[i]);
        nnbr2[i] = convert_nnbr2(devid, innbr2[i]);

    }
}

static void
unpack_foutMC(int devid, int ni, UINT64 (*iacc)[3], UINT64 *ipot, UINT64 (*ijerk)[3], UINT64 *inblist_ovflw, UINT64 (*inblist)[32], UINT64 *innb_ovflw, UINT64 *innb, UINT64 *innbr2_ovflw, UINT64 *innbr2)
{
    int i, k, nword;

    if (Nbodies[devid] == 0) { // no calculation done, and no valid data in Wbuf.
        for (i = 0; i < ni; i++) {
            for (k = 0; k < 3; k++) {
                iacc[i][k] = 0;
            }
            ipot[i] = 0;
            for (k = 0; k < 3; k++) {
                ijerk[i][k] = 0;
            }
            inblist_ovflw[i] = 0;
            for (k = 0; k < 32; k++) {
                inblist[i][k] = 0;
            }
            innb_ovflw[i] = 0;
            innb[i] = 0;
            innbr2_ovflw[i] = 0;
            innbr2[i] = 0;

        }
        Nretrieved[devid] = 0;
        return;
    }

    /* wait DMA write completion */
    hib_finish_dmawMC(devid);

    nword = 0;
    for (i = 0; i < ni; i++) {
        /*
         * unpack an FO.
         */
	int nword0 = nword;

        // iacc[i][0]
        iacc[i][0]  = (UINT64)((Wbuf[devid][nword])); // iacc[i][0][31..0]  nvar:0
        nword++;
        iacc[i][0] |= (UINT64)((Wbuf[devid][nword])) << 32; // iacc[i][0][63..32]  nvar:0
        nword++;

        // iacc[i][1]
        iacc[i][1]  = (UINT64)((Wbuf[devid][nword])); // iacc[i][1][31..0]  nvar:0
        nword++;
        iacc[i][1] |= (UINT64)((Wbuf[devid][nword])) << 32; // iacc[i][1][63..32]  nvar:0
        nword++;

        // iacc[i][2]
        iacc[i][2]  = (UINT64)((Wbuf[devid][nword])); // iacc[i][2][31..0]  nvar:0
        nword++;
        iacc[i][2] |= (UINT64)((Wbuf[devid][nword])) << 32; // iacc[i][2][63..32]  nvar:0
        nword++;

        // ipot[i]
        ipot[i]  = (UINT64)((Wbuf[devid][nword])); // ipot[i][31..0]  nvar:1
        nword++;
        ipot[i] |= (UINT64)((Wbuf[devid][nword])) << 32; // ipot[i][63..32]  nvar:1
        nword++;

        // ijerk[i][0]
        ijerk[i][0]  = (UINT64)((Wbuf[devid][nword])); // ijerk[i][0][31..0]  nvar:2
        nword++;
        ijerk[i][0] |= (UINT64)(0x0000ffff & (Wbuf[devid][nword])) << 32; // ijerk[i][0][47..32]  nvar:2
        // fill MSBs with '1's if negative.
        if (ijerk[i][0] & ((UINT64)1 << 47)) {
            ijerk[i][0] |= ~(((UINT64)1 << 48) - 1);
        }

        // ijerk[i][1]
        ijerk[i][1]  = (UINT64)(0x0000ffff & (Wbuf[devid][nword] >> 16)); // ijerk[i][1][15..0]  nvar:2
        nword++;
        ijerk[i][1] |= (UINT64)((Wbuf[devid][nword])) << 16; // ijerk[i][1][47..16]  nvar:2
        nword++;
        // fill MSBs with '1's if negative.
        if (ijerk[i][1] & ((UINT64)1 << 47)) {
            ijerk[i][1] |= ~(((UINT64)1 << 48) - 1);
        }

        // ijerk[i][2]
        ijerk[i][2]  = (UINT64)((Wbuf[devid][nword])); // ijerk[i][2][31..0]  nvar:2
        nword++;
        ijerk[i][2] |= (UINT64)(0x0000ffff & (Wbuf[devid][nword])) << 32; // ijerk[i][2][47..32]  nvar:2
        // fill MSBs with '1's if negative.
        if (ijerk[i][2] & ((UINT64)1 << 47)) {
            ijerk[i][2] |= ~(((UINT64)1 << 48) - 1);
        }

        // inblist_ovflw[i]
        inblist_ovflw[i]  = (UINT64)(0x00000001 & (Wbuf[devid][nword] >> 16)); // inblist_ovflw[i][0..0]  nvar:3
        // fill MSBs with '1's if negative.
        if (inblist_ovflw[i] & ((UINT64)1 << 0)) {
            inblist_ovflw[i] |= ~(((UINT64)1 << 1) - 1);
        }

        // inblist[i][0]
        inblist[i][0]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][0][14..0]  nvar:4
        nword++;
        inblist[i][0] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][0][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][0] & ((UINT64)1 << 31)) {
            inblist[i][0] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][1]
        inblist[i][1]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][1][14..0]  nvar:4
        nword++;
        inblist[i][1] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][1][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][1] & ((UINT64)1 << 31)) {
            inblist[i][1] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][2]
        inblist[i][2]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][2][14..0]  nvar:4
        nword++;
        inblist[i][2] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][2][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][2] & ((UINT64)1 << 31)) {
            inblist[i][2] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][3]
        inblist[i][3]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][3][14..0]  nvar:4
        nword++;
        inblist[i][3] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][3][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][3] & ((UINT64)1 << 31)) {
            inblist[i][3] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][4]
        inblist[i][4]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][4][14..0]  nvar:4
        nword++;
        inblist[i][4] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][4][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][4] & ((UINT64)1 << 31)) {
            inblist[i][4] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][5]
        inblist[i][5]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][5][14..0]  nvar:4
        nword++;
        inblist[i][5] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][5][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][5] & ((UINT64)1 << 31)) {
            inblist[i][5] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][6]
        inblist[i][6]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][6][14..0]  nvar:4
        nword++;
        inblist[i][6] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][6][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][6] & ((UINT64)1 << 31)) {
            inblist[i][6] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][7]
        inblist[i][7]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][7][14..0]  nvar:4
        nword++;
        inblist[i][7] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][7][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][7] & ((UINT64)1 << 31)) {
            inblist[i][7] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][8]
        inblist[i][8]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][8][14..0]  nvar:4
        nword++;
        inblist[i][8] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][8][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][8] & ((UINT64)1 << 31)) {
            inblist[i][8] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][9]
        inblist[i][9]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][9][14..0]  nvar:4
        nword++;
        inblist[i][9] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][9][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][9] & ((UINT64)1 << 31)) {
            inblist[i][9] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][10]
        inblist[i][10]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][10][14..0]  nvar:4
        nword++;
        inblist[i][10] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][10][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][10] & ((UINT64)1 << 31)) {
            inblist[i][10] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][11]
        inblist[i][11]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][11][14..0]  nvar:4
        nword++;
        inblist[i][11] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][11][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][11] & ((UINT64)1 << 31)) {
            inblist[i][11] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][12]
        inblist[i][12]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][12][14..0]  nvar:4
        nword++;
        inblist[i][12] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][12][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][12] & ((UINT64)1 << 31)) {
            inblist[i][12] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][13]
        inblist[i][13]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][13][14..0]  nvar:4
        nword++;
        inblist[i][13] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][13][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][13] & ((UINT64)1 << 31)) {
            inblist[i][13] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][14]
        inblist[i][14]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][14][14..0]  nvar:4
        nword++;
        inblist[i][14] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][14][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][14] & ((UINT64)1 << 31)) {
            inblist[i][14] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][15]
        inblist[i][15]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][15][14..0]  nvar:4
        nword++;
        inblist[i][15] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][15][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][15] & ((UINT64)1 << 31)) {
            inblist[i][15] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][16]
        inblist[i][16]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][16][14..0]  nvar:4
        nword++;
        inblist[i][16] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][16][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][16] & ((UINT64)1 << 31)) {
            inblist[i][16] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][17]
        inblist[i][17]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][17][14..0]  nvar:4
        nword++;
        inblist[i][17] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][17][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][17] & ((UINT64)1 << 31)) {
            inblist[i][17] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][18]
        inblist[i][18]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][18][14..0]  nvar:4
        nword++;
        inblist[i][18] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][18][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][18] & ((UINT64)1 << 31)) {
            inblist[i][18] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][19]
        inblist[i][19]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][19][14..0]  nvar:4
        nword++;
        inblist[i][19] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][19][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][19] & ((UINT64)1 << 31)) {
            inblist[i][19] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][20]
        inblist[i][20]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][20][14..0]  nvar:4
        nword++;
        inblist[i][20] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][20][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][20] & ((UINT64)1 << 31)) {
            inblist[i][20] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][21]
        inblist[i][21]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][21][14..0]  nvar:4
        nword++;
        inblist[i][21] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][21][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][21] & ((UINT64)1 << 31)) {
            inblist[i][21] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][22]
        inblist[i][22]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][22][14..0]  nvar:4
        nword++;
        inblist[i][22] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][22][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][22] & ((UINT64)1 << 31)) {
            inblist[i][22] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][23]
        inblist[i][23]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][23][14..0]  nvar:4
        nword++;
        inblist[i][23] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][23][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][23] & ((UINT64)1 << 31)) {
            inblist[i][23] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][24]
        inblist[i][24]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][24][14..0]  nvar:4
        nword++;
        inblist[i][24] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][24][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][24] & ((UINT64)1 << 31)) {
            inblist[i][24] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][25]
        inblist[i][25]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][25][14..0]  nvar:4
        nword++;
        inblist[i][25] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][25][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][25] & ((UINT64)1 << 31)) {
            inblist[i][25] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][26]
        inblist[i][26]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][26][14..0]  nvar:4
        nword++;
        inblist[i][26] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][26][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][26] & ((UINT64)1 << 31)) {
            inblist[i][26] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][27]
        inblist[i][27]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][27][14..0]  nvar:4
        nword++;
        inblist[i][27] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][27][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][27] & ((UINT64)1 << 31)) {
            inblist[i][27] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][28]
        inblist[i][28]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][28][14..0]  nvar:4
        nword++;
        inblist[i][28] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][28][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][28] & ((UINT64)1 << 31)) {
            inblist[i][28] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][29]
        inblist[i][29]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][29][14..0]  nvar:4
        nword++;
        inblist[i][29] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][29][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][29] & ((UINT64)1 << 31)) {
            inblist[i][29] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][30]
        inblist[i][30]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][30][14..0]  nvar:4
        nword++;
        inblist[i][30] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][30][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][30] & ((UINT64)1 << 31)) {
            inblist[i][30] |= ~(((UINT64)1 << 32) - 1);
        }

        // inblist[i][31]
        inblist[i][31]  = (UINT64)(0x00007fff & (Wbuf[devid][nword] >> 17)); // inblist[i][31][14..0]  nvar:4
        nword++;
        inblist[i][31] |= (UINT64)(0x0001ffff & (Wbuf[devid][nword])) << 15; // inblist[i][31][31..15]  nvar:4
        // fill MSBs with '1's if negative.
        if (inblist[i][31] & ((UINT64)1 << 31)) {
            inblist[i][31] |= ~(((UINT64)1 << 32) - 1);
        }

        // innb_ovflw[i]
        innb_ovflw[i]  = (UINT64)(0x00000001 & (Wbuf[devid][nword] >> 17)); // innb_ovflw[i][0..0]  nvar:5
        // fill MSBs with '1's if negative.
        if (innb_ovflw[i] & ((UINT64)1 << 0)) {
            innb_ovflw[i] |= ~(((UINT64)1 << 1) - 1);
        }

        // innb[i]
        innb[i]  = (UINT64)(0x00003fff & (Wbuf[devid][nword] >> 18)); // innb[i][13..0]  nvar:6
        nword++;
        innb[i] |= (UINT64)(0x0003ffff & (Wbuf[devid][nword])) << 14; // innb[i][31..14]  nvar:6
        // fill MSBs with '1's if negative.
        if (innb[i] & ((UINT64)1 << 31)) {
            innb[i] |= ~(((UINT64)1 << 32) - 1);
        }

        // innbr2_ovflw[i]
        innbr2_ovflw[i]  = (UINT64)(0x00000001 & (Wbuf[devid][nword] >> 18)); // innbr2_ovflw[i][0..0]  nvar:7
        // fill MSBs with '1's if negative.
        if (innbr2_ovflw[i] & ((UINT64)1 << 0)) {
            innbr2_ovflw[i] |= ~(((UINT64)1 << 1) - 1);
        }

        // innbr2[i]
        innbr2[i]  = (UINT64)(0x00001fff & (Wbuf[devid][nword] >> 19)); // innbr2[i][12..0]  nvar:8
        nword++;
        innbr2[i] |= (UINT64)(0x001fffff & (Wbuf[devid][nword])) << 13; // innbr2[i][33..13]  nvar:8
        nword = nword0 + 2 * Fosize[devid];
        // fill MSBs with '1's if negative.
        if (innbr2[i] & ((UINT64)1 << 33)) {
            innbr2[i] |= ~(((UINT64)1 << 34) - 1);
        }

    }

#if 0
     for (i = 0; i < nword; i++) {
	 fprintf(stderr, "Wbuf[%d][%d]:0x%08x\n", devid, i, Wbuf[devid][i]);
     }
#endif

    Nretrieved[devid] = ni;
}


int
pg2g6nb_get_number_of_pipelines(void)
{
    int ic, n;
    int nmin = 65536; // any large number will do.

    // returns the smallest one
    for (ic = 0; ic < Ndevice; ic++) {
	n = pg2g6nb_get_number_of_pipelinesMC(DeviceIdInUse[ic]);
	if (nmin > n) {
	    nmin = n;
	}
    }

    return nmin;
}

int
pg2g6nb_get_jmemsize(void)
{
    int ic;
    int jms = 0;

    for (ic = 0; ic < Ndevice; ic++) {
	jms += pg2g6nb_get_jmemsizeMC(DeviceIdInUse[ic]);
    }
    return jms;
}

int
pg2g6nb_get_number_of_pipelinesMC(int devid)
{
    return Nipmax[devid];
}

int
pg2g6nb_get_jmemsizeMC(int devid)
{
    return Jmemsize[devid];
}

void
pg2g6nb_calculate_fout_on_ip(double (*xi)[3], double (*vi)[3], double *epsi2, int *indexi, double *hi2, double (*acc)[3], double *pot, double (*jerk)[3], int *nblist_ovflw, int (*nblist)[32], int *nnb_ovflw, int *nnb, int *nnbr2_ovflw, double *nnbr2, int ni)
{
    int off, nii, np;

    np = pg2g6nb_get_number_of_pipelines();

    for (off = 0; off < ni; off += np) {
	nii = np;
	if (off+nii > ni) {
	    nii = ni - off;
	}

	pg2g6nb_set_ip(nii, (double (*)[3])xi[off], (double (*)[3])vi[off], &epsi2[off], &indexi[off], &hi2[off]);
	pg2g6nb_run();
	pg2g6nb_get_fout(nii, (double (*)[3])acc[off], &pot[off], (double (*)[3])jerk[off], &nblist_ovflw[off], (int (*)[32])nblist[off], &nnb_ovflw[off], &nnb[off], &nnbr2_ovflw[off], &nnbr2[off]);
    }
}

/*
 *
 * local functions
 *
 */

/*
 * initialize variables used for "standard" functions (i.e. non-MC functions).
 * this initialization is not necessary for "primitive" functions (MC functions).
 */
static void
init_envs(void)
{
    int ic;
    char *p;
    char *cardno;

    if (Ndevice != 0) return; // already initialized.

    p = getenv("GWARNLEVEL");
    if (!p) { // for backward compatibility.
        p = getenv("PG2G6NB_WARNLEVEL");
    }
    if (p) {
        int tmp;
        tmp = atoi(strtok(p, " "));
        if (0 <= tmp) {
            warn_level = tmp;
        }
        WARN(3, "warn_level: %d\n", warn_level);
    }
    hib_set_warn_level(warn_level);

    p = getenv("SENDFUNC");
    if (p) {
        if (0 == strcmp("DMAR", p)) {
            Sendfunc = SENDFUNC_DMAR;
        }
    }

    /* cards are not allocated yet.
       try to allocate cards specified by environment variable "PG2G6NB_CARDS".
       try to allocate all cards, if PG2G6NB_CARDS is not set. */

    p = getenv("GDEVICE");
    if (!p) { // for backward compatibility.
        p = getenv("PG2G6NB_CARDS");
    }
    if (p) {
        cardno = strtok(p, " ");
        while (cardno) {
            ic = atoi(cardno);
            if (ic < 0 || ic >= hib_ndevice()) {
                fprintf(stderr, "GDEVICE (or PG2G6NB_CARDS) have device_id out of range: %d\n", ic);
                exit(2);
            }
            DeviceIdInUse[Ndevice] = ic;
            Ndevice++;
            cardno = strtok(NULL, " ");
        }
            
    }
    else { // GDEVICE is not set
        Ndevice = hib_ndevice();
        for (ic = 0; ic < Ndevice; ic++) {
            DeviceIdInUse[ic] = ic;
        }
    }
    WARN(3, "IDs of allocated devices:");
    for (ic = 0; ic < Ndevice; ic++) {
        WARN(3, " %d", DeviceIdInUse[ic]);
    }
    WARN(3, "\n");
}

static void
init_boardinfoMC(int devid)
{
    int tmp;
    UINT32 binfo;

    // by default, Jmemsize & Npipes are
    // not read out from board_info registers.
    // they are embedded into the source code by pgdl2lib.
    //
    Nchip[devid] = 1;
    Npipes[devid] = 5;
    Jmemsize[devid]   = 2048 * Nchip[devid];

    binfo = hib_mem_readMC(devid, H[devid]->r->boardinfo);
    DeviceModelId[devid] = (binfo >> 24) & 0xf;
    DeviceProductId[devid] = (binfo >> 28) & 0xf;
    switch (DeviceProductId[devid]) {
      case 1:
        WARN(1,"GRAPE-7(PCI-X) ");
	break;
      case 2:
        WARN(1,"PCIe Eval Boards ");
	break;
      case 3:
        WARN(1,"GRAPE-DR ");
	break;
      case 4:
        WARN(1,"GRAPE-9 ");
	break;
      default:
        fprintf(stderr,"init_boardinfoMC: DeviceProductId[%d]=%d  unknown product.\n",
		devid, DeviceProductId[devid]);
        //	exit(2);
    }

    switch (DeviceProductId[devid]) {
      case 1: // GRAPE-7
        switch (DeviceModelId[devid]) {
          case 1:
            Nchip[devid] = 1;
            WARN(1, "model100 ");
            break;
          case 3:
            Nchip[devid] = 3;
            WARN(1, "model300 [4-6] ");
            break;
          case 2:
            Nchip[devid] = 3;
            WARN(1, "model300 D[1-3] ");
            break;
          case 6:
            Nchip[devid] = 6;
            WARN(1, "model600 ");
            break;
          case 8:
            Nchip[devid] = 1;
            WARN(1, "model800 ");
            break;
          default:
            fprintf(stderr,"init_boardinfoMC: DeviceModelId[%d]=%d  unknown model.\n",
                    devid, DeviceModelId[devid]);
            //            exit(2);
        }
        break;

      case 2: // PCIe prototype
        switch (DeviceModelId[devid]) {
          case 2:
            Nchip[devid] = 1;
            WARN(1, "PLDA Xpress2 GX ");
            break;
          case 4:
            Nchip[devid] = 1;
            WARN(1, "Altera Stratix IV GX ");
            break;
          default:
            fprintf(stderr,"init_boardinfoMC: DeviceModelId[%d]=%d  unknown model.\n",
                    devid, DeviceModelId[devid]);
            //            exit(2);
        }
        break;

      case 3: // GRAPE-DR
        switch (DeviceModelId[devid]) {
          case 1:
            Nchip[devid] = 1;
            WARN(1, "TB1 ");
            break;
          case 2:
            Nchip[devid] = 1;
            WARN(1, "TB2 ");
            break;
          case 3:
            Nchip[devid] = 1;
            WARN(1, "TB3 ");
            break;
          case 4:
            Nchip[devid] = 4;
            WARN(1, "TB4 ");
            break;
          default:
            fprintf(stderr,"init_boardinfoMC: DeviceModelId[%d]=%d  unknown model.\n",
                    devid, DeviceModelId[devid]);
            //            exit(2);
        }
        break;

      case 4: // GRAPE-9
        //        Jmemsize[devid]  = 2048 << ((binfo>>8) & 0x3);
        Npipes[devid]    = (binfo >>  0) & 0xff;
        switch (DeviceModelId[devid]) {
          case 1:
            WARN(1, "G9C ");
            break;
          default:
            fprintf(stderr,"init_boardinfoMC: DeviceModelId[%d]=%d  unknown model.\n",
                    devid, DeviceModelId[devid]);
            //            exit(2);
        }
        break;
    }

    WARN(1, " Nchip:%d ", Nchip[devid]);
    WARN(1, "Npipes:%d Jmemsize:%d\n", Npipes[devid], Jmemsize[devid]);
}

static void
recalculate_iobuf_attributesMC(int devid)
{
    Jpsize[devid] = 9;
    Ipsize[devid] = 6;
    Fosize[devid] = 24;

    Nfomax[devid] = OFIFOSIZE / Fosize[devid];
    Njpwordmax[devid] = NJPWORDMAX - 100;
    Nipmax[devid] = IFIFOSIZE / Ipsize[devid];
    Nfomax[devid] = OFIFOSIZE / Fosize[devid];
    Nipmax[devid] = Nipmax[devid] / Npipes[devid] * Npipes[devid];
    Nfomax[devid] = Nfomax[devid] / Npipes[devid] * Npipes[devid];

    if (Nipmax[devid] > Nfomax[devid]) {
	Nipmax[devid] = Nfomax[devid];
    }
    else {
	Nfomax[devid] = Nipmax[devid];
    }
    WARN(3, "Fosize:%d Nfomax:%d\n", Fosize[devid], Nfomax[devid]);
}

static void
set_regMC(int devid, UINT32 addr, UINT32 val)
{
    int nword = 0;

    Rbuf[devid][nword++] = addr;
    Rbuf[devid][nword++] = val;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
}

static void
initialize_scale_factorMC(int devid)
{
#if 0 // !!! should look like;
    Ximin[devid] = 0.0;
#endif
}

static UINT64
compose_float(UINT64 sign, UINT64 exp, UINT64 man, int wexp, int wman)
{
    UINT64 dst;

    sign &= (UINT64)1;
    exp  &= ((UINT64)1 << wexp) - 1;
    man  &= ((UINT64)1 << wman) - 1;

    dst  = sign << (wexp + wman);
    dst |= exp  << wman;
    dst |= man;

    return dst;
}

static void
decompose_float(UINT64 src, int wexp, int wman, UINT64 *signp, UINT64 *expp, UINT64 *manp)
{

    *signp = src >> (wexp + wman);
    *signp &= (UINT64)1;

    *expp = src >> wman;
    *expp &= ((UINT64)1 << wexp) - 1;

    *manp = src;
    *manp &= ((UINT64)1 << wman) - 1;
}

static inline UINT64
convert_ti(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'ti' from double to float63.52.\n");
    pg_conv_cdouble_to_float(src * Ti_scale[devid], &dst, 63, 52);
    // fprintf(stdout, "ti:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_etainv(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'etainv' from double to float24.13.\n");
    pg_conv_cdouble_to_float(src * Etainv_scale[devid], &dst, 24, 13);
    // fprintf(stdout, "etainv:%llx\n", dst);
    return dst;
}


static inline UINT64
convert_mj(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'mj' from double to float34.23.\n");
    pg_conv_cdouble_to_float(src * Mj_scale[devid], &dst, 34, 23);
    // fprintf(stdout, "mj:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_xj(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'xj' from double to int64.\n");
    double scale  = pg2g6nb_get_scale_xjMC(devid);
    double offset = pg2g6nb_get_offset_xjMC(devid);
    dst = (UINT64)((src - offset) * scale + ONEHALF);

    // fprintf(stdout, "xj:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_vj(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'vj' from double to float28.17.\n");
    pg_conv_cdouble_to_float(src * Vj_scale[devid], &dst, 28, 17);
    // fprintf(stdout, "vj:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_tj(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'tj' from double to float63.52.\n");
    pg_conv_cdouble_to_float(src * Tj_scale[devid], &dst, 63, 52);
    // fprintf(stdout, "tj:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_acc0by2(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'acc0by2' from double to float28.17.\n");
    pg_conv_cdouble_to_float(src * Acc0by2_scale[devid], &dst, 28, 17);
    // fprintf(stdout, "acc0by2:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_jerk0by6(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'jerk0by6' from double to float28.17.\n");
    pg_conv_cdouble_to_float(src * Jerk0by6_scale[devid], &dst, 28, 17);
    // fprintf(stdout, "jerk0by6:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_indexj(int devid, int src)
{
    UINT64 dst;

    WARN(3, "convert 'indexj' from int to int32.\n");
    dst = (UINT64)src;
    // fprintf(stdout, "indexj:%llx\n", dst);
    return dst;
}


static inline UINT64
convert_xi(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'xi' from double to int64.\n");
    double scale  = pg2g6nb_get_scale_xiMC(devid);
    double offset = pg2g6nb_get_offset_xiMC(devid);
    dst = (UINT64)((src - offset) * scale + ONEHALF);

    // fprintf(stdout, "xi:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_vi(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'vi' from double to float28.17.\n");
    pg_conv_cdouble_to_float(src * Vi_scale[devid], &dst, 28, 17);
    // fprintf(stdout, "vi:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_epsi2(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'epsi2' from double to float34.23.\n");
    pg_conv_cdouble_to_float(src * Epsi2_scale[devid], &dst, 34, 23);
    // fprintf(stdout, "epsi2:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_indexi(int devid, int src)
{
    UINT64 dst;

    WARN(3, "convert 'indexi' from int to int32.\n");
    dst = (UINT64)src;
    // fprintf(stdout, "indexi:%llx\n", dst);
    return dst;
}

static inline UINT64
convert_hi2(int devid, double src)
{
    UINT64 dst;

    WARN(3, "convert 'hi2' from double to float34.23.\n");
    pg_conv_cdouble_to_float(src * Hi2_scale[devid], &dst, 34, 23);
    // fprintf(stdout, "hi2:%llx\n", dst);
    return dst;
}


static inline double
convert_acc(int devid, UINT64 src)
{
    double dst;

    WARN(3, "convert 'acc' from int64 to double.\n");
    // fprintf(stdout, "acc:%llx\n", src);
    dst = Acc_scale[devid] * (INT64)src;
    return dst;
}

static inline double
convert_pot(int devid, UINT64 src)
{
    double dst;

    WARN(3, "convert 'pot' from int64 to double.\n");
    // fprintf(stdout, "pot:%llx\n", src);
    dst = Pot_scale[devid] * (INT64)src;
    return dst;
}

static inline double
convert_jerk(int devid, UINT64 src)
{
    double dst;

    WARN(3, "convert 'jerk' from int48 to double.\n");
    // fprintf(stdout, "jerk:%llx\n", src);
    dst = Jerk_scale[devid] * (INT64)src;
    return dst;
}

static inline int
convert_nblist_ovflw(int devid, UINT64 src)
{
    int dst;

    WARN(3, "convert 'nblist_ovflw' from int1 to int.\n");
    // fprintf(stdout, "nblist_ovflw:%llx\n", src);
    dst = (INT64)src;
    return dst;
}

static inline int
convert_nblist(int devid, UINT64 src)
{
    int dst;

    WARN(3, "convert 'nblist' from int32 to int.\n");
    // fprintf(stdout, "nblist:%llx\n", src);
    dst = (INT64)src;
    return dst;
}

static inline int
convert_nnb_ovflw(int devid, UINT64 src)
{
    int dst;

    WARN(3, "convert 'nnb_ovflw' from int1 to int.\n");
    // fprintf(stdout, "nnb_ovflw:%llx\n", src);
    dst = (INT64)src;
    return dst;
}

static inline int
convert_nnb(int devid, UINT64 src)
{
    int dst;

    WARN(3, "convert 'nnb' from int32 to int.\n");
    // fprintf(stdout, "nnb:%llx\n", src);
    dst = (INT64)src;
    return dst;
}

static inline int
convert_nnbr2_ovflw(int devid, UINT64 src)
{
    int dst;

    WARN(3, "convert 'nnbr2_ovflw' from int1 to int.\n");
    // fprintf(stdout, "nnbr2_ovflw:%llx\n", src);
    dst = (INT64)src;
    return dst;
}

static inline double
convert_nnbr2(int devid, UINT64 src)
{
    double dst;

    WARN(3, "convert 'nnbr2' from float34.23 to double.\n");
    // fprintf(stdout, "nnbr2:%llx\n", src);
    pg_conv_float_to_cdouble(src, 34, 23, &dst);
    dst *= Nnbr2_scale[devid];
    return dst;
}



