#if defined(_WIN32)

#include <DriverSpecs.h>
__user_code 
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "devioctl.h"
#include "strsafe.h"
#include <setupapi.h>
#include <basetyps.h>
#include "hibutil.h"
#include <math.h>
#include "hibdrv.h"
#include "hibutil.h"

#else // !_WIN32  i.e., for normal OSes.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/io.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include "hibdrv.h"
#include "hibutil.h"

#endif // _WIN32

static UINT64 *piowbuf;
static UINT64 *piowbuf2[NHIBMAX];

static void showstatus(int argc, char **argv);
static void stopdmaw(int argc, char **argv);
static void clearfifo(int argc, char **argv);
static void showdmastatus(int argc, char **argv);
static void configread(int argc, char **argv);
static void configwrite(int argc, char **argv);
static void regread(int argc, char **argv);
static void regwrite(int argc, char **argv);
static void pioread(int argc, char **argv);
static void piowrite(int argc, char **argv);

static void dmatest(int argc, char **argv);
static void dmatest2(int argc, char **argv);
static void dmaperf(int argc, char **argv);
static void dmawperf(int argc, char **argv);
static void dmarperf(int argc, char **argv);
static void resetbackend(int argc, char **argv);
static void rawperf(int argc, char **argv);
static void dmaperf2(int argc, char **argv);
static void dmawperf2(int argc, char **argv);
static void dmarperf2(int argc, char **argv);

static void eraserom(int argc, char **argv);
static void writerom(int argc, char **argv);
static void readromid(int argc, char **argv);

static void writepllconf(int argc, char **argv);

static void ioread(int argc, char **argv);
static void iowrite(int argc, char **argv);
static void backendloopback(int argc, char **argv);

static void reconfdevice(int argc, char **argv);

static void showusage(int argc, char **argv);
static void get_cputime(double *laptime, double *sprittime);

#if defined(_WIN32)
static void usleep(int t);
static void sleep(int t);
static void srand48(long int seedval);
static long int lrand48(void);
static long int time(void);
static long long int strtoull(const char *nptr, char **endptr, int base);
#endif // _WIN32

typedef struct {
    void (*func)();
    char *usage;
} TestMode;

static TestMode testmode[] =
    {
	showstatus, "show contents of config & HIB-local registers [devid]",
	stopdmaw, "reset DMA and FIFO [devid]",
	clearfifo, "clear HIB-internal FIFO [devid]",
	showdmastatus, "show DMA status [devid]",
	configread, "read config register <addr> [devid]",
	configwrite, "write config register <addr> <val> [devid]",
	regread, "read HIB local registers mapped to BAR0 <addr> [devid]",
	regwrite, "write HIB local registers mapped to BAR0 <addr> <val> [devid]",
	pioread, "CURRENTLY NOT SUPPORTED. read backend memory space mapped to BAR1 <addr> [devid]",
	piowrite, "CURRENTLY NOT SUPPORTED. write backend memory space mapped to BAR1 <addr> <val> [devid]",
	dmatest, "check DMA read/write function <size> <sendfunc> [devid] (host <-> HIB)",
	dmaperf, "measure DMA performance <sendfunc> [devid] (host <-> HIB)",
	dmawperf, "measure DMA write performance [devid] (host <- HIB; bypass internal FIFO)",
	dmarperf, "measure DMA read performance <sendfunc> [devid] (host -> HIB; bypass internal FIFO)",
	resetbackend, "reset backend [devid]",
	rawperf, "raw PIO r/w & DMA r/w [devid]",
	dmaperf2, "measure DMA performance with multiple HIBs <sendfunc>  <# of hibs> (host <-> HIBs internal FIFO)",
	dmawperf2, "measure DMA write performance with multiple HIBs <# of hibs> [devid offset] (host <- HIBs; bypass internal FIFO)",
	dmarperf2, "measure DMA read performance with multiple HIBs <sendfunc> <# of hibs> [devid offset] (host -> HIBs; bypass internal FIFO)",
        eraserom, "erase configuration ROM (EPCS64) [devid]",
        writerom, "write .rpd to configuration ROM (EPCS64) <rpd-file> [devid]",
        readromid, "read configuration ROM ID (0x10:EPCS1 0x12:EPCS4 0x14:EPCS16 0x16:EPCS64) [devid]",
	writepllconf, "set pipeline clock frequency to (base_freq * M / N) <M> <N> [devid]",
	dmatest2, "heavy-loaded test of loopback transfer <time (sec)> <sendfunc> [verbose(0 or 1)] [devid] (host <-> HIB)",
#if !defined(_WIN32)
	ioread, "read I/O space <b|w|l> <addr> [# of words] [devid]",
	iowrite, "write I/O space <b|w|l> <addr> <val> [# of words] [devid]",
#endif // _WIN32
        backendloopback, "write to, and then read from the backend. <sendfunc> <# of words> [devid] (host <-> HIB <-> backend)\n"
        "      backend logic need to supply enough data to be read back.",
        reconfdevice, "reconfigure the device. the host must be rebooted in order to access the reconfigured device. [devid]",
    };

int
main(int argc, char **argv)
{
    int mode, i;

    if (argc < 2) {
	showusage(argc, argv);
	exit (1);
    }
    piowbuf = (UINT64 *)calloc(HIB_DMABUF_BYTES>>3,sizeof(*piowbuf));
    if (! piowbuf ) {
        fprintf(stderr, "Out of memory.");
        exit (1);
    }
    for (i=0; i< hib_ndevice(); i++) {
        piowbuf2[i] = (UINT64 *)calloc(HIB_DMABUF_BYTES>>3,sizeof(*piowbuf));
        if (! piowbuf2[i] ) {
          fprintf(stderr, "Out of memory.");
          exit (1);
        }
    }

    mode = atoi(argv[1]);
    if (0 <= mode && mode < sizeof(testmode)/sizeof(testmode[0])) {
	testmode[mode].func(argc, argv);
    }
    else {
	showusage(argc, argv);
	exit (1);
    }
    exit (0);
}

static void
showusage(int argc, char **argv)
{
    int i;
    int nitems = sizeof(testmode)/sizeof(testmode[0]);

    fprintf(stderr, "usage: %s <test_program_ID>\n", argv[0]);
    for (i = 0; i < nitems; i++) {
	fprintf(stderr, "  %2d) %s\n", i, testmode[i].usage);
    }
}


/*
 * write data to the HIB-internal FIFO, then read back.
 */
static void
dmatest(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	devid = atoi(argv[4]);
    }

    fprintf(stderr, "\n# check hib[%d] DMA read/write (host <-> HIB internal FIFO)\n\n", devid);

    size = atoi(argv[2]);
    fprintf(stderr, "size %d\n", size);

    sendfunc = atoi(argv[3]);

    h = hib_openMC(devid);
    if (size < 0 || size > (h->r->dmabuf_bytes>>2)) {
	fprintf(stderr, "inappropriate size %d\n", size);
	exit(1);
    }

#if 0
    if (h->type == HIB_GRAPEDRG) {
        hib_mem_writeMC(devid, h->dma0cmd, (1 << h->dma0cmd_abort_bit));
        hib_mem_writeMC(devid, h->dma1cmd, (1 << h->dma1cmd_abort_bit));
    }
    else {
        hib_mem_writeMC(devid, h->command, (1 << h->command_dma_reset_bit));
    }
#endif

    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);
    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    printf("clear DMA buf...\n");

    srand48(time(NULL));
    for (i = 0; i < size+10; i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
        //        rbuf[i] = lrand48() << 32 | lrand48();;

	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }


    rbuf[0] = 0x1111111111111111LL;
    rbuf[1] = 0x2222222222222222LL;
    rbuf[2] = 0x3333333333333333LL;
    rbuf[3] = 0x4444444444444444LL;
    rbuf[4] = 0x5555555555555555LL;
    rbuf[5] = 0x6666666666666666LL;
    wbuf[0] = 0xaaaaaaaaaaaaaaaaLL;
    wbuf[1] = 0xbbbbbbbbbbbbbbbbLL;
    wbuf[2] = 0xccccccccccccccccLL;



    printf("DMA read size: %d words (%d bytes)\n", size, size*8);
    srand48(time(NULL));
    printf("will dmar...\n");

    hib_sendMC(devid, size, rbuf);
    hib_recvMC(devid, size, wbuf);

    fprintf(stdout, "\n");

    for (nng = 0, i = 0; i < size+2; i++) {
#if defined(_WIN32)
        // Win32 seems to have a bug in printing 64-bit integer.
        fprintf(stdout, "rbuf[%04d]: 0x%08x%08x  wbuf[%04d]: 0x%08x%08x",
		i, *(UINT32 *)((UINT8 *)rbuf+i*8), *(UINT32 *)((UINT8 *)rbuf+i*8+4),
		i, *(UINT32 *)((UINT8 *)wbuf+i*8), *(UINT32 *)((UINT8 *)wbuf+i*8+4));
#else // !_WIN32
	fprintf(stdout, "rbuf[%04d]: 0x%016llx  wbuf[%04d]: 0x%016llx",
		i, rbuf[i], i, wbuf[i]);
#endif // _WIN32

	if (wbuf[i] != rbuf[i] && i < size) {
	    nng++;
	    fprintf(stdout, " NG\n");
	}
	else {
	    fprintf(stdout, " \n");
	}
	if (i+1 == size) {
	    fprintf(stdout, "---- transfer size reached ----\n");
	}
    }
    printf("done\n %d words (%d bytes).\n", size, size*8);
    if (nng) {
	fprintf(stderr, "NG %d words\n", nng);
    }
    else {
	fprintf(stderr, "OK\n");
    }
    for (i = 0; i < size; i++) {
	rbuf[i] = 0;
	wbuf[i] = 0;
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

static void
dmaperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 3) {
	devid = atoi(argv[3]);
    }

    sendfunc = atoi(argv[2]);

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    for (i = 0; i < (h->r->dmabuf_bytes>>3); i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    nloop = 1e7;

    for (size = 32; size <= 512; size *= 2) {
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
		hib_sendMC(devid, size, rbuf);
		hib_recvMC(devid, size, wbuf);
	    }
	    get_cputime(&lt, &st);
            switch (sendfunc) {
              case SENDFUNC_DMAR:
                printf("size: % 5d DMA read & DMA write: % 4.1f sec  % 7.2f MB/s\n",
                       size*8, lt, 2*sizeof(UINT64)*nloop/1e6/lt);
                break;
              case SENDFUNC_PIOW:
                printf("size: % 5d PIO write & DMA write: % 4.1f sec  % 7.2f MB/s\n",
                       size*8, lt, 2*sizeof(UINT64)*nloop/1e6/lt);
                break;
            }
        }
    }

    // wait long enough before unset test mode
    // so that PIOW burst is completed.
    usleep(10);
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}


static void
dmawperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 2) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 2) {
	devid = atoi(argv[2]);
    }

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
    rbuf = h->dmar_buf;
    wbuf = h->dmaw_buf;

    for (i = 0; i < (h->r->dmabuf_bytes>>3); i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    nloop = 1e7;

#if 1
    fprintf(stderr, "\n# hib[%d] DMA write (host <- HIB)\n", devid);
    for (size = 128; size <= 4096; size *= 2) {
        // for (size = 4096; size <= 4096; size *= 2) {

        for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
            for (j = 0; j < nloop/size; j++) {
                // for (j = 0; j < 2; j++) {
		hib_recvMC(devid, size, wbuf);
	    }
	    get_cputime(&lt, &st);
	    printf("size: %d DMA write: %f sec  %f MB/s\n",
		   size*8, lt, sizeof(UINT64)*nloop/1e6/lt);
	}
    }
#else
    fprintf(stderr, "\n# hib[%d] DMA write (host <- HIB)\n", devid);
    //    for (size = 128; size <= 4096; size *= 2) {
    // for (size = 4096; size <= 4096; size *= 2) {
    for (size = 1024; size < 1025; size *= 2) {
    //   for (size = 512; size < 513; size *= 2) {
    //        for (size = 128; size < 129; size *= 2) {
        for (ntry = 0; ntry < 8192; ntry++) {
            hib_start_dmawMC(devid, size, wbuf);
            hib_mem_readMC(devid, h->dma1misc);
            hib_finish_dmawMC(devid);

            hib_start_dmawMC(devid, 1, wbuf);
            hib_mem_readMC(devid, h->dma1misc);
            hib_finish_dmawMC(devid);
	}
    }
#endif

    // wait long enough before unset test mode
    // so that PIOW burst is completed.
    usleep(10);
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}


static void
dmarperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 3) {
	devid = atoi(argv[3]);
    }

    sendfunc = atoi(argv[2]);

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read (host -> HIB)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write (host -> HIB)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    for (i = 0; i < (h->r->dmabuf_bytes>>3); i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    nloop = 1e7;

    for (size = 128; size <= 4096; size *= 2) {
        for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);

            for (j = 0; j < nloop/size; j++) {
		hib_sendMC(devid, size, rbuf);
	    }
	    get_cputime(&lt, &st);
            switch (sendfunc) {
              case SENDFUNC_DMAR:
                printf("size: %d DMA read: %f sec  %f MB/s\n",
                       size*8, lt, sizeof(UINT64)*nloop/1e6/lt);
                break;
              case SENDFUNC_PIOW:
                printf("size: %d PIO write: %f sec  %f MB/s\n",
                       size*8, lt, sizeof(UINT64)*nloop/1e6/lt);
                break;
            }
	}
    }


    // wait long enough before unset test mode
    // so that PIOW burst is completed.
    usleep(10);
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

static void
dmaperf2(int argc, char **argv)
{
    Hib *h[NHIBMAX];
    UINT64 *wbuf[NHIBMAX], *rbuf[NHIBMAX];
    UINT32 val;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;
    int nhib;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    sendfunc = atoi(argv[2]);
    nhib = atoi(argv[3]);
    if (hib_ndevice() < nhib) {
	fprintf(stderr,
		"too large number of hibs(= %d).\n",
		nhib);
	exit(1);
    }

    for (devid = 0; devid < nhib; devid++) {
        h[devid] = hib_openMC(devid);
	hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);

	switch (sendfunc) {
	  case SENDFUNC_DMAR:
            fprintf(stderr, "# hib[%d] DMA read (host -> HIB)\n", devid);
            val = hib_config_readMC(devid, 0x88) >> 12;
            val = val & 0x7;
            val = 128 << val;
            fprintf(stderr, "max read request size : %d byte\n", val);
            rbuf[devid] = h[devid]->dmar_buf;
            wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  case SENDFUNC_PIOW:
	    fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	    rbuf[devid] = piowbuf2[devid];
	    wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  default:
	    fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	    exit(1);
	}
    }

    nloop = 1e7;

    for (size = 32; size <= 512; size *= 2) { // size in 8-byte word
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    for (devid = 0; devid < nhib; devid++) {
		hib_sendMC(devid, size, rbuf[devid]);
	    }
	    for (devid = 0; devid < nhib; devid++) {
		hib_recvMC(devid, size, wbuf[devid]);
	    }
	}
	get_cputime(&lt, &st);
        switch (sendfunc) {
          case SENDFUNC_DMAR:
            printf("size: % 5d byte   DMA read & DMA write: % 4.1f sec  % 7.2f MB/s\n",
                   size*8, lt, nhib*sizeof(UINT64)*nloop/1e6/lt);
            break;
          case SENDFUNC_PIOW:
            printf("size: % 5d byte   PIO write & DMA write: % 4.1f sec  % 7.2f MB/s\n",
                   size*8, lt, nhib*sizeof(UINT64)*nloop/1e6/lt);
            break;
        }
    }

    for (devid = 0; devid < nhib; devid++) {
	hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

static void
dmawperf2(int argc, char **argv)
{
    Hib *h[NHIBMAX];
    UINT64 *wbuf[NHIBMAX], *rbuf[NHIBMAX];
    int devid = 0;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;
    int nhib, devidoff = 0;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    nhib = atoi(argv[2]);

    if (argc > 3) {
	devidoff = atoi(argv[3]);
    }

    if (hib_ndevice() < devidoff + nhib) {
	fprintf(stderr,
		"too large number of hibs(= %d).\n",
		devidoff + nhib);
	exit(1);
    }

    fprintf(stderr, "\n# DMA write (host <- HIB) using %d hibs simultaneously.\n", nhib);
    for (devid = devidoff; devid < nhib + devidoff; devid++) {
        h[devid] = hib_openMC(devid);
	hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
	rbuf[devid] = h[devid]->dmar_buf;
	wbuf[devid] = h[devid]->dmaw_buf;
    }

    nloop = 1e7;

#if 1
     for (size = 32; size <= 1024*16+1; size *= 1.1) { // size in 8-byte word
#else
    for (size = 1024*128/8; 1; ) { // size in 8-byte word
#endif
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
#if 0 // sequential
		for (devid = devidoff; devid < nhib + devidoff; devid++) {
		    hib_recvMC(devid, size, wbuf[devid]);
		}
#else // parallel
		for (devid = devidoff; devid < nhib + devidoff; devid++) {
		    hib_start_dmawMC(devid, size, wbuf[devid]);
		}
		for (devid = devidoff; devid < nhib + devidoff; devid++) {
		    hib_finish_dmawMC(devid);
		}
#endif
	    }
	    get_cputime(&lt, &st);
	    printf("size: % 5d byte   DMA write: % 4.1f sec  % 7.2f MB/s\n",
		   size*8, lt, nhib*sizeof(UINT64)*nloop/1e6/lt);
	    fflush(stdout);
	}
    }

    for (devid = devidoff; devid < nhib + devidoff; devid++) {
	hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

static void
dmarperf2(int argc, char **argv)
{
    Hib *h[NHIBMAX];
    UINT64 *wbuf[NHIBMAX], *rbuf[NHIBMAX];
    char msg[256];
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;
    int nhib, devidoff = 0;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    sendfunc = atoi(argv[2]);
    nhib = atoi(argv[3]);

    if (argc > 4) {
	devidoff = atoi(argv[4]);
    }

    if (hib_ndevice() < devidoff + nhib) {
	fprintf(stderr,
		"too large number of hibs(= %d).\n",
		devidoff + nhib);
	exit(1);
    }

    switch (sendfunc) {
      case SENDFUNC_DMAR:
        sprintf(msg, "DMA read");
	fprintf(stderr, "\n# %s (host -> HIB) using %d hibs simultaneously.\n",
                msg, nhib);
	break;
      case SENDFUNC_PIOW:
        sprintf(msg, "PIO write");
	fprintf(stderr, "\n# %s (host -> HIB) using %d hibs simultaneously.\n",
                msg, nhib);
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }
    for (devid = devidoff; devid < nhib + devidoff; devid++) {
        h[devid] = hib_openMC(devid);
	hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
        //	fprintf(stderr, "hib[%d] opened.\n", devid);

	switch (sendfunc) {
	  case SENDFUNC_DMAR:
            rbuf[devid] = h[devid]->dmar_buf;
            wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  case SENDFUNC_PIOW:
	    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	    rbuf[devid] = piowbuf2[devid];
	    wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  default:
	    fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	    exit(1);
	}
        for (i = 0; i < (h[devid]->r->dmabuf_bytes>>3); i++) {
            rbuf[devid][i] = 0x1234567812345678ll;
        }
    }

    nloop = 1e7;

#if 1
     for (size = 32; size <= 1024*16+1; size *= 1.1) { // size in 8-byte word
#else
    for (size = 1024*128/8; 1; ) { // size in 8-byte word
#endif
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
		switch (sendfunc) {
		  case SENDFUNC_DMAR:
		    for (devid = devidoff; devid < nhib + devidoff; devid++) {
			hib_start_dmarMC(devid, size, rbuf[devid]);
		    }
		    for (devid = devidoff; devid < nhib + devidoff; devid++) {
                        hib_finish_dmarMC(devid);
		    }
		    break;
		  case SENDFUNC_PIOW:
		    for (devid = devidoff; devid < nhib + devidoff; devid++) {
			hib_piowMC(devid, size, rbuf[devid]);
		    }
		    break;
		}
	    }
	    get_cputime(&lt, &st);
	    printf("size: % 5d byte   %s : % 4.1f sec  % 7.2f MB/s\n",
		   size*8, msg, lt, nhib*sizeof(UINT64)*nloop/1e6/lt);
	    fflush(stdout);
	}
    }

    for (devid = devidoff; devid < nhib + devidoff; devid++) {
	hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

/*
 * busmode : 0:PCI  1:PCI-X  2:PCIe
 * returns bus frequency in MHz unit.
 */
static double
get_pcibus_freqMC(int devid, Hib *h, int *busmode)
{
    int plda_csr;
    double freq;

    if (h->type == HIB_GRAPE7X) { // XHIB
        plda_csr = hib_config_readMC(devid, 0x44); /* PLDA Core Status Register at 0x44 */
        switch ((plda_csr>>28)&0x3) {
          case 0:
            freq = 33.0;
            break;
          case 1:
            freq = 66.0;
            break;
          case 2:
            freq = 100.0;
            break;
          case 3:
            freq = 133.33;
            break;
        }
        *busmode = (plda_csr>>30)&0x1;
    }
    else { // PCIe PIPE interface freq
        UINT32 val;
        val = hib_config_readMC(devid, 0x90) >> 0+16; // link speed
        val = val & 0xf;
        switch (val) {
          case 1:
            freq = 125.0;
            break;

          case 2:
            freq = 250.0;
            break;

          default:
            freq = 0.0;
            fprintf(stderr, "illegal link speed. abort.\n");
            exit(1);
        }
        *busmode = 2;
    }

    return freq;
}

static void
showstatus(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j, busmode;
    int devid;
    double freq;
    int hib0, nhib;

    hib0 = 0;
    nhib = hib_ndevice();
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhib = 1;
    }
    //    fprintf(stderr, "hib0: %d    nhib: %d\n", hib0, nhib);

    for (j = 0; j < nhib; j++) {
	devid = hib0 + j;
	fprintf(stderr, "## hib%d:\n", devid);
        h = hib_openMC(devid);

        if (h->type == HIB_GRAPEDRG || h->type == HIB_GRAPEDRP) {
            UINT32 val, gen;

            fprintf(stderr, "protocol : PCIe\n");

            // link width
            val = hib_config_readMC(devid, 0x90) >> 4+16;
            val = val & 0x3f;
            fprintf(stderr, "link width negotiated : x%d\n", val);
            val = hib_config_readMC(devid, 0x8c) >> 4;
            val = val & 0x3f;
            fprintf(stderr, "            supported : x%d\n", val);

            // link speed
            val = hib_config_readMC(devid, 0x90) >> 0+16;
            val = val & 0xf;
            fprintf(stderr, "link speed negotiated : %3.1f Gb/s\n", 2.5 * val);
            val = hib_config_readMC(devid, 0x8c) & 0xf;
            fprintf(stderr, "           supported  : %3.1f Gb/s\n", 2.5 * val);

            // payload size
            val = hib_config_readMC(devid, 0x88) >> 5;
            val = val & 0x7;
            val = 128 << val;
            fprintf(stderr, "max payload size negotiated : %d byte\n", val);
            val = hib_config_readMC(devid, 0x84) & 0x7;
            val = 128 << val;
            fprintf(stderr, "                 supported  : %d byte\n", val);

            // read request size
            val = hib_config_readMC(devid, 0x88) >> 12;
            val = val & 0x7;
            val = 128 << val;
            fprintf(stderr, "max read request size : %d byte\n", val);
            fprintf(stderr, "\n");
        }
        else {
            fprintf(stderr, "protocol : PCI-X\n");
            fprintf(stderr, "\n");
        }

	fprintf(stderr, "configuration register:\n");
	for (i = 0; i < 16; i++) {
            if (4 <= i && i <=7) {
                fprintf(stderr, "0x%08x: 0x%08x 0x%08x\n",
                        i*4, hib_config_readMC(devid, i*4),
			(hib_config_readMC(devid, i*4)>>4)<<4);
            }
            else {
                fprintf(stderr, "0x%08x: 0x%08x\n",
                        i*4, hib_config_readMC(devid, i*4));
            }
	}
#if 1
        fprintf(stderr, "PCI Express Capability Register:\n");
        for (i = 0x80; i <= 0x90; i += 4) {
            fprintf(stderr, "0x%08x: 0x%08x\n",
                    i, hib_config_readMC(devid, i));
        }
#endif
	hib_closeMC(devid);
    }
}

static void
configread(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 3) {
	devid = atoi(argv[3]);
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);

    fprintf(stderr, "hib[%d] config 0x%08lx: 0x%08x\n",
	    devid, addr, hib_config_readMC(devid, addr));

    hib_closeMC(devid);
}

static void
configwrite(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr, val;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	devid = atoi(argv[4]);
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);
    val = strtoul(argv[3], (char**)NULL, 16);
    fprintf(stderr, "write to hib[%d] config 0x%08lx value 0x%08lx\n",
	    devid, addr, val);
    hib_config_writeMC(devid, addr, val);
    hib_closeMC(devid);
}

static void
regread(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 3) {
	devid = atoi(argv[3]);
    }
    addr = strtoul(argv[2], (char**)NULL, 16);
    //    hib_set_warn_level(5);
    h = hib_openMC(devid);

    fprintf(stderr, "hib[%d] 0x%08lx: 0x%08x\n",
    	    devid, addr, hib_mem_readMC(devid, addr));
    if (addr % 8 == 0) {
        fprintf(stderr, "hib[%d] 0x%08lx: 0x%016llx (64bit)\n",
                devid, addr, hib_mem_read64MC(devid, addr));
    }

    hib_closeMC(devid);
}

static void
regwrite(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr, val;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 4) {
	devid = atoi(argv[4]);
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);
    val = strtoul(argv[3], (char**)NULL, 16);
    fprintf(stderr, "write to hib[%d] 0x%08lx value 0x%08lx\n",
	    devid, addr, val);

#if 1
    hib_mem_writeMC(devid, addr, val);
#else
    hib_mem_write64MC(devid, addr, 0x12345678abcdef77LL);
#endif

    hib_closeMC(devid);
}

static void
stopdmaw(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j, size, nword;
    int devid;
    int hib0, nhib;

    hib0 = 0;
    nhib = hib_ndevice();
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhib = 1;
    }
    fprintf(stderr, "hib0: %d    nhib: %d\n", hib0, nhib);

    fprintf(stderr, "stop DMA channel operation (host <- HIB)");
    for (j = 0; j < nhib; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
	fprintf(stderr, "\n#\n# stop hib[%d]\n#\n", devid);
        if (h->type == HIB_GRAPEDRG) {
            hib_mem_writeMC(devid, h->r->dma0cmd, (1 << h->r->dma0cmd_abort_bit));
            hib_mem_writeMC(devid, h->r->dma1cmd, (1 << h->r->dma1cmd_abort_bit));
        }
        else {
            hib_mem_writeMC(devid, h->r->command, (1 << h->r->command_dma_reset_bit));
        }
	hib_closeMC(devid);
    }
}

static int
getdma1datacnt(int devid, Hib *h)
{
    int datacnt;

    switch (h->type) {
      case HIB_GRAPE7X:
	datacnt =  8 * (hib_mem_readMC(devid, h->r->dma1misc) & 0x7ff);
	// in PCI-X core, datacnt is counted in 64-bit word unit.
	break;
      case HIB_GRAPEDRP:
	datacnt =  (hib_mem_readMC(devid, h->r->dma1misc) & 0x1fff);
	// in PCIe core, datacnt is counted in byte unit.
	break;
      case HIB_GRAPEDRG:
	datacnt =  (hib_mem_readMC(devid, h->r->dma1misc) & 0x1fff);
	// in PCIe core, datacnt is counted in byte unit.
	break;
      default:
	datacnt = 0;
	fprintf(stderr, "getdma1datacnt: unknown Hib type: %d\n", h->type);
	exit(2);
    }
    return datacnt;
}

static void
clearfifo(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j;
    int devid;
    int datacnt;

    int hib0, nhib;

    hib0 = 0;
    nhib = hib_ndevice();
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhib = 1;
    }
    fprintf(stderr, "hib0: %d    nhib: %d\n", hib0, nhib);

    fprintf(stderr, "clear HIB-internal FIFO \n");
    for (j = 0; j < nhib; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
        hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);
        rbuf = h->dmar_buf;
        wbuf = h->dmaw_buf;

	fprintf(stderr, "\n#\n# clear hib[%d] FIFO ", devid);
	while (datacnt = getdma1datacnt(devid, h)) {
	    fprintf(stderr, "dma1 datacnt: %d byte(s)\n", datacnt);
	    usleep(3000000/datacnt);
	    hib_recvMC(devid, 1, wbuf);
	}

	fprintf(stderr, "... done.\n#\n");

        // wait long enough before unset test mode
        // so that PIOW burst is completed.
        usleep(10);
        hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

static void
getwlock(int devid, Hib *h, int wlock[2])
{
    int regval;

    switch (h->type) {
      case HIB_GRAPE7X:
	wlock[0] = ((hib_mem_readMC(devid, h->r->dma0misc) >> 25)&1);
	wlock[1] = ((hib_mem_readMC(devid, h->r->dma0misc) >> 26)&1);
	break;
      case HIB_GRAPEDRP:
	wlock[0] = ((hib_mem_readMC(devid, h->r->dma0misc) >> 14)&1);
	wlock[1] = ((hib_mem_readMC(devid, h->r->dma1misc) >> 14)&1);
	break;
      case HIB_GRAPEDRG:
        regval = hib_mem_readMC(devid, h->r->command);
        wlock[0] = ((regval >> h->r->command_sram0_wlock_bit ) & 1);
        wlock[1] = ((regval >> h->r->command_sram1_wlock_bit ) & 1);
	break;
      default:
	fprintf(stderr, "getwlock: unknown Hib type: %d\n", h->type);
	exit(2);
    }
}

static void
showdmastatus(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf, val;
    int i, j;
    int devid;
    int hib0, nhib;
    int wlock[2];

    hib0 = 0;
    nhib = hib_ndevice();
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhib = 1;
    }
    fprintf(stderr, "hib0: %d    nhib: %d\n", hib0, nhib);

    fprintf(stderr, "show DMA status \n");
    for (j = 0; j < nhib; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
        getwlock(devid, h, wlock);
	fprintf(stderr, "\n#\n# hib[%d]\n#\n", devid);
	fprintf(stderr, "PIO write (host->HIB)\n");
	fprintf(stderr, "    swap_sram: %d\n", (hib_mem_readMC(devid, h->r->dma0misc)>>(h->r->dma0misc_swap_sram_bit))&1);
	fprintf(stderr, "    sram0_wlock: %d\n", wlock[0]);
	fprintf(stderr, "    sram1_wlock: %d\n", wlock[1]);
        if (h->type == HIB_GRAPEDRG) {
            fprintf(stderr, "    sram_wcnt: %d\n", (hib_mem_readMC(devid, h->r->command)>>(h->r->command_sram_wcnt_bit))&0x1ff);
        }
        else {
            fprintf(stderr, "    sram_wcnt: %d\n", (hib_mem_readMC(devid, h->r->dma0misc)>>(h->r->dma0misc_sram_wcnt_bit))&0x1ff);
        }
	fprintf(stderr, "\n");

	fprintf(stderr, "DMA0 (host->HIB)\n");
	fprintf(stderr, "    dma0_done: %d\n", (hib_mem_readMC(devid, h->r->dma0misc) >> (h->r->dma0misc_done_bit))&1);
	fprintf(stderr, "    data to be transferred:    0x%x byte\n", hib_mem_readMC(devid, h->r->dma0size));
	fprintf(stderr, "    command & status register: 0x%08x\n", hib_mem_readMC(devid, h->r->dma0cmd));
	fprintf(stderr, "\n");

	fprintf(stderr, "DMA1 (host<-HIB)\n");
	fprintf(stderr, "    dma1_done: %d\n", (hib_mem_readMC(devid, h->r->dma1misc) >> (h->r->dma1misc_done_bit))&1);
	fprintf(stderr, "    m_dma1_datacnt:            0x%x byte\n",
		getdma1datacnt(devid, h));
	fprintf(stderr, "    data to be transferred:    0x%x byte\n", hib_mem_readMC(devid, h->r->dma1size));
	fprintf(stderr, "    command & status register: 0x%08x\n", hib_mem_readMC(devid, h->r->dma1cmd));
        if (h->type == HIB_GRAPEDRG) {
            val = hib_mem_readMC(devid, h->r->testmode);
            if (val>>31 & (UINT64)0x1) {
                fprintf(stderr, "tx buf overflown.\n");
            }
            if (val>>30 & (UINT64)0x1) {
                fprintf(stderr, "tx buf underflown.\n");
            }
        }
	fprintf(stderr, "\n");
	hib_closeMC(devid);
    }
}

static void
pioread(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int baddr, waddr;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 3) {
	devid = atoi(argv[3]);
    }
    h = hib_openMC(devid);
    baddr = strtoull(argv[2], (char**)NULL, 16);

    waddr = baddr >> 3;
    baddr = waddr << 3;
    fprintf(stderr, "backend[%d] 0x%08lx: 0x%016llx\n",
	    devid, baddr, h->backend[waddr]);

    hib_closeMC(devid);
}

static void
piowrite(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int baddr, waddr;
    UINT64 val;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 4) {
	devid = atoi(argv[4]);
    }
    h = hib_openMC(devid);
    baddr = strtoul(argv[2], (char**)NULL, 16);
    val = strtoull(argv[3], (char**)NULL, 16);

    waddr = baddr >> 3;
    baddr = waddr << 3;
    fprintf(stderr, "write to backend[%d] 0x%08lx value 0x%016llx\n",
	    devid, baddr, val);
    h->backend[waddr] = val;
    hib_closeMC(devid);
}

#define MEGA (1e6)
static void
rawperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int i, j, ntry, nng, size0, off;
    int size; /* in 64-bit words */
    double sized, ratio, nloop;
    double lt = 0.0, st = 0.0;
    UINT64 *b;

    if (argc < 2) {
	showusage(argc, argv);
	exit(1);
    }

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
    rbuf = h->dmar_buf;
    wbuf = h->dmaw_buf;
    ratio = 1.01;

#if 1
    // DMA read
    nloop = 1e5;
    printf("\n#\n# DMA read\n#\n");
    for (sized = size = 2; size < 1024*16+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    hib_sendMC(devid, size, rbuf);
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }

    // DMA write
    nloop = 1e5;
    printf("\n#\n# DMA write\n#\n");
    for (sized = size = 2; size < 1024*16+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    hib_recvMC(devid, size, wbuf);
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }
#endif

    // PIO write
    nloop = 1e5;

    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
    printf("\n#\n# PIO write\n#\n");

    for (sized = size = 2; size < 1024*16+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    hib_sendMC(devid, size, piowbuf);
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }

#if 0 // never turn this flag on for PCIe.
    // PIO read
    nloop = 1e4;

    printf("\n#\n# PIO read\n#\n");
    for (sized = size = 2; size < 1024+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    for (i = 0; i < size; i++) {
	        piowbuf[i] = h->backend[i];
	    }
	    // _mm_mfence();
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }
#endif


    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

static void
eraserom(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    volatile UINT32 asmi;
    UINT32 var;

    if (hib_ndevice() > 1 && argc < 3) {
        showusage(argc, argv);
        exit (1);
    }
    if (argc > 2) {
        devid = atoi(argv[2]);
    }
    h = hib_openMC(devid);
    hib_mem_writeMC(devid, h->r->command, (1<<h->r->command_reset_backend_bit));
    fprintf(stderr, "\nerasing configuration ROM of hib[%d].\n"
            "this would take a few minutes. be patient.", devid);

    var = h->r->asmi_cmd_eb;
    hib_mem_writeMC(devid, h->r->asmi, var);
    do {
        sleep(3);
        asmi = hib_mem_readMC(devid, h->r->asmi);
        fprintf(stderr, ".");
    }
    while (asmi & (1 << h->r->asmi_busy_bit));

    if (asmi & (1 << h->r->asmi_error_bit)) {
	fprintf(stderr, "failed for unknown reason.\n");
	exit(1);
    }

    fprintf(stderr, "done.\n\n");
    hib_closeMC(devid);
}

static char
reverse_byte(char in)
{
    char out = 0;

    out =
        ((in & 0x01) << 7) |
        ((in & 0x02) << 5) |
        ((in & 0x04) << 3) |
        ((in & 0x08) << 1) |
        ((in & 0x10) >> 1) |
        ((in & 0x20) >> 3) |
        ((in & 0x40) >> 5) |
        ((in & 0x80) >> 7);

    return out;
}

static void
writerom(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    unsigned int i, addr, size, pagesize, sizemax;
    int devid = 0;
    FILE *fp;
    char poffile[128];
    char *pofdata;
    volatile UINT32 asmi;

    if (argc < 3 || (hib_ndevice() > 1 && argc < 4)) {
	showusage(argc, argv);
	exit (1);
    }
    strncpy(poffile, argv[2], 128);
    if (argc > 3) {
	devid = atoi(argv[3]);
    }

    fp = fopen(poffile, "r");
    if (!fp) {
        perror("writerom");
        exit(1);
    }
    sizemax = (1<<24)-1; // 16Mbyte
    pofdata = (char *)malloc(sizeof(char) * sizemax);
    size = fread(pofdata, sizeof(char), sizemax, fp);
    fclose(fp);
    fprintf(stderr, "read %d byte in %s\n", size, poffile);

    h = hib_openMC(devid);
    hib_mem_writeMC(devid, h->r->command, (1<<h->r->command_reset_backend_bit));
    fprintf(stderr, "\nwriting %s to configuration ROM of hib[%d]...\n\n",
            poffile, devid);

    pagesize = 256;
    for (addr = 0; addr < size; addr += pagesize) {
        UINT32 var;
        if (pagesize + addr > size) {
            pagesize = size - addr;
        }

	// stage1: fill up ASMI-megafunction-internal fifo.
        for (i = 0; i < pagesize; i++) {
            var = (h->r->asmi_cmd_wp | (0xff & pofdata[addr + i])); // don't reverse the bit order. the hardware handles it.
            hib_mem_writeMC(devid, h->r->asmi, var);

#if 1 // just to wait for a moment in order to make sure the written byte is send to the fifo
	    do {
		asmi = hib_mem_readMC(devid, h->r->asmi);
	    }
	    while (asmi & (1 << h->r->asmi_busy_bit));
#endif

        }

	// stage2: flush the fifo to the ROM.
        var = (addr << 8) | (0xff & pagesize);
        hib_mem_writeMC(devid, h->r->asmi, var); // writing start address will start flushing automatically.
        do {
            asmi = hib_mem_readMC(devid, h->r->asmi);
        }
        while (asmi & (1 << h->r->asmi_busy_bit));

	if (asmi & (1 << h->r->asmi_error_bit)) {
	    fprintf(stderr, "failed for unknown reason.\n");
	    exit(1);
	}

	if (addr % (1<<18) == 0) {
            fprintf(stderr, "%5.1f MB (%2d%%) done\n",
                    (double)(addr)/(1<<20), 100*addr/size);
        }
    }
    fprintf(stderr, "done.\n\n");
    hib_closeMC(devid);
    free(pofdata);
}

static void
readromid(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    static char romname[32];
    volatile UINT32 asmi, id;
    UINT32 var;

    if (argc > 2) {
        devid = atoi(argv[2]);
    }
    h = hib_openMC(devid);
    hib_mem_writeMC(devid, h->r->command, (1<<h->r->command_reset_backend_bit));
    fprintf(stderr, "\nreading configuration ROM id of hib[%d]...", devid);

    var = h->r->asmi_cmd_rsid;
    hib_mem_writeMC(devid, h->r->asmi, var);
    do {
	asmi = hib_mem_readMC(devid, h->r->asmi);
    }
    while (asmi & (1 << h->r->asmi_busy_bit));
    asmi = hib_mem_readMC(devid, h->r->asmi);
    id = ((asmi >> 8) & 0xff);

    fprintf(stderr, "done.\n\n");
    printf("asmi: 0x%08x\n", asmi);
    hib_closeMC(devid);

    switch (id) {
      case 0x10:
        sprintf(romname, "EPCS1");
        break;
      case 0x12:
        sprintf(romname, "EPCS4");
        break;
      case 0x14:
        sprintf(romname, "EPCS16");
        break;
      case 0x16:
        sprintf(romname, "EPCS64");
        break;
      default:
        sprintf(romname, "unknown");
    };
    printf("id:0x%02x  rom:%s\n", id, romname);
}

#if 0
  -- PLL Reconfiguration Control Register (40h)
  -- 
  --         write             read
  -- (32:25) not used
  -- (24:16) datain
  -- (15:12) not used
  -- (11: 8) counter_type
  -- ( 7: 6) not used
  -- ( 5: 3) counter_param
  -- (2)     reconfig
  -- (1)     read_param
  -- (0)     write_param       busy

  type   val
  N      0x0
  M      0x1
  CP/LF  0x2
  VCO    0x3
  C0     0x4
  C1     0x5
  C2     0x6
  C3     0x7
  C4     0x8
  C5     0x9
  C6     0xA
  C7     0xB
  C8     0xC
  C9     0xD

  param          val    width

  C0-9,M,N counters:
  high_count     0x0    8
  low_count      0x1    8
  //  bybass_M&N    0x2    1 // NG???
  bybass_M&N     0x4    1 // OK???
  bybass_couter  0x4    1
  odd_division   0x5    1

  CP/LF:
  CP current     0x0    3
  CP unused      0x5    5
  LF resistor    0x1    5
  LF capacitance 0x2    2
  LF unused      0x4    1

  VCO:
  vco_post_scale 0x0    1

#endif

// write parameters to the pll scan cache.
static void
writepllparam(int devid, Hib *h, int type, int param, int data)
{
    UINT32 pllconf;
    UINT32 val = (data << 16) | (type << 8) | (param << 3) | (1 << h->r->pllconf_write_bit);
;
    hib_mem_writeMC(devid, h->r->pllconf, val);
    do {
	pllconf = hib_mem_readMC(devid, h->r->pllconf);
	//	fprintf(stderr, "0x%08x\n", pllconf);
    }
    while (pllconf & (1 << h->r->pllconf_busy_bit));

    fprintf(stderr, ".");
    //    usleep(100000); // this is not necessary. just in case.
    pllconf = hib_mem_readMC(devid, h->r->pllconf);
}

static void
writepllconf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0, m = 1, n = 1;
    UINT32 pllconf;
    static char romname[32];
    volatile UINT32 asmi, id;
    int busmode;
    double freq;
    UINT32 counter_type; // 4 bit
    UINT32 counter_param; // 3 bit
    UINT32 counter_data; // 9 bit
    UINT32 val;


    if (argc < 4 || (hib_ndevice() > 1 && argc < 5)) {
	showusage(argc, argv);
	exit (1);
    }
    if (argc > 4) {
        devid = atoi(argv[4]);
    }
    m = atoi(argv[2]);
    n = atoi(argv[3]);

    if (n < 1 || 255 < n) {
	fprintf(stderr, "N out of range. it should be in the range of 1..255.\n");
	exit(1);
    }
    if (m < 1 || 255 < m) {
	fprintf(stderr, "M out of range. it should be in the range of 1..255.\n");
	exit(1);
    }

    h = hib_openMC(devid);

    freq = get_pcibus_freqMC(devid, h, &busmode);
    switch (busmode) {
      case 0:
      case 1:
        fprintf(stderr, "\nset hib[%d] pipeline clock frequency to PCI-X_bus_freq * %d / %d "
                "(%5.2fMHz * %d * %d = %5.2fMHz).",
                devid, m, n, freq, m, n, freq * m / n);
        break;
      default:
        fprintf(stderr, "\nset hib[%d] pipeline clock frequency to PIPE_freq * %d / %d."
                "(%5.2fMHz * %d * %d = %5.2fMHz).",
                devid, m, n, freq, m, n, freq * m / n);
        break;
    }

#if 0
    // type, param, data
    writepllparam(devid, h, 0x1, 0x4, 0x0); // M bypass: 0
    writepllparam(devid, h, 0x1, 0x0, m); // M high_count: 
    writepllparam(devid, h, 0x1, 0x1, m); // M low_count: 

    writepllparam(devid, h, 0x0, 0x4, 0x0); // N bypass: 0
    writepllparam(devid, h, 0x0, 0x0, n); // N high_count: 
    writepllparam(devid, h, 0x0, 0x1, n); // N low_count: 

    writepllparam(devid, h, 0x4, 0x4, 0x1); // C0 bypass: 1

    /*
    writepllparam(devid, h, 0x4, 0x4, 0x0); // C0 bypass: 0
    writepllparam(devid, h, 0x4, 0x0, n); // C0 high_count: 
    writepllparam(devid, h, 0x4, 0x1, n); // C0 low_count: 
    */
#elif 1
    // type, param, data
    //    writepllparam(devid, h, 0x2, 0x1, 0x18); // LF resistor: 24 !!!
    //    writepllparam(devid, h, 0x2, 0x1, 0x1c); // LF resistor: 28 !!!

    writepllparam(devid, h, 0x1, 0x4, 0x0); // M bypass: 0
    writepllparam(devid, h, 0x1, 0x0, m); // M high_count: 
    writepllparam(devid, h, 0x1, 0x1, m); // M low_count: 

    writepllparam(devid, h, 0x0, 0x4, 0x0); // N bypass: 0
    writepllparam(devid, h, 0x0, 0x0, n); // N high_count: 
    writepllparam(devid, h, 0x0, 0x1, n); // N low_count: 

    writepllparam(devid, h, 0x4, 0x4, 0x1); // C0 bypass: 1
#else
    writepllparam(devid, h, 0x0, 0x2, 0x1); // M bypass: 0
    writepllparam(devid, h, 0x4, 0x4, 0x1); // C0 bypass: 0
    writepllparam(devid, h, 0x5, 0x4, 0x1); // C1 bypass: 0
#endif

    // reconfigure the pll.
    val = 0x1 << h->r->pllconf_reconfig_bit;
    hib_mem_writeMC(devid, h->r->pllconf, val);
    do {
	pllconf = hib_mem_readMC(devid, h->r->pllconf);
    }
    while (pllconf & (1 << h->r->pllconf_busy_bit));

    fprintf(stderr, "done.\n\n");
    hib_closeMC(devid);
}

static void
reconfdevice(int argc, char **argv)
{
    Hib *h;
    int devid = 0;

    if (argc > 2) {
        devid = atoi(argv[2]);
    }
    fprintf(stderr, "reconfigure device[%d].\n", devid);
    h = hib_openMC(devid);
    hib_mem_writeMC(devid, h->r->rupdate, 1); // 1 has no meaning. any value will do.
    hib_closeMC(devid);
    fprintf(stderr, "done.\n\n");
}

static void
resetbackend(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j, size, nword;
    int devid;
    int hib0, nhib;

    hib0 = 0;
    nhib = hib_ndevice();
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhib = 1;
    }
    fprintf(stderr, "hib0: %d    nhib: %d\n", hib0, nhib);

    fprintf(stderr, "\nreset backend\n\n");
    for (j = 0; j < nhib; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
	fprintf(stderr, "\n#\n# reset hib[%d] backend ... ", devid);
	hib_mem_writeMC(devid, h->r->command, (1<<h->r->command_reset_backend_bit));
	fprintf(stderr, "done.\n\n");
	hib_closeMC(devid);
    }
}


static void
ioread(int argc, char **argv)
{
#if !defined(_WIN32)
    Hib *h;
    char wordtype[32];
    int devid = 0, nword = 1, wordsize, i;
    unsigned long int bar, addr, val;
    UINT8 bufb[256];
    UINT16 bufw[256];
    UINT32 bufl[256];
    int nhib;

    // argv 1      2       3      4            5
    //      ioread <b|w|l> <addr> [# of words] [devid]",

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    addr = strtoull(argv[3], (char**)NULL, 16);
    if (argc > 4) {
	nword = atoi(argv[4]);
        if (nword <= 0) {
            fprintf(stderr, "invalid # of words : %d\n", nword);
            exit(1);
        }
    }
    if (argc > 5) {
	devid = atoi(argv[5]);
    }

    h = hib_openMC(devid);
    bar = hib_config_readMC(devid, 0x1c); // bar3 is mapped to an I/O space.
    bar &= ~((UINT32)3);
    fprintf(stderr, "hib[%d] BAR3 I/O space : 0x%08lx\n", devid, bar);
    hib_closeMC(devid);

    addr += bar;

    if (iopl(3)) {
        perror("iopl() failed.");
        exit(1);
    }

    if (nword == 1) { // read/write a single word.
        switch (argv[2][0]) {
          case 'b':
            wordsize = 1;
            sprintf(wordtype, "byte");
            val = inb(addr);
            break;
          case 'w':
            wordsize = 2;
            sprintf(wordtype, "word");
            val = inw(addr);
            break;
          case 'l':
            wordsize = 4;
            sprintf(wordtype, "long_word");
            val = inl(addr);
            break;
          default :
            fprintf(stderr, "word size should be one of 'b', 'w', or 'l'.\n\n");
            showusage(argc, argv);
            exit(1);
        }
        fprintf(stderr, "I/O space read %s 0x%08lx: 0x%016llx\n",
                wordtype, addr, val);
    }
    else { // read/write multiple words.
        switch (argv[2][0]) {
          case 'b':
            wordsize = 1;
            sprintf(wordtype, "byte");
            insb(addr, (void*)bufb, nword);
            break;
          case 'w':
            wordsize = 2;
            sprintf(wordtype, "word");
            insw(addr, (void*)bufw, nword);
            break;
          case 'l':
            wordsize = 4;
            sprintf(wordtype, "long_word");
            insl(addr, (void*)bufl, nword);
            break;
          default :
            fprintf(stderr, "word size should be one of 'b', 'w', or 'l'.\n\n");
            showusage(argc, argv);
            exit(1);
        }
        fprintf(stderr, "I/O space read %s addr : 0x%08lx\n", wordtype, addr);
        for (i = 0; i < nword; i++) {
            switch (argv[2][0]) {
              case 'b':
                fprintf(stderr, " 0x%02x: 0x%08llx\n", i, bufb[i]);
                break;
              case 'w':
                fprintf(stderr, " 0x%02x: 0x%08llx\n", i, bufw[i]);
                break;
              case 'l':
                fprintf(stderr, " 0x%02x: 0x%08llx\n", i, bufl[i]);
                break;
            }    
        }
    }
#endif // _WIN32
}

static void
iowrite(int argc, char **argv)
{
#if !defined(_WIN32)
    Hib *h;
    char wordtype[32];
    int devid = 0, nword, wordsize;
    unsigned long int bar, addr, val;

    // argv 1       2       3      4     5            6
    //      iowrite <b|w|l> <addr> <val> [# of words] [devid]",

    if (argc < 5) {
	showusage(argc, argv);
	exit(1);
    }
    addr = strtoull(argv[3], (char**)NULL, 16);
    if (addr < 0 || addr > 0xff) {
        fprintf(stderr, "I/O port address should be in a range [0, 0xff]\n");
        exit(1);
    }
    val = strtoull(argv[4], (char**)NULL, 16);
    if (argc > 5) {
	nword = atoi(argv[5]);
    }
    if (argc > 6) {
	devid = atoi(argv[6]);
    }

    h = hib_openMC(devid);
    bar = hib_config_readMC(devid, 0x1c); // bar3 is mapped to an I/O space.
    bar &= ~((UINT32)3);
    fprintf(stderr, "hib[%d] BAR3 I/O space : 0x%08lx\n", devid, bar);
    hib_closeMC(devid);

    addr += bar;

    if (iopl(3)) {
        perror("iopl() failed.");
        exit(1);
    }
    switch (argv[2][0]) {
      case 'b':
        wordsize = 1;
        sprintf(wordtype, "byte");
        outb(val, addr);
        break;
      case 'w':
        wordsize = 2;
        sprintf(wordtype, "word");
        outw(val, addr);
        break;
      case 'l':
        wordsize = 4;
        sprintf(wordtype, "long_word");
        outl(val, addr);
        break;
      default :
        fprintf(stderr, "word size should be one of 'b', 'w', or 'l'.\n\n");
        showusage(argc, argv);
        exit(1);
    }
    fprintf(stderr, "I/O space write %s 0x%08lx: 0x%016llx\n",
	    wordtype, addr, val);
#endif // _WIN32
}


static void
backendloopback(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    UINT32 val;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	devid = atoi(argv[4]);
    }

    sendfunc = atoi(argv[2]);
    size = atoi(argv[3]);

    h = hib_openMC(devid);

    fprintf(stderr, "# write %d 64-bit word(s) to the backend by ", size);
    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "hib[%d] DMA read (host -> HIB -> backend)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write (host -> HIB -> backend)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    for (i = 0; i < size; i++) {
        fprintf(stderr, "word[%03x] : ", i);
        scanf("%llx", rbuf+i);
        fprintf(stderr, "  %03x : %016llx\n", rbuf[i]);
    }

    hib_sendMC(devid, size, rbuf);
    fprintf(stderr, "written:\n");
    for (i = 0; i < size; i++) {
        fprintf(stderr, "  %03x : %016llx\n", i, rbuf[i]);
    }

    hib_recvMC(devid, size, wbuf);
    fprintf(stderr, "read:\n");
    for (i = 0; i < size; i++) {
        fprintf(stderr, "  %03x : %016llx\n", i, wbuf[i]);
    }

    hib_closeMC(devid);
}

#if defined(_WIN32)

static void
get_cputime(double *splittime, double *laptime)
{
    DWORD cnt;
    cnt = GetTickCount();
    *splittime = (double)cnt / 1000.0 - *laptime;
    *laptime = (double)cnt / 1000.0;
}

#else // !_WIN32

static void
get_cputime(double *splittime, double *laptime)
{
    struct timeval x;

    gettimeofday(&x, NULL);

    *splittime = x.tv_sec + x.tv_usec/1000000.0 - *laptime;
    *laptime = x.tv_sec + x.tv_usec/1000000.0;
}

#endif // _WIN32

#if 1

/*
 * write random data to the HIB-internal FIFO, read it back, and then
 * compare with the written ones.
 */
static void
dmatest2(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf, *wbufs, *rbufs;
    int devid = 0;
    int sendfunc;
    int i, j, nng;
    int size, sizemax; /* in 32-bit words */
    int *sizes;
    int verbose = 0;
    int nbuf = 10000;
    double lt = 0.0, st = 0.0, timetotal = 0.0, timegrandtotal = 0.0;
    long long int t, ntry, llval = 0LL;
    long long int sizetotal = 0, sizegrandtotal = 0;
    int linkwidth;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	verbose = atoi(argv[4]);
    }

    if (argc > 5) {
	devid = atoi(argv[5]);
    }

    ntry = atoi(argv[2]);
    sendfunc = atoi(argv[3]);

    h = hib_openMC(devid);

    // link width
    linkwidth = hib_config_readMC(devid, 0x90) >> 4+16;
    linkwidth = linkwidth & 0x3f;
    fprintf(stderr, "link width negotiated : x%d\n", linkwidth);
    sizemax = 1024;
    ntry *= 100000;
    fprintf(stderr, "ntry %lld\n"
            "total size will be about %3.1lfGB.\n"
            "total time will be around %3.1lfs + memory accsess time.\n",
            ntry,
            (double)ntry * sizemax * sizeof(UINT64) / 1e9 * 2 * 0.5, // full duplex, half of sizemax on average.
            (double)ntry * sizemax * sizeof(UINT64) / 1e9 * 2 * 0.5 / (linkwidth / 4 * 0.6)); // size / effective_speed.

    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    sizes = (int *)malloc(sizeof(int) * nbuf);
    rbufs = (UINT64 *)malloc(sizeof(UINT64) * sizemax * nbuf);
    wbufs = (UINT64 *)malloc(sizeof(UINT64) * sizemax * nbuf);
    if (rbufs == NULL || wbufs == NULL) {
        perror("dmatest2");
        exit(1);
    }

    srand48(time(NULL));
    for (t = 0; t < ntry; t += nbuf) {

        if (ntry - t < nbuf) {
            nbuf = ntry - t;
        }
        for (i = 0; i < nbuf; i++) {
            sizes[i] = lrand48() % sizemax;
            if (sizes[i] == 0) {
                sizes[i] = 1;
            }
            for (j = 0; j < sizes[i]; j++) {
                rbufs[i * sizemax + j] = lrand48() << 32 | lrand48();;
                // rbufs[i * sizemax + j] = llval++;
#if 0
                {
                    UINT64 val = (llval & 0xffff);
                    rbufs[i * sizemax + j] = (val+3) << (16*3) | (val+2) << (16*2) | (val+1) << (16*1) | (val+0) << (16*0);
                    llval += 4;
                }
#endif
            }
        }

        get_cputime(&lt, &st);
        for (i = 0; i < nbuf; i++) {
            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizes[i]);
            hib_start_dmawMC(devid, sizes[i], wbuf);
            hib_sendMC(devid, sizes[i], rbuf);
            hib_finish_dmawMC(devid);
            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizes[i]);
        }

        get_cputime(&lt, &st);
        timetotal += lt;
        timegrandtotal += lt;

        nng = 0;
        for (i = 0; i < nbuf; i++) {
            sizetotal += sizes[i] * 2; // full duplex.
            sizegrandtotal += sizes[i] * 2;
            for (j = 0; j < sizes[i]; j++) {
                if (wbufs[i * sizemax + j] != rbufs[i * sizemax + j]) {
                    nng++;
                }
                if (verbose || nng) {
#if defined(_WIN32)
                    // Win32 seems to have a bug in printing 64-bit integer.
                    fprintf(stdout, "rbuf[%04d]: 0x%08x%08x  wbuf[%04d]: 0x%08x%08x",
                            j,
                            *(UINT32 *)((UINT8 *)rbufs+(i * sizemax + j)*8),
                            *(UINT32 *)((UINT8 *)rbufs+(i * sizemax + j)*8+4),
                            j,
                            *(UINT32 *)((UINT8 *)wbufs+(i * sizemax + j)*8),
                            *(UINT32 *)((UINT8 *)wbufs+(i * sizemax + j)*8+4));
#else // !_WIN32
                    fprintf(stdout, "rbuf[%04d]: 0x%016llx  wbuf[%04d]: 0x%016llx",
                            j, rbufs[i * sizemax + j], j, wbufs[i * sizemax + j]);
#endif // _WIN32
                    if (wbufs[i * sizemax + j] != rbufs[i * sizemax + j]) {
                        fprintf(stdout, " NG\n");
                    }
                    else {
                        fprintf(stdout, " \n");
                    }
                }
            } // j

            if (nng) {
                int jj;
                fprintf(stderr, "%d NG words on try %d  i=%d\n", nng, t, i);
                for (jj = -3; jj < 3; jj++) {
                    fprintf(stderr, "sizes[i+(%d)]:%d\n", jj, sizes[i+jj]);
                }
                exit(1);
            }

        } // i

        if (t % 100000 == 0 && t != 0) {
            fprintf(stderr, "try % 10d done. %3.1fGB in %3.1fs. %3.1fGB/s\n",
                    t, sizetotal * sizeof(UINT64) / 1e9,
                    timetotal, sizetotal * sizeof(UINT64) / 1e9 / timetotal);
            sizetotal = 0.0;
            timetotal = 0.0;
        }

    } // t

    fprintf(stderr, "all done. %3.1fGB in %3.1fs. %3.1fGB/s\n",
            t, sizegrandtotal * sizeof(UINT64) / 1e9,
            timegrandtotal, sizegrandtotal * sizeof(UINT64) / 1e9 / timegrandtotal);


    // good house keeping.
    for (i = 0; i < sizemax; i++) {
	rbuf[i] = 0;
	wbuf[i] = 0;
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

#else


/*
 * write random data to the HIB-internal FIFO, read it back, and then
 * compare with the written ones.
 */
static void
dmatest2(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf, *wbufs, *rbufs;
    int devid = 0;
    int sendfunc;
    int i, j, nng;
    int size, sizemax; /* in 32-bit words */
    int *sizes;
    int verbose = 0;
    int nbuf = 10000;
    double lt = 0.0, st = 0.0, timetotal = 0.0, timegrandtotal = 0.0;
    long long int t, ntry;
    long long int sizetotal = 0, sizegrandtotal = 0;
    int linkwidth;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	verbose = atoi(argv[4]);
    }

    if (argc > 5) {
	devid = atoi(argv[5]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    ntry = atoi(argv[2]);
    sendfunc = atoi(argv[3]);

    h = hib_openMC(devid);

    // link width
    linkwidth = hib_config_readMC(devid, 0x90) >> 4+16;
    linkwidth = linkwidth & 0x3f;
    fprintf(stderr, "link width negotiated : x%d\n", linkwidth);
    sizemax = 1024;
    ntry *= 100000;
    fprintf(stderr, "ntry %lld\n"
            "total size will be about %3.1lfGB.\n"
            "total time will be around %3.1lfs + memory accsess time.\n",
            ntry,
            (double)ntry * sizemax * sizeof(UINT64) / 1e9 * 2 * 0.5, // full duplex, half of sizemax on average.
            (double)ntry * sizemax * sizeof(UINT64) / 1e9 * 2 * 0.5 / (linkwidth / 4 * 0.6)); // size / effective_speed.

    //    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	rbuf = h->r->dmar_buf;
	wbuf = h->r->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->r->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    sizes = (int *)malloc(sizeof(int) * nbuf);
    rbufs = (UINT64 *)malloc(sizeof(UINT64) * sizemax * nbuf);
    wbufs = (UINT64 *)malloc(sizeof(UINT64) * sizemax * nbuf);
    if (rbufs == NULL || wbufs == NULL) {
        perror("dmatest2");
        exit(1);
    }

    srand48(time(NULL));
    for (t = 0; t < ntry; t += nbuf) {

        if (ntry - t < nbuf) {
            nbuf = ntry - t;
        }
        for (i = 0; i < nbuf; i++) {
            sizes[i] = lrand48() % sizemax;
            //            sizes[i] = 1024;
            if (sizes[i] == 0) {
                sizes[i] = 1;
            }
            for (j = 0; j < sizemax; j++) {
                rbufs[i * sizemax + j] = lrand48() << 32 | lrand48();;
            }
        }



        nbuf = 1;
#if 0
        for (i = 0; i < nbuf; i++) {
            sizes[i] = 1024;
        }
#elif 0
        for (i = 0; i < nbuf; i += 2) {
            sizes[i] = 100;
            sizes[i+1] = 10;
        }
#endif
        get_cputime(&lt, &st);
        for (i = 0; i < nbuf; i++) {
            //            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizes[i]);

            /*
            */
            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, wbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, wbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, wbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, wbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, rbufs + i * sizemax, sizeof(UINT64) * sizemax);
            memcpy(rbuf, wbufs + i * sizemax, sizeof(UINT64) * sizemax);

            hib_start_dmawMC(devid, sizes[i], wbuf);
            hib_sendMC(devid, sizes[i], rbuf);
            hib_finish_dmawMC(devid);

            //            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizes[i]);

            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(rbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(rbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(rbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(rbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(wbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            memcpy(rbufs + i * sizemax, wbuf, sizeof(UINT64) * sizemax);
            /*
            */

        }
        get_cputime(&lt, &st);
        timetotal += lt;
        timegrandtotal += lt;

        exit(1);



        nng = 0;
        for (i = 0; i < nbuf; i++) {
            sizetotal += sizes[i] * 2; // full duplex.
            sizegrandtotal += sizes[i] * 2;
            for (j = 0; j < sizes[i]; j++) {
                if (wbufs[i * sizemax + j] != rbufs[i * sizemax + j]) {
                    nng++;
                }
                if (verbose || nng) {
                    fprintf(stdout, "rbuf[%04d]: 0x%016llx  wbuf[%04d]: 0x%016llx",
                            j, rbufs[i * sizemax + j], j, wbuf[i * sizemax + j]);
                    if (wbufs[i * sizemax + j] != rbufs[i * sizemax + j]) {
                        fprintf(stdout, " NG\n");
                    }
                    else {
                        fprintf(stdout, " \n");
                    }
                }
            } // j

            if (nng) {
                fprintf(stderr, "%d NG words on try %d\n", nng, t);
                exit(1);
            }

        } // i

        if (t % 100000 == 0 && t != 0) {
            fprintf(stderr, "try % 10d done. %3.1fGB in %3.1fs. %3.1fGB/s\n",
                    t, sizetotal * sizeof(UINT64) / 1e9,
                    timetotal, sizetotal * sizeof(UINT64) / 1e9 / timetotal);
            sizetotal = 0.0;
            timetotal = 0.0;
        }

    } // t

    fprintf(stderr, "all done. %3.1fGB in %3.1fs. %3.1fGB/s\n",
            t, sizegrandtotal * sizeof(UINT64) / 1e9,
            timegrandtotal, sizegrandtotal * sizeof(UINT64) / 1e9 / timegrandtotal);


    // good house keeping.
    for (i = 0; i < sizemax; i++) {
	rbuf[i] = 0;
	wbuf[i] = 0;
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

#endif

#ifdef _WIN32

static void
usleep(int t)
{
    t /= 1000;
    if (t = 0) {
        t = 1;
    }
    Sleep(t);
}

static void
sleep(int t)
{
    Sleep(t * 1000);
}

static void
srand48(long int seedval)
{
    srand(seedval);
}

static long int
lrand48(void)
{
    return rand();
}

// dummy time() used only for random seed generation.
static long int
time(void)
{
    return 0x12345678L;
}

static long long int
strtoull(const char *nptr, char **endptr, int base)
{
    strtoull(nptr, endptr, base);
}

#endif // _WIN32
