/*
 Phantom GRAPE-5 using SSE
*/
// #include <stdio.h>
// #include <stdlib.h>
#include <assert.h>
#include "gravity.h"

#if 0
// summation
#define AX	"%xmm0"
#define AY	"%xmm1"
#define AZ	"%xmm2"
#define PHI	"%xmm3"
// j particle
#define DX	"%xmm4"
#define DY	"%xmm5"
#define DZ	"%xmm6"
#define MJ	"%xmm7"
// temporary
#define RINV	"%xmm8"
#define X2		"%xmm9"
#define Y2		"%xmm10"
#define Z2		"%xmm11"

#define XI	"%xmm12"
#define YI	"%xmm13"
#define ZI	"%xmm14"
#define EPS2 "%xmm15"
#else
#define AX	"%xmm8"
#define AY	"%xmm9"
#define AZ	"%xmm10"
#define PHI	"%xmm11"
// j particle
#define DX	"%xmm12"
#define DY	"%xmm13"
#define DZ	"%xmm14"
#define MJ	"%xmm7"
// temporary
#define RINV	"%xmm0"
#define X2		"%xmm1"
#define Y2		"%xmm2"
#define Z2		"%xmm3"

#define XI	"%xmm4"
#define YI	"%xmm5"
#define ZI	"%xmm6"
#define EPS2 "%xmm15"
#endif

#define XORPS(a, b) asm("xorps "  a  ","  b );
#define LOADPS(mem, reg) asm("movaps %0, %"reg::"m"(mem));
#define STORPS(reg, mem) asm("movaps %"reg " , %0"::"m"(mem));
#define MOVAPS(src, dst) asm("movaps " src "," dst);
#define MOVQ(src, dst) asm("movq " src "," dst);
#define BCAST0(reg) asm("shufps $0x00, " reg ","  reg);
#define BCAST1(reg) asm("shufps $0x55, " reg ","  reg);
#define BCAST2(reg) asm("shufps $0xaa, " reg ","  reg);
#define BCAST3(reg) asm("shufps $0xff, " reg ","  reg);
#define MULPS(src, dst) asm("mulps " src "," dst);
#define ADDPS(src, dst) asm("addps " src ","  dst);
#define SUBPS(src, dst) asm("subps "  src "," dst);
#define ADDPS_M(mem, reg) asm("addps %0, %"reg::"m"(mem));
#define SUBPS_M(mem, reg) asm("subps %0, %"reg::"m"(mem));
#define RSQRTPS(src, dst) asm("rsqrtps " src "," dst);
#define MOVHLPS(src, dst) asm("movhlps " src "," dst);
#define PREFETCH(mem) asm ("prefetchnta %0"::"m"(mem))

/*
#define DEBUGPS(reg) asm("movaps " "%" reg " , %0"::"m"(*dbgbuf)); \
				printf(#reg ": %f %f %f %f\n", dbgbuf[0], dbgbuf[1], dbgbuf[2], dbgbuf[3]);
*/
#define DEBUGPS(reg)

void GravityKernel(pIpstruct ipdata, pFostruct fodata, pJpstruct jpdata, int nj){
	// static float dbgbuf[4] __attribute__ ((aligned(16)));
	int j;
#define NODEBUG
	assert(((unsigned long)jpdata & 15) == 0);
	assert(((unsigned long)ipdata & 15) == 0);
	assert(((unsigned long)fodata & 15) == 0);

	PREFETCH(jpdata[0]);

	XORPS(AX, AX);
	XORPS(AY, AY);
	XORPS(AZ, AZ);
	XORPS(PHI, PHI);

	LOADPS(*ipdata->x, XI);
	LOADPS(*ipdata->y, YI);
	LOADPS(*ipdata->z, ZI);
	LOADPS(*ipdata->eps2, EPS2);

	LOADPS(*jpdata++, X2);
	MOVAPS(X2, Y2);
	MOVAPS(Y2, Z2);
	MOVAPS(Z2, MJ);

	BCAST0(X2);
	BCAST1(Y2);
	BCAST2(Z2);
	BCAST3(MJ);

	SUBPS(XI, X2);
	SUBPS(YI, Y2);
	SUBPS(ZI, Z2);

	MOVAPS(X2, DX);
	MOVAPS(Y2, DY);
	MOVAPS(Z2, DZ);

	MULPS(X2, X2);
	MULPS(Y2, Y2);
	MULPS(Z2, Z2);

	ADDPS(EPS2, X2);
	ADDPS(X2, Y2);
	ADDPS(Y2, Z2); // Z2 = R2

	// LOADPS(*jpdata++, MJ);

	for(j=0;j<nj;j++){
			LOADPS(*jpdata, X2);
			LOADPS(*jpdata, Y2);
			// MOVAPS(X2, Y2);
			BCAST0(X2);
			SUBPS(XI, X2);
			BCAST1(Y2);
			SUBPS(YI, Y2);
		RSQRTPS(Z2, RINV); // Z2 free
			LOADPS(*jpdata, Z2);
			BCAST2(Z2);
			SUBPS(ZI, Z2);
		MULPS(RINV, MJ);
		SUBPS(MJ, PHI);
		MULPS(RINV, RINV);
		MULPS(MJ, RINV); // MJ free

		MULPS(RINV, DX);
		ADDPS(DX, AX);
			MOVAPS(X2, DX);
			MULPS(X2, X2);
			ADDPS(EPS2, X2);
		MULPS(RINV, DY);
			LOADPS(*jpdata, MJ);
			BCAST3(MJ);
			PREFETCH(jpdata[2]);
			jpdata++;
		ADDPS(DY, AY);
			MOVAPS(Y2, DY);
			MULPS(Y2, Y2);
			ADDPS(X2, Y2);
		MULPS(RINV, DZ);
		ADDPS(DZ, AZ);
			MOVAPS(Z2, DZ);
			MULPS(Z2, Z2);
			ADDPS(Y2, Z2); // Z2 = R2
	}
	// puts("store fodata");
	STORPS(AX, *fodata->ax);
	STORPS(AY, *fodata->ay);
	STORPS(AZ, *fodata->az);
	STORPS(PHI, *fodata->phi);
}

/*
int main(){
	int i;
	static struct ipstruct ipdata ALIGN16;
	static struct fostruct force_out ALIGN16;
	pJpstruct jpdata;
	float iparray[4][4] = {
		{1.0, 0.0, -1.0, 0.0},
		{0.0, 1.0, 0.0, -1.0},
		{0.0, 0.0, 0.0, 0.0},
		{1./256., 1./256., 1./256., 1./256.,}};
	float jparry[4][4] = {
		{1.0, 0.0, 0.0, 1.0},	
		{0.0, 1.0, 0.0, 1.0},	
		{-1.0, 0.0, 0.0, 1.0},	
		{0.0, -1.0, 0.0, 1.0}};

	posix_memalign((void**)&jpdata, 16, 4*sizeof(*jpdata));
	memcpy(&ipdata, iparray, 16*sizeof(float));
	memset(&force_out, 0xbb, 64);
	// jpdata = valloc(64);
	memcpy(jpdata, jparry, 4*sizeof(*jpdata));

	puts("gravity");
	GravityKernel(&ipdata, &force_out, jpdata, 4);

	for(i=0;i<4;i++){
		printf("%f %f %f %f\n", force_out.ax[i], force_out.ay[i], force_out.az[i], force_out.phi[i]);

	}

	return 0;
}
*/

