/*
 Phantom GRAPE-5 using SSE
*/
// #include <stdio.h>
// #include <stdlib.h>
#include <assert.h>
#include "gravity.h"

// summation
#define AX	"%xmm0"
#define AY	"%xmm1"
#define AZ	"%xmm2"
#define PHI	"%xmm3"
// j particles
#define XJ0	"%xmm4"
#define YJ0	"%xmm5"
#define ZJ0	"%xmm6"
#define MJ0	"%xmm7"
#define XJ1	"%xmm8"
#define YJ1	"%xmm9"
#define ZJ1	"%xmm10"
#define MJ1	"%xmm11"
// temporary
#define R0		"%xmm12"
#define R1		"%xmm13"
#define TMP0	"%xmm14"
#define TMP1	"%xmm15"

#define XORPS(a, b) asm("xorps "  a  ","  b );
#define LOADPS(mem, reg) asm("movaps %0, %"reg::"m"(mem));
#define MOVLPS(mem, reg) asm("movlps %0, %"reg::"m"(mem));
// #define MOVLPS(mem, reg) asm("movq %0, %"reg::"m"(mem));
#define MOVHPS(mem, reg) asm("movhps %0, %"reg::"m"(mem));
#define SUBPS_M(mem, reg) asm("subps %0, %"reg::"m"(mem));
#define STORPS(reg, mem) asm("movaps %"reg " , %0"::"m"(mem));
#define MOVAPS(src, dst) asm("movaps " src "," dst);
#define MOVQ(src, dst) asm("movq " src "," dst);
#define BCAST0(reg) asm("shufps $0, "  reg ","  reg);
#define BCAST1(reg) asm("shufps $85, " reg ","  reg);
#define MULPS(src, dst) asm("mulps " src "," dst);
#define ADDPS(src, dst) asm("addps " src ","  dst);
#define SUBPS(src, dst) asm("subps "  src "," dst);
#define RSQRTPS(src, dst) asm("rsqrtps " src "," dst);
#define MOVHLPS(src, dst) asm("movhlps " src "," dst);
#define MOVLHPS(src, dst) asm("movlhps " src "," dst);
#define UNPCKLPS(reg) asm("unpcklps " reg "," reg);
#define PREFETCH(mem) asm("prefetcht0 %0"::"m"(mem));
#define NOP asm("nop");

/*
#define DEBUGPS(reg) asm("movaps " "%" reg " , %0"::"m"(*dbgbuf)); \
				printf(#reg ": %f %f %f %f\n", dbgbuf[0], dbgbuf[1], dbgbuf[2], dbgbuf[3]);
*/
#define DEBUGPS(reg)

void GravityKernel(pIpstruct ipdata, pFostruct fodata, pJpstruct jpdata, int nj){
	// static float dbgbuf[4] __attribute__ ((aligned(16)));
	// int j;
	char *jptr = (char *)jpdata;
	assert(((unsigned long)jpdata & 15) == 0);
	assert(((unsigned long)ipdata & 15) == 0);
	assert(((unsigned long)fodata & 15) == 0);
	assert(nj%2 == 0);

	XORPS(AX, AX);
	XORPS(AY, AY);
	XORPS(AZ, AZ);
	XORPS(PHI, PHI);

	MOVLPS(jptr[0], XJ0);
	UNPCKLPS(XJ0);
	MOVHLPS(XJ0, YJ0);
	MOVLHPS(XJ0, XJ0);
	MOVLHPS(YJ0, YJ0);
	// BCAST0(XJ0);
	// MOVLPS(jptr[0], YJ0);
	// BCAST1(YJ0);

	MOVLPS(jptr[8], MJ0);
	UNPCKLPS(MJ0);
	MOVLHPS(MJ0, ZJ0);
	MOVHLPS(ZJ0, ZJ0);
	MOVHLPS(MJ0, MJ0);
	// BCAST0(ZJ0);
	// MOVLPS(jptr[8], MJ0);
	// BCAST1(MJ0);

	MOVLPS(jptr[16], XJ1);
	UNPCKLPS(XJ1);
	MOVHLPS(XJ1, YJ1);
	MOVLHPS(XJ1, XJ1);
	MOVLHPS(YJ1, YJ1);
	// BCAST0(XJ1);
	// MOVLPS(jptr[16], YJ1);
	// BCAST1(YJ1);

	MOVLPS(jptr[24], MJ1);
	UNPCKLPS(MJ1);
	MOVLHPS(MJ1, ZJ1);
	MOVHLPS(ZJ1, ZJ1);
	MOVHLPS(MJ1, MJ1);
	// BCAST0(ZJ1);
	// MOVLPS(jptr[24], MJ1);
	// BCAST1(MJ1);

	LOADPS(*ipdata->eps2, R0);
	/*
	NOP;
	NOP;
	*/

	while((unsigned long)jptr < (unsigned long)(jpdata+nj)){
		SUBPS_M(*ipdata->x, XJ0);
		  MOVAPS(R0, R1);
			MOVHLPS(MJ1, MJ1);
			MOVHLPS(ZJ1, ZJ1);

		SUBPS_M(*ipdata->y, YJ0);
		jptr += 32;

		SUBPS_M(*ipdata->z, ZJ0);
		MOVAPS(XJ0, TMP0);
		PREFETCH(jptr[32]);

		MOVAPS(YJ0, TMP1);
		MULPS(XJ0, TMP0);
		  SUBPS_M(*ipdata->x, XJ1);

		MULPS(YJ0, TMP1);
		ADDPS(TMP0, R0);
		MOVAPS(ZJ0, TMP0);

		MULPS(ZJ0, TMP0);
		  SUBPS_M(*ipdata->y, YJ1);

		ADDPS(TMP1, R0);
		  MOVAPS(XJ1, TMP1);

		ADDPS(TMP0, R0);
		  MULPS(XJ1, TMP1);
		  MOVAPS(YJ1, TMP0);

		RSQRTPS(R0, R0);
		  SUBPS_M(*ipdata->z, ZJ1);
		  MULPS(YJ1, TMP0);

		MULPS(R0, MJ0);
		  ADDPS(TMP1, R1);

		MULPS(R0, R0);
		  ADDPS(TMP0, R1);
		  MOVAPS(ZJ1, TMP1);

		SUBPS(MJ0, PHI);
		MULPS(MJ0, R0);
			MOVLPS(jptr[8], MJ0);
			
		  MULPS(ZJ1, TMP1);
			UNPCKLPS(MJ0);

		  ADDPS(TMP1, R1);
		  RSQRTPS(R1, R1);
		  MULPS(R1, MJ1);
		MULPS(R0, XJ0);
		ADDPS(XJ0, AX);
		  MULPS(R1, R1);
		MULPS(R0, YJ0);
		ADDPS(YJ0, AY);
			MOVLPS(jptr[0], XJ0);
			UNPCKLPS(XJ0);
		  SUBPS(MJ1, PHI);
		  MULPS(MJ1, R1);
			MOVLPS(jptr[24], MJ1);
			UNPCKLPS(MJ1);

			MOVHLPS(XJ0, YJ0);
			MOVLHPS(XJ0, XJ0);
			MOVLHPS(YJ0, YJ0);
		MULPS(R0, ZJ0);
		ADDPS(ZJ0, AZ);
		  MULPS(R1, XJ1);
		    MOVLHPS(MJ0, ZJ0);
			MOVHLPS(MJ0, MJ0);
			MOVHLPS(ZJ0, ZJ0);
			LOADPS(*ipdata->eps2, R0);

		  ADDPS(XJ1, AX);
		  MULPS(R1, YJ1);
			MOVLPS(jptr[16], XJ1);
			UNPCKLPS(XJ1);
		  ADDPS(YJ1, AY);
		  MULPS(R1, ZJ1);
			MOVHLPS(XJ1, YJ1);
		  ADDPS(ZJ1, AZ);
			MOVLHPS(XJ1, XJ1);
			MOVLHPS(YJ1, YJ1);
		    MOVLHPS(MJ1, ZJ1);
			// MOVAPS(R0, R1);
	}
	// puts("store fodata");
	STORPS(AX, *fodata->ax);
	STORPS(AY, *fodata->ay);
	STORPS(AZ, *fodata->az);
	STORPS(PHI, *fodata->phi);
}

#ifdef DEBUG
int main(){
	static struct ipstruct ipdata ALIGN16;
	static struct fostruct force_out ALIGN16;
	pJpstruct jpdata;
	int i;
	float iparray[4][4] = {
		{1.0, 0.0, -1.0, 0.0},
		{0.0, 1.0, 0.0, -1.0},
		{0.0, 0.0, 0.0, 0.0},
		{1./256., 1./256., 1./256., 1./256.,}};
	float jparry[4][4] = {
		{1.0, 0.0, 0.0, 1.0},	
		{0.0, 1.0, 0.0, 1.0},	
		{-1.0, 0.0, 0.0, 1.0},	
		{0.0, -1.0, 0.0, 1.0}};

	posix_memalign(&jpdata, 16, 4*sizeof(*jpdata));
	memcpy(&ipdata, iparray, 16*sizeof(float));
	memset(&force_out, 0xbb, 64);
	// jpdata = valloc(64);
	memcpy(jpdata, jparry, 4*sizeof(*jpdata));

	puts("gravity");
	GravityKernel(&ipdata, &force_out, jpdata, 4);

	for(i=0;i<4;i++){
		printf("%f %f %f %f\n", force_out.ax[i], force_out.ay[i], force_out.az[i], force_out.phi[i]);

	}

	return 0;
}
#endif
