/*
 Virtual GRAPE-5 using SSE
*/
// #include <stdio.h>
// #include <stdlib.h>
#include <assert.h>
#include "gravity.h"

// accumlator
#define AX	"%xmm0"
#define AY	"%xmm1"
#define AZ	"%xmm2"
#define PHI	"%xmm3"
// position & mass
#define X2	"%xmm4"
#define Y2	"%xmm5"
#define Z2	"%xmm6"
#define M	"%xmm7"
// alias
#define R2	"%xmm4"
#define MRINV	"%xmm7"

#define XORPS(a, b) asm("xorps "  a  ","  b );
#define LOADPS(mem, reg) asm("movaps %0, %"reg::"m"(mem));
#define STORPS(reg, mem) asm("movaps %"reg " , %0"::"m"(mem));
#define ADDPS_M(mem, reg) asm("addps %0, %"reg::"m"(mem));
#define SUBPS_M(mem, reg) asm("subps %0, %"reg::"m"(mem));
#define MULPS_M(mem, reg) asm("mulps %0, %"reg::"m"(mem));
#define MOVAPS(src, dst) asm("movaps " src "," dst);
#define MOVQ(src, dst) asm("movq " src "," dst);
#define BCAST0(reg) asm("shufps $0x00, "  reg ","  reg);
#define BCAST1(reg) asm("shufps $0x55, " reg ","  reg);
#define BCAST2(reg) asm("shufps $0xaa, " reg ","  reg);
#define BCAST3(reg) asm("shufps $0xff, " reg ","  reg);
#define MULPS(src, dst) asm("mulps " src "," dst);
#define ADDPS(src, dst) asm("addps " src ","  dst);
#define SUBPS(src, dst) asm("subps "  src "," dst);
#define RSQRTPS(src, dst) asm("rsqrtps " src "," dst);
#define MOVHLPS(src, dst) asm("movhlps " src "," dst);

/*
#define DEBUGPS(reg) asm("movaps " "%" reg " , %0"::"m"(*dbgbuf)); \
				printf(#reg ": %f %f %f %f\n", dbgbuf[0], dbgbuf[1], dbgbuf[2], dbgbuf[3]);
*/
#define DEBUGPS(reg)

void GravityKernel(pIpstruct ipdata, pFostruct fodata, pJpstruct jpdata, int nj){
	// static float dbgbuf[4] __attribute__ ((aligned(16)));
	int j;
	float xij[3][4] __attribute__ ((aligned(16)));

	assert(((unsigned long)xij & 15) == 0);
	assert(((unsigned long)jpdata & 15) == 0);
	assert(((unsigned long)ipdata & 15) == 0);
	assert(((unsigned long)fodata & 15) == 0);

	XORPS(AX, AX);
	XORPS(AY, AY);
	XORPS(AZ, AZ);
	XORPS(PHI, PHI);

	/*
	// puts("load i particle");
	LOADPS(*ipdata->x, XI);
	LOADPS(*ipdata->y, YI);
	LOADPS(*ipdata->z, ZI);
	LOADPS(*ipdata->eps2, EPS2);
	// puts("force loop");
	LOADPS(*jpdata, Y2);
	MOVQ(Y2, X2);
	MOVHLPS(Y2, Z2);
	BCAST0(X2);
	BCAST1 (Y2);
	MOVQ(Z2, MJ);
	BCAST0(Z2);
	*/

	for(j=0;j<nj;j++, jpdata++){
		LOADPS(*jpdata, M);
		MOVAPS(M, X2);
		MOVAPS(M, Y2);
		MOVAPS(M, Z2);
		BCAST0(X2);
		BCAST1(Y2);
		BCAST2(Z2);
		BCAST3(M);
		SUBPS_M(*ipdata->x, X2);
		SUBPS_M(*ipdata->y, Y2);
		SUBPS_M(*ipdata->z, Z2);

		STORPS(X2, *xij[0]);
		MULPS(X2, X2);
		STORPS(Y2, *xij[1]);
		MULPS(Y2, Y2);
		STORPS(Z2, *xij[2]);
		MULPS(Z2, Z2);

		ADDPS_M(*ipdata->eps2, X2);
		ADDPS(Y2, X2);
		ADDPS(Z2, X2);

		RSQRTPS(X2, X2);
		MULPS(X2, M);
		MULPS(X2, X2);
		SUBPS(M, PHI);
		MULPS(M, X2);
		MOVAPS(X2, Y2);
		MOVAPS(X2, Z2);
		MULPS_M(*xij[0], X2);
		ADDPS(X2, AX);
		MULPS_M(*xij[1], Y2);
		ADDPS(Y2, AY);
		MULPS_M(*xij[2], Z2);
		ADDPS(Z2, AZ);
	}
	// puts("store fodata");
	STORPS(AX, *fodata->ax);
	STORPS(AY, *fodata->ay);
	STORPS(AZ, *fodata->az);
	STORPS(PHI, *fodata->phi);
}


