/*
 Virtual GRAPE-5 using SSE
*/
// #include <stdio.h>
// #include <stdlib.h>
#include <assert.h>
#include "gravity.h"

// summation
#define AX	"%xmm0"
#define AY	"%xmm1"
#define AZ	"%xmm2"
#define PHI	"%xmm3"
// j particle
#define XJ	"%xmm4"
#define YJ	"%xmm5"
#define ZJ	"%xmm6"
#define MJ	"%xmm7"
// #define LOADBUF "%xmm14"
// temporary
#define R2		"%xmm8"
#define X2		"%xmm8"
#define Y2		"%xmm14"
#define Z2		"%xmm15"
#define R2TMP	"%xmm9"
#define MRINV3	"%xmm7"
// #define MRINV	"%xmm9"
// fixed i particle
#define XI	"%xmm10"
#define YI	"%xmm11"
#define ZI	"%xmm12"
#define EPS2 "%xmm13"

#define XORPS(a, b) asm("xorps "  a  ","  b );
#define LOADPS(mem, reg) asm("movaps %0, %"reg::"m"(mem));
#define STORPS(reg, mem) asm("movaps %"reg " , %0"::"m"(mem));
#define MOVAPS(src, dst) asm("movaps " src "," dst);
#define MOVQ(src, dst) asm("movq " src "," dst);
#define BCAST0(reg) asm("shufps $0, "  reg ","  reg);
#define BCAST1(reg) asm("shufps $85, " reg ","  reg);
#define MULPS(src, dst) asm("mulps " src "," dst);
#define ADDPS(src, dst) asm("addps " src ","  dst);
#define SUBPS(src, dst) asm("subps "  src "," dst);
#define RSQRTPS(src, dst) asm("rsqrtps " src "," dst);
#define MOVHLPS(src, dst) asm("movhlps " src "," dst);

/*
#define DEBUGPS(reg) asm("movaps " "%" reg " , %0"::"m"(*dbgbuf)); \
				printf(#reg ": %f %f %f %f\n", dbgbuf[0], dbgbuf[1], dbgbuf[2], dbgbuf[3]);
*/
#define DEBUGPS(reg)

void GravityKernel(pIpstruct ipdata, pFostruct fodata, pJpstruct jpdata, int nj){
	// static float dbgbuf[4] __attribute__ ((aligned(16)));
	int j;
	assert(((unsigned long)jpdata & 15) == 0);
	assert(((unsigned long)ipdata & 15) == 0);
	assert(((unsigned long)fodata & 15) == 0);

	XORPS(AX, AX);
	XORPS(AY, AY);
	XORPS(AZ, AZ);
	XORPS(PHI, PHI);

	// puts("load i particle");
	LOADPS(*ipdata->x, XI);
	LOADPS(*ipdata->y, YI);
	LOADPS(*ipdata->z, ZI);
	LOADPS(*ipdata->eps2, EPS2);
	// puts("force loop");
	LOADPS(*jpdata, Y2);
	MOVQ(Y2, X2);
	MOVHLPS(Y2, Z2);
	BCAST0(X2);
	BCAST1 (Y2);
	MOVQ(Z2, MJ);
	BCAST0(Z2);

	for(j=0;j<nj;j++){
		SUBPS (XI, X2);
		BCAST1(MJ);
		SUBPS  (YI, Y2);
		jpdata++;

		MOVAPS(X2, XJ);
		MULPS(X2, X2);

		MOVAPS(Y2, YJ);
		MULPS(Y2, Y2);
		ADDPS(Y2, R2);

		SUBPS (ZI, Z2);
		 LOADPS(*jpdata, Y2);
		MOVAPS(Z2, ZJ);
		MULPS(Z2, Z2);
		ADDPS(Z2, R2);

		ADDPS (EPS2, R2);

		RSQRTPS(R2, R2TMP);
		 MOVQ(Y2, X2);
		MULPS(R2TMP, MRINV3);

		 MOVHLPS(Y2, Z2);
		SUBPS(MRINV3, PHI);
		MULPS(R2TMP, R2TMP);

		 BCAST0(X2);
		MULPS(R2TMP, MRINV3);

		MULPS(MRINV3, XJ);
		ADDPS(XJ, AX);

		MULPS(MRINV3, YJ);
		ADDPS(YJ, AY);
		 BCAST1(Y2);

		MULPS(MRINV3, ZJ);
		 MOVQ(Z2, MJ);
		ADDPS(ZJ, AZ);
		 BCAST0(Z2);
	}
	// puts("store fodata");
	STORPS(AX, *fodata->ax);
	STORPS(AY, *fodata->ay);
	STORPS(AZ, *fodata->az);
	STORPS(PHI, *fodata->phi);
}

/*
int main(){
	int i;
	float iparray[4][4] = {
		{1.0, 0.0, -1.0, 0.0},
		{0.0, 1.0, 0.0, -1.0},
		{0.0, 0.0, 0.0, 0.0},
		{1./256., 1./256., 1./256., 1./256.,}};
	float jparry[4][4] = {
		{1.0, 0.0, 0.0, 1.0},	
		{0.0, 1.0, 0.0, 1.0},	
		{-1.0, 0.0, 0.0, 1.0},	
		{0.0, -1.0, 0.0, 1.0}};

	posix_memalign(&jpdata, 16, 4*sizeof(*jpdata));
	memcpy(&ipdata, iparray, 16*sizeof(float));
	memset(&force_out, 0xbb, 64);
	// jpdata = valloc(64);
	memcpy(jpdata, jparry, 4*sizeof(*jpdata));

	puts("gravity");
	GravityKernel(4);

	for(i=0;i<4;i++){
		printf("%f %f %f %f\n", force_out.ax[i], force_out.ay[i], force_out.az[i], force_out.phi[i]);

	}

	return 0;
}
*/

