/*
 Virtual GRAPE-5 using SSE
*/
// #include <stdio.h>
// #include <stdlib.h>
#include <assert.h>
#include "gravity.h"

#if __GNUC__ == 4
typedef float v4sf __attribute__ ((vector_size(16)));
#else
typedef float v4sf __attribute__ ((mode(V4SF)));
#endif

void GravityKernel(pIpstruct ipdata, pFostruct fodata, pJpstruct jpdata, int nj){
	// static float dbgbuf[4] __attribute__ ((aligned(16)));
	int j;
	v4sf ax, ay, az, phi;
	v4sf xi, yi, zi, eps2;
	v4sf x, y, z, m;
	v4sf r2, rinv, mrinv, mrinv3;
	assert(((unsigned long)jpdata & 15) == 0);
	assert(((unsigned long)ipdata & 15) == 0);
	assert(((unsigned long)fodata & 15) == 0);

#if 0 // this would cause compilation error on some versions of gcc.
          ax ^= ax;
          ay ^= ay;
          az ^= az;
          phi ^= phi;
#else // much better.
        ax = ay = az = phi = (v4sf){0.f, 0.f, 0.f, 0.f};
#endif

	xi = *(v4sf *)(ipdata->x);
	yi = *(v4sf *)(ipdata->y);
	zi = *(v4sf *)(ipdata->z);
	eps2 = *(v4sf *)(ipdata->eps2);

	x = y = z = m = *(v4sf *)jpdata;
	x = __builtin_ia32_shufps (x, x, 0x00);
	y = __builtin_ia32_shufps (y, y, 0x55);
	z = __builtin_ia32_shufps (z, z, 0xaa);
	m = __builtin_ia32_shufps (m, m, 0xff);


	for(j=0;j<nj;j++){
		x -= xi;
		y -= yi;
		z -= zi;
		r2 = x*x + y*y + z*z + eps2;
		rinv = __builtin_ia32_rsqrtps(r2);
		mrinv = m * rinv;
		phi -= mrinv;
		mrinv3 = rinv*rinv*mrinv;
		ax += x * mrinv3;
		ay += y * mrinv3;
		az += z * mrinv3;

		x = y = z = m = *(v4sf *)(jpdata+j+1);
		x = __builtin_ia32_shufps (x, x, 0x00);
		y = __builtin_ia32_shufps (y, y, 0x55);
		z = __builtin_ia32_shufps (z, z, 0xaa);
		m = __builtin_ia32_shufps (m, m, 0xff);
	}
	*(v4sf *)(fodata->ax) = ax;
	*(v4sf *)(fodata->ay) = ay;
	*(v4sf *)(fodata->az) = az;
	*(v4sf *)(fodata->phi) = phi;
}

/*
int main(){
	int i;
	float iparray[4][4] = {
		{1.0, 0.0, -1.0, 0.0},
		{0.0, 1.0, 0.0, -1.0},
		{0.0, 0.0, 0.0, 0.0},
		{1./256., 1./256., 1./256., 1./256.,}};
	float jparry[4][4] = {
		{1.0, 0.0, 0.0, 1.0},	
		{0.0, 1.0, 0.0, 1.0},	
		{-1.0, 0.0, 0.0, 1.0},	
		{0.0, -1.0, 0.0, 1.0}};

	posix_memalign(&jpdata, 16, 4*sizeof(*jpdata));
	memcpy(&ipdata, iparray, 16*sizeof(float));
	memset(&force_out, 0xbb, 64);
	// jpdata = valloc(64);
	memcpy(jpdata, jparry, 4*sizeof(*jpdata));

	puts("gravity");
	GravityKernel(4);

	for(i=0;i<4;i++){
		printf("%f %f %f %f\n", force_out.ax[i], force_out.ay[i], force_out.az[i], force_out.phi[i]);

	}

	return 0;
}
*/
