#define ALIGN16 __attribute__ ((aligned(16)))
#define ALIGN64 __attribute__ ((aligned(64)))

#if __GNUC__ == 4
typedef double v2df __attribute__ ((vector_size(16)));
typedef float  v4sf __attribute__ ((vector_size(16)));
typedef int    v4si __attribute__ ((vector_size(16)));
typedef short  v8hi __attribute__ ((vector_size(16)));
#else
typedef double v2df __attribute__ ((mode(V2DF)));
typedef float  v4sf __attribute__ ((mode(V4SF)));
typedef int    v4si __attribute__ ((mode(V4SI)));
typedef short  v8hi __attribute__ ((mode(V8HI)));
#define V2DF_GATHER(reg, p0, p1) \
	reg = __builtin_ia32_loadlpd(reg, (void *)(p0)), \
	reg = __builtin_ia32_loadhpd(reg, (void *)(p1))
#endif

typedef struct iptdata{
	float x[4];
	float y[4];
	float z[4];
	float eps2[4]; // not used in this implementation
} Ipdata, *pIpdata;

typedef struct fodata{
	float ax[4];
	float ay[4];
	float az[4];
	float phi[4];
} Fodata, *pFodata;

typedef struct jpdata{
	float x, y, z, m;
} Jpdata, *pJpdata;

#if __GNUC__ == 4
static void v4sf_print(v4sf v){
	printf("%f %f %f %f\n",
			__builtin_ia32_vec_ext_v4sf(v, 0),
			__builtin_ia32_vec_ext_v4sf(v, 1),
			__builtin_ia32_vec_ext_v4sf(v, 2),
			__builtin_ia32_vec_ext_v4sf(v, 3));
}

static inline void v4sf_transpose(
		v4sf *d0, v4sf *d1, v4sf *d2, v4sf *d3, 
		v4sf  s0, v4sf  s1, v4sf  s2, v4sf  s3)
{
	*d0 = __builtin_ia32_unpcklps(
				__builtin_ia32_unpcklps(s0, s2),
				__builtin_ia32_unpcklps(s1, s3));
	*d1 = __builtin_ia32_unpckhps(
				__builtin_ia32_unpcklps(s0, s2),
				__builtin_ia32_unpcklps(s1, s3));
	*d2 = __builtin_ia32_unpcklps(
				__builtin_ia32_unpckhps(s0, s2),
				__builtin_ia32_unpckhps(s1, s3));
	*d3 = __builtin_ia32_unpckhps(
				__builtin_ia32_unpckhps(s0, s2),
				__builtin_ia32_unpckhps(s1, s3));
}

static inline void v4sf_store_dp(v4sf vec, double *d0, double *d1, double *d2, double *d3){
	v4sf tmp = {0,0,0,0};
	v2df pd0, pd1;
	pd1 = __builtin_ia32_cvtps2pd(__builtin_ia32_movhlps(tmp, vec));
	pd0 = __builtin_ia32_cvtps2pd(vec);
	*d0 = __builtin_ia32_vec_ext_v2df(pd0, 0);
	*d1 = __builtin_ia32_vec_ext_v2df(pd0, 1);
	*d2 = __builtin_ia32_vec_ext_v2df(pd1, 0);
	*d3 = __builtin_ia32_vec_ext_v2df(pd1, 1);
	/*
	__builtin_ia32_storelpd(d0, pd0);
	__builtin_ia32_storehpd(d1, pd0);
	__builtin_ia32_storelpd(d2, pd1);
	__builtin_ia32_storehpd(d3, pd1);
	*/
}
#endif
