union floatUnion
{
	float m_float;
	uint m_int;
};

#define LIM 1e-5

float getSGCurrent(float v, float D, float delta, float n1, float n2)
{
	float vDelta = v*delta;
	float x = vDelta/D;
	float j = 0;

	if (x < -LIM)
	{
		float factor = exp(-x);
		float n2factor = n2*factor;
		float oneMinusFactor = 1.0f-factor;
		float n1MinusN2Factor = n1-n2factor;
		float n1MinusN2FactorOverOneMinusFactor = n1MinusN2Factor/oneMinusFactor;

		j = v*n1MinusN2FactorOverOneMinusFactor;
	}
	else if (x > LIM)
	{
		float factor = exp(x);
		float factorMinusOne = factor - 1.0f;
		float n1factor = n1*factor;
		float n1factorMinusN2 = n1factor - n2;
		float n1factorMinusN2OverFactorMinusOne = n1factorMinusN2/factorMinusOne;

		j = v*n1factorMinusN2OverFactorMinusOne;
	}
	else
	{
		x /= 2.0f;

		float twoD = 2.0f*D;
		float factor = twoD/delta;
		float n1MinusN2 = n1-n2;
		float n1PlusN2 = n1+n2;
		float term1 = 0.5f*n1MinusN2;
		float term2Part = 0.5f*n1PlusN2;
		float term2 = term2Part*x;
		float xSquared = x*x;
		float term3Part = n1MinusN2*xSquared;
		float term3 = term3Part/6.0f;

		float seriesPart = term1 + term2;
		float series = seriesPart/* + term3*/;

		j = factor*series;
	}

	return j;
}

/*
float getSGCurrent(float v, float D, float delta, float n1, float n2)
{
	float x = v*delta/D;
	float j = 0;


	if (x < -LIM)
	{
		float factor = exp(x);

		j = v*(n1*factor-n2)/(factor-1.0f);
	}
	else if (x > LIM)
	{
		float factor = exp(-x);

		j = v*(n1-factor*n2)/(1.0f-factor);
	}
	else
	{
		x /= 2.0f;

		float factor = 2.0f*D/delta;

		float series = 0.5f*(n1-n2) + 0.5f*(n1+n2)*x + (n1-n2)*x*x/6.0f;

		j = factor*series;
	}

	//if (isnan(j))
	//	j = 0;

	return j;
}
*/
/*
union floatUnion
{
	float m_float;
	uint m_int;
};

float getSGCurrent(float v, float D, float delta, float n1, float n2)
{
	float x = v*delta/D;
	union floatUnion j1;
	union floatUnion j2;
	union floatUnion j3;
	union floatUnion j;

	j1.m_int = 0;
	j2.m_int = 0;
	j3.m_int = 0;
	j.m_int = 0;

#define LIM 1e-5

	uint xSmaller = ~((x < -LIM)-1);
	uint xLarger = ~((x > LIM)-1);
	uint xMiddle = ~(xSmaller|xLarger);

	{
		float factor = exp(x);

		j1.m_float = v*(n1*factor-n2)/(factor-1.0f);
		j1.m_int &= xSmaller;
	}
	{
		float factor = exp(-x);

		j2.m_float = v*(n1-factor*n2)/(1.0f-factor);
		j2.m_int &= xLarger;
	}
	{
		x /= 2.0f;

		float factor = 2.0f*D/delta;

		float series = 0.5f*(n1-n2) + 0.5f*(n1+n2)*x + (n1-n2)*x*x/6.0f;

		j3.m_float = factor*series;
		j3.m_int &= xMiddle;
	}
	
	j.m_int = (j1.m_int|j2.m_int|j3.m_int);

	return j.m_float;
}
*/

float2 getXCurrents(int idx,
			  __global const float *pV, 
                          __global const float *pDe, __global const float *pDh,
			  __global const float *pEMob, __global const float *pHMob,
			  __global const float *pN, __global const float *pP,
			  int width, int height)
{
	int x = idx % width;
	int y = idx / width;
	int nextX = (x+1) % width;
	int nextIdx = nextX + y*width;

	float Ex = - ( pV[nextIdx] - pV[idx] );
	float De = (pDe[idx]+pDe[nextIdx])/2.0f;
	float Dh = (pDh[idx]+pDh[nextIdx])/2.0f;
	float eMob = (pEMob[idx]+pEMob[nextIdx])/2.0f;
	float hMob = (pHMob[idx]+pHMob[nextIdx])/2.0f;

	float ve = -eMob*Ex;
	float vh = +hMob*Ex;

	float2 currents;

	currents.x = getSGCurrent(ve, De, 1.0f, pN[idx], pN[nextIdx]);
	currents.y = getSGCurrent(vh, Dh, 1.0f, pP[idx], pP[nextIdx]);

	return currents;
}

float2 getYCurrents(int idx,
			  __global const float *pV, 
                          __global const float *pDe, __global const float *pDh,
			  __global const float *pEMob, __global const float *pHMob,
			  __global const float *pN, __global const float *pP,
			  int width, int height)
{
	int x = idx % width;
	int y = idx / width;
	int nextY = y + 1 - y/(height-1); // if no next y is available, this just yields the same y, we won't be using this calculation, so it doesn't really matter
	int nextIdx = x + nextY*width;

	float Ey = -(pV[nextIdx]-pV[idx]);
	float De = (pDe[idx]+pDe[nextIdx])/2.0f;
	float Dh = (pDh[idx]+pDh[nextIdx])/2.0f;
	float eMob = (pEMob[idx]+pEMob[nextIdx])/2.0f;
	float hMob = (pHMob[idx]+pHMob[nextIdx])/2.0f;

	float ve = -eMob*Ey;
	float vh = +hMob*Ey;

	float2 currents;

	currents.x = getSGCurrent(ve, De, 1.0f, pN[idx], pN[nextIdx]);
	currents.y = getSGCurrent(vh, Dh, 1.0f, pP[idx], pP[nextIdx]);

	return currents;
}

__kernel void currentsKernel(__global const float *pV, 
                          __global const float *pDe, __global const float *pDh,
			  __global const float *pEMob, __global const float *pHMob,
			  __global const float *pN, __global const float *pP,
			  __global float *pECurX, __global float *pECurY,
			  __global float *pHCurX, __global float *pHCurY,
			  int width, int height)
{
	int idx = get_global_id(0);

	float2 xCurrents = getXCurrents(idx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 yCurrents = getYCurrents(idx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);

	pECurX[idx] = xCurrents.x;
	pHCurX[idx] = xCurrents.y;
	pECurY[idx] = yCurrents.x;
	pHCurY[idx] = yCurrents.y;
}

/*
__kernel void currentsKernel(__global const float *pV, 
                          __global const float *pDe, __global const float *pDh,
			  __global const float *pEMob, __global const float *pHMob,
			  __global const float *pN, __global const float *pP,
			  __global float *pECurX, __global float *pECurY,
			  __global float *pHCurX, __global float *pHCurY,
			  int width, int height)
{
	int idx = get_global_id(0);
	int x = idx % width;
	int y = idx / width;
	int nextY = y + 1 - y/(height-1); // if no next y is available, this just yields the same y, we won't be using this calculation, so it doesn't really matter
	int nextIdx = x + nextY*width;

	float Ey = -(pV[nextIdx]-pV[idx]);
	float De = (pDe[idx]+pDe[nextIdx])/2.0f;
	float Dh = (pDh[idx]+pDh[nextIdx])/2.0f;
	float eMob = (pEMob[idx]+pEMob[nextIdx])/2.0f;
	float hMob = (pHMob[idx]+pHMob[nextIdx])/2.0f;

	float ve = -eMob*Ey;
	float vh = +hMob*Ey;

	float xe = ve/De;

	float n1 = pN[idx];
	float n2 = pN[nextIdx];
		float factor = exp(xe);
		float factorMinusOne = factor - 1.0f;
		float n1factor = n1*factor;
		float n1factorMinusN2 = n1factor - n2;
		float n1factorMinusN2OverFactorMinusOne = n1factorMinusN2/factorMinusOne;

		float j = ve*n1factorMinusN2OverFactorMinusOne;

	pECurX[idx] = 0;
	pECurY[idx] = 0;
	pHCurX[idx] = factor;
	pHCurY[idx] = j;

}
*/
__kernel void updateDensitiesKernel(__global const float *pV, 
                          __global const float *pDe, __global const float *pDh,
			  __global const float *pEMob, __global const float *pHMob,
			  __global const float *pGenRate, __global const float *pRecFactor,
			  __global const float *pN, __global const float *pP,
			  __global float *pNdst, __global float *pPdst,
			  int width, int height,
			  float scaledDt)

{
	int idx = get_global_id(0);
	int x = idx % width;
	int y = idx / width;
	int prevX = (x-1+width)%width;
	int prevY = y-1;
	int leftIdx = prevX + y*width;
	int belowIdx = x + prevY*width;
	int isBottom = (y == 1);

	float2 xCurrentsIdx = getXCurrents(idx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 yCurrentsIdx = getYCurrents(idx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 xCurrentsLeftIdx = getXCurrents(leftIdx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 yCurrentsBelowIdx = getYCurrents(belowIdx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);

	float JexCur = xCurrentsIdx.x;  // .x specifies electron current
	float JexPrev = xCurrentsLeftIdx.x;
	float JeyCur = yCurrentsIdx.x;
	float JeyPrev = yCurrentsBelowIdx.x;

	float JhxCur = xCurrentsIdx.y; // .y specifies hole current
	float JhxPrev = xCurrentsLeftIdx.y;
	float JhyCur = yCurrentsIdx.y;
	float JhyPrev = yCurrentsBelowIdx.y;

	// Calculate the gradients

	float Jexx = (JexCur - JexPrev) ;
	float Jeyy = (JeyCur - JeyPrev) ;
	float Jhxx = (JhxCur - JhxPrev) ;
	float Jhyy = (JhyCur - JhyPrev) ;
			
	// Calculate the rates according to the continuity equation
	float n = pN[idx];
	float p = pP[idx];
	float recombination = pRecFactor[idx]*p*n;
	float netGen = pGenRate[idx] - recombination;

	float dndt = netGen - ( Jexx+Jeyy );
	float dpdt = netGen - ( Jhxx+Jhyy );

	pNdst[idx] = n + dndt*scaledDt;
	pPdst[idx] = p + dpdt*scaledDt;
}

__kernel void updateDensitiesKernelSwapped(__global const float *pV, 
                          __global const float *pDe, __global const float *pDh,
			  __global const float *pEMob, __global const float *pHMob,
			  __global const float *pGenRate, __global const float *pRecFactor,
			  __global float *pNdst, __global float *pPdst,
			  __global const float *pN, __global const float *pP,
			  int width, int height,
			  float scaledDt)

{
	int idx = get_global_id(0);
	int x = idx % width;
	int y = idx / width;
	int prevX = (x-1+width)%width;
	int prevY = y-1;
	int leftIdx = prevX + y*width;
	int belowIdx = x + prevY*width;
	int isBottom = (y == 1);

	float2 xCurrentsIdx = getXCurrents(idx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 yCurrentsIdx = getYCurrents(idx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 xCurrentsLeftIdx = getXCurrents(leftIdx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);
	float2 yCurrentsBelowIdx = getYCurrents(belowIdx, pV, pDe, pDh, pEMob, pHMob, pN, pP, width, height);

	float JexCur = xCurrentsIdx.x;  // .x specifies electron current
	float JexPrev = xCurrentsLeftIdx.x;
	float JeyCur = yCurrentsIdx.x;
	float JeyPrev = yCurrentsBelowIdx.x;

	float JhxCur = xCurrentsIdx.y; // .y specifies hole current
	float JhxPrev = xCurrentsLeftIdx.y;
	float JhyCur = yCurrentsIdx.y;
	float JhyPrev = yCurrentsBelowIdx.y;

	// Calculate the gradients

	float Jexx = (JexCur - JexPrev) ;
	float Jeyy = (JeyCur - JeyPrev) ;
	float Jhxx = (JhxCur - JhxPrev) ;
	float Jhyy = (JhyCur - JhyPrev) ;
			
	// Calculate the rates according to the continuity equation
	float n = pN[idx];
	float p = pP[idx];
	float recombination = pRecFactor[idx]*p*n;
	float netGen = pGenRate[idx] - recombination;

	float dndt = netGen - ( Jexx+Jeyy );
	float dpdt = netGen - ( Jhxx+Jhyy );

	pNdst[idx] = n + dndt*scaledDt;
	pPdst[idx] = p + dpdt*scaledDt;
}

__kernel void blackRedKernel(float chargeMultiplier, 
                          __global const float *pN, __global const float *pP, __global const float *pBg,
			  __global const float *pA0, __global const float *pA1, __global const float *pA2,
			  __global const float *pA3, __global const float *pA4, 
			  __global float *pV, float w, int blackOrRed, int width, int height)
{
	int idx = get_global_id(0);
	int x = idx % width;
	int y = idx / width;
	int xPrev = (x-1+width)%width;
	int xNext = (x+1)%width;
	int yNext = y+1;
	int yPrev = y-1;

	int leftIndex = xPrev + y*width;
	int rightIndex = xNext + y*width;
	int upIndex = x + yNext*width;
	int downIndex = x + yPrev*width;

	float charge = (pP[idx]-pN[idx]+pBg[idx])*chargeMultiplier;

	float prediction = ( pA1[idx]*pV[rightIndex] + pA2[idx]*pV[leftIndex] + pA3[idx]*pV[upIndex] + pA4[idx]*pV[downIndex] 
                             + charge ) / pA0[idx];

	float curValue = pV[idx];
	float diff = prediction-curValue;

	int update = (x+y+blackOrRed+1)%2;
	
	float newValue = curValue + (diff*w)*((float)update);

	pV[idx] = newValue;
}

