#include "simulation2ddoublegpurel.h"
#include "simulationstate.h"
#include <serut/fileserializer.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <iostream>
#include <cmath>

using namespace serut;

class GPURelNoCheck
{
public:
	bool isDone()
	{
		return false;
	}
};

class GPURelTimeCheck
{
public:
	GPURelTimeCheck(int maxSec)
	{
		m_endTime = time(0) + maxSec;
	}

	bool isDone()
	{
		if (time(0) > m_endTime)
			return true;
		return false;
	}
private:
	time_t m_endTime;
};

#define LIM 1e-10

inline double Simulation2DDoubleGPURel::getSGCurrent(double v, double D, double delta, double n1, double n2)
{
	double vDelta = v*delta;
	double x = vDelta/D;
	double j = 0;

	if (x < -LIM)
	{
		double factor = std::exp(x);
		double factorMinusOne = factor - 1.0f;
		double n1factor = n1*factor;
		double n1factorMinusN2 = n1factor - n2;
		double n1factorMinusN2OverFactorMinusOne = n1factorMinusN2/factorMinusOne;

		j = v*n1factorMinusN2OverFactorMinusOne;
	}
	else if (x > LIM)
	{
		double factor = std::exp(-x);
		double n2factor = n2*factor;
		double oneMinusFactor = 1.0f-factor;
		double n1MinusN2Factor = n1-n2factor;
		double n1MinusN2FactorOverOneMinusFactor = n1MinusN2Factor/oneMinusFactor;

		j = v*n1MinusN2FactorOverOneMinusFactor;

	}
	else
	{
		x /= 2.0f;

		double twoD = 2.0f*D;
		double factor = twoD/delta;
		double n1MinusN2 = n1-n2;
		double n1PlusN2 = n1+n2;
		double term1 = 0.5f*n1MinusN2;
		double term2Part = 0.5f*n1PlusN2;
		double term2 = term2Part*x;
		double xSquared = x*x;
		double term3Part = n1MinusN2*xSquared;
		double term3 = term3Part/6.0f;

		double seriesPart = term1 + term2;
		double series = seriesPart + term3;

		j = factor*series;
	}

	return j;
}

// TODO: to use external file, mainly for debugging
std::string GPURelLoadCLFile(const std::string &fileName)
{
	FILE *pFile = fopen(fileName.c_str(), "rb");
	if (pFile == 0)	
		return std::string("");
	
	fseek(pFile, 0, SEEK_END);
	
	int length = ftell(pFile);
	char *pStr = new char[length+1];

	fseek(pFile, 0, SEEK_SET);
	fread(pStr, length, 1, pFile);
	pStr[length] = 0;

	std::string prog(pStr);
	delete [] pStr;

	return prog;
}

Simulation2DDoubleGPURel::Simulation2DDoubleGPURel()
{
	m_init = false;

	m_width = 0;
	m_height = 0;
	m_totalPixels = 0;
	m_pixelWidth = 0;
	m_pixelHeight = 0;
	m_pixelFrac = 1;
	m_pixelFracInv = 1;
	m_deltaPhi = 0;

	m_phiMethodThreshold = 100;
	m_phiBackupBetterCounter = 0;
	m_phiMethodAdditionalCheckCount = 1000;

	zeroCLMemory();
}

Simulation2DDoubleGPURel::~Simulation2DDoubleGPURel()
{
	clearCLMemory();
}

bool Simulation2DDoubleGPURel::init(int lowestWidth, int lowestHeight, double realWidth, double realHeight, int scaleSteps, int *pNumX, int *pNumY)
{
	if (m_init)
	{
		setErrorString("Already initialized");
		return false;
	}

	if (lowestWidth < 2 || lowestHeight < 3)
	{
		setErrorString("Invalid grid dimensions");
		return false;
	}

	if (scaleSteps < 1)
	{
		setErrorString("Invalid number of multigrid steps");
		return false;
	}

	if (!initGPU())
		return false;

	// Calculate the number of pixels on the highest resolution grid.

	int multiplier = 1 << (scaleSteps-1); // bitshift to accomplish 2^(scaleSteps-1)
	int xPixels = lowestWidth*multiplier;
	int yPixels = lowestHeight*multiplier;

	*pNumX = xPixels;
	*pNumY = yPixels;

	m_multiScaleSteps = scaleSteps;

	m_width = xPixels;
	m_height = yPixels;
	m_totalPixels = m_width*m_height;

	m_pixelWidth = realWidth/(double)xPixels;
	m_pixelHeight = realHeight/(double)(yPixels-1);
	m_pixelFrac = m_pixelWidth/m_pixelHeight;
	m_pixelFracInv = m_pixelHeight/m_pixelWidth;

	// Allocate enough memory for various arrays
	resizeArrays();

	m_deltaPhi = 0;

	m_init = true;
	m_initPotential = true;
	m_epsChanged = true;
	m_extraPotentialChanged = true;

	m_npDensFactor = 1.0*CONST_EPSILON0/(CHARGE_ELECTRON*m_pixelWidth*m_pixelWidth);
	m_chargeMultiplier = (CHARGE_ELECTRON*m_pixelWidth*m_pixelWidth/CONST_EPSILON0) * m_npDensFactor;
	m_chargeMultiplierFloat = (float)m_chargeMultiplier;
	m_timeFactor = 1.0e-10;

	//std::cout << "m_npDensFactor = " << m_npDensFactor << std::endl;
	//std::cout << "m_chargeMultiplier = " << m_chargeMultiplier << std::endl;
	//std::cout << "m_timeFactor = " << m_timeFactor << std::endl;
	//std::cout << "m_pixelWidth = " << m_pixelWidth << std::endl;

	return true;
}

#define SIMULATION2DDOUBLEGPUREL_STARTID	0x474d4953
#define SIMULATION2DDOUBLEGPUREL_VERSION	1
#define SIMULATION2DDOUBLEGPUREL_ENDID		0x444e4f43

bool Simulation2DDoubleGPURel::write(SerializationInterface &s) const
{
	if (!m_init)
	{
		setErrorString("Can't save an uninitialized simulation");
		return false;
	}

	s.writeInt32(SIMULATION2DDOUBLEGPUREL_STARTID);
	s.writeInt32(SIMULATION2DDOUBLEGPUREL_VERSION);
	s.writeInt32(m_width);
	s.writeInt32(m_height);
	s.writeInt32(m_multiScaleSteps);
	s.writeDouble(m_pixelWidth);
	s.writeDouble(m_pixelHeight);

	s.writeDouble(m_npDensFactor);
	s.writeDouble(m_chargeMultiplier);
	s.writeDouble(m_timeFactor);

	s.writeDoubles(m_n);
	s.writeDoubles(m_p);
	s.writeDoubles(m_background);

	s.writeDoubles(m_generationRate);
	s.writeDoubles(m_recombinationFactor);
	s.writeFloats(m_De);
	s.writeFloats(m_Dh);
	s.writeFloats(m_eMob);
	s.writeFloats(m_hMob);

	s.writeDoubles(m_epsRels[0]);
	s.writeDouble(m_deltaPhi);

	s.writeDoubles(m_Vbase[0]);
	s.writeDoubles(m_numCurTotEx);
	s.writeDoubles(m_numCurTotEy);
	s.writeDoubles(m_numCurTotHx);
	s.writeDoubles(m_numCurTotHy);

	s.writeDoubles(m_extraPotentialN);
	s.writeDoubles(m_extraPotentialP);

	int32_t flag = (m_initPotential)?1:0;

	s.writeInt32(flag);

	if (!s.writeInt32(SIMULATION2DDOUBLEGPUREL_ENDID))
	{
		setErrorString(std::string("Error writing to stream: ") + s.getErrorString());
		return false;
	}

	return true;
}

bool Simulation2DDoubleGPURel::read(SerializationInterface &s)
{
	if (m_init)
	{
		setErrorString("The simulation has already been initialized, clear it before loading other data in it");
		return false;
	}

	int32_t tmp;

	if (!s.readInt32(&tmp))
	{
		setErrorString(std::string("Error reading data file start identifier: ") + s.getErrorString());
		return false;
	}
	if (tmp != SIMULATION2DDOUBLEGPUREL_STARTID)
	{
		setErrorString("Read an invalid data file start identifier");
		return false;
	}
	if (!s.readInt32(&tmp))
	{
		setErrorString("Unable to read file format version: " + s.getErrorString());
		return false;
	}
	if (tmp != SIMULATION2DDOUBLEGPUREL_VERSION)
	{
		setErrorString("Incompatible file format version");
		return false;
	}

	int32_t w, h, steps;
	double pixelWidth, pixelHeight;
	double npDens, chargeMult, timeFact;

	if (!(s.readInt32(&w) && s.readInt32(&h) && s.readInt32(&steps) && s.readDouble(&pixelWidth) && s.readDouble(&pixelHeight)
	    && s.readDouble(&npDens) && s.readDouble(&chargeMult) && s.readDouble(&timeFact) ) )
	{
		setErrorString("Error reading grid dimensions and base units");
		return false;
	}

	m_width = w;
	m_height = h;
	m_multiScaleSteps = steps;

	m_pixelWidth = pixelWidth;
	m_pixelHeight = pixelHeight;
	m_pixelFrac = m_pixelWidth/m_pixelHeight;
	m_pixelFracInv = m_pixelHeight/m_pixelWidth;

	m_npDensFactor = npDens;
	m_chargeMultiplier = chargeMult;
	m_chargeMultiplierFloat = (float)m_chargeMultiplier;
	m_timeFactor = timeFact;

	int multiplier = 1 << (m_multiScaleSteps-1);
	int lowestWidth = m_width/multiplier;
	int lowestHeight = m_height/multiplier;

	if (!(lowestWidth*multiplier == m_width && lowestHeight*multiplier == m_height))
	{
		setErrorString("Detected inconsistent dimension settings");
		return false;
	}

	m_totalPixels = m_width*m_height;

	if (!initGPU())
		return false;
	
	resizeArrays();

	s.readDoubles(m_n);
	s.readDoubles(m_p);
	s.readDoubles(m_background);

	s.readDoubles(m_generationRate);
	s.readDoubles(m_recombinationFactor);

	for (int i = 0 ; i < m_recombinationFactorFloat.size() ; i++)
		m_recombinationFactorFloat[i] = (float)m_recombinationFactor[i];

	s.readFloats(m_De);
	s.readFloats(m_Dh);
	s.readFloats(m_eMob);
	s.readFloats(m_hMob);

	s.readDoubles(m_epsRels[0]);
	s.readDouble(&m_deltaPhi);

	s.readDoubles(m_Vbase[0]);
	s.readDoubles(m_numCurTotEx);
	s.readDoubles(m_numCurTotEy);
	s.readDoubles(m_numCurTotHx);
	s.readDoubles(m_numCurTotHy);

	s.readDoubles(m_extraPotentialN);
	s.readDoubles(m_extraPotentialP);

	int32_t flag = 0;

	s.readInt32(&flag);
	if (flag == 1)
		m_initPotential = true;
	else
		m_initPotential = false;

	if (!s.readInt32(&tmp))
	{
		setErrorString(std::string("Error reading data file end identifier: ") + s.getErrorString());
		return false;
	}

	if (tmp != SIMULATION2DDOUBLEGPUREL_ENDID)
	{
		setErrorString("Read an invalid data file end identifier");
		return false;
	}

	m_init = true;
	m_epsChanged = true;
	m_extraPotentialChanged = true;

	m_phiBackupBetterCounter = 0;

	return true;
}

bool Simulation2DDoubleGPURel::save(const std::string &fileName) const
{
	if (!m_init)
	{
		setErrorString("Can't save an uninialized simulation");
		return false;
	}

	FileSerializer fSer;

	if (!fSer.open(fileName, FileSerializer::WriteOnly))
	{
		setErrorString(std::string("Couldn't save to file ") + fileName + std::string(": ") + fSer.getErrorString());
		return false;
	}

	return write(fSer);
}

bool Simulation2DDoubleGPURel::load(const std::string &fileName)
{
	if (m_init)
	{
		setErrorString("The simulation has already been initialized, clear it before loading other data in it");
		return false;
	}

	FileSerializer fSer;

	if (!fSer.open(fileName, FileSerializer::ReadOnly))
	{
		setErrorString(std::string("Couldn't load file ") + fileName + std::string(": ") + fSer.getErrorString());
		return false;
	}

	return read(fSer);
}

bool Simulation2DDoubleGPURel::start(int steps, double dt, bool inverseMatrixSolver)
{
	if (inverseMatrixSolver)
	{
		setErrorString("Inverse matrix Poisson solver is not available for this type of simulation");
		return false;
	}

	GPURelNoCheck extraCheck;

	return commonStart<GPURelNoCheck>(extraCheck, steps, dt);
}

bool Simulation2DDoubleGPURel::start(int seconds, double dt, int &steps, bool inverseMatrixSolver)
{
	if (inverseMatrixSolver)
	{
		setErrorString("Inverse matrix Poisson solver is not available for this type of simulation");
		return false;
	}

	GPURelTimeCheck extraCheck(seconds);

	return commonStart<GPURelTimeCheck>(extraCheck, steps, dt);
}

void Simulation2DDoubleGPURel::calculateXCurrent(double &leftAvg, double &rightAvg, double &overallAvg, double &center) const
{
	double sum = 0;
	double sum2 = 0;
	double sum3 = 0;
	double sum4 = 0;

	for (int y = 1 ; y < m_height-1 ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = x+y*m_width;

			double JexCur = m_numCurTotEx[idx];
			double JhxCur = m_numCurTotHx[idx];
			
			sum += -JexCur+JhxCur;
		}

		int idx = 0+y*m_width;

		double JexCur = m_numCurTotEx[idx];
		double JhxCur = m_numCurTotHx[idx];

		sum2 += -JexCur + JhxCur;

		idx = m_width-1+y*m_width;

		JexCur = m_numCurTotEx[idx];
		JhxCur = m_numCurTotHx[idx];

		sum3 += -JexCur + JhxCur;

		idx = m_width/2+y*m_width;

		JexCur = m_numCurTotEx[idx];
		JhxCur = m_numCurTotHx[idx];

		sum4 += -JexCur + JhxCur;
	}

	sum /= m_width*(m_height-2);
	sum2 /= m_height-2;
	sum3 /= m_height-2;
	sum4 /= m_height-2;

	double factor = m_npDensFactor*m_pixelWidth/m_timeFactor;

	sum *= factor;
	sum2 *= factor;
	sum3 *= factor;
	sum4 *= factor;

	leftAvg = sum2;
	rightAvg = sum3;
	overallAvg = sum;
	center = sum4;
}

// Helper function to calculate the current in the Y-direction at the bottom side, the top side
// and averaged over the entire grid

void Simulation2DDoubleGPURel::calculateYCurrent(double &bottomAvg, double &topAvg, double &overallAvg, double &center) const
{
	double sum = 0;
	double sum2 = 0;
	double sum3 = 0;
	double sum4 = 0;

	for (int x = 0 ; x < m_width ; x++)
	{
		for (int y = 0 ; y < m_height-1 ; y++)
		{
			int idx = x+y*m_width;

			double JeyCur = m_numCurTotEy[idx];
			double JhyCur = m_numCurTotHy[idx];
			
			sum += -JeyCur+JhyCur;
		}

		int idx = x+0*m_width;

		double JeyCur = m_numCurTotEy[idx];
		double JhyCur = m_numCurTotHy[idx];

		sum2 += -JeyCur + JhyCur;

		idx = x+(m_height-2)*m_width;

		JeyCur = m_numCurTotEy[idx];
		JhyCur = m_numCurTotHy[idx];

		sum3 += -JeyCur + JhyCur;

		idx = x+(m_height/2)*m_width;

		JeyCur = m_numCurTotEy[idx];
		JhyCur = m_numCurTotHy[idx];

		sum4 += -JeyCur + JhyCur;
	}

	sum /= m_width*(m_height-1);
	sum2 /= m_width;
	sum3 /= m_width;
	sum4 /= m_width;

	double factor = m_npDensFactor*m_pixelWidth/m_timeFactor;

	sum *= factor;
	sum2 *= factor;
	sum3 *= factor;
	sum4 *= factor;

	bottomAvg = sum2;
	topAvg = sum3;
	overallAvg = sum;
	center = sum4;
}

void Simulation2DDoubleGPURel::initializePotentialFinder()
{
	// Initialize the potential at the coarsest scale

	if (m_initPotential)
	{
		m_initPotential = false;

		for (int y = 0 ; y < m_height ; y++)
		{
			double V = ((double)y/(double)(m_height-1))*m_deltaPhi;

			for (int x = 0 ; x < m_width ; x++)
			{
				int idx = x+y*m_width;

				m_Vbase[0][idx] = V;
			}
		}
		
		m_phiBackupBetterCounter = 0;
	}
	
	// Build scaled arrays of permittivity at each scale

	if (m_epsChanged)
	{
		m_epsChanged = false;

		int w = m_width;
		int h = m_height;

		for (int i = 0 ; i < m_multiScaleSteps-1 ; i++)
		{
			int w2 = w/2;

			for (int x = 0 ; x < w ; x += 2)
			{
				for (int y = 0 ; y < h ; y += 2)
				{
					double epsSum = m_epsRels[i][x+y*w] + m_epsRels[i][x+1+y*w] + m_epsRels[i][x+(y+1)*w] + m_epsRels[i][x+1+(y+1)*w];

					epsSum /= 4.0;

					int idx = x/2+(y/2)*w2;

					m_epsRels[i+1][idx] = epsSum;
				}
			}

			w /= 2;
			h /= 2;
		}


		w = m_width;
		h = m_height;

		double frac2 = m_pixelFrac*m_pixelFrac;

		for (int i = 0 ; i < m_multiScaleSteps ; i++)
		{
			for (int y = 0 ; y < h ; y++)
			{
				for (int x = 0 ; x < w ; x++)
				{
					// Initialize using periodic boundary conditions in both directions

					int xNext = (x+1)%w;
					int xPrev = (x-1+w)%w;
					int yNext = (y+1)%h;
					int yPrev = (y-1+h)%h;

					int index = x+y*w;
					int leftIndex = xPrev+y*w; 
					int rightIndex = xNext+y*w;
					int upIndex = x+yNext*w;
					int downIndex = x+yPrev*w;

					m_a1s[i][index] = 0.5*(m_epsRels[i][index]+m_epsRels[i][rightIndex]);
					m_a2s[i][index] = 0.5*(m_epsRels[i][index]+m_epsRels[i][leftIndex]);
					m_a3s[i][index] = 0.5*(m_epsRels[i][index]+m_epsRels[i][upIndex])*frac2;
					m_a4s[i][index] = 0.5*(m_epsRels[i][index]+m_epsRels[i][downIndex])*frac2;
					m_a0s[i][index] = m_a1s[i][index] + m_a2s[i][index] + m_a3s[i][index] + m_a4s[i][index];

					m_a1sFloat[i][index] = (float)m_a1s[i][index];
					m_a2sFloat[i][index] = (float)m_a2s[i][index];
					m_a3sFloat[i][index] = (float)m_a3s[i][index];
					m_a4sFloat[i][index] = (float)m_a4s[i][index];
					m_a0sFloat[i][index] = (float)m_a0s[i][index];
				}
			}

			w /= 2;
			h /= 2;
		}
	
		m_phiBackupBetterCounter = 0;
	}

	if (m_extraPotentialChanged)
	{
		for (int y = 0 ; y < m_height ; y++)
		{
			for (int x = 0 ; x < m_width ; x++)
			{
				int idx = getIndex(x, y);
				int nextX = (x+1)%m_width;
				int nextIdx = getIndex(nextX, y);

				m_extraElecticFieldNx[idx] = -(m_extraPotentialN[nextIdx] - m_extraPotentialN[idx]);
				m_extraElecticFieldPx[idx] = -(m_extraPotentialP[nextIdx] - m_extraPotentialP[idx]);
			}
		}

		for (int x = 0 ; x < m_width ; x++)
		{
			for (int y = 0 ; y < m_height-1 ; y++)
			{	
				int idx = getIndex(x, y);
				int nextIdx = getIndex(x, y+1);

				m_extraElecticFieldNy[idx] = -(m_extraPotentialN[nextIdx]-m_extraPotentialN[idx])*m_pixelFrac;
				m_extraElecticFieldPy[idx] = -(m_extraPotentialP[nextIdx]-m_extraPotentialP[idx])*m_pixelFrac;
			}

			// NOTE: the last row is not needed (and cannot be calculated by the way)
		}

		m_extraPotentialChanged = false;
	}

}

void Simulation2DDoubleGPURel::potentialFinder()
{
	if (m_phiBackupBetterCounter < m_phiMethodThreshold || m_phiBackupBetterCounter%m_phiMethodAdditionalCheckCount == 0)
	{
		// First create scaled grids of the charge

		//std::cout << m_phiBackupBetterCounter << " Trying both " << std::endl;

		int w = m_width;
		int h = m_height;

		for (int x = 0 ; x < w ; x++)
		{
			for (int y = 0 ; y < h ; y++)
			{
				int idx = x+y*w;

				m_chargeSums[0][idx] = (m_pRel[idx]-m_nRel[idx])*m_chargeMultiplierFloat;
			}
		}

		for (int i = 0 ; i < m_multiScaleSteps-1 ; i++)
		{
			int w2 = w/2;

			for (int x = 0 ; x < w ; x += 2)
			{
				for (int y = 0 ; y < h ; y += 2)
				{
					float chargeSum = m_chargeSums[i][x+y*w] + m_chargeSums[i][x+1+y*w] + m_chargeSums[i][x+(y+1)*w] + m_chargeSums[i][x+1+(y+1)*w];

					m_chargeSums[i+1][x/2+(y/2)*w2] = chargeSum; // sum it
				}
			}

			w /= 2;
			h /= 2;
		}

		// Width and height should now contain the dimensions of the coarsest grid
		
		// Calculate the potential for the lowest resolution grid
		float error, error2;
		
		error = blackRed(&(m_potential[m_multiScaleSteps-1][0]), m_multiScaleSteps-1, w, h, 100, 1.8f);

		for (int i = m_multiScaleSteps-2 ; i >= 0 ; i--)
		{
			int w2 = w;
			int h2 = h;

			w *= 2;
			h *= 2;

			// Interpolate the result for a first estimate of the higher resolution result

			for (int y = 1 ; y < h-1 ; y++) // leave the boundaries alone
			{
				float yFrac = (float)y/(float)(h-1);
				float y1d = yFrac*(h2-1);
				int y1 = (int)y1d;
				int y2 = y1+1;

				if (y2 == h2)
					y2 = h2-1;

				float t = y1d-(float)y1;

				for (int x = 0 ; x < w ; x++)
				{
					float xFrac = (float)x/(float)(w-1);
					float x1d = xFrac*(w2-1);
					int x1 = (int)x1d;
					int x2 = (x1+1)%w2;

					float s = x1d-(float)x1;

					float v1 = m_potential[i+1][x1+y1*w2];
					float v2 = m_potential[i+1][x2+y1*w2];
					float v3 = m_potential[i+1][x1+y2*w2];
					float v4 = m_potential[i+1][x2+y2*w2];

					// Bilinear interpolation
					m_potential[i][x+y*w] = v1*(1.0f-t)*(1.0f-s) + v2*(1.0f-t)*s + v3*t*(1.0f-s) + v4*t*s;
				}
			}

			// Optimize at new scale

			error = blackRed(&(m_potential[i][0]), i, w, h, 10, 1.8f);
		}

		// We shall also continuously update another potential field using another
		// method

		error2 = blackRed(&(m_backupPotential[0]), 0, m_width, m_height, 4, 1.0f);

		//std::cerr << "error = " << error << " error2 = " << error2 << std::endl;

		// We'll use the best result of the two approaches
		if (error2 >= error)
		{
			m_phiBackupBetterCounter = 0;
			memcpy(&(m_backupPotential[0]),&(m_potential[0][0]), sizeof(float)*m_totalPixels);
		}
		else
		{
			m_phiBackupBetterCounter++;
			memcpy(&(m_potential[0][0]), &(m_backupPotential[0]), sizeof(float)*m_totalPixels);
		}
	}
	else
	{
		// First grid of total charge

		int w = m_width;
		int h = m_height;

		for (int x = 0 ; x < w ; x++)
		{
			for (int y = 0 ; y < h ; y++)
			{
				int idx = x+y*w;

				m_chargeSums[0][idx] = (m_pRel[idx]-m_nRel[idx])*m_chargeMultiplierFloat;
			}
		}

		blackRed(&(m_backupPotential[0]), 0, m_width, m_height, 4, 1.0f);
		m_phiBackupBetterCounter++;
		memcpy(&(m_potential[0][0]), &(m_backupPotential[0]), sizeof(float)*m_totalPixels);
	}
}

float Simulation2DDoubleGPURel::blackRed(float *pSrc, int aIndex, int width, int height, int steps, float w)
{
	float error = -1;

	for (int i = 0 ; i < steps ; i++)
	{
		error = 0;

		// Alternating updates in the same array seems to be much more stable!

		for (int loop = 0 ; loop < 2 ; loop++)
		{
			int yStart = 1;
			int yStop = height-1;

			for (int y = yStart ; y < yStop ; y++)
			{
				int xOff = (y+loop)%2;

				for (int x = xOff ; x < width ; x += 2)
				{
					int xNext = (x+1)%width;
					int xPrev = (x-1+width)%width;
					int yNext = y+1;
					int yPrev = y-1;

					int index = x + y*width;
					int leftIndex = xPrev + y*width;
					int rightIndex = xNext + y*width;
					int upIndex = x + yNext*width;
					int downIndex = x+ yPrev*width;

					float prediction = (m_a1sFloat[aIndex][index]*pSrc[rightIndex] + m_a2sFloat[aIndex][index]*pSrc[leftIndex]
							  +  m_a3sFloat[aIndex][index]*pSrc[upIndex]  + m_a4sFloat[aIndex][index]*pSrc[downIndex]
							  +  m_chargeSums[aIndex][index])/m_a0sFloat[aIndex][index];

					float curValue = pSrc[index];

					float diff = m_VpredSub[aIndex][index] + prediction - curValue;

					pSrc[index] += w*diff;

					// update error

					error += diff*diff;
				}
			}
		}
	}

	return std::sqrt(error/(float)(width*height));
}

bool Simulation2DDoubleGPURel::resizeArrays()
{
	m_n.resize(m_totalPixels);
	m_p.resize(m_totalPixels);
	m_nFloat.resize(m_totalPixels);
	m_pFloat.resize(m_totalPixels);
	m_background.resize(m_totalPixels);
	m_generationRate.resize(m_totalPixels);
	m_recombinationFactor.resize(m_totalPixels);
	m_recombinationFactorFloat.resize(m_totalPixels);
	m_De.resize(m_totalPixels);
	m_Dh.resize(m_totalPixels);
	m_eMob.resize(m_totalPixels);
	m_hMob.resize(m_totalPixels);

	int tmpSize = m_totalPixels;

	// More arrays need to be allocated for the multigrid method

	m_epsRels.resize(m_multiScaleSteps);
	m_chargeSums.resize(m_multiScaleSteps);
	m_potential.resize(m_multiScaleSteps);
	m_a0s.resize(m_multiScaleSteps);
	m_a1s.resize(m_multiScaleSteps);
	m_a2s.resize(m_multiScaleSteps);
	m_a3s.resize(m_multiScaleSteps);
	m_a4s.resize(m_multiScaleSteps);
	m_a0sFloat.resize(m_multiScaleSteps);
	m_a1sFloat.resize(m_multiScaleSteps);
	m_a2sFloat.resize(m_multiScaleSteps);
	m_a3sFloat.resize(m_multiScaleSteps);
	m_a4sFloat.resize(m_multiScaleSteps);
	m_Vbase.resize(m_multiScaleSteps);
	m_VpredSub.resize(m_multiScaleSteps);
	m_chargeSumBase.resize(m_multiScaleSteps);

	m_backupPotential.resize(m_totalPixels);

	for (int i = 0 ; i < m_multiScaleSteps ; i++)
	{
		m_epsRels[i].resize(tmpSize);
		m_chargeSums[i].resize(tmpSize);
		m_potential[i].resize(tmpSize);
		m_a0s[i].resize(tmpSize);
		m_a1s[i].resize(tmpSize);
		m_a2s[i].resize(tmpSize);
		m_a3s[i].resize(tmpSize);
		m_a4s[i].resize(tmpSize);
		m_a0sFloat[i].resize(tmpSize);
		m_a1sFloat[i].resize(tmpSize);
		m_a2sFloat[i].resize(tmpSize);
		m_a3sFloat[i].resize(tmpSize);
		m_a4sFloat[i].resize(tmpSize);
		m_Vbase[i].resize(tmpSize);
		m_VpredSub[i].resize(tmpSize);
		m_chargeSumBase[i].resize(tmpSize);
		tmpSize /= 4;
	}

	m_electicFieldx.resize(m_totalPixels);
	m_electicFieldy.resize(m_totalPixels);

	// Initialize the values of some arrays

	setValues(m_n, 0);
	setValues(m_p, 0);
	setValues(m_background, 0);
	setValues(m_generationRate, 0);
	setValues(m_recombinationFactor, 0);
	setValues(m_recombinationFactorFloat, 0);
	setValues(m_De, 0);
	setValues(m_Dh, 0);
	setValues(m_eMob, 0);
	setValues(m_hMob, 0);
	setValues(m_epsRels[0], 1);
	setValues(m_background, 0);
	setValues(m_Vbase[0], 0);

	// Allocate memory for the arrays which will contain the number currents
	// due to diffusion and electric field respectively

	m_numCurTotEx.resize(m_totalPixels);
	m_numCurTotEy.resize(m_totalPixels);
	m_numCurTotHx.resize(m_totalPixels);
	m_numCurTotHy.resize(m_totalPixels);
	m_numCurTotExRel.resize(m_totalPixels);
	m_numCurTotEyRel.resize(m_totalPixels);
	m_numCurTotHxRel.resize(m_totalPixels);
	m_numCurTotHyRel.resize(m_totalPixels);

	m_dndt.resize(m_totalPixels);
	m_dpdt.resize(m_totalPixels);

	m_nRel.resize(m_totalPixels);
	m_pRel.resize(m_totalPixels);
	m_GRminJDivE.resize(m_totalPixels);
	m_GRminJDivH.resize(m_totalPixels);
	m_rp.resize(m_totalPixels);
	m_rn.resize(m_totalPixels);
	
	m_fieldBaseXN.resize(m_totalPixels);
	m_fieldBaseYN.resize(m_totalPixels);
	m_fieldBaseXP.resize(m_totalPixels);
	m_fieldBaseYP.resize(m_totalPixels);

	m_extraPotentialN.resize(m_totalPixels);
	m_extraPotentialP.resize(m_totalPixels);
	m_extraElecticFieldNx.resize(m_totalPixels);
	m_extraElecticFieldNy.resize(m_totalPixels);
	m_extraElecticFieldPx.resize(m_totalPixels);
	m_extraElecticFieldPy.resize(m_totalPixels);

	setValues(m_extraPotentialN, 0);
	setValues(m_extraPotentialP, 0);

	clearCLMemory();

	if (!allocateCLMemory())
		return false;
	return true;
}

void Simulation2DDoubleGPURel::writePlotData(FILE *pFile)
{
	for (int y = 0 ; y < m_height ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = x + y*m_width;

			double j1 = 0;
			double j2 = 0;

			if (y == m_height-1)
			{
				j1 = m_numCurTotHx[idx-m_width]-m_numCurTotEx[idx-m_width];
				j2 = m_numCurTotHy[idx-m_width]-m_numCurTotEy[idx-m_width];
			}
			else
			{
				j1 = m_numCurTotHx[idx]-m_numCurTotEx[idx];
				j2 = m_numCurTotHy[idx]-m_numCurTotEy[idx];
			}

			fprintf(pFile, "%d %d %g %g %g %g %g %g  %g %g %g %g %g %g\n", x, y, 
					         (double)m_npDensFactor*(double)m_n[idx], 
						 (double)m_npDensFactor*(double)m_p[idx], 
						 (double)m_Vbase[0][idx], 
					         (double)(1.0/(m_npDensFactor*m_timeFactor))*(double)m_recombinationFactor[idx]*(double)m_n[idx]*(double)m_p[idx]*m_npDensFactor*m_npDensFactor,
						 (double)j1*(double)(m_npDensFactor*m_pixelWidth/m_timeFactor), 
						 (double)j2*(double)(m_npDensFactor*m_pixelWidth/m_timeFactor),
						 (double)m_generationRate[idx]*(double)(m_npDensFactor/m_timeFactor), 
						 (double)m_recombinationFactor[idx]*(double)(1.0/(m_npDensFactor*m_timeFactor)),
						 (double)m_De[idx]*(double)(m_pixelWidth*m_pixelWidth/m_timeFactor), 
						 (double)m_Dh[idx]*(double)(m_pixelWidth*m_pixelWidth/m_timeFactor), 
						 (double)m_eMob[idx]*(double)(m_pixelWidth*m_pixelWidth/m_timeFactor), 
						 (double)m_hMob[idx]*(double)(m_pixelWidth*m_pixelWidth/m_timeFactor));
		}

		fprintf(pFile, "\n");
	}
	fprintf(pFile, "\n");
}

void Simulation2DDoubleGPURel::prepareRelativeCalculation()
{
	memset(&(m_nRel[0]), 0, sizeof(float)*m_totalPixels);
	memset(&(m_pRel[0]), 0, sizeof(float)*m_totalPixels);

	for (int y = 0 ; y < m_height ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = x + y*m_width;

			m_pFloat[idx] = (float)m_p[idx];
			m_nFloat[idx] = (float)m_n[idx];
		}
	}

	int w = m_width;
	int h = m_height;

	// set boundary conditions (just to be safe)
	{
		int yOff = (h-1)*w;
		for (int x = 0 ; x < w ; x++)
		{
			m_Vbase[0][x] = 0;
			m_Vbase[0][x+yOff] = m_deltaPhi;
			m_potential[0][x] = 0;
			m_potential[0][x+yOff] = 0;
		}
	}

	for (int i = 1 ; i < m_multiScaleSteps ; i++) // TODO: improve this very simple interpolation?
	{
		int w2 = w/2;
		int h2 = h/2;

		for (int y = 0 ; y < h2 ; y++)
		{
			double ySrc = ((double)y/(double)(h2-1))*(double)(h-1);
			int y0 = (int)ySrc;
			int y1 = y0+1;
			if (y0 == h-1)
				y1 = h-1;

			double yFrac = ySrc - (double)y0;

			for (int x = 0 ; x < w2 ; x++)
			{
				double xSrc = ((double)x/(double)(w2-1))*(double)(w-1);
				int x0 = (int)xSrc;
				int x1 = (x0+1)%w;

				double xFrac = xSrc - (double)x0;
				double v00 = m_Vbase[i-1][x0+y0*w];
				double v01 = m_Vbase[i-1][x0+y1*w];
				double v10 = m_Vbase[i-1][x1+y0*w];
				double v11 = m_Vbase[i-1][x1+y1*w];

				double val = (1.0-xFrac)*(1.0-yFrac)*v00
					   + xFrac*(1.0-yFrac)*v10
					   + (1.0-xFrac)*yFrac*v01
					   + xFrac*yFrac*v11;

				m_Vbase[i][x+y*w2] = val;
			}
		}
		w = w2;
		h = h2;
	
		// set boundary conditions (just to be safe)
		{
			int yOff = (h-1)*w;
			for (int x = 0 ; x < w ; x++)
			{
				m_Vbase[i][x] = 0;
				m_Vbase[i][x+yOff] = m_deltaPhi;
				m_potential[i][x] = 0;
				m_potential[i][x+yOff] = 0;
			}
		}
	}

	// base charge sums

	w = m_width;
	h = m_height;

	for (int x = 0 ; x < w ; x++)
	{
		for (int y = 0 ; y < h ; y++)
		{
			int idx = x+y*w;

			m_chargeSumBase[0][idx] = (m_p[idx]-m_n[idx]+m_background[idx])*m_chargeMultiplier;
		}
	}

	for (int i = 0 ; i < m_multiScaleSteps-1 ; i++)
	{
		int w2 = w/2;

		for (int x = 0 ; x < w ; x += 2)
		{
			for (int y = 0 ; y < h ; y += 2)
			{
				double chargeSum = m_chargeSumBase[i][x+y*w] + m_chargeSumBase[i][x+1+y*w] + m_chargeSumBase[i][x+(y+1)*w] + m_chargeSumBase[i][x+1+(y+1)*w];

				m_chargeSumBase[i+1][x/2+(y/2)*w2] = chargeSum; // sum it
			}
		}

		w /= 2;
		h /= 2;
	}

	// Calculations that can already be done 

	w = m_width;
	h = m_height;

	for (int i = 0 ; i < m_multiScaleSteps ; i++)
	{
		for (int y = 1 ; y < h-1 ; y++)
		{
			for (int x = 0 ; x < w ; x++)
			{
				int xNext = (x+1)%w;
				int xPrev = (x-1+w)%w;
				int yNext = y+1;
				int yPrev = y-1;

				int index = x + y*w;
				int leftIndex = xPrev + y*w;
				int rightIndex = xNext + y*w;
				int upIndex = x + yNext*w;
				int downIndex = x+ yPrev*w;

				double D = (m_a1s[i][index]*m_Vbase[i][leftIndex] + m_a2s[i][index]*m_Vbase[i][rightIndex]
					   + m_a3s[i][index]*m_Vbase[i][upIndex] + m_a4s[i][index]*m_Vbase[i][downIndex] + m_chargeSumBase[i][index] )/m_a0s[i][index]
					 - m_Vbase[i][index];

				m_VpredSub[i][index] = (float)D;
			}
		}

		w /= 2;
		h /= 2;
	}

	int multiplier = 1 << (m_multiScaleSteps-1);
	int lowestWidth = m_width/multiplier;
	int lowestHeight = m_height/multiplier;

	// m_Vbase[0] already contains the new estimate of the next potential
	memset(&(m_backupPotential[0]), 0, sizeof(float)*m_totalPixels);
	memset(&(m_potential[0][0]), 0, sizeof(float)*m_totalPixels);
	memset(&(m_potential[m_multiScaleSteps-1][0]), 0, sizeof(float)*lowestHeight*lowestWidth);

	// Calculate base field corresponding to base potential

	for (int y = 0 ; y < m_height ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = getIndex(x, y);
			int nextX = (x+1)%m_width;
			int nextIdx = getIndex(nextX, y);

			double diff = -(m_Vbase[0][nextIdx] - m_Vbase[0][idx]);

			m_fieldBaseXN[idx] = (float)(diff + m_extraElecticFieldNx[idx]);
			m_fieldBaseXP[idx] = (float)(diff + m_extraElecticFieldPx[idx]);
		}
	}

	for (int x = 0 ; x < m_width ; x++)
	{
		for (int y = 0 ; y < m_height-1 ; y++)
		{	
			int idx = getIndex(x, y);
			int nextIdx = getIndex(x, y+1);

			double diff = -(m_Vbase[0][nextIdx]-m_Vbase[0][idx])*m_pixelFrac;

			m_fieldBaseYN[idx] = (float)(diff + m_extraElecticFieldNy[idx]);
			m_fieldBaseYP[idx] = (float)(diff + m_extraElecticFieldPy[idx]);
		}

		// NOTE: the last row is not needed (and cannot be calculated by the way)
	}

	for (int y = 0 ; y < m_height ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = getIndex(x, y);
			int nextX = (x+1)%m_width; // for periodic boundary conditions
			int nextIdx = getIndex(nextX, y);

			double De = (m_De[idx]+m_De[nextIdx])/2.0f;
			double Dh = (m_Dh[idx]+m_Dh[nextIdx])/2.0f;
			double eMob = (m_eMob[idx]+m_eMob[nextIdx])/2.0f;
			double hMob = (m_hMob[idx]+m_hMob[nextIdx])/2.0f;

			double ve = -eMob*m_fieldBaseXN[idx];
			double vh = +hMob*m_fieldBaseXP[idx];

			m_numCurTotEx[idx] = getSGCurrent(ve, De, 1.0, m_n[idx], m_n[nextIdx]);
			m_numCurTotHx[idx] = getSGCurrent(vh, Dh, 1.0, m_p[idx], m_p[nextIdx]);
		}
	}

	for (int x = 0 ; x < m_width ; x++)
	{
		for (int y = 0 ; y < m_height-1 ; y++)
		{	
			int idx = getIndex(x, y);
			int nextIdx = getIndex(x, y+1);

			double De = (m_De[idx]+m_De[nextIdx])/2.0f;
			double Dh = (m_Dh[idx]+m_Dh[nextIdx])/2.0f;
			double eMob = (m_eMob[idx]+m_eMob[nextIdx])/2.0f;
			double hMob = (m_hMob[idx]+m_hMob[nextIdx])/2.0f;

			double ve = -eMob*m_fieldBaseYN[idx];
			double vh = +hMob*m_fieldBaseYP[idx];

			m_numCurTotEy[idx] = getSGCurrent(ve, De, m_pixelFracInv, m_n[idx], m_n[nextIdx]);
			m_numCurTotHy[idx] = getSGCurrent(vh, Dh, m_pixelFracInv, m_p[idx], m_p[nextIdx]);
		}
	}

	for (int x = 0 ; x < m_width ; x++)
	{
		int prevX = (x-1+m_width)%m_width; // For periodic boundary conditions

		for (int y = 1 ; y < m_height-1 ; y++)
		{
			int prevY = y-1;
			int idx = getIndex(x, y);
			int leftIdx = getIndex(prevX, y);
			int belowIdx = getIndex(x, prevY);

			double JexCur = m_numCurTotEx[idx];
			double JexPrev = m_numCurTotEx[leftIdx];
			double JeyCur = m_numCurTotEy[idx];
			double JeyPrev = m_numCurTotEy[belowIdx];

			double JhxCur = m_numCurTotHx[idx];
			double JhxPrev = m_numCurTotHx[leftIdx];
			double JhyCur = m_numCurTotHy[idx];
			double JhyPrev = m_numCurTotHy[belowIdx];

			double Jexx = (JexCur - JexPrev);
			double Jeyy = (JeyCur - JeyPrev)*m_pixelFrac;
			double Jhxx = (JhxCur - JhxPrev);
			double Jhyy = (JhyCur - JhyPrev)*m_pixelFrac;

			double rf = m_recombinationFactor[idx];
			double p = m_p[idx];
			double n = m_n[idx];
			double g1 = m_generationRate[idx] - rf*p*n;

			m_GRminJDivE[idx] = (float)(g1 - (Jexx+Jeyy));
			m_GRminJDivH[idx] = (float)(g1 - (Jhxx+Jhyy));

			m_rn[idx] = (float)(rf*n);
			m_rp[idx] = (float)(rf*p);
		}
	}
}

void Simulation2DDoubleGPURel::mergeRelativeResults()
{
	for (int y = 1 ; y < m_height-1 ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = x+y*m_width;

			m_n[idx] += (double)m_nRel[idx];
			m_p[idx] += (double)m_pRel[idx];
			m_Vbase[0][idx] += (double)m_potential[0][idx];
		}
	}

	for (int y = 0 ; y < m_height-1 ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = x+y*m_width;

			m_numCurTotEx[idx] += (double)m_numCurTotExRel[idx];
			m_numCurTotEy[idx] += (double)m_numCurTotEyRel[idx];
			m_numCurTotHx[idx] += (double)m_numCurTotHxRel[idx];
			m_numCurTotHy[idx] += (double)m_numCurTotHyRel[idx];
		}
	}
}

void Simulation2DDoubleGPURel::zeroCLMemory()
{
	m_clPotential = 0; 
	m_clVSubPred = 0; 
	m_clExBaseN = 0; 
	m_clEyBaseN = 0;
	m_clExBaseP = 0; 
	m_clEyBaseP = 0;
	m_clDe = 0; 
	m_clDh = 0; 
	m_clEMob = 0; 
	m_clHMob = 0;
	m_clN = 0; 
	m_clP = 0; 
	m_clN2 = 0; 
	m_clP2 = 0; 
	m_clNBase = 0; 
	m_clPBase = 0;
	m_clECurX = 0; 
	m_clECurY = 0; 
	m_clHCurX = 0; 
	m_clHCurY = 0;
	m_clGRminJDivE = 0; 
	m_clGRminJDivH = 0; 
	m_clRN = 0; 
	m_clRP = 0; 
	m_clRecFactor = 0; 
	m_clA0 = 0; 
	m_clA1 = 0; 
	m_clA2 = 0; 
	m_clA3 = 0; 
	m_clA4 = 0;
}

void Simulation2DDoubleGPURel::clearCLMemory()
{
	if (m_clPotential != 0)
		clReleaseMemObject(m_clPotential); 
	if (m_clVSubPred != 0)
		clReleaseMemObject(m_clVSubPred); 
	if (m_clExBaseP != 0)
		clReleaseMemObject(m_clExBaseP); 
	if (m_clEyBaseP != 0)
		clReleaseMemObject(m_clEyBaseP);
	if (m_clExBaseN != 0)
		clReleaseMemObject(m_clExBaseN); 
	if (m_clEyBaseN != 0)
		clReleaseMemObject(m_clEyBaseN);
	if (m_clDe != 0)
		clReleaseMemObject(m_clDe); 
	if (m_clDh != 0)
		clReleaseMemObject(m_clDh); 
	if (m_clEMob != 0)
		clReleaseMemObject(m_clEMob); 
	if (m_clHMob != 0)
		clReleaseMemObject(m_clHMob);
	if (m_clN != 0)
		clReleaseMemObject(m_clN); 
	if (m_clP != 0)
		clReleaseMemObject(m_clP); 
	if (m_clNBase != 0)
		clReleaseMemObject(m_clNBase); 
	if (m_clPBase != 0)
		clReleaseMemObject(m_clPBase);
	if (m_clECurX != 0)
		clReleaseMemObject(m_clECurX); 
	if (m_clECurY != 0)
		clReleaseMemObject(m_clECurY); 
	if (m_clHCurX != 0)
		clReleaseMemObject(m_clHCurX); 
	if (m_clHCurY != 0)
		clReleaseMemObject(m_clHCurY);
	if (m_clGRminJDivE != 0)
		clReleaseMemObject(m_clGRminJDivE); 
	if (m_clGRminJDivH != 0)
		clReleaseMemObject(m_clGRminJDivH); 
	if (m_clRN != 0)
		clReleaseMemObject(m_clRN); 
	if (m_clRP != 0)
		clReleaseMemObject(m_clRP); 
	if (m_clRecFactor != 0)
		clReleaseMemObject(m_clRecFactor); 
	if (m_clN2 != 0)
		clReleaseMemObject(m_clN2); 
	if (m_clP2 != 0)
		clReleaseMemObject(m_clP2); 
	if (m_clA0 != 0)
		clReleaseMemObject(m_clA0); 
	if (m_clA1 != 0)
		clReleaseMemObject(m_clA1); 
	if (m_clA2 != 0)
		clReleaseMemObject(m_clA2); 
	if (m_clA3 != 0)
		clReleaseMemObject(m_clA3); 
	if (m_clA4 != 0)
		clReleaseMemObject(m_clA4);
	
	zeroCLMemory();
}

#define CHECKCLALLOCATEERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		char str[1024]; \
 \
		sprintf(str, "Error allocating OpenCL memory (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

bool Simulation2DDoubleGPURel::allocateCLMemory()
{
	cl_int err = 0;
	cl_context ctx = m_gpu.getContext();
	cl_mem tmpBuffer = 0;
	size_t bufSize = sizeof(float)*m_totalPixels;

	m_clPotential = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)
	m_clVSubPred = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clExBaseN = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clEyBaseN = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)
	m_clExBaseP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clEyBaseP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)
	m_clDe = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clDh = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clEMob = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clHMob = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)
	m_clN = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clNBase = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clPBase = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)
	m_clECurX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clECurY = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clHCurX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clHCurY = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)
	m_clGRminJDivE = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clGRminJDivH = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clRN = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clRP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clRecFactor = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clN2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clP2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clA0 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clA1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clA2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clA3 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err);  CHECKCLALLOCATEERROR(err)
	m_clA4 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, bufSize, 0, &err); CHECKCLALLOCATEERROR(err)

	return true;
}

#define CHECKKERNELARGERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		sprintf(str, "Error setting OpenCL kernel argument (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

#define CHECKWRITEERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		sprintf(str, "Error writing to OpenCL buffer (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

#define CHECKWRITEERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		sprintf(str, "Error writing to OpenCL buffer (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

#define CHECKREADERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		sprintf(str, "Error reading from OpenCL buffer (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

#define CHECKENQUEUEKERNELERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		sprintf(str, "Error enqueuing OpenCL kernel (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

#define CHECKFINISHERROR(error) \
{ \
	if (error != CL_SUCCESS) \
	{ \
		sprintf(str, "Error calling OpenCL clFinish command (error code %d)", (int)error); \
		setErrorString(str); \
		return false; \
	} \
}

template <class T> bool Simulation2DDoubleGPURel::commonStart(T &extraCheck, int &steps, double dt)
{
	if (!m_init)
	{
		setErrorString("Not initialized");
		return false;
	}
	if (steps < 1)
	{
		setErrorString("At least one step should be executed");
		return false;
	}
	
	initializePotentialFinder();

	prepareRelativeCalculation();

	float scaledDt = (float)(dt/m_timeFactor);
	char str[1024];

	cl_int err, width, height;
	cl_kernel currentsKernel = m_gpu.getKernel(0);
	cl_kernel updateDensitiesKernel = m_gpu.getKernel(1);
	cl_kernel updateDensitiesKernelSwapped = m_gpu.getKernel(2);
	cl_kernel blackRedKernel = m_gpu.getKernel(3);
	cl_float clScaledDt = scaledDt;
	cl_float pixelFrac = (cl_float)m_pixelFrac;
	cl_float pixelFracInv = (cl_float)m_pixelFracInv;

	width = m_width;
	height = m_height;

	// TODO: perhaps this doesn't need to be done each time?

	int argNr = 0;

	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clPotential); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clExBaseN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clEyBaseN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clExBaseP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clEyBaseP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clDe); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clDh); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clEMob); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clHMob); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clNBase); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clPBase); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clECurX); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clECurY); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clHCurX); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_mem), (void *)&m_clHCurY); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_int), (void *)&width); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_int), (void *)&height); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_float), (void *)&pixelFrac); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(currentsKernel, argNr++, sizeof(cl_float), (void *)&pixelFracInv); CHECKKERNELARGERROR(err)

	argNr = 0;

	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clPotential); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clExBaseN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clEyBaseN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clExBaseP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clEyBaseP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clDe); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clDh); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clEMob); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clHMob); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clGRminJDivE); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clGRminJDivH); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clRN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clRP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clRecFactor); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clNBase); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clPBase); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clN2); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_mem), (void *)&m_clP2); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_int), (void *)&width); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_int), (void *)&height); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_float), (void *)&clScaledDt); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_float), (void *)&pixelFrac); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernel, argNr++, sizeof(cl_float), (void *)&pixelFracInv); CHECKKERNELARGERROR(err)

	argNr = 0;

	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clPotential); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clExBaseN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clEyBaseN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clExBaseP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clEyBaseP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clDe); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clDh); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clEMob); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clHMob); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clGRminJDivE); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clGRminJDivH); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clRN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clRP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clRecFactor); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clNBase); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clPBase); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clN2); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_mem), (void *)&m_clP2); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_int), (void *)&width); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_int), (void *)&height); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_float), (void *)&clScaledDt); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_float), (void *)&pixelFrac); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(updateDensitiesKernelSwapped, argNr++, sizeof(cl_float), (void *)&pixelFracInv); CHECKKERNELARGERROR(err)

	cl_float chargeMult = m_chargeMultiplierFloat;
	cl_float w = 1.0f;
	cl_int blackOrRed = 0;

	err = clSetKernelArg(blackRedKernel, 0, sizeof(cl_float), (void * )&chargeMult); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 1, sizeof(cl_mem), (void *)&m_clN); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 2, sizeof(cl_mem), (void *)&m_clP); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 3, sizeof(cl_mem), (void *)&m_clA0); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 4, sizeof(cl_mem), (void *)&m_clA1); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 5, sizeof(cl_mem), (void *)&m_clA2); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 6, sizeof(cl_mem), (void *)&m_clA3); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 7, sizeof(cl_mem), (void *)&m_clA4); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 8, sizeof(cl_mem), (void *)&m_clVSubPred); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 9, sizeof(cl_mem), (void *)&m_clPotential); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 10, sizeof(cl_float), (void * )&w); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 11, sizeof(cl_int), (void * )&blackOrRed); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 12, sizeof(cl_int), (void * )&width); CHECKKERNELARGERROR(err)
	err = clSetKernelArg(blackRedKernel, 13, sizeof(cl_int), (void * )&height); CHECKKERNELARGERROR(err)

	size_t currentsGlobalOffset[] = { 0 };
	size_t currentsGlobalSize[] = { m_totalPixels-width };
	size_t globalOffset[] = { width };
	size_t globalSize[] = { m_totalPixels-width-width };

	// TODO: check errors!
	// Sync values
	// m_potential[0] has been set to zero in prepareRelativeCalculation()
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clPotential, false, 0, m_totalPixels*sizeof(float), &(m_potential[0][0]), 0, 0, 0); CHECKWRITEERROR(err)

	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clVSubPred, false, 0, m_totalPixels*sizeof(float), &(m_VpredSub[0][0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clExBaseN, false, 0, m_totalPixels*sizeof(float), &(m_fieldBaseXN[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clEyBaseN, false, 0, m_totalPixels*sizeof(float), &(m_fieldBaseYN[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clExBaseP, false, 0, m_totalPixels*sizeof(float), &(m_fieldBaseXP[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clEyBaseP, false, 0, m_totalPixels*sizeof(float), &(m_fieldBaseYP[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clDe, false, 0, m_totalPixels*sizeof(float), &(m_De[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clDh, false, 0, m_totalPixels*sizeof(float), &(m_Dh[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clEMob, false, 0, m_totalPixels*sizeof(float), &(m_eMob[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clHMob, false, 0, m_totalPixels*sizeof(float), &(m_hMob[0]), 0, 0, 0); CHECKWRITEERROR(err)
	
	// m_nRel and m_pRel have been set to zero
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clN, false, 0, m_totalPixels*sizeof(float), &(m_nRel[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clP, false, 0, m_totalPixels*sizeof(float), &(m_pRel[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clN2, false, 0, m_totalPixels*sizeof(float), &(m_nRel[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clP2, false, 0, m_totalPixels*sizeof(float), &(m_pRel[0]), 0, 0, 0); CHECKWRITEERROR(err)

	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clNBase, false, 0, m_totalPixels*sizeof(float), &(m_nFloat[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clPBase, false, 0, m_totalPixels*sizeof(float), &(m_pFloat[0]), 0, 0, 0); CHECKWRITEERROR(err)

	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clGRminJDivE, false, 0, m_totalPixels*sizeof(float), &(m_GRminJDivE[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clGRminJDivH, false, 0, m_totalPixels*sizeof(float), &(m_GRminJDivH[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clRN, false, 0, m_totalPixels*sizeof(float), &(m_rn[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clRP, false, 0, m_totalPixels*sizeof(float), &(m_rp[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clRecFactor, false, 0, m_totalPixels*sizeof(float), &(m_recombinationFactorFloat[0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clA0, false, 0, m_totalPixels*sizeof(float), &(m_a0sFloat[0][0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clA1, false, 0, m_totalPixels*sizeof(float), &(m_a1sFloat[0][0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clA2, false, 0, m_totalPixels*sizeof(float), &(m_a2sFloat[0][0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clA3, false, 0, m_totalPixels*sizeof(float), &(m_a3sFloat[0][0]), 0, 0, 0); CHECKWRITEERROR(err)
	err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clA4, false, 0, m_totalPixels*sizeof(float), &(m_a4sFloat[0][0]), 0, 0, 0); CHECKWRITEERROR(err)

	int maxSteps = steps;
	int count = 0;
	bool updatedClPotential = false;

	do
	{
		if (m_phiBackupBetterCounter < m_phiMethodThreshold || m_phiBackupBetterCounter%m_phiMethodAdditionalCheckCount == 0)
		{
			if (updatedClPotential)
			{
				err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clPotential, false, 0, sizeof(float)*m_totalPixels, &(m_backupPotential[0]), 0, 0, 0); CHECKREADERROR(err)
			}

			int num = 0;
			int count2 = 0;

			if (m_phiBackupBetterCounter < m_phiMethodThreshold)
				num = m_phiMethodThreshold-m_phiBackupBetterCounter;
			else
				num = 1;

			//std::cout << "HERE1 " << num << std::endl;

			do
			{
				if (count%2 == 0)
				{
					err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clN, false, 0, sizeof(float)*m_totalPixels, &(m_nRel[0]), 0, 0, 0); CHECKREADERROR(err)
					err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clP, false, 0, sizeof(float)*m_totalPixels, &(m_pRel[0]), 0, 0, 0); CHECKREADERROR(err)
				}
				else
				{
					err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clN2, false, 0, sizeof(float)*m_totalPixels, &(m_nRel[0]), 0, 0, 0); CHECKREADERROR(err)
					err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clP2, false, 0, sizeof(float)*m_totalPixels, &(m_pRel[0]), 0, 0, 0); CHECKREADERROR(err)
				}

				err = clFinish(m_gpu.getCommandQueue()); CHECKFINISHERROR(err)

				potentialFinder();
				updatedClPotential =  false;

				err = clEnqueueWriteBuffer(m_gpu.getCommandQueue(), m_clPotential, false, 0, m_totalPixels*sizeof(float), &(m_potential[0][0]), 0, 0, 0); CHECKWRITEERROR(err)

				if (count%2 == 0)
				{
					err = clEnqueueNDRangeKernel(m_gpu.getCommandQueue(), updateDensitiesKernel, 1, globalOffset, globalSize, 0, 0, 0, 0); 
					CHECKENQUEUEKERNELERROR(err)
				}
				else
				{
					err = clEnqueueNDRangeKernel(m_gpu.getCommandQueue(), updateDensitiesKernelSwapped, 1, globalOffset, globalSize, 0, 0, 0, 0);
					CHECKENQUEUEKERNELERROR(err)
				}

				count++;
				count2++;

			} while ((!extraCheck.isDone()) && count < maxSteps && count2 < num);
		}
		else
		{
			int num = m_phiMethodAdditionalCheckCount- (m_phiBackupBetterCounter%m_phiMethodAdditionalCheckCount);
			int count2 = 0;
			//std::cout << "HERE2 " << num << std::endl;

			do
			{
				if (count%2 == 0)
				{
					err = clSetKernelArg(blackRedKernel, 1, sizeof(cl_mem), (void *)&m_clN); CHECKKERNELARGERROR(err)
					err = clSetKernelArg(blackRedKernel, 2, sizeof(cl_mem), (void *)&m_clP); CHECKKERNELARGERROR(err)
				}
				else
				{
					err = clSetKernelArg(blackRedKernel, 1, sizeof(cl_mem), (void *)&m_clN2); CHECKKERNELARGERROR(err)
					err = clSetKernelArg(blackRedKernel, 2, sizeof(cl_mem), (void *)&m_clP2); CHECKKERNELARGERROR(err)
				}

				for (blackOrRed = 0; blackOrRed < 8 ; blackOrRed++)
				{
					err = clSetKernelArg(blackRedKernel, 11, sizeof(cl_int), (void * )&blackOrRed); CHECKKERNELARGERROR(err)
					err = clEnqueueNDRangeKernel(m_gpu.getCommandQueue(), blackRedKernel, 1, globalOffset, globalSize, 0, 0, 0, 0); CHECKENQUEUEKERNELERROR(err)
					updatedClPotential = true;
				}
				m_phiBackupBetterCounter++;

				if (count%2 == 0)
				{
					err = clEnqueueNDRangeKernel(m_gpu.getCommandQueue(), updateDensitiesKernel, 1, globalOffset, globalSize, 0, 0, 0, 0);
					CHECKENQUEUEKERNELERROR(err)
				}
				else
				{
					err = clEnqueueNDRangeKernel(m_gpu.getCommandQueue(), updateDensitiesKernelSwapped, 1, globalOffset, globalSize, 0, 0, 0, 0);
					CHECKENQUEUEKERNELERROR(err)
				}
				count++;
				count2++;

			} while ((!extraCheck.isDone()) && count < maxSteps && count2 < num);
		}

	} while ((!extraCheck.isDone()) && count < maxSteps);

	steps = count;

	// Calculate currents

	if (count%2 == 1)
	{
		err = clSetKernelArg(currentsKernel, 9, sizeof(cl_mem), (void *)&m_clN); CHECKKERNELARGERROR(err)
		err = clSetKernelArg(currentsKernel, 10, sizeof(cl_mem), (void *)&m_clP); CHECKKERNELARGERROR(err)
	}
	else
	{
		err = clSetKernelArg(currentsKernel, 9, sizeof(cl_mem), (void *)&m_clN2); CHECKKERNELARGERROR(err)
		err = clSetKernelArg(currentsKernel, 10, sizeof(cl_mem), (void *)&m_clP2); CHECKKERNELARGERROR(err)
	}
	err = clEnqueueNDRangeKernel(m_gpu.getCommandQueue(), currentsKernel, 1, currentsGlobalOffset, currentsGlobalSize, 0, 0, 0, 0);
	CHECKENQUEUEKERNELERROR(err)

	// Fetch results

	err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clECurX, false, 0, sizeof(float)*m_totalPixels, &(m_numCurTotExRel[0]), 0, 0, 0); CHECKREADERROR(err)
	err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clECurY, false, 0, sizeof(float)*m_totalPixels, &(m_numCurTotEyRel[0]), 0, 0, 0); CHECKREADERROR(err)
	err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clHCurX, false, 0, sizeof(float)*m_totalPixels, &(m_numCurTotHxRel[0]), 0, 0, 0); CHECKREADERROR(err)
	err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clHCurY, false, 0, sizeof(float)*m_totalPixels, &(m_numCurTotHyRel[0]), 0, 0, 0); CHECKREADERROR(err)

	if (count%2 == 1)
	{
		err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clN2, false, 0, sizeof(float)*m_totalPixels, &(m_nRel[0]), 0, 0, 0); CHECKREADERROR(err)
		err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clP2, false, 0, sizeof(float)*m_totalPixels, &(m_pRel[0]), 0, 0, 0); CHECKREADERROR(err)
	}
	else
	{
		err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clN, false, 0, sizeof(float)*m_totalPixels, &(m_nRel[0]), 0, 0, 0); CHECKREADERROR(err)
		err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clP, false, 0, sizeof(float)*m_totalPixels, &(m_pRel[0]), 0, 0, 0); CHECKREADERROR(err)
	}

	if (updatedClPotential)
	{
		err = clEnqueueReadBuffer(m_gpu.getCommandQueue(), m_clPotential, false, 0, sizeof(float)*m_totalPixels, &(m_backupPotential[0]), 0, 0, 0); 
		CHECKREADERROR(err)
	}

	err = clFinish(m_gpu.getCommandQueue()); CHECKFINISHERROR(err)

	if (updatedClPotential)
		memcpy(&(m_potential[0][0]), &(m_backupPotential[0]), sizeof(float)*m_totalPixels);

	mergeRelativeResults();

	return true;
}

std::string Simulation2DDoubleGPURel::getOpenCLProgram()
{
	// TODO: for debugging
	//std::string program = GPURelLoadCLFile("/home/jori/projects/simiconductor/src/proggpurel.cl");

	std::string program = 
		"union floatUnion\n"
		"{\n"
		"	float m_float;\n"
		"	uint m_int;\n"
		"};\n"
		"\n"
		"#define LIM 1e-5\n"
		"\n"
		"float getSGCurrent(float v, float D, float delta, float n1, float n2)\n"
		"{\n"
		"	float vDelta = v*delta;\n"
		"	float x = vDelta/D;\n"
		"	float j = 0;\n"
		"\n"
		"	if (x < -LIM)\n"
		"	{\n"
		"		float factor = exp(-x);\n"
		"		float n2factor = n2*factor;\n"
		"		float oneMinusFactor = 1.0f-factor;\n"
		"		float n1MinusN2Factor = n1-n2factor;\n"
		"		float n1MinusN2FactorOverOneMinusFactor = n1MinusN2Factor/oneMinusFactor;\n"
		"\n"
		"		j = v*n1MinusN2FactorOverOneMinusFactor;\n"
		"	}\n"
		"	else if (x > LIM)\n"
		"	{\n"
		"		float factor = exp(x);\n"
		"		float factorMinusOne = factor - 1.0f;\n"
		"		float n1factor = n1*factor;\n"
		"		float n1factorMinusN2 = n1factor - n2;\n"
		"		float n1factorMinusN2OverFactorMinusOne = n1factorMinusN2/factorMinusOne;\n"
		"\n"
		"		j = v*n1factorMinusN2OverFactorMinusOne;\n"
		"	}\n"
		"	else\n"
		"	{\n"
		"		x /= 2.0f;\n"
		"\n"
		"		float twoD = 2.0f*D;\n"
		"		float factor = twoD/delta;\n"
		"		float n1MinusN2 = n1-n2;\n"
		"		float n1PlusN2 = n1+n2;\n"
		"		float term1 = 0.5f*n1MinusN2;\n"
		"		float term2Part = 0.5f*n1PlusN2;\n"
		"		float term2 = term2Part*x;\n"
		"		float xSquared = x*x;\n"
		"		float term3Part = n1MinusN2*xSquared;\n"
		"		float term3 = term3Part/6.0f;\n"
		"\n"
		"		float seriesPart = term1 + term2;\n"
		"		float series = seriesPart/* + term3*/;\n"
		"\n"
		"		j = factor*series;\n"
		"	}\n"
		"\n"
		"	return j;\n"
		"}\n"
		"\n"
		"float2 getXRelCurrents(int idx,\n"
		"			  __global const float *pV, \n"
		"			  __global const float *pExBaseN, __global const float *pExBaseP,\n"
		"                          __global const float *pDe, __global const float *pDh,\n"
		"			  __global const float *pEMob, __global const float *pHMob,\n"
		"			  __global const float *pN, __global const float *pP,\n"
		"			  __global const float *pNBase, __global const float *pPBase,\n"
		"			  int width, int height)\n"
		"{\n"
		"	int x = idx % width;\n"
		"	int y = idx / width;\n"
		"	int nextX = (x+1) % width;\n"
		"	int nextIdx = nextX + y*width;\n"
		"\n"
		"	float Ex1 = - ( pV[nextIdx] - pV[idx] );\n"
		"	float De = (pDe[idx]+pDe[nextIdx])/2.0f;\n"
		"	float Dh = (pDh[idx]+pDh[nextIdx])/2.0f;\n"
		"	float eMob = (pEMob[idx]+pEMob[nextIdx])/2.0f;\n"
		"	float hMob = (pHMob[idx]+pHMob[nextIdx])/2.0f;\n"
		"\n"
		"	float ve0 = -eMob*pExBaseN[idx];\n"
		"	float vh0 = +hMob*pExBaseP[idx];\n"
		"	float ve1 = -eMob*Ex1;\n"
		"	float vh1 = +hMob*Ex1;\n"
		"\n"
		"	float2 currents;\n"
		"\n"
		"	currents.x = getSGCurrent(ve0+ve1, De, 1.0f, pN[idx], pN[nextIdx]) + 0.5f*ve1*(pNBase[idx]+pNBase[nextIdx]);\n"
		"	currents.y = getSGCurrent(vh0+vh1, Dh, 1.0f, pP[idx], pP[nextIdx]) + 0.5f*vh1*(pPBase[idx]+pPBase[nextIdx]);\n"
		"\n"
		"	return currents;\n"
		"}\n"
		"\n"
		"float2 getYRelCurrents(int idx,\n"
		"			  __global const float *pV, \n"
		"			  __global const float *pEyBaseN, __global const float *pEyBaseP,\n"
		"                          __global const float *pDe, __global const float *pDh,\n"
		"			  __global const float *pEMob, __global const float *pHMob,\n"
		"			  __global const float *pN, __global const float *pP,\n"
		"			  __global const float *pNBase, __global const float *pPBase,\n"
		"			  int width, int height, float pixelFrac, float pixelFracInv)\n"
		"{\n"
		"	int x = idx % width;\n"
		"	int y = idx / width;\n"
		"	int nextY = y + 1 - y/(height-1); // if no next y is available, this just yields the same y, we won't be using this calculation, so it doesn't really matter\n"
		"	int nextIdx = x + nextY*width;\n"
		"\n"
		"	float Ey1 = -(pV[nextIdx]-pV[idx])*pixelFrac;\n"
		"	float De = (pDe[idx]+pDe[nextIdx])/2.0f;\n"
		"	float Dh = (pDh[idx]+pDh[nextIdx])/2.0f;\n"
		"	float eMob = (pEMob[idx]+pEMob[nextIdx])/2.0f;\n"
		"	float hMob = (pHMob[idx]+pHMob[nextIdx])/2.0f;\n"
		"\n"
		"	float ve0 = -eMob*pEyBaseN[idx];\n"
		"	float vh0 = +hMob*pEyBaseP[idx];\n"
		"	float ve1 = -eMob*Ey1;\n"
		"	float vh1 = +hMob*Ey1;\n"
		"\n"
		"	float2 currents;\n"
		"\n"
		"	currents.x = getSGCurrent(ve0+ve1, De, pixelFracInv, pN[idx], pN[nextIdx]) + 0.5f*ve1*(pNBase[idx]+pNBase[nextIdx]);\n"
		"	currents.y = getSGCurrent(vh0+vh1, Dh, pixelFracInv, pP[idx], pP[nextIdx]) + 0.5f*vh1*(pPBase[idx]+pPBase[nextIdx]);\n"
		"\n"
		"	return currents;\n"
		"}\n"
		"\n"
		"__kernel void currentsKernel(__global const float *pV, \n"
		"			  __global const float *pExBaseN, __global const float *pEyBaseN,\n"
		"			  __global const float *pExBaseP, __global const float *pEyBaseP,\n"
		"                          __global const float *pDe, __global const float *pDh,\n"
		"			  __global const float *pEMob, __global const float *pHMob,\n"
		"			  __global const float *pN, __global const float *pP,\n"
		"			  __global const float *pNBase, __global const float *pPBase,\n"
		"			  __global float *pECurX, __global float *pECurY,\n"
		"			  __global float *pHCurX, __global float *pHCurY,\n"
		"			  int width, int height, float pixelFrac, float pixelFracInv)\n"
		"{\n"
		"	int idx = get_global_id(0);\n"
		"\n"
		"	float2 xCurrents = getXRelCurrents(idx, pV, pExBaseN, pExBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height);\n"
		"	float2 yCurrents = getYRelCurrents(idx, pV, pEyBaseN, pEyBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height, pixelFrac, pixelFracInv);\n"
		"\n"
		"	pECurX[idx] = xCurrents.x;\n"
		"	pHCurX[idx] = xCurrents.y;\n"
		"	pECurY[idx] = yCurrents.x;\n"
		"	pHCurY[idx] = yCurrents.y;\n"
		"}\n"
		"\n"
		"__kernel void updateDensitiesKernel(__global const float *pV, \n"
		"			  __global const float *pExBaseN, __global const float *pEyBaseN,\n"
		"			  __global const float *pExBaseP, __global const float *pEyBaseP,\n"
		"                          __global const float *pDe, __global const float *pDh,\n"
		"			  __global const float *pEMob, __global const float *pHMob,\n"
		"			  __global const float *pGRminJDivE,\n"
		"			  __global const float *pGRminJDivH,\n"
		"			  __global const float *pRN,\n"
		"			  __global const float *pRP, __global const float *pRecFactor,\n"
		"			  __global const float *pN, __global const float *pP,\n"
		"			  __global const float *pNBase, __global const float *pPBase,\n"
		"			  __global float *pNdst, __global float *pPdst,\n"
		"			  int width, int height,\n"
		"			  float scaledDt, float pixelFrac, float pixelFracInv)\n"
		"\n"
		"{\n"
		"	int idx = get_global_id(0);\n"
		"	int x = idx % width;\n"
		"	int y = idx / width;\n"
		"	int prevX = (x-1+width)%width;\n"
		"	int prevY = y-1;\n"
		"	int leftIdx = prevX + y*width;\n"
		"	int belowIdx = x + prevY*width;\n"
		"\n"
		"	float2 xCurrentsIdx = getXRelCurrents(idx, pV, pExBaseN, pExBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height);\n"
		"	float2 yCurrentsIdx = getYRelCurrents(idx, pV, pEyBaseN, pEyBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height, pixelFrac, pixelFracInv);\n"
		"	float2 xCurrentsLeftIdx = getXRelCurrents(leftIdx, pV, pExBaseN, pExBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height);\n"
		"	float2 yCurrentsBelowIdx = getYRelCurrents(belowIdx, pV, pEyBaseN, pEyBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height, pixelFrac, pixelFracInv);\n"
		"\n"
		"	float JexCur = xCurrentsIdx.x;  // .x specifies electron current\n"
		"	float JexPrev = xCurrentsLeftIdx.x;\n"
		"	float JeyCur = yCurrentsIdx.x;\n"
		"	float JeyPrev = yCurrentsBelowIdx.x;\n"
		"\n"
		"	float JhxCur = xCurrentsIdx.y; // .y specifies hole current\n"
		"	float JhxPrev = xCurrentsLeftIdx.y;\n"
		"	float JhyCur = yCurrentsIdx.y;\n"
		"	float JhyPrev = yCurrentsBelowIdx.y;\n"
		"\n"
		"	// Calculate the gradients\n"
		"\n"
		"	float Jexx = (JexCur - JexPrev);\n"
		"	float Jeyy = (JeyCur - JeyPrev)*pixelFrac;\n"
		"	float Jhxx = (JhxCur - JhxPrev);\n"
		"	float Jhyy = (JhyCur - JhyPrev)*pixelFrac;\n"
		"			\n"
		"	// Calculate the rates according to the continuity equation\n"
		"	float n = pN[idx];\n"
		"	float p = pP[idx];\n"
		"	float recPart = pRN[idx]*p + pRP[idx]*n + pRecFactor[idx]*p*n;\n"
		"	float netGenE = pGRminJDivE[idx] - recPart;\n"
		"	float netGenH = pGRminJDivH[idx] - recPart;\n"
		"\n"
		"	float dndt = netGenE - ( Jexx+Jeyy );\n"
		"	float dpdt = netGenH - ( Jhxx+Jhyy );\n"
		"\n"
		"	pNdst[idx] = n + dndt*scaledDt;\n"
		"	pPdst[idx] = p + dpdt*scaledDt;\n"
		"}\n"
		"\n"
		"__kernel void updateDensitiesKernelSwapped(__global const float *pV, \n"
		"			  __global const float *pExBaseN, __global const float *pEyBaseN,\n"
		"			  __global const float *pExBaseP, __global const float *pEyBaseP,\n"
		"                          __global const float *pDe, __global const float *pDh,\n"
		"			  __global const float *pEMob, __global const float *pHMob,\n"
		"			  __global const float *pGRminJDivE,\n"
		"			  __global const float *pGRminJDivH,\n"
		"			  __global const float *pRN,\n"
		"			  __global const float *pRP, __global const float *pRecFactor,\n"
		"			  __global float *pNdst, __global float *pPdst,\n"
		"			  __global const float *pNBase, __global const float *pPBase,\n"
		"			  __global const float *pN, __global const float *pP,\n"
		"			  int width, int height,\n"
		"			  float scaledDt, float pixelFrac, float pixelFracInv)\n"
		"\n"
		"{\n"
		"	int idx = get_global_id(0);\n"
		"	int x = idx % width;\n"
		"	int y = idx / width;\n"
		"	int prevX = (x-1+width)%width;\n"
		"	int prevY = y-1;\n"
		"	int leftIdx = prevX + y*width;\n"
		"	int belowIdx = x + prevY*width;\n"
		"\n"
		"	float2 xCurrentsIdx = getXRelCurrents(idx, pV, pExBaseN, pExBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height);\n"
		"	float2 yCurrentsIdx = getYRelCurrents(idx, pV, pEyBaseN, pEyBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height, pixelFrac, pixelFracInv);\n"
		"	float2 xCurrentsLeftIdx = getXRelCurrents(leftIdx, pV, pExBaseN, pExBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height);\n"
		"	float2 yCurrentsBelowIdx = getYRelCurrents(belowIdx, pV, pEyBaseN, pEyBaseP, pDe, pDh, pEMob, pHMob, pN, pP, pNBase, pPBase, width, height, pixelFrac, pixelFracInv);\n"
		"\n"
		"	float JexCur = xCurrentsIdx.x;  // .x specifies electron current\n"
		"	float JexPrev = xCurrentsLeftIdx.x;\n"
		"	float JeyCur = yCurrentsIdx.x;\n"
		"	float JeyPrev = yCurrentsBelowIdx.x;\n"
		"\n"
		"	float JhxCur = xCurrentsIdx.y; // .y specifies hole current\n"
		"	float JhxPrev = xCurrentsLeftIdx.y;\n"
		"	float JhyCur = yCurrentsIdx.y;\n"
		"	float JhyPrev = yCurrentsBelowIdx.y;\n"
		"\n"
		"	// Calculate the gradients\n"
		"\n"
		"	float Jexx = (JexCur - JexPrev);\n"
		"	float Jeyy = (JeyCur - JeyPrev)*pixelFrac;\n"
		"	float Jhxx = (JhxCur - JhxPrev);\n"
		"	float Jhyy = (JhyCur - JhyPrev)*pixelFrac;\n"
		"			\n"
		"	// Calculate the rates according to the continuity equation\n"
		"	float n = pN[idx];\n"
		"	float p = pP[idx];\n"
		"	float recPart = pRN[idx]*p + pRP[idx]*n + pRecFactor[idx]*p*n;\n"
		"	float netGenE = pGRminJDivE[idx] - recPart;\n"
		"	float netGenH = pGRminJDivH[idx] - recPart;\n"
		"\n"
		"	float dndt = netGenE - ( Jexx+Jeyy );\n"
		"	float dpdt = netGenH - ( Jhxx+Jhyy );\n"
		"\n"
		"	pNdst[idx] = n + dndt*scaledDt;\n"
		"	pPdst[idx] = p + dpdt*scaledDt;\n"
		"}\n"
		"\n"
		"__kernel void blackRedKernel(float chargeMultiplier, \n"
		"                          __global const float *pN, __global const float *pP,\n"
		"			  __global const float *pA0, __global const float *pA1, __global const float *pA2,\n"
		"			  __global const float *pA3, __global const float *pA4, \n"
		"			  __global const float *pVPredSub,\n"
		"			  __global float *pV, \n"
		"			  float w, int blackOrRed, int width, int height)\n"
		"{\n"
		"	int idx = get_global_id(0);\n"
		"	int x = idx % width;\n"
		"	int y = idx / width;\n"
		"	int xPrev = (x-1+width)%width;\n"
		"	int xNext = (x+1)%width;\n"
		"	int yNext = y+1;\n"
		"	int yPrev = y-1;\n"
		"\n"
		"	int leftIndex = xPrev + y*width;\n"
		"	int rightIndex = xNext + y*width;\n"
		"	int upIndex = x + yNext*width;\n"
		"	int downIndex = x + yPrev*width;\n"
		"\n"
		"	float charge = (pP[idx]-pN[idx])*chargeMultiplier;\n"
		"\n"
		"	float prediction = ( pA1[idx]*pV[rightIndex] + pA2[idx]*pV[leftIndex] + pA3[idx]*pV[upIndex] + pA4[idx]*pV[downIndex] \n"
		"                             + charge ) / pA0[idx];\n"
		"\n"
		"	float curValue = pV[idx];\n"
		"	float diff = pVPredSub[idx] + prediction-curValue;\n"
		"\n"
		"	int update = (x+y+blackOrRed+1)%2;\n"
		"	\n"
		"	float newValue = curValue + (diff*w)*((float)update);\n"
		"\n"
		"	pV[idx] = newValue;\n"
		"}\n"
		"\n";
	return program;
}

bool Simulation2DDoubleGPURel::initGPU()
{
	std::string program = getOpenCLProgram();

	if (program.length() == 0)
	{
		setErrorString("Internal error: couldn't load GPU program");
		return false;
	}

	if (!m_gpu.init(4))
	{
		setErrorString(std::string("Unable to initialize GPU usage: ") + m_gpu.getErrorString());
		return false;
	}

	std::string failLog;
	if (!m_gpu.loadProgram(program, failLog))
	{
		setErrorString(std::string("Internal error: couldn't load GPU program: ") + m_gpu.getErrorString());
		std::cerr << failLog << std::endl;
		m_gpu.destroy();
		return false;
	}

	if (!m_gpu.loadKernel(0, "currentsKernel"))
	{
		setErrorString(std::string("Internal error: couldn't load GPU kernel: ") + m_gpu.getErrorString());
		m_gpu.destroy();
		return false;
	}
	if (!m_gpu.loadKernel(1, "updateDensitiesKernel"))
	{
		setErrorString(std::string("Internal error: couldn't load second GPU kernel: ") + m_gpu.getErrorString());
		m_gpu.destroy();
		return false;
	}
	if (!m_gpu.loadKernel(2, "updateDensitiesKernelSwapped"))
	{
		setErrorString(std::string("Internal error: couldn't load third GPU kernel: ") + m_gpu.getErrorString());
		m_gpu.destroy();
		return false;
	}
	if (!m_gpu.loadKernel(3, "blackRedKernel"))
	{
		setErrorString(std::string("Internal error: couldn't load fourth GPU kernel: ") + m_gpu.getErrorString());
		m_gpu.destroy();
		return false;
	}
	return true;
}

bool Simulation2DDoubleGPURel::setState(const SimulationState &state, std::vector<std::string> &warnings)
{
	if (!m_init)
	{
		setErrorString("Simulation not initialized");
		return false;
	}
	if (!state.isInitialized())
	{
		setErrorString("State to be used is not initialized");
		return false;
	}
	if (state.getDimensions() != 2)
	{
		setErrorString("State is not one-dimensional");
		return false;
	}

	if (state.getNumberOfXPixels() != getNumXPixels() || state.getNumberOfYPixels() != getNumYPixels())
	{
		setErrorString("State does not have same dimensions as simulation");
		return false;
	}

	std::vector<bool> m_input(SIMSTATE_GRIDPROP_MAX);
	std::vector<bool> m_output(SIMSTATE_GRIDPROP_MAX);
	std::vector<bool> m_optional(SIMSTATE_GRIDPROP_MAX);

	for (int i = 0 ; i < SIMSTATE_GRIDPROP_MAX ; i++)
	{
		m_input[i] = false;
		m_output[i] = false;
		m_optional[i] = false;
	}

	m_input[SIMSTATE_GRIDPROP_N] = true;
	m_input[SIMSTATE_GRIDPROP_P] = true;
	m_input[SIMSTATE_GRIDPROP_DN] = true;
	m_input[SIMSTATE_GRIDPROP_DP] = true;
	m_input[SIMSTATE_GRIDPROP_NMOB] = true;
	m_input[SIMSTATE_GRIDPROP_PMOB] = true;
	m_input[SIMSTATE_GRIDPROP_EPSREL] = true;

	m_output[SIMSTATE_GRIDPROP_N] = true;
	m_output[SIMSTATE_GRIDPROP_P] = true;
	m_output[SIMSTATE_GRIDPROP_V] = true;
	m_output[SIMSTATE_GRIDPROP_JNX] = true;
	m_output[SIMSTATE_GRIDPROP_JPX] = true;
	m_output[SIMSTATE_GRIDPROP_JNY] = true;
	m_output[SIMSTATE_GRIDPROP_JPY] = true;
	m_output[SIMSTATE_GRIDPROP_R] = true;

	m_optional[SIMSTATE_GRIDPROP_BG] = true;
	m_optional[SIMSTATE_GRIDPROP_G] = true;
	m_optional[SIMSTATE_GRIDPROP_RF] = true;
	m_optional[SIMSTATE_GRIDPROP_VNEXTRA] = true;
	m_optional[SIMSTATE_GRIDPROP_VPEXTRA] = true;

	for (int i = 0 ; i < SIMSTATE_GRIDPROP_MAX ; i++)
	{
		if (m_input[i])
		{
			if (!state.isGridPropertySet(i))
			{
				setErrorString("Grid property '" + state.getGridPropertyName(i) + "' is required, but has not been set");
				return false;
			}
		}
		else
		{
			if (state.isGridPropertySet(i))
			{
				if (!m_output[i] && !m_optional[i])
					warnings.push_back("Property '" + state.getGridPropertyName(i) + "' is set but not used");
			}
		}
	}

	int numX = getNumXPixels();
	int numY = getNumYPixels();

	for (int y = 0 ; y < numY ; y++)
	{
		for (int x = 0 ; x < numX ; x++)
		{
			setElectronNumberDensity(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_N, x, y));
			setHoleNumberDensity(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_P, x, y));

			setHoleMobility(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_P, x, y));

			if (state.isGridPropertySet(SIMSTATE_GRIDPROP_BG))
				setBackgroundNumberDensity(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_BG, x, y));

			if (state.isGridPropertySet(SIMSTATE_GRIDPROP_G))
				setGenerationRate(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_G, x, y));
			if (state.isGridPropertySet(SIMSTATE_GRIDPROP_RF))
				setRecombinationFactor(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_RF, x, y));

			setElectronDiffusionConstant(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_DN, x, y));
			setHoleDiffusionConstant(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_DP, x, y));
			setElectronMobility(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_NMOB, x, y));
			setHoleMobility(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_PMOB, x, y));
			setRelativePermittivity(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_EPSREL, x, y));

			if (state.isGridPropertySet(SIMSTATE_GRIDPROP_V))
				setPotential(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_V, x, y));

			if (state.isGridPropertySet(SIMSTATE_GRIDPROP_VNEXTRA))
				setExtraElectronPotential(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_VNEXTRA, x, y));

			if (state.isGridPropertySet(SIMSTATE_GRIDPROP_VPEXTRA))
				setExtraHolePotential(x, y, state.getGridProperty(SIMSTATE_GRIDPROP_VPEXTRA, x, y));
		}
	}

	double vDiff;

	if (!state.getDoubleProperty(SIMSTATE_PROP_VDIFF, vDiff))
	{
		setErrorString("Potential difference has not been set");
		return false;
	}

	setPotentialDifference(vDiff);

	if (state.getRecombinationModel() == SimulationState::Braun)
	{
		setErrorString("Braun recombination model selected, but not supported in this simulation type");
		return false;
	}
	else
	{
		double a, kf;

		if (state.getDoubleProperty(SIMSTATE_PROP_PAIRDIST, a))
			warnings.push_back("'Pair distance' parameter is set, but is only used in the braun model");
		if (state.getDoubleProperty(SIMSTATE_PROP_KF, kf))
			warnings.push_back("'Dissociation rate' parameter 'kf' is set, but is only used in the braun model");
	}

	//FILE *f = fopen("log.txt","wt");
	//writePlotData(f);
	//fclose(f);

	return true;
}

bool Simulation2DDoubleGPURel::storeState(SimulationState &state) const
{
	if (!m_init)
	{
		setErrorString("Not initialized");
		return false;
	}

	if (state.getNumberOfXPixels() != getNumXPixels() ||
	    state.getNumberOfYPixels() != getNumYPixels() )
	{
		setErrorString("Number of pixels is not same as in simulation state");
		return false;
	}

	for (int i = 0 ; i < SIMSTATE_PROP_MAX ; i++)
		state.clearDoubleProperty(i);

	for (int i = 0 ; i < SIMSTATE_GRIDPROP_MAX ; i++)
		state.clearGridProperty(i);

	std::vector<double> tmp(getNumXPixels() * getNumYPixels());

	if (!copyProperty(state, SIMSTATE_GRIDPROP_N, tmp, m_n, m_npDensFactor) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_P, tmp, m_p, m_npDensFactor) ||
	    !state.setGridProperty(SIMSTATE_GRIDPROP_V, m_Vbase[0]))
	{
		setErrorString("Unable to store n,p or V");
		return false;
	}

	if (!copyProperty(state, SIMSTATE_GRIDPROP_BG, tmp, m_background, m_npDensFactor) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_G, tmp, m_generationRate, m_npDensFactor/m_timeFactor) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_RF, tmp, m_recombinationFactor, 1.0/(m_npDensFactor*m_timeFactor)) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_DN, tmp, m_De, m_pixelWidth*m_pixelWidth/m_timeFactor) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_DP, tmp, m_Dh, m_pixelWidth*m_pixelWidth/m_timeFactor) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_NMOB, tmp, m_eMob, m_pixelWidth*m_pixelWidth/m_timeFactor) ||
	    !copyProperty(state, SIMSTATE_GRIDPROP_PMOB, tmp, m_hMob, m_pixelWidth*m_pixelWidth/m_timeFactor) ||
	    !state.setGridProperty(SIMSTATE_GRIDPROP_EPSREL, m_epsRels[0]) ||
	    !state.setGridProperty(SIMSTATE_GRIDPROP_VNEXTRA, m_extraPotentialN) ||
	    !state.setGridProperty(SIMSTATE_GRIDPROP_VPEXTRA, m_extraPotentialP) )
	{
		setErrorString("Unable to store fixed simulation items");
		return false;
	}

	double jFactor = m_npDensFactor*m_pixelWidth/m_timeFactor;
	for (int x = 0 ; x < m_width ; x++)
	{
		for (int y = 0 ; y < m_height ; y++)
		{
			double jnx = m_numCurTotEx[x+y*m_width]*jFactor;
			double jpx = m_numCurTotHx[x+y*m_width]*jFactor;
			double jny = m_numCurTotEy[x+y*m_width]*jFactor;
			double jpy = m_numCurTotHy[x+y*m_width]*jFactor;

			if (!state.setGridProperty(SIMSTATE_GRIDPROP_JNX, x, y, jnx) ||
			    !state.setGridProperty(SIMSTATE_GRIDPROP_JPX, x, y, jpx) ||
			    !state.setGridProperty(SIMSTATE_GRIDPROP_JNY, x, y, jny) ||
			    !state.setGridProperty(SIMSTATE_GRIDPROP_JPY, x, y, jpy))
			{
				setErrorString("Unable to store the number current");
				return false;
			}
		}
	}

	for (int y = 0 ; y < m_height ; y++)
	{
		for (int x = 0 ; x < m_width ; x++)
		{
			int idx = x+y*m_width;
			double R = m_recombinationFactor[idx] * (m_n[idx] * m_p[idx]) * m_npDensFactor/m_timeFactor;
		
			if (!state.setGridProperty(SIMSTATE_GRIDPROP_R, x, y, R))
			{
				setErrorString("Unable to store the recombination rate");
				return false;
			}
		}
	}

	state.setRecombinationModel(SimulationState::Simple);	

	if (!state.setDoubleProperty(SIMSTATE_PROP_VDIFF, m_deltaPhi))
	{
		setErrorString("Couldn't store potential difference");
		return false;
	}

	return true;
}

bool Simulation2DDoubleGPURel::copyProperty(SimulationState &dst, int propID, std::vector<double> &tmp, const std::vector<double> &grid, double multiplier)
{
	int num = grid.size();

	for (int i = 0 ; i < num ; i++)
		tmp[i] = grid[i]*multiplier;

	if (!dst.setGridProperty(propID, tmp))
		return false;

	return true;
}

bool Simulation2DDoubleGPURel::copyProperty(SimulationState &dst, int propID, std::vector<double> &tmp, const std::vector<float> &grid, double multiplier)
{
	int num = grid.size();

	for (int i = 0 ; i < num ; i++)
		tmp[i] = ((double)grid[i])*multiplier;

	if (!dst.setGridProperty(propID, tmp))
		return false;

	return true;
}



