Kaydet (Commit) e791fbfc authored tarafından Haidong Lian's avatar Haidong Lian Kaydeden (comit) Kohei Yoshida

Patch for milestone1-0829-v4.

1. Add the parser based on RPN;
2. For test sample1 named "ground-water-daily.xls", using the compound formula
   to do calculation;
Add the compound kernels:
Formulae include "AVERAGE,MAX and MIN".Compound formulae include "AVERAGE
-(+,*,/)","MAX -(+,*,/)" and "MIN -(+,*,/)";
3. For formulae which do not work in GPU, they'll work in CPU;
4. For compound operators(-,+,*,/), they'll be calculated one by one in GPU as
the sequence of RPN;
5. Add the start and end position to fit for the sliding window;
6. Modify kernels by using vector for AMD GPU.

Conflicts:
	sc/source/core/opencl/formulagroupcl.cxx
	sc/source/core/opencl/openclwrapper.cxx

Change-Id: I6157008575ce89ddd3e7bf552a87812474af4125
üst ccf7b15c
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
#include "openclwrapper.hxx" #include "openclwrapper.hxx"
#define SRCDATASIZE 100
#define SINGLEARRAYLEN 100
#define DOUBLEARRAYLEN 100
#define SVDOUBLELEN 100
namespace sc { namespace sc {
// A single public entry point for a factory function: // A single public entry point for a factory function:
...@@ -38,447 +42,918 @@ double getTimeDiff(const TimeValue& t1, const TimeValue& t2) ...@@ -38,447 +42,918 @@ double getTimeDiff(const TimeValue& t1, const TimeValue& t2)
}//dbg-t }//dbg-t
TimeValue aTimeBefore, aTimeAfter; TimeValue aTimeBefore, aTimeAfter;
/////////////////////////////////////// ///////////////////////////////////////
class SourceData
{
const double *mdpSrcPtr;
unsigned int mnDataSize;
const char *mcpFormulaName;
unsigned int mnCol;
int eOp;
public:
SourceData( const double *dpData, unsigned int nSize, uint nCol = 1,const char *cpFormulaName = NULL):mdpSrcPtr(dpData),mnDataSize(nSize),mcpFormulaName(cpFormulaName),mnCol(nCol)
{
}
SourceData():mdpSrcPtr(NULL),mnDataSize(0)
{
}
void setSrcPtr( const double *dpTmpDataPtr)
{
mdpSrcPtr = dpTmpDataPtr;
}
void setSrcSize( int nSize )
{
mnDataSize = nSize;
}
const double * getDouleData()
{
return mdpSrcPtr;
}
unsigned int getDataSize()
{
return mnDataSize;
}
void print()
{
for( uint i=0; i<mnDataSize; i++ )
printf( " The SourceData is %f and data size is %d\n",mdpSrcPtr[i],mnDataSize );
}
void printFormula()
{
printf("--------The formulaname is %s and the eOp is %d---------\n",mcpFormulaName,eOp);
}
void setFormulaName(const char *cpFormulaName)
{
this->mcpFormulaName = cpFormulaName;
}
const char *getFormulaName()
{
return mcpFormulaName;
}
void seteOp(int op)
{
this->eOp = op;
}
int geteOp()
{
return eOp;
}
int getColNum()
{
return mnCol;
}
};
class FormulaGroupInterpreterOpenCL : public FormulaGroupInterpreterSoftware class FormulaGroupInterpreterOpenCL : public FormulaGroupInterpreterSoftware
{ {
SourceData *mSrcDataStack[SRCDATASIZE];
unsigned int mnStackPointer,mnDoublePtrCount;
uint * mnpOclStartPos;
uint * mnpOclEndPos;
SingleVectorFormula *mSingleArray[SINGLEARRAYLEN];
DoubleVectorFormula *mDoubleArray[DOUBLEARRAYLEN];
double mdpSvdouble[SVDOUBLELEN];
double *mdpSrcDoublePtr[SVDOUBLELEN];
uint mnSingleCount;
uint mnDoubleCount;
uint mnSvDoubleCount;
uint mnOperatorGroup[100];
uint mnOperatorCount;
char mcHostName[100];
uint mnPositonLen;
size_t mnRowSize;
public: public:
FormulaGroupInterpreterOpenCL() : FormulaGroupInterpreterOpenCL() :
FormulaGroupInterpreterSoftware() FormulaGroupInterpreterSoftware()
{ {
OclCalc::InitEnv(); mnStackPointer = 0;
mnpOclEndPos = NULL;
mnpOclStartPos = NULL;
mnSingleCount = 0;
mnDoubleCount = 0;
mnSvDoubleCount = 0;
mnOperatorCount = 0;
mnPositonLen = 0;
mnDoublePtrCount = 0;
OclCalc::initEnv();
} }
virtual ~FormulaGroupInterpreterOpenCL() virtual ~FormulaGroupInterpreterOpenCL()
{ {
OclCalc::ReleaseOpenclRunEnv(); OclCalc::releaseOpenclRunEnv();
} }
virtual ScMatrixRef inverseMatrix(const ScMatrix& rMat); virtual ScMatrixRef inverseMatrix( const ScMatrix& rMat );
virtual bool interpret(ScDocument& rDoc, const ScAddress& rTopPos, virtual bool interpret( ScDocument& rDoc, const ScAddress& rTopPos,
const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode); const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode );
}; void collectDoublePointers( double *temp )
ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
{
SCSIZE nC, nR;
rMat.GetDimensions(nC, nR);
if (nC != nR || nC == 0)
// Input matrix must be square. Return an empty matrix on failure and
// the caller will calculate it via CPU.
return ScMatrixRef();
// This vector will contain a series of doubles from the first column to
// the last, chained together in a single array.
std::vector<double> aDoubles;
rMat.GetDoubleArray(aDoubles);
float * fpOclMatrixSrc = NULL;
float * fpOclMatrixDst = NULL;
double * dpOclMatrixSrc = NULL;
double * dpOclMatrixDst = NULL;
uint nMatrixSize = nC * nR;
static OclCalc aOclCalc;
if ( aOclCalc.GetOpenclState() )
{ {
if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 ) if( mnDoublePtrCount < SRCDATASIZE )
{ {
aOclCalc.CreateBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize ); mdpSrcDoublePtr[mnDoublePtrCount++] = temp;
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i];
aOclCalc.OclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
} }
else else
{ {
aOclCalc.CreateBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize ); printf( "The mdpSrcDoublePtr is full now.\n" );
for ( uint i = 0; i < nC; i++ ) double *dtmp = NULL;
for ( uint j = 0; j < nR; j++ ) if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i]; {
aOclCalc.OclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR ); free( dtmp );
dtmp = NULL;
}
} }
} }
// TODO: Inverse this matrix and put the result back into xInv. Right now, void freeDoublePointers()
// I'll just put the original, non-inversed matrix values back, just to
// demonstrate how to put the values back after inversion. There are two
// ways to put the values back (depending on what the GPU output is).
ScMatrixRef xInv(new ScMatrix(nR, nR, 0.0));
#if 0
// One way is to put the whole value as one array. This method assumes
// that the array size equals column x row, and is oriented column-wise.
// This method is slightly more efficient than the second, but I wouldn't
// expect too much of a difference.
xInv->PutDouble(&aDoubles[0], aDoubles.size(), 0, 0);
#else
// Another way is to put the values one column at a time.
const double* p = &aDoubles[0];
for (SCSIZE i = 0; i < nC; ++i)
{ {
xInv->PutDouble(p, nR, i, 0); while( mnDoublePtrCount > 0 )
p += nR; {
double *dtmp = NULL;
if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
{
free( dtmp );
dtmp = NULL;
}
}
} }
#endif
return xInv;
}
bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& rTopPos,
const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode)
{
generateRPNCode(rDoc, rTopPos, rCode);
size_t rowSize = xGroup->mnLength; void srdDataPush( SourceData *temp )
fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize);
// The row quantity can be gotten from p2->GetArrayLength()
uint nCount1 = 0, nCount2 = 0, nCount3 = 0;
int nOclOp = 0;
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
if(NULL==rResult)
{ {
printf("malloc err\n"); if( mnStackPointer < SRCDATASIZE )
return false; {
mSrcDataStack[mnStackPointer++] = temp;
} }
memset(rResult,0,rowSize); else
float * fpOclSrcData = NULL; // Point to the input data from CPU printf( "The stack is full now.\n" );
double * dpOclSrcData = NULL; }
uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100)) SourceData *srdDataPop( void )
uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
float * fpRightData = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
// The rightData can't be zero for "/"
double * dpLeftData = NULL;
double * dpRightData = NULL;
float * fpSaveData=NULL; //It is a temp pointer point the preparing memory;
float * fpSumProMergeLfData = NULL; //It merge the more col to one col is the left operator
float * fpSumProMergeRtData = NULL; //It merge the more col to one col is the right operator
double * dpSaveData=NULL;
double * dpSumProMergeLfData = NULL;
double * dpSumProMergeRtData = NULL;
uint * npSumSize=NULL; //It is a array to save the matix sizt(col *row)
int nSumproductSize=0; //It is the merge array size
bool aIsAlloc=false; //It is a flag to judge the fpSumProMergeLfData existed
unsigned int nCountMatix=0; //It is a count to save the calculate times
static OclCalc ocl_calc;
bool isSumProduct=false;
if(ocl_calc.GetOpenclState())
{ {
// Don't know how large the size will be applied previously, so create them as the rowSize or 65536 if( mnStackPointer <= 0 )
// Don't know which formulae will be used previously, so create buffers for different formulae used probably
if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
{ {
ocl_calc.CreateBuffer64Bits(dpOclSrcData,npOclStartPos,npOclEndPos,rowSize); printf( "The stack was empty\n" );
ocl_calc.CreateBuffer64Bits(dpLeftData,dpRightData,rowSize); return NULL;
} }
else return mSrcDataStack[--mnStackPointer];
}
unsigned int getDataSize()
{ {
ocl_calc.CreateBuffer32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize); return mnStackPointer;
ocl_calc.CreateBuffer32Bits(fpLeftData,fpRightData,rowSize);
} }
//printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos); void printStackInfo()
{
printf( "/********The stack size is %d*********\\\n",mnStackPointer );
for ( int i = mnStackPointer - 1; i >= 0; i-- )
mSrcDataStack[i]->print();
} }
/////////////////////////////////////////////////////////////////////////////////////////// bool getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen);
bool chooseFunction(OclCalc &ocl_calc,double *&dpResult);
// Until we implement group calculation for real, decompose the group into bool isStockHistory();
// individual formula token arrays for individual calculation. bool isGroundWater();
ScAddress aTmpPos = rTopPos; };
for (sal_Int32 i = 0; i < xGroup->mnLength; ++i) bool FormulaGroupInterpreterOpenCL::getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen)
{
uint nColPosition = 0;
ScTokenArray * rCodePos = rCode.Clone();
static int nCountPosSize = nRowSize;
bool isAllocFormulaOclBuf = true;
for ( const formula::FormulaToken* p = rCodePos->First(); p; p = rCodePos->Next() )
{ {
aTmpPos.SetRow(xGroup->mnStart + i); switch ( p->GetType() )
ScTokenArray aCode2;
for (const formula::FormulaToken* p = rCode.First(); p; p = rCode.Next())
{ {
switch (p->GetType()) case formula::svDoubleVectorRef:
{ {
case formula::svSingleVectorRef: nColPosition++;
break;
}
}
}
int nPositionSize = nColPosition * nRowSize;
npOclStartPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
npOclEndPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
if ( nCountPosSize < nPositionSize )
{ {
const formula::SingleVectorRefToken* p2 = static_cast<const formula::SingleVectorRefToken*>(p); nCountPosSize = nPositionSize;
const double* pArray = p2->GetArray(); isAllocFormulaOclBuf = false;
aCode2.AddDouble(static_cast<size_t>(i) < p2->GetArrayLength() ? pArray[i] : 0.0);
} }
break; for ( sal_Int32 i = 0; i < xGroup->mnLength; ++i )
{
ScTokenArray * rCodeTemp = rCode.Clone();
int j = 0;
for ( const formula::FormulaToken* p = rCodeTemp->First(); p; p = rCodeTemp->Next() )
{
switch (p->GetType())
{
case formula::svDoubleVectorRef: case formula::svDoubleVectorRef:
{ {
const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p); const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p);
const std::vector<const double*>& rArrays = p2->GetArrays();
size_t nColSize = rArrays.size();
size_t nRowStart = p2->IsStartFixed() ? 0 : i; size_t nRowStart = p2->IsStartFixed() ? 0 : i;
size_t nRowEnd = p2->GetRefRowSize() - 1; size_t nRowEnd = p2->GetRefRowSize() - 1;
if (!p2->IsEndFixed()) if (!p2->IsEndFixed())
nRowEnd += i; nRowEnd += i;
size_t nRowSize = nRowEnd - nRowStart + 1; npOclStartPos[j*nRowSize+i] = nRowStart;//record the start position
//store the a matix`s rowsize and colsize,use it to calculate the matix`s size npOclEndPos[j*nRowSize+i] = nRowEnd;//record the end position
ocl_calc.nFormulaRowSize = nRowSize; j++;
ocl_calc.nFormulaColSize = nColSize; }
ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0)); }
if(ocl_calc.GetOpenclState())
{
npOclStartPos[i] = nRowStart; // record the start position
npOclEndPos[i] = nRowEnd; // record the end position
}
int nTempOpcode;
const formula::FormulaToken* pTemp = p;
pTemp=aCode2.Next();
nTempOpcode=pTemp->GetOpCode();
while(1)
{
nTempOpcode=pTemp->GetOpCode();
if(nTempOpcode!=ocOpen && nTempOpcode!=ocPush)
break;
pTemp=aCode2.Next();
} }
if((!aIsAlloc) && (ocl_calc.GetOpenclState())&& (nTempOpcode == ocSumProduct)) }
*nPositonLen = nPositionSize;
//Now the pos array is 0 1 2 3 4 5 0 1 2 3 4 5;
return isAllocFormulaOclBuf;
}
bool FormulaGroupInterpreterOpenCL::isStockHistory()
{
bool isHistory = false;
if( (mnOperatorGroup[0]== 224) && (mnOperatorGroup[1]== 227) && (mnOperatorGroup[2]== 41) && (mnOperatorGroup[3]== 43) && (mnOperatorGroup[4]== 41) )
{ {
//nColSize * rowSize is the data size , but except the the head of data will use less the nRowSize strcpy(mcHostName,"OclOperationColumnN");
//the other all use nRowSize times . and it must aligen so add nRowSize-1. isHistory = true;
nSumproductSize = nRowSize+nColSize * rowSize*nRowSize-1;
if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
ocl_calc.CreateBuffer64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
else
ocl_calc.CreateBuffer32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
aIsAlloc = true;
isSumProduct=true;
} }
if(isSumProduct) else if( (mnOperatorGroup[0] == 226) && (mnOperatorGroup[1] == 42) )
{ {
if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) strcpy(mcHostName,"OclOperationColumnH");
isHistory = true;
}
else if((mnOperatorGroup[0] == 213) && (mnOperatorGroup[1] == 43) && (mnOperatorGroup[2] == 42) )
{ {
if(nCountMatix%2==0) strcpy(mcHostName,"OclOperationColumnJ");
dpSaveData = dpSumProMergeLfData; isHistory = true;
else
dpSaveData = dpSumProMergeRtData;
} }
else return isHistory;
}
bool FormulaGroupInterpreterOpenCL::isGroundWater()
{
bool GroundWater=false;
if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
(mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
{ {
if(nCountMatix%2==0) GroundWater = true;
fpSaveData = fpSumProMergeLfData;
else
fpSaveData = fpSumProMergeRtData;
} }
return GroundWater;
}
bool FormulaGroupInterpreterOpenCL::chooseFunction( OclCalc &ocl_calc, double *&dpResult )
{
const double * dpOclSrcData = NULL;
unsigned int nSrcDataSize = 0;
const double *dpLeftData = NULL;
const double *dpRightData = NULL;
if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
(mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
{
double delta = 0.0;
const double *pArrayToSubtractOneElementFrom;
const double *pGroundWaterDataArray;
uint nSrcData = 0;
if( mnSvDoubleCount!=1 )
{
pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
pGroundWaterDataArray= mDoubleArray[0]->mdpInputData;
nSrcData = mDoubleArray[0]->mnInputDataSize;
} }
for (size_t nCol = 0; nCol < nColSize; ++nCol) else
{ {
const double* pArray = rArrays[nCol]; pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
if( NULL==pArray ) pGroundWaterDataArray=NULL;
delta = mdpSvdouble[0];
}
ocl_calc.oclGroundWaterGroup( mnOperatorGroup,mnOperatorCount,pGroundWaterDataArray,pArrayToSubtractOneElementFrom,nSrcData,mnRowSize,delta,mnpOclStartPos,mnpOclEndPos,dpResult);
}
else if( isStockHistory() )
{ {
fprintf(stderr,"Error: pArray is NULL!\n");
free(rResult);
return false; return false;
} }
if(ocl_calc.GetOpenclState()) else if(((mnSvDoubleCount==0)&&(mnSingleCount==0)&&(mnDoubleCount==1)) &&
((mnOperatorGroup[0] == ocAverage)||(mnOperatorGroup[0] == ocMax)||(mnOperatorGroup[0] == ocMin)))
{ {
for( size_t u=nRowStart; u<=nRowEnd; u++ ) if(mnOperatorGroup[0] == ocAverage)
strcpy(mcHostName,"oclFormulaAverage");
if(mnOperatorGroup[0] == ocMax)
strcpy(mcHostName,"oclFormulaMax");
if(mnOperatorGroup[0] == ocMin)
strcpy(mcHostName,"oclFormulaMin");
DoubleVectorFormula * doubleTemp = mDoubleArray[--mnDoubleCount];
nSrcDataSize = doubleTemp->mnInputDataSize;
dpOclSrcData = doubleTemp->mdpInputData;
if ( ocl_calc.getOpenclState())
{ {
if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) if ( ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
{ {
dpOclSrcData[u] = pArray[u]; ocl_calc.createFormulaBuf64Bits( nSrcDataSize, mnRowSize );
//fprintf(stderr,"dpOclSrcData[%d] is %f.\n",u,dpOclSrcData[u]); ocl_calc.mapAndCopy64Bits( dpOclSrcData,mnpOclStartPos,mnpOclEndPos,nSrcDataSize,mnRowSize );
if(isSumProduct) ocl_calc.oclHostFormulaStatistics64Bits( mcHostName, dpResult, mnRowSize );
dpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = pArray[u];
} }
else else
{ {
// Many video cards can't support double type in kernel, so need transfer the double to float ocl_calc.createFormulaBuf32Bits( nSrcDataSize, mnPositonLen );
fpOclSrcData[u] = (float)pArray[u]; ocl_calc.mapAndCopy32Bits( dpOclSrcData, mnpOclStartPos, mnpOclEndPos, nSrcDataSize, mnRowSize);
//fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]); ocl_calc.oclHostFormulaStatistics32Bits( mcHostName, dpResult, mnRowSize );
if(isSumProduct)
fpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = (float)pArray[u];
} }
} }
} }
else if((mnSvDoubleCount==0)&&(mnSingleCount==1)&&(mnDoubleCount==0))
for (size_t nRow = 0; nRow < nRowSize; ++nRow)
{ {
if (nRowStart + nRow < p2->GetArrayLength()) dpLeftData = mSingleArray[0]->mdpInputLeftData;
dpRightData = mSingleArray[0]->mdpInputRightData;
if(mnOperatorGroup[0] == ocAdd)
strcpy(mcHostName,"oclSignedAdd");
if(mnOperatorGroup[0] == ocSub)
strcpy(mcHostName,"oclSignedSub");
if(mnOperatorGroup[0] == ocMul)
strcpy(mcHostName,"oclSignedMul");
if(mnOperatorGroup[0] == ocDiv)
strcpy(mcHostName,"oclSignedDiv");
if ( ocl_calc.getOpenclState())
{ {
double fVal = pArray[nRowStart+nRow]; if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
pMat->PutDouble(fVal, nCol, nRow); {
} ocl_calc.createArithmeticOptBuf64Bits( mnRowSize );
} ocl_calc.mapAndCopy64Bits(dpLeftData,dpRightData,mnRowSize);
ocl_calc.oclHostArithmeticOperator64Bits( mcHostName,dpResult,mnRowSize );
} }
else
ScMatrixToken aTok(pMat);
aCode2.AddToken(aTok);
if(isSumProduct)
{ {
npSumSize[nCountMatix/2] =nRowSize*nColSize; ocl_calc.createArithmeticOptBuf32Bits( mnRowSize );
nCountMatix++; ocl_calc.mapAndCopy32Bits(dpLeftData,dpRightData,mnRowSize);
ocl_calc.oclHostArithmeticOperator32Bits( mcHostName,dpResult,mnRowSize );
} }
} }
break;
default:
aCode2.AddToken(*p);
} }
else if( (mnSingleCount>1) && (mnSvDoubleCount==0) && (mnDoubleCount==0) )
{
const double* dpArray[100] = {};
int j=0;
for( uint i = 0; i < mnSingleCount; i++ )
{
dpArray[j++] = mSingleArray[i]->mdpInputLeftData;
if( NULL != mSingleArray[i]->mdpInputRightData )
dpArray[j++] = mSingleArray[i]->mdpInputRightData;
} }
double *dpMoreColArithmetic = (double *)malloc( sizeof(double) * j * mnRowSize );
ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos); if( NULL == dpMoreColArithmetic )
if (!pDest)
{ {
free(rResult); printf("Memory alloc error!\n");
return false; return false;
} }
if(ocl_calc.GetOpenclState()) for( uint i = 0; i < j*mnRowSize; i++ )
{
const formula::FormulaToken *pCur = aCode2.First();
aCode2.Reset();
while( ( pCur = aCode2.Next() ) != NULL )
{ {
OpCode eOp = pCur->GetOpCode(); dpMoreColArithmetic[i] = dpArray[i/mnRowSize][i%mnRowSize];
if(eOp==0) }
if ( ocl_calc.getOpenclState())
{ {
if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
{ {
if(nCount3%2==0) ocl_calc.createMoreColArithmeticBuf64Bits( j * mnRowSize, mnOperatorCount );
dpLeftData[nCount1++] = pCur->GetDouble(); ocl_calc.mapAndCopyMoreColArithmetic64Bits( dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount );
else ocl_calc.oclMoreColHostArithmeticOperator64Bits( mnRowSize, mnOperatorCount, dpResult,mnRowSize );
dpRightData[nCount2++] = pCur->GetDouble();
nCount3++;
} }
else else
{ {
if(nCount3%2==0) ocl_calc.createMoreColArithmeticBuf32Bits( j* mnRowSize, mnOperatorCount );
fpLeftData[nCount1++] = (float)pCur->GetDouble(); ocl_calc.mapAndCopyMoreColArithmetic32Bits(dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount);
else ocl_calc.oclMoreColHostArithmeticOperator32Bits( mnRowSize, mnOperatorCount, dpResult, mnRowSize );
fpRightData[nCount2++] = (float)pCur->GetDouble();
nCount3++;
}
} }
else if( eOp!=ocOpen && eOp!=ocClose &&eOp != ocSep)
nOclOp = eOp;
// if(count1>0){//dbg
// fprintf(stderr,"leftData is %f.\n",fpLeftData[count1-1]);
// count1--;
// }
// if(count2>0){//dbg
// fprintf(stderr,"rightData is %f.\n",fpRightData[count2-1]);
// count2--;
// }
} }
} }
else
if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState())
{ {
//fprintf(stderr,"ccCPU flow...\n\n"); return false;
generateRPNCode(rDoc, aTmpPos, aCode2);
ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2);
aInterpreter.Interpret();
pDest->SetResultToken(aInterpreter.GetResultToken().get());
pDest->ResetDirty();
pDest->SetChanged(true);
} }
} // for loop end (xGroup->mnLength) return true;
}
// For GPU calculation class agency
if(getenv("SC_GPU")&&ocl_calc.GetOpenclState()) {
{ public:
fprintf(stderr,"ggGPU flow...\n\n"); double *calculate(int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt);
printf(" oclOp is... %d\n",nOclOp); };
osl_getSystemTime(&aTimeBefore); //timer
if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) double * agency::calculate( int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt)
{
const double *dpLeftData = NULL;
const double *dpRightData = NULL;
const double *dpOclSrcData=NULL;
if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
{ {
fprintf(stderr,"ggGPU double precision flow...\n\n"); switch( nOclOp )
//double precision
switch(nOclOp)
{ {
case ocAdd: case ocAdd:
ocl_calc.OclHostArithmeticOperator64Bits("oclSignedAdd",dpLeftData,dpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * nDataSize );
memset(rResult,0,rowSize);
ocl_calc.oclHostArithmeticStash64Bits( "oclSignedAdd",dpLeftData,dpRightData,rResult,temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
break; break;
}
case ocSub: case ocSub:
ocl_calc.OclHostArithmeticOperator64Bits("oclSignedSub",dpLeftData,dpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = ( double * )malloc( sizeof(double) * nDataSize );
memset( rResult,0,rowSize );
ocl_calc.oclHostArithmeticStash64Bits( "oclSignedSub",dpLeftData,dpRightData,rResult,temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData(rResult,nDataSize) );
break; break;
}
case ocMul: case ocMul:
ocl_calc.OclHostArithmeticOperator64Bits("oclSignedMul",dpLeftData,dpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * nDataSize );
memset( rResult,0,rowSize );
ocl_calc.oclHostArithmeticStash64Bits( "oclSignedMul",dpLeftData,dpRightData,rResult,temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
break; break;
}
case ocDiv: case ocDiv:
ocl_calc.OclHostArithmeticOperator64Bits("oclSignedDiv",dpLeftData,dpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = ( double * )malloc( sizeof(double) * nDataSize );
memset( rResult,0,rowSize );
ocl_calc.oclHostArithmeticStash64Bits( "oclSignedDiv",dpLeftData,dpRightData,rResult,temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
break; break;
}
case ocMax: case ocMax:
ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
nDataSize = temp->getDataSize();
dpOclSrcData = temp->getDouleData();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * rowSize );
memset( rResult,0,rowSize );
ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
break; break;
}
case ocMin: case ocMin:
ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
nDataSize = temp->getDataSize();
dpOclSrcData = temp->getDouleData();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * rowSize );
memset( rResult,0,rowSize );
ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
break; break;
}
case ocAverage: case ocAverage:
ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); {
break; unsigned int nDataSize = 0;
case ocSum: SourceData *temp = formulaInterprt->srdDataPop();
ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaSum",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); nDataSize = temp->getDataSize();
break; dpOclSrcData = temp->getDouleData();
case ocCount: double *rResult = NULL; // Point to the output data from GPU
ocl_calc.OclHostFormulaCount64Bits(npOclStartPos,npOclEndPos,rResult,rowSize); rResult = (double *)malloc( sizeof(double) * rowSize );
break; memset( rResult,0,rowSize );
case ocSumProduct: ocl_calc.oclHostFormulaStash64Bits( "oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
ocl_calc.OclHostFormulaSumProduct64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,rResult,rowSize); formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
break; break;
}
default: default:
fprintf(stderr,"No OpenCL function for this calculation.\n"); fprintf( stderr,"No OpenCL function for this calculation.\n" );
break; break;
} }
} }
else else
{ {
fprintf(stderr,"ggGPU float precision flow...\n\n"); switch( nOclOp )
//float precision
switch(nOclOp)
{ {
case ocAdd: case ocAdd:
ocl_calc.OclHostArithmeticOperator32Bits("oclSignedAdd",fpLeftData,fpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * nDataSize );
memset(rResult,0,rowSize);
ocl_calc.oclHostArithmeticStash32Bits( "oclSignedAdd", dpLeftData, dpRightData, rResult, temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
break; break;
}
case ocSub: case ocSub:
ocl_calc.OclHostArithmeticOperator32Bits("oclSignedSub",fpLeftData,fpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * nDataSize );
memset( rResult, 0, rowSize );
ocl_calc.oclHostArithmeticStash32Bits( "oclSignedSub", dpLeftData, dpRightData, rResult, temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
break; break;
}
case ocMul: case ocMul:
ocl_calc.OclHostArithmeticOperator32Bits("oclSignedMul",fpLeftData,fpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc(sizeof(double) * nDataSize );
memset( rResult, 0, rowSize );
ocl_calc.oclHostArithmeticStash32Bits( "oclSignedMul", dpLeftData, dpRightData, rResult, temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData( rResult, nDataSize ) );
break; break;
}
case ocDiv: case ocDiv:
ocl_calc.OclHostArithmeticOperator32Bits("oclSignedDiv",fpLeftData,fpRightData,rResult,nCount1); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
SourceData *temp2 = formulaInterprt->srdDataPop();
nDataSize = temp2->getDataSize();
dpLeftData = temp2->getDouleData();
dpRightData = temp->getDouleData();
nDataSize = temp2->getDataSize();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * nDataSize );
memset( rResult, 0, rowSize );
ocl_calc.oclHostArithmeticStash32Bits( "oclSignedDiv", dpLeftData, dpRightData, rResult, temp->getDataSize() );
formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
break; break;
}
case ocMax: case ocMax:
ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMax",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
nDataSize = temp->getDataSize();
dpOclSrcData = temp->getDouleData();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc(sizeof(double) * nDataSize );
memset(rResult,0,rowSize);
ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMax", dpOclSrcData, npOclStartPos, npOclEndPos, rResult,nDataSize, rowSize );
formulaInterprt->srdDataPush( new SourceData( rResult, rowSize ) );
break; break;
}
case ocMin: case ocMin:
ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMin",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); {
unsigned int nDataSize = 0;
SourceData *temp = formulaInterprt->srdDataPop();
nDataSize = temp->getDataSize();
dpOclSrcData = temp->getDouleData();
double *rResult = NULL; // Point to the output data from GPU
rResult = (double *)malloc( sizeof(double) * nDataSize );
memset( rResult, 0, rowSize );
ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMin", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
break; break;
}
case ocAverage: case ocAverage:
ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaAverage",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); {
break; unsigned int nDataSize = 0;
case ocSum: SourceData *temp = formulaInterprt->srdDataPop();
ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaSum",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); nDataSize = temp->getDataSize();
break; dpOclSrcData = temp->getDouleData();
case ocCount: double *rResult = NULL; // Point to the output data from GPU
ocl_calc.OclHostFormulaCount32Bits(npOclStartPos,npOclEndPos,rResult,rowSize); rResult = (double *)malloc( sizeof(double) * nDataSize );
break; memset( rResult, 0, rowSize);
case ocSumProduct: ocl_calc.oclHostFormulaStash32Bits( "oclFormulaAverage", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
ocl_calc.OclHostFormulaSumProduct32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,rResult,rowSize); formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
break; break;
}
default: default:
fprintf(stderr,"No OpenCL function for this calculation.\n"); fprintf(stderr,"No OpenCL function for this calculation.\n");
break; break;
} }
} }
return NULL;
}
ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix( const ScMatrix& rMat )
{
SCSIZE nC, nR;
rMat.GetDimensions( nC, nR );
if ( nC != nR || nC == 0 )
// Input matrix must be square. Return an empty matrix on failure and
// the caller will calculate it via CPU.
return ScMatrixRef();
// This vector will contain a series of doubles from the first column to
// the last, chained together in a single array.
std::vector<double> aDoubles;
rMat.GetDoubleArray(aDoubles);
///////////////////////////////////////////////////// float * fpOclMatrixSrc = NULL;
osl_getSystemTime(&aTimeAfter); float * fpOclMatrixDst = NULL;
double diff = getTimeDiff(aTimeAfter, aTimeBefore); double * dpOclMatrixSrc = NULL;
//if (diff >= 1.0) double * dpOclMatrixDst = NULL;
uint nMatrixSize = nC * nR;
static OclCalc aOclCalc;
if ( aOclCalc.getOpenclState() )
{
if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 )
{ {
fprintf(stderr,"OpenCL,diff...%f.\n",diff); aOclCalc.createBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i];
aOclCalc.oclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
}
else
{
aOclCalc.createBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i];
aOclCalc.oclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
}
} }
/////////////////////////////////////////////////////
//rResult[i]; // TODO: Inverse this matrix and put the result back into xInv. Right now,
// for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results // I'll just put the original, non-inversed matrix values back, just to
// fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]); // demonstrate how to put the values back after inversion. There are two
// } // ways to put the values back (depending on what the GPU output is).
ScMatrixRef xInv(new ScMatrix(nR, nR, 0.0));
// Insert the double data, in rResult[i] back into the document #if 0
rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength); // One way is to put the whole value as one array. This method assumes
// that the array size equals column x row, and is oriented column-wise.
// This method is slightly more efficient than the second, but I wouldn't
// expect too much of a difference.
xInv->PutDouble(&aDoubles[0], aDoubles.size(), 0, 0);
#else
// Another way is to put the values one column at a time.
const double* p = &aDoubles[0];
for( SCSIZE i = 0; i < nC; ++i )
{
xInv->PutDouble( p, nR, i, 0 );
p += nR;
} }
#endif
return xInv;
}
bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc, const ScAddress& rTopPos,
const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode )
{
generateRPNCode( rDoc, rTopPos, rCode );
mnRowSize = xGroup->mnLength;
fprintf( stderr,"mnRowSize at begin is ...%ld.\n",(long)mnRowSize );
// The row quantity can be gotten from p2->GetArrayLength()
int nOclOp = 0;
const double * dpOclSrcData = NULL;
const double * dpBinaryData = NULL;
static OclCalc ocl_calc;
unsigned int nSrcDataSize = 0;
free(rResult); const double *dpResult = NULL;
double *pResult = (double *)malloc(sizeof(double) * mnRowSize);
double *dpSvDouble = NULL;
bool isSample = false;
mnSingleCount = 0;
mnDoubleCount = 0;
mnSvDoubleCount = 0;
mnOperatorCount = 0;
mnPositonLen = 0;
if ( ocl_calc.getOpenclState() )
{
getPosition(rCode,xGroup,mnRowSize,mnpOclStartPos,mnpOclEndPos,&mnPositonLen);
const formula::FormulaToken* p = rCode.FirstRPN();
bool isSingle = false;
int nCountNum=0;
do
{
if ( ocPush != p->GetOpCode())
{
nOclOp = p->GetOpCode();
mnOperatorGroup[mnOperatorCount++] = nOclOp;
}
else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
{
mnSingleCount++;
}
if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
{
mnSvDoubleCount++;
}
} while ( NULL != ( p = rCode.NextRPN() ) );
if( isGroundWater() )
{
isSample = true;
}
mnOperatorCount = 0;
mnSingleCount = 0;
mnSvDoubleCount = 0;
p = rCode.FirstRPN();
if(isSample)
{
do
{
if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
{
mdpSvdouble[mnSvDoubleCount++] = p->GetDouble();
}
else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
{
const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
const std::vector< const double* >& rArrays = pDvr->GetArrays();
uint rArraysSize = rArrays.size();
int nMoreColSize = 0;
DoubleVectorFormula *SvDoubleTemp = new DoubleVectorFormula();
if( rArraysSize > 1 )
{
double *dpMoreColData = NULL;
for ( uint loop=0; loop < rArraysSize; loop++ )
{
dpOclSrcData = rArrays[loop];
nSrcDataSize = pDvr->GetArrayLength();
nMoreColSize += nSrcDataSize;
dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
for ( uint j = nMoreColSize - nSrcDataSize, i = 0; i < nSrcDataSize; i++, j++ )
{
dpMoreColData[j] = dpOclSrcData[i];
}
}
dpOclSrcData = dpMoreColData;
nSrcDataSize = nMoreColSize;
}
else
{
dpOclSrcData = rArrays[0];
nSrcDataSize = pDvr->GetArrayLength();
SvDoubleTemp->mdpInputData = dpOclSrcData;
SvDoubleTemp->mnInputDataSize = nSrcDataSize;
SvDoubleTemp->mnInputStartPosition = mnpOclStartPos[nCountNum*mnRowSize];
SvDoubleTemp->mnInputEndPosition = mnpOclEndPos[nCountNum*mnRowSize];
SvDoubleTemp->mnInputStartOffset = mnpOclStartPos[nCountNum*mnRowSize+1]-mnpOclStartPos[nCountNum*mnRowSize];
SvDoubleTemp->mnInputEndOffset = mnpOclEndPos[nCountNum*mnRowSize+1]-mnpOclEndPos[nCountNum*mnRowSize];
mDoubleArray[mnDoubleCount++] = SvDoubleTemp;
nCountNum++;
}
}
else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
{
const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
dpBinaryData = pSvr->GetArray();
uint nArrayLen = pSvr->GetArrayLength();
SingleVectorFormula *SignleTemp = new SingleVectorFormula() ;
if(isSingle)
{
SignleTemp = mSingleArray[--mnSingleCount];
SignleTemp->mdpInputRightData = dpBinaryData;
SignleTemp->mnInputRightDataSize = nArrayLen;
SignleTemp->mnInputRightStartPosition = 0;
SignleTemp->mnInputRightOffset = 0;
isSingle = false;
}
else
{
SignleTemp = new SingleVectorFormula();
SignleTemp->mdpInputLeftData = dpBinaryData;
SignleTemp->mnInputLeftDataSize = nArrayLen;
SignleTemp->mdpInputRightData = NULL;
SignleTemp->mnInputRightDataSize = 0;
SignleTemp->mnInputLeftStartPosition = 0;
SignleTemp->mnInputLeftOffset = 0;
isSingle = true;
}
mSingleArray[mnSingleCount++] = SignleTemp;
}
else
{
nOclOp = p->GetOpCode();
mnOperatorGroup[mnOperatorCount++] = nOclOp;
}
} while ( NULL != ( p = rCode.NextRPN() ) );
if ( !chooseFunction( ocl_calc, pResult ) )
return false;
else
dpResult = pResult;
}
else
{
agency aChooseAction;
do
{
if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
{
dpSvDouble = (double *) malloc( sizeof(double ) * mnRowSize );
double dTempValue = p->GetDouble();
for ( uint i = 0; i < mnRowSize; i++ )
dpSvDouble[i] = dTempValue;
srdDataPush( new SourceData( dpSvDouble, mnRowSize ) );
collectDoublePointers( dpSvDouble );
}
else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
{
const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
const std::vector< const double* >& rArrays = pDvr->GetArrays();
unsigned int rArraysSize = rArrays.size();
int nMoreColSize = 0;
if(rArraysSize > 1)
{
double *dpMoreColData = NULL;
for( uint loop=0; loop < rArraysSize; loop++ )
{
dpOclSrcData = rArrays[loop];
nSrcDataSize = pDvr->GetArrayLength();
nMoreColSize += nSrcDataSize;
dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
for(uint j=nMoreColSize-nSrcDataSize,i=0;i<nSrcDataSize;i++,j++)
{
dpMoreColData[j] = dpOclSrcData[i];
}
}
dpOclSrcData = dpMoreColData;
nSrcDataSize = nMoreColSize;
collectDoublePointers( dpMoreColData );
}
else
{
dpOclSrcData = rArrays[0];
nSrcDataSize = pDvr->GetArrayLength();
}
srdDataPush( new SourceData( dpOclSrcData,nSrcDataSize,rArraysSize ) );
}
else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
{
const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
dpBinaryData = pSvr->GetArray();
nSrcDataSize = pSvr->GetArrayLength();
srdDataPush( new SourceData( dpBinaryData, nSrcDataSize ) );
}
else
{
nOclOp = p->GetOpCode();
aChooseAction.calculate(nOclOp,mnRowSize,ocl_calc,mnpOclStartPos,mnpOclEndPos,this);
mnSingleCount = 0;
mnDoubleCount = 0;
mnSvDoubleCount = 0;
mnOperatorCount = 0;
mnPositonLen = 0;
}
} while ( NULL != ( p = rCode.NextRPN() ) );
SourceData * sResult = srdDataPop();
dpResult = sResult->getDouleData();
}
rDoc.SetFormulaResults( rTopPos, dpResult, mnRowSize );
freeDoublePointers();
if ( pResult )
{
free( pResult );
pResult = NULL;
}
if ( mnpOclStartPos )
{
free( mnpOclStartPos );
mnpOclStartPos = NULL;
}
if ( mnpOclEndPos )
{
free( mnpOclEndPos );
mnpOclEndPos = NULL;
}
return true; return true;
} // getOpenclState() End
else
return false;
} }
/// Special case of formula compiler for groundwatering /// Special case of formula compiler for groundwatering
...@@ -489,11 +964,11 @@ public: ...@@ -489,11 +964,11 @@ public:
FormulaGroupInterpreterSoftware() FormulaGroupInterpreterSoftware()
{ {
fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n"); fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n");
OclCalc::InitEnv(); OclCalc::initEnv();
} }
virtual ~FormulaGroupInterpreterGroundwater() virtual ~FormulaGroupInterpreterGroundwater()
{ {
OclCalc::ReleaseOpenclRunEnv(); OclCalc::releaseOpenclRunEnv();
} }
virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); } virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); }
...@@ -569,7 +1044,7 @@ bool FormulaGroupInterpreterGroundwater::interpretCL(ScDocument& rDoc, const ScA ...@@ -569,7 +1044,7 @@ bool FormulaGroupInterpreterGroundwater::interpretCL(ScDocument& rDoc, const ScA
fprintf (stderr, "Calculate !"); fprintf (stderr, "Calculate !");
double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray, double *pResult = ocl_calc.oclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
pArrayToSubtractOneElementFrom, pArrayToSubtractOneElementFrom,
(size_t) xGroup->mnLength, delta ); (size_t) xGroup->mnLength, delta );
RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed"); RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed");
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. * file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/ */
#ifndef _OCL_KERNEL_H_ #ifndef SC_OCLKERNELS_HXX
#define _OCL_KERNEL_H_ #define SC_OCLKERNELS_HXX
#ifndef USE_EXTERNAL_KERNEL #ifndef USE_EXTERNAL_KERNEL
#define KERNEL( ... )# __VA_ARGS__ #define KERNEL( ... )# __VA_ARGS__
...@@ -24,6 +24,97 @@ const char *kernel_src = KERNEL( ...@@ -24,6 +24,97 @@ const char *kernel_src = KERNEL(
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
\n#else\n \n#else\n
\n#endif\n \n#endif\n
inline fp_t oclAverage( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
{
uint start = startArray[id];
uint end = endArray[id];
fp_t fSum = 0.0;
fp_t zero[16] = {0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f};
fp_t16 vSum=vload16(0,zero);
fp_t16 ptr;
__global fp_t *p = values;
p+= start;
for(int i = 0; i < (end - start + 1)/16; ++i)
{
ptr=vload16(0,p);
vSum += ptr;
p+=16;
}
int lastData = (end-start+1)%16;
for(int i = 0; i <lastData; i++)
{
fSum += *p;
p+=1;
}
vSum.s01234567 = vSum.s01234567+vSum.s89abcdef;
vSum.s0123 = vSum.s0123+vSum.s4567;
vSum.s01 = vSum.s01+vSum.s23;
vSum.s0 = vSum.s0+vSum.s1;
fSum = vSum.s0+fSum;
fp_t fVal = fSum/(end-start+1);
return fVal;
}
inline fp_t oclMax( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
{
uint start = startArray[id];
uint end = endArray[id];
fp_t fMax = values[start];
fp_t zero[16] = {fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax};
fp_t16 vMax=vload16(0,zero);
//Max
fp_t16 ptr;
__global fp_t *p = values;
p+= start;
for(int i = 0; i < (end - start + 1)/16; ++i)
{
ptr=vload16(0,p);
vMax = fmax(vMax,ptr);
p+=16;
}
int lastData = (end-start+1)%16;
for(int i = 0; i <lastData; i++)
{
fMax = fmax(fMax,*p);
p+=1;
}
vMax.s01234567 = fmax(vMax.s01234567, vMax.s89abcdef);
vMax.s0123 = fmax(vMax.s0123, vMax.s4567);
vMax.s01 = fmax(vMax.s01, vMax.s23);
vMax.s0 = fmax(vMax.s0, vMax.s1);
fMax = fmax(vMax.s0, fMax);
return fMax;
}
inline fp_t oclMin( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
{
uint start = startArray[id];
uint end = endArray[id];
fp_t fMin = values[start];
fp_t zero[16] = {fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin};
fp_t16 vMin=vload16(0,zero);
//Min
fp_t16 ptr;
__global fp_t *p = values;
p+= start;
for(int i = 0; i < (end - start + 1)/16; ++i)
{
ptr=vload16(0,p);
vMin = fmin(vMin,ptr);
p+=16;
}
int lastData = (end-start+1)%16;
for(int i = 0; i <lastData; i++)
{
fMin = fmin(fMin,*p);
p+=1;
}
vMin.s01234567 = fmin(vMin.s01234567, vMin.s89abcdef);
vMin.s0123 = fmin(vMin.s0123, vMin.s4567);
vMin.s01 = fmin(vMin.s01, vMin.s23);
vMin.s0 = fmin(vMin.s0, vMin.s1);
fMin = fmin(vMin.s0, fMin);
return fMin;
}
__kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{ {
...@@ -31,7 +122,6 @@ __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global ...@@ -31,7 +122,6 @@ __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global
otData[id] = ltData[id] + rtData[id]; otData[id] = ltData[id] + rtData[id];
} }
__kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{ {
const unsigned int id = get_global_id(0); const unsigned int id = get_global_id(0);
...@@ -41,39 +131,31 @@ __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global ...@@ -41,39 +131,31 @@ __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global
__kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) __kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{ {
int id = get_global_id(0); int id = get_global_id(0);
otData[id] =ltData[id] * rtData[id]; otData[id] = ltData[id] * rtData[id];
} }
__kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) __kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{ {
const unsigned int id = get_global_id(0); const unsigned int id = get_global_id(0);
otData[id] = ltData[id] / rtData[id]; fp_t divisor = rtData[id];
if ( divisor != 0 )
otData[id] = ltData[id] / divisor;
else
otData[id] = 0.0;
} }
__kernel void oclFormulaMin(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) __kernel void oclFormulaMin(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{ {
const unsigned int id = get_global_id(0); const unsigned int id = get_global_id(0);
unsigned int startFlag = start[id]; fp_t fVal = oclMin(id,input,start,end);
unsigned int endFlag = end[id]; output[id] = fVal ;
fp_t fMinVal = input[startFlag];
for(int i=startFlag;i<=endFlag;i++)
{
fMinVal = fmin( fMinVal, input[i] );
}
output[id] = fMinVal;
} }
__kernel void oclFormulaMax(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) __kernel void oclFormulaMax(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{ {
const unsigned int id = get_global_id(0); const unsigned int id = get_global_id(0);
unsigned int startFlag = start[id]; fp_t fVal = oclMax(id,input,start,end);
unsigned int endFlag = end[id]; output[id] = fVal ;
fp_t fMaxVal = input[startFlag];
for ( int i = startFlag; i <= endFlag; i++ )
{
fMaxVal = fmax( fMaxVal, input[i] );
}
output[id] = fMaxVal;
} }
//Sum //Sum
__kernel void oclFormulaSum(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) __kernel void oclFormulaSum(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
...@@ -94,12 +176,10 @@ __kernel void oclFormulaCount(__global uint *start,__global uint *end,__global f ...@@ -94,12 +176,10 @@ __kernel void oclFormulaCount(__global uint *start,__global uint *end,__global f
__kernel void oclFormulaAverage(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) __kernel void oclFormulaAverage(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{ {
const unsigned int id = get_global_id(0); const unsigned int id = get_global_id(0);
fp_t sum=0.0; fp_t fVal = oclAverage(id,input,start,end);
for(int i = start[id];i<=end[id];i++) output[id] = fVal ;
sum += input[i];
output[id] = sum / (end[id]-start[id]+1);
}
}
//Sumproduct //Sumproduct
__kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global uint* npSumSize,__global fp_t *output,uint nMatixSize) __kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global uint* npSumSize,__global fp_t *output,uint nMatixSize)
{ {
...@@ -147,7 +227,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s ...@@ -147,7 +227,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s
// Min // Min
fp_t fMinVal = values[start]; fp_t fMinVal = values[start];
for(int i=start+1;i < end;i++) for ( int i = start + 1; i < end; i++ )
{ {
if(values[i]<fMinVal) if(values[i]<fMinVal)
fMinVal = values[i]; fMinVal = values[i];
...@@ -177,14 +257,14 @@ __kernel void oclFormulaMtxInv(__global fp_t * fpMatrixInput, __global fp_t * fp ...@@ -177,14 +257,14 @@ __kernel void oclFormulaMtxInv(__global fp_t * fpMatrixInput, __global fp_t * fp
fpP[nOffset*nDimension+nId] = fpP[nMax*nDimension+nId]; fpP[nOffset*nDimension+nId] = fpP[nMax*nDimension+nId];
fpP[nMax*nDimension+nId] = dMovebuffer; fpP[nMax*nDimension+nId] = dMovebuffer;
} }
__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY) __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY,__global uint* npDim)
{ {
int nId = get_global_id(0); int nId = get_global_id(0);
int nDimension = get_global_size(0); int nDimension = npDim[nId];
fp_t fsum = 0.0;
for ( int yi=0; yi < nDimension; yi++ ) for ( int yi=0; yi < nDimension; yi++ )
{ {
fp_t fsum = 0.0; fsum = 0.0;
for ( int yj=0; yj < nDimension; yj++ ) for ( int yj=0; yj < nDimension; yj++ )
{ {
fsum += fpMatrixInput[yi*nDimension+yj] * fpY[nId+yj*nDimension]; fsum += fpMatrixInput[yi*nDimension+yj] * fpY[nId+yj*nDimension];
...@@ -194,7 +274,7 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat ...@@ -194,7 +274,7 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
} }
for ( int xi = nDimension - 1; xi >= 0; xi-- ) for ( int xi = nDimension - 1; xi >= 0; xi-- )
{ {
fp_t fsum = 0.0; fsum = 0.0;
for ( int xj = 0; xj < nDimension; xj++ ) for ( int xj = 0; xj < nDimension; xj++ )
{ {
fsum += fpMatrixInput[xi*nDimension+xj] * fpMatrixOutput[nId+nDimension*xj]; fsum += fpMatrixInput[xi*nDimension+xj] * fpMatrixOutput[nId+nDimension*xj];
...@@ -203,6 +283,101 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat ...@@ -203,6 +283,101 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
} }
} }
__kernel void oclAverageAdd(__global fp_t *values,__global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fVal = oclAverage(id,values,startArray,endArray);
output[id] = fVal + addend[id];
}
__kernel void oclAverageSub(__global fp_t *values,__global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fVal = oclAverage(id,values,startArray,endArray);
output[id] = fVal - subtract[id];
}
__kernel void oclAverageMul(__global fp_t *values,__global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fVal = oclAverage(id,values,startArray,endArray);
output[id] = fVal * multiplier[id];
}
__kernel void oclAverageDiv(__global fp_t *values,__global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fVal = oclAverage(id,values,startArray,endArray);
fp_t divisor = div[id];
if ( divisor != 0 )
output[id] = fVal / divisor;
else
output[id] = 0.0;
}
__kernel void oclMinAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMin = oclMin(id,values,startArray,endArray);
output[id] = fMin + addend[id];
}
__kernel void oclMinSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMin = oclMin(id,values,startArray,endArray);
output[id] = fMin - subtract[id];
}
__kernel void oclMinMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMin = oclMin(id,values,startArray,endArray);
output[id] = fMin * multiplier[id];
}
__kernel void oclMinDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMin = oclMin(id,values,startArray,endArray);
fp_t divisor = div[id];
if ( divisor != 0 )
output[id] = fMin / divisor;
else
output[id] = 0.0;
}
__kernel void oclMaxAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMax = oclMax(id,values,startArray,endArray);
output[id] = fMax + addend[id];
}
__kernel void oclMaxSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMax = oclMax(id,values,startArray,endArray);
output[id] = fMax - subtract[id];
}
__kernel void oclMaxMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMax = oclMax(id,values,startArray,endArray);
output[id] = fMax * multiplier[id];
}
__kernel void oclMaxDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
fp_t fMax = oclMax(id,values,startArray,endArray);
fp_t divisor = div[id];
if ( divisor != 0 )
output[id] = fMax / divisor;
else
output[id] = 0.0;
}
__kernel void oclSub( fp_t ltData, __global fp_t *rtData, __global fp_t *outData )
{
const unsigned int id = get_global_id(0);
outData[id] = ltData - rtData[id];
}
); );
#endif // USE_EXTERNAL_KERNEL #endif // USE_EXTERNAL_KERNEL
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. * file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/ */
#ifndef SC_OPENCL_WRAPPER_H #ifndef SC_OPENCLWRAPPER_HXX
#define SC_OPENCL_WRAPPER_H #define SC_OPENCLWRAPPER_HXX
#include <config_features.h> #include <config_features.h>
#include <formula/opcode.hxx> #include <formula/opcode.hxx>
...@@ -87,6 +87,19 @@ if( status != CL_SUCCESS ) \ ...@@ -87,6 +87,19 @@ if( status != CL_SUCCESS ) \
return 0; \ return 0; \
} }
#define CHECK_OPENCL_VOID(status,name) \
if( status != CL_SUCCESS ) \
{ \
printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when %s .\n", status, name); \
}
#define CHECK_OPENCL_RELEASE(status,name) \
if ( name != NULL ) \
clReleaseMemObject( name ); \
if( status != CL_SUCCESS ) \
{ \
printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when clReleaseMemObject( %s ).\n", status, #name); \
}
#define MAX_KERNEL_STRING_LEN 64 #define MAX_KERNEL_STRING_LEN 64
#define MAX_CLFILE_NUM 50 #define MAX_CLFILE_NUM 50
...@@ -119,25 +132,48 @@ typedef struct ...@@ -119,25 +132,48 @@ typedef struct
char kernelName[MAX_KERNEL_NAME_LEN + 1]; char kernelName[MAX_KERNEL_NAME_LEN + 1];
char *kernelStr; char *kernelStr;
} kernel_node; } kernel_node;
typedef struct _SingleVectorFormula
{
const double *mdpInputLeftData;
const double *mdpInputRightData;
size_t mnInputLeftDataSize;
size_t mnInputRightDataSize;
uint mnInputLeftStartPosition;
uint mnInputRightStartPosition;
int mnInputLeftOffset;
int mnInputRightOffset;
} SingleVectorFormula;
typedef struct _DoubleVectorFormula
{
const double *mdpInputData;
size_t mnInputDataSize;
uint mnInputStartPosition;
uint mnInputEndPosition;
int mnInputStartOffset;
int mnInputEndOffset;
} DoubleVectorFormula;
class OpenclCalcBase class OpenclCalcBase
{ {
public: public:
OpenclCalcBase(){}; OpenclCalcBase(){};
virtual ~OpenclCalcBase(){}; virtual ~OpenclCalcBase(){};
virtual int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize )=0; virtual int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize )=0;
virtual int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize )=0; virtual int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0;
virtual int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0; virtual int oclHostFormulaStatistics64Bits( const char* aKernelName,double *&output, int outputSize )=0;
virtual int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0; virtual int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0;
virtual int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>& dpResult, uint nDim)=0; virtual int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0;
virtual int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim)=0;
virtual int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0;
virtual int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize )=0; virtual int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize )=0;
virtual int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize )=0; virtual int oclHostFormulaStatistics32Bits( const char* aKernelName,double *output, int outputSize )=0;
virtual int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0; virtual int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0;
virtual int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0; virtual int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0;
virtual int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0; virtual int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0;
virtual double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0; virtual int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle, size_t nSrcDataSize,size_t nElements, double delta,uint *nStartPos,uint *nEndPos ,double *deResult)=0;
virtual double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0;
}; };
...@@ -151,40 +187,40 @@ public: ...@@ -151,40 +187,40 @@ public:
static int isInited; static int isInited;
OpenclDevice(); OpenclDevice();
~OpenclDevice(); ~OpenclDevice();
static int InitEnv(); static int initEnv();
static int RegistOpenclKernel(); static int registOpenclKernel();
static int ReleaseOpenclRunEnv(); static int releaseOpenclRunEnv();
static int InitOpenclRunEnv( GPUEnv *gpu ); static int initOpenclRunEnv( GPUEnv *gpu );
static int ReleaseOpenclEnv( GPUEnv *gpuInfo ); static int releaseOpenclEnv( GPUEnv *gpuInfo );
static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ); static int compileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
static int InitOpenclRunEnv( int argc ); static int initOpenclRunEnv( int argc );
static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ); static int cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
static int GeneratBinFromKernelSource( cl_program program, const char * clFileName ); static int generatBinFromKernelSource( cl_program program, const char * clFileName );
static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ); static int writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
static int BinaryGenerated( const char * clFileName, FILE ** fhandle ); static int binaryGenerated( const char * clFileName, FILE ** fhandle );
static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption ); static int compileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
int InitOpenclAttr( OpenCLEnv * env ); int initOpenclAttr( OpenCLEnv * env );
int ReleaseKernel( KernelEnv * env ); int releaseKernel( KernelEnv * env );
int SetKernelEnv( KernelEnv *envInfo ); int setKernelEnv( KernelEnv *envInfo );
int CreateKernel( char * kernelname, KernelEnv * env ); int createKernel( char * kernelname, KernelEnv * env );
int RunKernel( const char *kernelName, void **userdata ); int runKernel( const char *kernelName, void **userdata );
int ConvertToString( const char *filename, char **source ); int convertToString( const char *filename, char **source );
int CheckKernelName( KernelEnv *envInfo, const char *kernelName ); int checkKernelName( KernelEnv *envInfo, const char *kernelName );
int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function ); int registerKernelWrapper( const char *kernelName, cl_kernel_function function );
int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ); int runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function ); int getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
#ifdef WIN32 #ifdef WIN32
static int LoadOpencl(); static int loadOpencl();
static int OpenclInite(); static int openclInite();
static void FreeOpenclDll(); static void freeOpenclDll();
#endif #endif
int GetOpenclState(); int getOpenclState();
void SetOpenclState( int state ); void setOpenclState( int state );
inline static int AddKernelConfig( int kCount, const char *kName ); inline static int addKernelConfig( int kCount, const char *kName );
}; };
...@@ -201,6 +237,10 @@ public: ...@@ -201,6 +237,10 @@ public:
cl_mem mpClmemMergeLfData; cl_mem mpClmemMergeLfData;
cl_mem mpClmemMergeRtData; cl_mem mpClmemMergeRtData;
cl_mem mpClmemMatixSumSize; cl_mem mpClmemMatixSumSize;
cl_mem mpClmemeOp;
unsigned int nArithmeticLen;
unsigned int nFormulaLen;
unsigned int nClmemLen;
unsigned int nFormulaColSize; unsigned int nFormulaColSize;
unsigned int nFormulaRowSize; unsigned int nFormulaRowSize;
...@@ -208,27 +248,49 @@ public: ...@@ -208,27 +248,49 @@ public:
~OclCalc(); ~OclCalc();
// for 64bits double // for 64bits double
int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize ); int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize );
int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize); int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize );
int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize ); int oclHostFormulaStatistics64Bits( const char* aKernelName, double *&output, int outputSize);
int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize); int oclHostFormulaStash64Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size);
int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim ); int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize );
int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize);
int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim );
// for 32bits float // for 32bits float
int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize ); int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize );
int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize); int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize );
int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize ); int oclHostFormulaStatistics32Bits( const char* aKernelName, double *output, int outputSize);
int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize ); int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize );
int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim ); int oclHostArithmeticStash64Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize );
int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize );
int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim );
// for groundwater // for groundwater
double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta ); int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle,size_t nSrcDataSize, size_t nElements, double delta ,uint *nStartPos,uint *nEndPos,double *deResult);
double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta );
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
int CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize ); int createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize );
int CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize ); int mapAndCopy64Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize );
int CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize ); int mapAndCopy64Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize);
int CreateBuffer32Bits( float *&fpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize ); int mapAndCopyArithmetic64Bits( const double *dpMoreArithmetic,int nBufferSize );
int CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize ); int mapAndCopyMoreColArithmetic64Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize );
int CreateBuffer32Bits( float *&fpSumProMergeLfData, float *&fpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize ); int createMoreColArithmeticBuf64Bits( int nBufferSize, int neOpSize );
int createFormulaBuf64Bits( int nBufferSize, int rowSize );
int createArithmeticOptBuf64Bits( int nBufferSize );
int createBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize );
int mapAndCopy32Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize );
int mapAndCopy32Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize);
int mapAndCopyArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize );
int mapAndCopyMoreColArithmetic32Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize );
int createMoreColArithmeticBuf32Bits( int nBufferSize, int neOpSize );
int createFormulaBuf32Bits( int nBufferSize, int rowSize );
int createArithmeticOptBuf32Bits( int nBufferSize );
int oclHostFormulaStash32Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size );
int oclHostArithmeticStash32Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize );
int releaseOclBuffer(void);
friend class agency;
}; };
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment