Patch for milestone1-0829-v4.

1. Add the parser based on RPN; 2. For test sample1 named "ground-water-daily.xls", using the compound formula to do calculation; Add the compound kernels: Formulae include "AVERAGE,MAX and MIN".Compound formulae include "AVERAGE -(+,*,/)","MAX -(+,*,/)" and "MIN -(+,*,/)"; 3. For formulae which do not work in GPU, they'll work in CPU; 4. For compound operators(-,+,*,/), they'll be calculated one by one in GPU as the sequence of RPN; 5. Add the start and end position to fit for the sliding window; 6. Modify kernels by using vector for AMD GPU. Conflicts: sc/source/core/opencl/formulagroupcl.cxx sc/source/core/opencl/openclwrapper.cxx Change-Id: I6157008575ce89ddd3e7bf552a87812474af4125

Patch for milestone1-0829-v4.
1. Add the parser based on RPN; 2. For test sample1 named "ground-water-daily.xls", using the compound formula to do calculation; Add the compound kernels: Formulae include "AVERAGE,MAX and MIN".Compound formulae include "AVERAGE -(+,*,/)","MAX -(+,*,/)" and "MIN -(+,*,/)"; 3. For formulae which do not work in GPU, they'll work in CPU; 4. For compound operators(-,+,*,/), they'll be calculated one by one in GPU as the sequence of RPN; 5. Add the start and end position to fit for the sliding window; 6. Modify kernels by using vector for AMD GPU. Conflicts: sc/source/core/opencl/formulagroupcl.cxx sc/source/core/opencl/openclwrapper.cxx Change-Id: I6157008575ce89ddd3e7bf552a87812474af4125
e791fbfc · Haidong Lian · Kohei Yoshida · ccf7b15c · e791fbfc · e791fbfc
Kaydet (Commit) e791fbfc authored Agu 30, 2013 tarafından Haidong Lian Kaydeden (comit) Kohei Yoshida Agu 30, 2013
4 changed files
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -19,6 +19,10 @@

 #include "openclwrapper.hxx"

+#define SRCDATASIZE 100
+#define SINGLEARRAYLEN 100
+#define DOUBLEARRAYLEN 100
+#define SVDOUBLELEN 100
 namespace sc {

 // A single public entry point for a factory function:
@@ -38,447 +42,918 @@ double getTimeDiff(const TimeValue& t1, const TimeValue& t2)
 }//dbg-t
 TimeValue aTimeBefore, aTimeAfter;
 ///////////////////////////////////////
+class SourceData
+{
+    const double *mdpSrcPtr;
+    unsigned int mnDataSize;
+    const char *mcpFormulaName;
+    unsigned int mnCol;
+    int eOp;
+public:
+    SourceData( const double *dpData, unsigned int nSize, uint nCol = 1,const char *cpFormulaName = NULL):mdpSrcPtr(dpData),mnDataSize(nSize),mcpFormulaName(cpFormulaName),mnCol(nCol)
+    {
+    }
+    SourceData():mdpSrcPtr(NULL),mnDataSize(0)
+    {
+    }
+    void setSrcPtr( const double *dpTmpDataPtr)
+    {
+        mdpSrcPtr = dpTmpDataPtr;
+    }
+    void setSrcSize( int nSize )
+    {
+        mnDataSize = nSize;
+    }
+    const double * getDouleData()
+    {
+        return mdpSrcPtr;
+    }
+    unsigned int getDataSize()
+    {
+        return mnDataSize;
+    }
+    void print()
+    {
+        for( uint i=0; i<mnDataSize; i++ )
+            printf( " The SourceData is %f and data size is %d\n",mdpSrcPtr[i],mnDataSize );
+    }
+    void printFormula()
+    {
+        printf("--------The formulaname is %s and the eOp is %d---------\n",mcpFormulaName,eOp);
+    }
+    void setFormulaName(const char *cpFormulaName)
+    {
+        this->mcpFormulaName = cpFormulaName;
+    }
+    const char *getFormulaName()
+    {
+        return mcpFormulaName;
+    }
+    void seteOp(int op)
+    {
+        this->eOp = op;
+    }
+    int geteOp()
+    {
+        return eOp;
+    }
+    int getColNum()
+    {
+        return mnCol;
+    }
+
+};

 class FormulaGroupInterpreterOpenCL : public FormulaGroupInterpreterSoftware
 {
+    SourceData *mSrcDataStack[SRCDATASIZE];
+    unsigned int mnStackPointer,mnDoublePtrCount;
+    uint * mnpOclStartPos;
+    uint * mnpOclEndPos;
+    SingleVectorFormula *mSingleArray[SINGLEARRAYLEN];
+    DoubleVectorFormula *mDoubleArray[DOUBLEARRAYLEN];
+    double mdpSvdouble[SVDOUBLELEN];
+    double *mdpSrcDoublePtr[SVDOUBLELEN];
+    uint mnSingleCount;
+    uint mnDoubleCount;
+    uint mnSvDoubleCount;
+    uint mnOperatorGroup[100];
+    uint mnOperatorCount;
+    char mcHostName[100];
+    uint mnPositonLen;
+    size_t mnRowSize;
 public:
    FormulaGroupInterpreterOpenCL() :
        FormulaGroupInterpreterSoftware()
    {
-        OclCalc::InitEnv();
+        mnStackPointer = 0;
+        mnpOclEndPos = NULL;
+        mnpOclStartPos = NULL;
+        mnSingleCount = 0;
+        mnDoubleCount = 0;
+        mnSvDoubleCount = 0;
+        mnOperatorCount = 0;
+        mnPositonLen = 0;
+        mnDoublePtrCount = 0;
+        OclCalc::initEnv();
    }
    virtual ~FormulaGroupInterpreterOpenCL()
    {
-        OclCalc::ReleaseOpenclRunEnv();
+        OclCalc::releaseOpenclRunEnv();
    }

-    virtual ScMatrixRef inverseMatrix(const ScMatrix& rMat);
-    virtual bool interpret(ScDocument& rDoc, const ScAddress& rTopPos,
-                           const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode);
-};
-
-ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
-{
-    SCSIZE nC, nR;
-    rMat.GetDimensions(nC, nR);
-    if (nC != nR || nC == 0)
-        // Input matrix must be square. Return an empty matrix on failure and
-        // the caller will calculate it via CPU.
-        return ScMatrixRef();
-
-    // This vector will contain a series of doubles from the first column to
-    // the last, chained together in a single array.
-    std::vector<double> aDoubles;
-    rMat.GetDoubleArray(aDoubles);
-    float * fpOclMatrixSrc = NULL;
-    float * fpOclMatrixDst = NULL;
-    double * dpOclMatrixSrc = NULL;
-    double * dpOclMatrixDst = NULL;
-    uint nMatrixSize = nC * nR;
-    static OclCalc aOclCalc;
-    if ( aOclCalc.GetOpenclState() )
+    virtual ScMatrixRef inverseMatrix( const ScMatrix& rMat );
+    virtual bool interpret( ScDocument& rDoc, const ScAddress& rTopPos,
+                           const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode );
+    void collectDoublePointers( double *temp )
    {
-        if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 )
+        if( mnDoublePtrCount < SRCDATASIZE )
        {
-            aOclCalc.CreateBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
-            for ( uint i = 0; i < nC; i++ )
-                for ( uint j = 0; j < nR; j++ )
-                    dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i];
-            aOclCalc.OclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
+            mdpSrcDoublePtr[mnDoublePtrCount++] = temp;
        }
        else
        {
-            aOclCalc.CreateBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
-            for ( uint i = 0; i < nC; i++ )
-                for ( uint j = 0; j < nR; j++ )
-                    fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i];
-            aOclCalc.OclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
+            printf( "The mdpSrcDoublePtr is full now.\n" );
+            double *dtmp = NULL;
+            if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
+            {
+                free( dtmp );
+                dtmp = NULL;
+            }
        }
    }

-    // TODO: Inverse this matrix and put the result back into xInv. Right now,
-    // I'll just put the original, non-inversed matrix values back, just to
-    // demonstrate how to put the values back after inversion.  There are two
-    // ways to put the values back (depending on what the GPU output is).
-    ScMatrixRef xInv(new ScMatrix(nR, nR, 0.0));
-
-#if 0
-    // One way is to put the whole value as one array. This method assumes
-    // that the array size equals column x row, and is oriented column-wise.
-    // This method is slightly more efficient than the second, but I wouldn't
-    // expect too much of a difference.
-    xInv->PutDouble(&aDoubles[0], aDoubles.size(), 0, 0);
-#else
-    // Another way is to put the values one column at a time.
-    const double* p = &aDoubles[0];
-    for (SCSIZE i = 0; i < nC; ++i)
+    void freeDoublePointers()
    {
-        xInv->PutDouble(p, nR, i, 0);
-        p += nR;
+        while( mnDoublePtrCount > 0 )
+        {
+            double *dtmp = NULL;
+            if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
+            {
+                free( dtmp );
+                dtmp = NULL;
+            }
+        }
    }
-#endif
-
-    return xInv;
-}

-bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& rTopPos,
-                                              const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode)
-{
-    generateRPNCode(rDoc, rTopPos, rCode);

-    size_t rowSize = xGroup->mnLength;
-    fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize);
-    // The row quantity can be gotten from p2->GetArrayLength()
-    uint nCount1 = 0, nCount2 = 0, nCount3 = 0;
-    int nOclOp = 0;
-    double *rResult = NULL; // Point to the output data from GPU
-    rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
-    if(NULL==rResult)
+    void srdDataPush( SourceData *temp )
    {
-        printf("malloc err\n");
-        return false;
+        if( mnStackPointer < SRCDATASIZE )
+        {
+            mSrcDataStack[mnStackPointer++] = temp;
        }
-    memset(rResult,0,rowSize);
-    float * fpOclSrcData = NULL; // Point to the input data from CPU
-    double * dpOclSrcData = NULL;
-    uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
-    uint * npOclEndPos   = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
-    float * fpLeftData   = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
-    float * fpRightData  = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
-                                 // The rightData can't be zero for "/"
-    double * dpLeftData = NULL;
-    double * dpRightData = NULL;
-
-    float * fpSaveData=NULL;            //It is a temp pointer point the preparing memory;
-    float * fpSumProMergeLfData = NULL; //It merge the more col to one col is the left operator
-    float * fpSumProMergeRtData = NULL; //It merge the more col to one col is the right operator
-    double * dpSaveData=NULL;
-    double * dpSumProMergeLfData = NULL;
-    double * dpSumProMergeRtData = NULL;
-    uint * npSumSize=NULL;      //It is a array to save the matix sizt(col *row)
-    int nSumproductSize=0;      //It is the merge array size
-    bool aIsAlloc=false;        //It is a flag to judge the fpSumProMergeLfData existed
-    unsigned int nCountMatix=0; //It is a count to save the calculate times
-    static OclCalc ocl_calc;
-    bool isSumProduct=false;
-    if(ocl_calc.GetOpenclState())
+        else
+            printf( "The stack is full now.\n" );
+    }
+    SourceData *srdDataPop( void )
    {
-        // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
-        // Don't know which formulae will be used previously, so create buffers for different formulae used probably
-        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+        if( mnStackPointer <= 0 )
        {
-            ocl_calc.CreateBuffer64Bits(dpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
-            ocl_calc.CreateBuffer64Bits(dpLeftData,dpRightData,rowSize);
+            printf( "The stack was empty\n" );
+            return NULL;
        }
-        else
+        return mSrcDataStack[--mnStackPointer];
+    }
+    unsigned int getDataSize()
    {
-            ocl_calc.CreateBuffer32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
-            ocl_calc.CreateBuffer32Bits(fpLeftData,fpRightData,rowSize);
+        return mnStackPointer;
    }
-        //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
+    void printStackInfo()
+    {
+        printf( "/********The stack size is %d*********\\\n",mnStackPointer );
+        for ( int i = mnStackPointer - 1; i >= 0; i-- )
+            mSrcDataStack[i]->print();
    }
-///////////////////////////////////////////////////////////////////////////////////////////
-
-    // Until we implement group calculation for real, decompose the group into
-    // individual formula token arrays for individual calculation.
-    ScAddress aTmpPos = rTopPos;
-    for (sal_Int32 i = 0; i < xGroup->mnLength; ++i)
+    bool getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen);
+    bool chooseFunction(OclCalc &ocl_calc,double *&dpResult);
+    bool isStockHistory();
+    bool isGroundWater();
+};
+bool FormulaGroupInterpreterOpenCL::getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen)
+{
+        uint nColPosition = 0;
+        ScTokenArray * rCodePos = rCode.Clone();
+        static int nCountPosSize = nRowSize;
+        bool isAllocFormulaOclBuf = true;
+        for ( const formula::FormulaToken* p = rCodePos->First(); p; p = rCodePos->Next() )
        {
-        aTmpPos.SetRow(xGroup->mnStart + i);
-        ScTokenArray aCode2;
-        for (const formula::FormulaToken* p = rCode.First(); p; p = rCode.Next())
+            switch ( p->GetType() )
            {
-            switch (p->GetType())
+                case formula::svDoubleVectorRef:
                {
-                case formula::svSingleVectorRef:
+                    nColPosition++;
+                    break;
+                }
+            }
+        }
+        int nPositionSize = nColPosition * nRowSize;
+        npOclStartPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
+        npOclEndPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
+        if ( nCountPosSize < nPositionSize )
        {
-                    const formula::SingleVectorRefToken* p2 = static_cast<const formula::SingleVectorRefToken*>(p);
-                    const double* pArray = p2->GetArray();
-                    aCode2.AddDouble(static_cast<size_t>(i) < p2->GetArrayLength() ? pArray[i] : 0.0);
+            nCountPosSize = nPositionSize;
+            isAllocFormulaOclBuf = false;
        }
-                break;
+        for ( sal_Int32 i = 0; i < xGroup->mnLength; ++i )
+        {
+            ScTokenArray * rCodeTemp = rCode.Clone();
+            int j = 0;
+            for ( const formula::FormulaToken* p = rCodeTemp->First(); p; p = rCodeTemp->Next() )
+            {
+                switch (p->GetType())
+                {
                    case formula::svDoubleVectorRef:
                    {
                        const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p);
-                    const std::vector<const double*>& rArrays = p2->GetArrays();
-                    size_t nColSize = rArrays.size();
                        size_t nRowStart = p2->IsStartFixed() ? 0 : i;
                        size_t nRowEnd = p2->GetRefRowSize() - 1;
                        if (!p2->IsEndFixed())
                            nRowEnd += i;
-                    size_t nRowSize = nRowEnd - nRowStart + 1;
-                    //store the a matix`s rowsize and colsize,use it to calculate the matix`s size
-                    ocl_calc.nFormulaRowSize = nRowSize;
-                    ocl_calc.nFormulaColSize = nColSize;
-                    ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0));
-                    if(ocl_calc.GetOpenclState())
-                    {
-                        npOclStartPos[i] = nRowStart; // record the start position
-                        npOclEndPos[i]   = nRowEnd;   // record the end position
-                    }
-                    int nTempOpcode;
-                    const formula::FormulaToken* pTemp = p;
-                    pTemp=aCode2.Next();
-                    nTempOpcode=pTemp->GetOpCode();
-                    while(1)
-                    {
-                        nTempOpcode=pTemp->GetOpCode();
-                        if(nTempOpcode!=ocOpen && nTempOpcode!=ocPush)
-                            break;
-                         pTemp=aCode2.Next();
+                        npOclStartPos[j*nRowSize+i] = nRowStart;//record the start position
+                        npOclEndPos[j*nRowSize+i] = nRowEnd;//record the end position
+                        j++;
+                    }
+                }
            }
-                    if((!aIsAlloc) && (ocl_calc.GetOpenclState())&& (nTempOpcode == ocSumProduct))
+        }
+        *nPositonLen = nPositionSize;
+        //Now the pos array is 0 1 2 3 4 5  0 1 2 3 4 5;
+        return isAllocFormulaOclBuf;
+}
+
+bool FormulaGroupInterpreterOpenCL::isStockHistory()
+{
+    bool isHistory = false;
+    if( (mnOperatorGroup[0]== 224) && (mnOperatorGroup[1]== 227) && (mnOperatorGroup[2]== 41) && (mnOperatorGroup[3]== 43) && (mnOperatorGroup[4]== 41) )
    {
-                        //nColSize * rowSize is the data size , but except the the head of data will use less the nRowSize
-                        //the other all use nRowSize times . and it must aligen so add nRowSize-1.
-                        nSumproductSize = nRowSize+nColSize * rowSize*nRowSize-1;
-                        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
-                            ocl_calc.CreateBuffer64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
-                        else
-                            ocl_calc.CreateBuffer32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
-                        aIsAlloc = true;
-                        isSumProduct=true;
+        strcpy(mcHostName,"OclOperationColumnN");
+        isHistory = true;
    }
-                    if(isSumProduct)
+    else if( (mnOperatorGroup[0] == 226) && (mnOperatorGroup[1] == 42) )
    {
-                        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+        strcpy(mcHostName,"OclOperationColumnH");
+        isHistory = true;
+    }
+    else if((mnOperatorGroup[0] == 213) && (mnOperatorGroup[1] == 43) && (mnOperatorGroup[2] == 42) )
    {
-                            if(nCountMatix%2==0)
-                                dpSaveData = dpSumProMergeLfData;
-                            else
-                                dpSaveData = dpSumProMergeRtData;
+        strcpy(mcHostName,"OclOperationColumnJ");
+        isHistory = true;
    }
-                        else
+    return isHistory;
+}
+
+bool FormulaGroupInterpreterOpenCL::isGroundWater()
+{
+    bool GroundWater=false;
+
+    if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
+        (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
    {
-                            if(nCountMatix%2==0)
-                                fpSaveData = fpSumProMergeLfData;
-                            else
-                                fpSaveData = fpSumProMergeRtData;
+        GroundWater = true;
    }
+    return GroundWater;
+}
+
+bool FormulaGroupInterpreterOpenCL::chooseFunction( OclCalc &ocl_calc, double *&dpResult )
+{
+    const double * dpOclSrcData = NULL;
+    unsigned int nSrcDataSize = 0;
+    const double *dpLeftData = NULL;
+    const double *dpRightData = NULL;
+    if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
+        (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
+    {
+        double delta = 0.0;
+        const double *pArrayToSubtractOneElementFrom;
+        const double *pGroundWaterDataArray;
+        uint nSrcData = 0;
+        if( mnSvDoubleCount!=1 )
+        {
+            pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
+            pGroundWaterDataArray= mDoubleArray[0]->mdpInputData;
+            nSrcData = mDoubleArray[0]->mnInputDataSize;
        }
-                    for (size_t nCol = 0; nCol < nColSize; ++nCol)
+        else
        {
-                        const double* pArray = rArrays[nCol];
-                        if( NULL==pArray )
+            pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
+            pGroundWaterDataArray=NULL;
+            delta = mdpSvdouble[0];
+        }
+        ocl_calc.oclGroundWaterGroup( mnOperatorGroup,mnOperatorCount,pGroundWaterDataArray,pArrayToSubtractOneElementFrom,nSrcData,mnRowSize,delta,mnpOclStartPos,mnpOclEndPos,dpResult);
+    }
+    else if( isStockHistory() )
    {
-                            fprintf(stderr,"Error: pArray is NULL!\n");
-                            free(rResult);
        return false;
    }
-                        if(ocl_calc.GetOpenclState())
+    else if(((mnSvDoubleCount==0)&&(mnSingleCount==0)&&(mnDoubleCount==1)) &&
+            ((mnOperatorGroup[0] == ocAverage)||(mnOperatorGroup[0] == ocMax)||(mnOperatorGroup[0] == ocMin)))
    {
-                            for( size_t u=nRowStart; u<=nRowEnd; u++ )
+        if(mnOperatorGroup[0] == ocAverage)
+            strcpy(mcHostName,"oclFormulaAverage");
+        if(mnOperatorGroup[0] == ocMax)
+            strcpy(mcHostName,"oclFormulaMax");
+        if(mnOperatorGroup[0] == ocMin)
+            strcpy(mcHostName,"oclFormulaMin");
+        DoubleVectorFormula * doubleTemp = mDoubleArray[--mnDoubleCount];
+        nSrcDataSize = doubleTemp->mnInputDataSize;
+        dpOclSrcData = doubleTemp->mdpInputData;
+        if ( ocl_calc.getOpenclState())
        {
-                                if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+            if ( ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
            {
-                                    dpOclSrcData[u] = pArray[u];
-                                    //fprintf(stderr,"dpOclSrcData[%d] is %f.\n",u,dpOclSrcData[u]);
-                                    if(isSumProduct)
-                                        dpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = pArray[u];
+                ocl_calc.createFormulaBuf64Bits( nSrcDataSize, mnRowSize );
+                ocl_calc.mapAndCopy64Bits( dpOclSrcData,mnpOclStartPos,mnpOclEndPos,nSrcDataSize,mnRowSize );
+                ocl_calc.oclHostFormulaStatistics64Bits( mcHostName, dpResult, mnRowSize );
            }
            else
            {
-                                    // Many video cards can't support double type in kernel, so need transfer the double to float
-                                    fpOclSrcData[u] = (float)pArray[u];
-                                    //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
-                                    if(isSumProduct)
-                                        fpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = (float)pArray[u];
+                ocl_calc.createFormulaBuf32Bits( nSrcDataSize, mnPositonLen );
+                ocl_calc.mapAndCopy32Bits( dpOclSrcData, mnpOclStartPos, mnpOclEndPos, nSrcDataSize, mnRowSize);
+                ocl_calc.oclHostFormulaStatistics32Bits( mcHostName, dpResult, mnRowSize );
            }
        }
    }
-
-                        for (size_t nRow = 0; nRow < nRowSize; ++nRow)
+    else if((mnSvDoubleCount==0)&&(mnSingleCount==1)&&(mnDoubleCount==0))
    {
-                            if (nRowStart + nRow < p2->GetArrayLength())
+        dpLeftData = mSingleArray[0]->mdpInputLeftData;
+        dpRightData =  mSingleArray[0]->mdpInputRightData;
+        if(mnOperatorGroup[0] == ocAdd)
+            strcpy(mcHostName,"oclSignedAdd");
+        if(mnOperatorGroup[0] == ocSub)
+            strcpy(mcHostName,"oclSignedSub");
+        if(mnOperatorGroup[0] == ocMul)
+            strcpy(mcHostName,"oclSignedMul");
+        if(mnOperatorGroup[0] == ocDiv)
+            strcpy(mcHostName,"oclSignedDiv");
+        if ( ocl_calc.getOpenclState())
        {
-                                double fVal = pArray[nRowStart+nRow];
-                                pMat->PutDouble(fVal, nCol, nRow);
-                            }
-                        }
+            if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+            {
+                ocl_calc.createArithmeticOptBuf64Bits( mnRowSize );
+                ocl_calc.mapAndCopy64Bits(dpLeftData,dpRightData,mnRowSize);
+                ocl_calc.oclHostArithmeticOperator64Bits( mcHostName,dpResult,mnRowSize );
            }
-
-                    ScMatrixToken aTok(pMat);
-                    aCode2.AddToken(aTok);
-                    if(isSumProduct)
+            else
            {
-                        npSumSize[nCountMatix/2] =nRowSize*nColSize;
-                        nCountMatix++;
+                ocl_calc.createArithmeticOptBuf32Bits( mnRowSize );
+                ocl_calc.mapAndCopy32Bits(dpLeftData,dpRightData,mnRowSize);
+                ocl_calc.oclHostArithmeticOperator32Bits( mcHostName,dpResult,mnRowSize );
            }
        }
-                break;
-                default:
-                    aCode2.AddToken(*p);
    }
+    else if( (mnSingleCount>1) && (mnSvDoubleCount==0) && (mnDoubleCount==0) )
+    {
+        const double* dpArray[100] = {};
+        int j=0;
+        for( uint i = 0; i < mnSingleCount; i++ )
+        {
+            dpArray[j++] = mSingleArray[i]->mdpInputLeftData;
+            if( NULL != mSingleArray[i]->mdpInputRightData )
+                dpArray[j++] = mSingleArray[i]->mdpInputRightData;
        }
-
-        ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos);
-        if (!pDest)
+        double *dpMoreColArithmetic = (double *)malloc( sizeof(double) * j * mnRowSize );
+        if( NULL == dpMoreColArithmetic )
        {
-            free(rResult);
+            printf("Memory alloc error!\n");
            return false;
        }
-        if(ocl_calc.GetOpenclState())
-        {
-            const formula::FormulaToken *pCur = aCode2.First();
-            aCode2.Reset();
-            while( ( pCur = aCode2.Next() ) != NULL )
+        for( uint i = 0; i < j*mnRowSize; i++ )
        {
-                OpCode eOp = pCur->GetOpCode();
-                if(eOp==0)
+            dpMoreColArithmetic[i] = dpArray[i/mnRowSize][i%mnRowSize];
+        }
+        if ( ocl_calc.getOpenclState())
        {
-                    if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+            if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
            {
-                        if(nCount3%2==0)
-                            dpLeftData[nCount1++] = pCur->GetDouble();
-                        else
-                            dpRightData[nCount2++] = pCur->GetDouble();
-                        nCount3++;
+                ocl_calc.createMoreColArithmeticBuf64Bits( j * mnRowSize, mnOperatorCount );
+                ocl_calc.mapAndCopyMoreColArithmetic64Bits( dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount );
+                ocl_calc.oclMoreColHostArithmeticOperator64Bits( mnRowSize, mnOperatorCount, dpResult,mnRowSize );
            }
            else
            {
-                        if(nCount3%2==0)
-                            fpLeftData[nCount1++] = (float)pCur->GetDouble();
-                        else
-                            fpRightData[nCount2++] = (float)pCur->GetDouble();
-                        nCount3++;
-                    }
+                ocl_calc.createMoreColArithmeticBuf32Bits( j* mnRowSize, mnOperatorCount );
+                ocl_calc.mapAndCopyMoreColArithmetic32Bits(dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount);
+                ocl_calc.oclMoreColHostArithmeticOperator32Bits( mnRowSize, mnOperatorCount, dpResult, mnRowSize );
            }
-                else if( eOp!=ocOpen && eOp!=ocClose &&eOp != ocSep)
-                    nOclOp = eOp;
-
-//              if(count1>0){//dbg
-//                  fprintf(stderr,"leftData is %f.\n",fpLeftData[count1-1]);
-//                  count1--;
-//              }
-//              if(count2>0){//dbg
-//                  fprintf(stderr,"rightData is %f.\n",fpRightData[count2-1]);
-//                  count2--;
-//              }
        }
    }
-
-        if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState())
+    else
    {
-            //fprintf(stderr,"ccCPU flow...\n\n");
-            generateRPNCode(rDoc, aTmpPos, aCode2);
-            ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2);
-            aInterpreter.Interpret();
-            pDest->SetResultToken(aInterpreter.GetResultToken().get());
-            pDest->ResetDirty();
-            pDest->SetChanged(true);
+        return false;
    }
-    } // for loop end (xGroup->mnLength)
+    return true;
+}

-    // For GPU calculation
-    if(getenv("SC_GPU")&&ocl_calc.GetOpenclState())
-    {
-        fprintf(stderr,"ggGPU flow...\n\n");
-        printf(" oclOp is... %d\n",nOclOp);
-        osl_getSystemTime(&aTimeBefore); //timer
-        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+class agency
+{
+public:
+    double *calculate(int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt);
+};
+
+double * agency::calculate( int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt)
+{
+    const double *dpLeftData = NULL;
+    const double *dpRightData = NULL;
+    const double *dpOclSrcData=NULL;
+    if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
    {
-            fprintf(stderr,"ggGPU double precision flow...\n\n");
-            //double precision
-            switch(nOclOp)
+        switch( nOclOp )
        {
            case ocAdd:
-                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedAdd",dpLeftData,dpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset(rResult,0,rowSize);
+                ocl_calc.oclHostArithmeticStash64Bits( "oclSignedAdd",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
                break;
+            }
            case ocSub:
-                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedSub",dpLeftData,dpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = ( double * )malloc( sizeof(double) * nDataSize );
+                memset( rResult,0,rowSize );
+                ocl_calc.oclHostArithmeticStash64Bits( "oclSignedSub",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData(rResult,nDataSize) );
                break;
+            }
            case ocMul:
-                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedMul",dpLeftData,dpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset( rResult,0,rowSize );
+                ocl_calc.oclHostArithmeticStash64Bits( "oclSignedMul",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
                break;
+            }
            case ocDiv:
-                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedDiv",dpLeftData,dpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = ( double * )malloc( sizeof(double) * nDataSize );
+                memset( rResult,0,rowSize );
+                ocl_calc.oclHostArithmeticStash64Bits( "oclSignedDiv",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
                break;
+            }
            case ocMax:
-                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                nDataSize = temp->getDataSize();
+                dpOclSrcData = temp->getDouleData();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * rowSize );
+                memset( rResult,0,rowSize );
+                ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+                formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
                break;
+            }
            case ocMin:
-                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                nDataSize = temp->getDataSize();
+                dpOclSrcData = temp->getDouleData();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * rowSize );
+                memset( rResult,0,rowSize );
+                ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+                formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
                break;
+            }
            case ocAverage:
-                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
-                    break;
-                case ocSum:
-                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaSum",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
-                    break;
-                case ocCount:
-                    ocl_calc.OclHostFormulaCount64Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
-                    break;
-                case ocSumProduct:
-                    ocl_calc.OclHostFormulaSumProduct64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,rResult,rowSize);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                nDataSize = temp->getDataSize();
+                dpOclSrcData = temp->getDouleData();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * rowSize );
+                memset( rResult,0,rowSize );
+                ocl_calc.oclHostFormulaStash64Bits( "oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+                formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
                break;
+            }
            default:
-                    fprintf(stderr,"No OpenCL function for this calculation.\n");
+                fprintf( stderr,"No OpenCL function for this calculation.\n" );
                break;
        }
    }
    else
    {
-            fprintf(stderr,"ggGPU float precision flow...\n\n");
-            //float precision
-            switch(nOclOp)
+        switch( nOclOp )
        {
            case ocAdd:
-                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedAdd",fpLeftData,fpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset(rResult,0,rowSize);
+                ocl_calc.oclHostArithmeticStash32Bits( "oclSignedAdd", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
                break;
+            }
            case ocSub:
-                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedSub",fpLeftData,fpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset( rResult, 0, rowSize );
+                ocl_calc.oclHostArithmeticStash32Bits( "oclSignedSub", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
                break;
+            }
            case ocMul:
-                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedMul",fpLeftData,fpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc(sizeof(double) * nDataSize );
+                memset( rResult, 0, rowSize );
+                ocl_calc.oclHostArithmeticStash32Bits( "oclSignedMul", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData( rResult, nDataSize ) );
                break;
+            }
            case ocDiv:
-                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedDiv",fpLeftData,fpRightData,rResult,nCount1);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                SourceData *temp2 = formulaInterprt->srdDataPop();
+                nDataSize = temp2->getDataSize();
+                dpLeftData = temp2->getDouleData();
+                dpRightData = temp->getDouleData();
+                nDataSize = temp2->getDataSize();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset( rResult, 0, rowSize );
+                ocl_calc.oclHostArithmeticStash32Bits( "oclSignedDiv", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+                formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
                break;
+            }
            case ocMax:
-                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMax",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                nDataSize = temp->getDataSize();
+                dpOclSrcData = temp->getDouleData();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc(sizeof(double) * nDataSize );
+                memset(rResult,0,rowSize);
+                ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMax", dpOclSrcData, npOclStartPos, npOclEndPos, rResult,nDataSize, rowSize );
+                formulaInterprt->srdDataPush( new SourceData( rResult, rowSize ) );
                break;
+            }
            case ocMin:
-                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMin",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                nDataSize = temp->getDataSize();
+                dpOclSrcData = temp->getDouleData();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset( rResult, 0, rowSize );
+                ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMin", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
+                formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
                break;
+            }
            case ocAverage:
-                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaAverage",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
-                    break;
-                case ocSum:
-                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaSum",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
-                    break;
-                case ocCount:
-                    ocl_calc.OclHostFormulaCount32Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
-                    break;
-                case ocSumProduct:
-                    ocl_calc.OclHostFormulaSumProduct32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,rResult,rowSize);
+            {
+                unsigned int nDataSize = 0;
+                SourceData *temp = formulaInterprt->srdDataPop();
+                nDataSize = temp->getDataSize();
+                dpOclSrcData = temp->getDouleData();
+                double *rResult = NULL; // Point to the output data from GPU
+                rResult = (double *)malloc( sizeof(double) * nDataSize );
+                memset( rResult, 0, rowSize);
+                ocl_calc.oclHostFormulaStash32Bits( "oclFormulaAverage", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
+                formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
                break;
+            }
            default:
                fprintf(stderr,"No OpenCL function for this calculation.\n");
                break;
        }
    }
+    return NULL;
+}
+
+ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix( const ScMatrix& rMat )
+{
+    SCSIZE nC, nR;
+    rMat.GetDimensions( nC, nR );
+    if ( nC != nR || nC == 0 )
+        // Input matrix must be square. Return an empty matrix on failure and
+        // the caller will calculate it via CPU.
+        return ScMatrixRef();
+
+    // This vector will contain a series of doubles from the first column to
+    // the last, chained together in a single array.
+    std::vector<double> aDoubles;
+    rMat.GetDoubleArray(aDoubles);

-        /////////////////////////////////////////////////////
-        osl_getSystemTime(&aTimeAfter);
-        double diff = getTimeDiff(aTimeAfter, aTimeBefore);
-        //if (diff >= 1.0)
+    float * fpOclMatrixSrc = NULL;
+    float * fpOclMatrixDst = NULL;
+    double * dpOclMatrixSrc = NULL;
+    double * dpOclMatrixDst = NULL;
+    uint nMatrixSize = nC * nR;
+    static OclCalc aOclCalc;
+    if ( aOclCalc.getOpenclState() )
+    {
+        if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 )
        {
-            fprintf(stderr,"OpenCL,diff...%f.\n",diff);
+            aOclCalc.createBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
+            for ( uint i = 0; i < nC; i++ )
+                for ( uint j = 0; j < nR; j++ )
+                    dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i];
+            aOclCalc.oclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
+        }
+        else
+        {
+            aOclCalc.createBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
+            for ( uint i = 0; i < nC; i++ )
+                for ( uint j = 0; j < nR; j++ )
+                    fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i];
+            aOclCalc.oclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
+        }
    }
-/////////////////////////////////////////////////////

-//rResult[i];
-//           for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results
-//               fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]);
-//           }
+    // TODO: Inverse this matrix and put the result back into xInv. Right now,
+    // I'll just put the original, non-inversed matrix values back, just to
+    // demonstrate how to put the values back after inversion.  There are two
+    // ways to put the values back (depending on what the GPU output is).
+    ScMatrixRef xInv(new ScMatrix(nR, nR, 0.0));

-        // Insert the double data, in rResult[i] back into the document
-        rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength);
+#if 0
+    // One way is to put the whole value as one array. This method assumes
+    // that the array size equals column x row, and is oriented column-wise.
+    // This method is slightly more efficient than the second, but I wouldn't
+    // expect too much of a difference.
+    xInv->PutDouble(&aDoubles[0], aDoubles.size(), 0, 0);
+#else
+    // Another way is to put the values one column at a time.
+    const double* p = &aDoubles[0];
+    for( SCSIZE i = 0; i < nC; ++i )
+    {
+        xInv->PutDouble( p, nR, i, 0 );
+        p += nR;
    }
+#endif
+
+    return xInv;
+}
+bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc, const ScAddress& rTopPos,
+                                        const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode )
+{
+    generateRPNCode( rDoc, rTopPos, rCode );
+    mnRowSize = xGroup->mnLength;
+    fprintf( stderr,"mnRowSize at begin is ...%ld.\n",(long)mnRowSize );
+    // The row quantity can be gotten from p2->GetArrayLength()
+    int nOclOp = 0;
+    const double * dpOclSrcData = NULL;
+    const double * dpBinaryData = NULL;
+    static OclCalc ocl_calc;
+    unsigned int nSrcDataSize = 0;

-    free(rResult);
+    const double *dpResult = NULL;
+    double *pResult = (double *)malloc(sizeof(double) * mnRowSize);
+    double *dpSvDouble = NULL;
+    bool isSample = false;
+
+    mnSingleCount = 0;
+    mnDoubleCount = 0;
+    mnSvDoubleCount = 0;
+    mnOperatorCount = 0;
+    mnPositonLen = 0;
+    if ( ocl_calc.getOpenclState() )
+    {
+        getPosition(rCode,xGroup,mnRowSize,mnpOclStartPos,mnpOclEndPos,&mnPositonLen);
+        const formula::FormulaToken* p = rCode.FirstRPN();

+        bool isSingle = false;
+        int nCountNum=0;
+        do
+        {
+            if ( ocPush != p->GetOpCode())
+            {
+                nOclOp = p->GetOpCode();
+                mnOperatorGroup[mnOperatorCount++] = nOclOp;
+            }
+            else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+            {
+                mnSingleCount++;
+            }
+            if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+            {
+                mnSvDoubleCount++;
+            }
+        } while ( NULL != ( p = rCode.NextRPN() ) );
+        if( isGroundWater() )
+        {
+            isSample = true;
+        }
+        mnOperatorCount = 0;
+        mnSingleCount = 0;
+        mnSvDoubleCount = 0;
+        p = rCode.FirstRPN();
+        if(isSample)
+        {
+            do
+            {
+                if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+                {
+                    mdpSvdouble[mnSvDoubleCount++] = p->GetDouble();
+                }
+                else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
+                {
+                    const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
+                    const std::vector< const double* >& rArrays = pDvr->GetArrays();
+                    uint rArraysSize = rArrays.size();
+                    int nMoreColSize = 0;
+                    DoubleVectorFormula *SvDoubleTemp = new DoubleVectorFormula();
+                    if( rArraysSize > 1 )
+                    {
+                        double *dpMoreColData = NULL;
+                        for ( uint loop=0; loop < rArraysSize; loop++ )
+                        {
+                            dpOclSrcData = rArrays[loop];
+                            nSrcDataSize = pDvr->GetArrayLength();
+                            nMoreColSize += nSrcDataSize;
+                            dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
+                            for ( uint j = nMoreColSize - nSrcDataSize, i = 0; i < nSrcDataSize; i++, j++ )
+                            {
+                                dpMoreColData[j] = dpOclSrcData[i];
+                            }
+                        }
+                        dpOclSrcData = dpMoreColData;
+                        nSrcDataSize = nMoreColSize;
+                    }
+                    else
+                    {
+                        dpOclSrcData = rArrays[0];
+                        nSrcDataSize = pDvr->GetArrayLength();
+                        SvDoubleTemp->mdpInputData = dpOclSrcData;
+                        SvDoubleTemp->mnInputDataSize = nSrcDataSize;
+                        SvDoubleTemp->mnInputStartPosition = mnpOclStartPos[nCountNum*mnRowSize];
+                        SvDoubleTemp->mnInputEndPosition = mnpOclEndPos[nCountNum*mnRowSize];
+                        SvDoubleTemp->mnInputStartOffset = mnpOclStartPos[nCountNum*mnRowSize+1]-mnpOclStartPos[nCountNum*mnRowSize];
+                        SvDoubleTemp->mnInputEndOffset = mnpOclEndPos[nCountNum*mnRowSize+1]-mnpOclEndPos[nCountNum*mnRowSize];
+                        mDoubleArray[mnDoubleCount++] = SvDoubleTemp;
+                        nCountNum++;
+                    }
+                }
+                else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+                {
+                    const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
+                    dpBinaryData = pSvr->GetArray();
+                    uint nArrayLen = pSvr->GetArrayLength();
+                    SingleVectorFormula *SignleTemp = new SingleVectorFormula() ;
+                    if(isSingle)
+                    {
+                        SignleTemp = mSingleArray[--mnSingleCount];
+                        SignleTemp->mdpInputRightData = dpBinaryData;
+                        SignleTemp->mnInputRightDataSize = nArrayLen;
+                        SignleTemp->mnInputRightStartPosition = 0;
+                        SignleTemp->mnInputRightOffset = 0;
+                        isSingle = false;
+                    }
+                    else
+                    {
+                        SignleTemp = new SingleVectorFormula();
+                        SignleTemp->mdpInputLeftData = dpBinaryData;
+                        SignleTemp->mnInputLeftDataSize = nArrayLen;
+                        SignleTemp->mdpInputRightData = NULL;
+                        SignleTemp->mnInputRightDataSize = 0;
+                        SignleTemp->mnInputLeftStartPosition = 0;
+                        SignleTemp->mnInputLeftOffset = 0;
+                        isSingle = true;
+                    }
+                    mSingleArray[mnSingleCount++] = SignleTemp;
+                }
+                else
+                {
+                    nOclOp = p->GetOpCode();
+                    mnOperatorGroup[mnOperatorCount++] = nOclOp;
+                }
+            } while ( NULL != ( p = rCode.NextRPN() ) );
+            if ( !chooseFunction( ocl_calc, pResult ) )
+                return false;
+            else
+                dpResult = pResult;
+        }
+        else
+        {
+            agency aChooseAction;
+
+            do
+            {
+                if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+                {
+                    dpSvDouble = (double *) malloc( sizeof(double ) * mnRowSize );
+                    double dTempValue = p->GetDouble();
+                    for ( uint i = 0; i < mnRowSize; i++ )
+                        dpSvDouble[i] = dTempValue;
+                    srdDataPush( new SourceData( dpSvDouble, mnRowSize ) );
+                    collectDoublePointers( dpSvDouble );
+                }
+                else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
+                {
+                    const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
+                    const std::vector< const double* >& rArrays = pDvr->GetArrays();
+                    unsigned int rArraysSize = rArrays.size();
+                    int nMoreColSize = 0;
+                    if(rArraysSize > 1)
+                    {
+                        double *dpMoreColData = NULL;
+                        for( uint loop=0; loop < rArraysSize; loop++ )
+                        {
+                            dpOclSrcData = rArrays[loop];
+                            nSrcDataSize = pDvr->GetArrayLength();
+                            nMoreColSize += nSrcDataSize;
+                            dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
+                            for(uint j=nMoreColSize-nSrcDataSize,i=0;i<nSrcDataSize;i++,j++)
+                            {
+                                dpMoreColData[j] = dpOclSrcData[i];
+                            }
+                        }
+                        dpOclSrcData = dpMoreColData;
+                        nSrcDataSize = nMoreColSize;
+                        collectDoublePointers( dpMoreColData );
+                    }
+                    else
+                    {
+                        dpOclSrcData = rArrays[0];
+                        nSrcDataSize = pDvr->GetArrayLength();
+                    }
+                    srdDataPush( new SourceData( dpOclSrcData,nSrcDataSize,rArraysSize ) );
+                }
+                else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+                {
+                    const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
+                    dpBinaryData = pSvr->GetArray();
+                    nSrcDataSize = pSvr->GetArrayLength();
+                    srdDataPush( new SourceData( dpBinaryData, nSrcDataSize ) );
+                }
+                else
+                {
+                    nOclOp = p->GetOpCode();
+                    aChooseAction.calculate(nOclOp,mnRowSize,ocl_calc,mnpOclStartPos,mnpOclEndPos,this);
+                    mnSingleCount = 0;
+                    mnDoubleCount = 0;
+                    mnSvDoubleCount = 0;
+                    mnOperatorCount = 0;
+                    mnPositonLen = 0;
+                }
+            } while ( NULL != ( p = rCode.NextRPN() ) );
+            SourceData * sResult = srdDataPop();
+            dpResult = sResult->getDouleData();
+        }
+        rDoc.SetFormulaResults( rTopPos, dpResult, mnRowSize );
+        freeDoublePointers();
+        if ( pResult )
+        {
+            free( pResult );
+            pResult = NULL;
+        }
+        if ( mnpOclStartPos )
+        {
+            free( mnpOclStartPos );
+            mnpOclStartPos = NULL;
+        }
+        if ( mnpOclEndPos )
+        {
+            free( mnpOclEndPos );
+            mnpOclEndPos = NULL;
+        }
        return true;
+    } // getOpenclState() End
+    else
+        return false;
 }

 /// Special case of formula compiler for groundwatering
@@ -489,11 +964,11 @@ public:
        FormulaGroupInterpreterSoftware()
    {
        fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n");
-        OclCalc::InitEnv();
+        OclCalc::initEnv();
    }
    virtual ~FormulaGroupInterpreterGroundwater()
    {
-        OclCalc::ReleaseOpenclRunEnv();
+        OclCalc::releaseOpenclRunEnv();
    }

    virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); }
@@ -569,7 +1044,7 @@ bool FormulaGroupInterpreterGroundwater::interpretCL(ScDocument& rDoc, const ScA

    fprintf (stderr, "Calculate !");

-    double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
+    double *pResult = ocl_calc.oclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
                                                        pArrayToSubtractOneElementFrom,
                                                        (size_t) xGroup->mnLength, delta );
    RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed");

--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -7,8 +7,8 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

-#ifndef _OCL_KERNEL_H_
-#define _OCL_KERNEL_H_
+#ifndef SC_OCLKERNELS_HXX
+#define SC_OCLKERNELS_HXX

 #ifndef USE_EXTERNAL_KERNEL
 #define KERNEL( ... )# __VA_ARGS__
@@ -24,6 +24,97 @@ const char *kernel_src = KERNEL(
 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
 \n#else\n
 \n#endif\n
+inline fp_t oclAverage( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+    uint start = startArray[id];
+    uint end = endArray[id];
+    fp_t fSum = 0.0;
+    fp_t zero[16] = {0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f};
+    fp_t16 vSum=vload16(0,zero);
+    fp_t16 ptr;
+    __global fp_t *p = values;
+    p+= start;
+
+    for(int i = 0; i < (end - start + 1)/16; ++i)
+    {
+        ptr=vload16(0,p);
+        vSum += ptr;
+        p+=16;
+    }
+    int lastData = (end-start+1)%16;
+    for(int i = 0; i <lastData; i++)
+    {
+        fSum += *p;
+        p+=1;
+    }
+    vSum.s01234567 = vSum.s01234567+vSum.s89abcdef;
+    vSum.s0123 = vSum.s0123+vSum.s4567;
+    vSum.s01 = vSum.s01+vSum.s23;
+    vSum.s0 = vSum.s0+vSum.s1;
+    fSum = vSum.s0+fSum;
+    fp_t fVal = fSum/(end-start+1);
+    return fVal;
+}
+inline fp_t oclMax( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+    uint start = startArray[id];
+    uint end = endArray[id];
+    fp_t fMax = values[start];
+    fp_t zero[16] = {fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax};
+    fp_t16 vMax=vload16(0,zero);
+    //Max
+    fp_t16 ptr;
+    __global fp_t *p = values;
+    p+= start;
+    for(int i = 0; i < (end - start + 1)/16; ++i)
+    {
+        ptr=vload16(0,p);
+        vMax = fmax(vMax,ptr);
+        p+=16;
+    }
+    int lastData = (end-start+1)%16;
+    for(int i = 0; i <lastData; i++)
+    {
+        fMax = fmax(fMax,*p);
+        p+=1;
+    }
+    vMax.s01234567 = fmax(vMax.s01234567, vMax.s89abcdef);
+    vMax.s0123 = fmax(vMax.s0123, vMax.s4567);
+    vMax.s01 = fmax(vMax.s01, vMax.s23);
+    vMax.s0 = fmax(vMax.s0, vMax.s1);
+    fMax = fmax(vMax.s0, fMax);
+    return fMax;
+}
+inline fp_t oclMin( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+    uint start = startArray[id];
+    uint end = endArray[id];
+    fp_t fMin = values[start];
+    fp_t zero[16] = {fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin};
+    fp_t16 vMin=vload16(0,zero);
+    //Min
+    fp_t16 ptr;
+    __global fp_t *p = values;
+    p+= start;
+    for(int i = 0; i < (end - start + 1)/16; ++i)
+    {
+        ptr=vload16(0,p);
+        vMin = fmin(vMin,ptr);
+        p+=16;
+    }
+    int lastData = (end-start+1)%16;
+    for(int i = 0; i <lastData; i++)
+    {
+        fMin = fmin(fMin,*p);
+        p+=1;
+    }
+    vMin.s01234567 = fmin(vMin.s01234567, vMin.s89abcdef);
+    vMin.s0123 = fmin(vMin.s0123, vMin.s4567);
+    vMin.s01 = fmin(vMin.s01, vMin.s23);
+    vMin.s0 = fmin(vMin.s0, vMin.s1);
+    fMin = fmin(vMin.s0, fMin);
+    return fMin;
+}

 __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
@@ -31,7 +122,6 @@ __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global
    otData[id] = ltData[id] + rtData[id];
 }

-
 __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
    const unsigned int id = get_global_id(0);
@@ -41,39 +131,31 @@ __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global
 __kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
    int id = get_global_id(0);
-    otData[id] =ltData[id] * rtData[id];
+    otData[id] = ltData[id] * rtData[id];
 }

 __kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
    const unsigned int id = get_global_id(0);
-    otData[id] = ltData[id] / rtData[id];
+    fp_t divisor = rtData[id];
+    if ( divisor != 0 )
+        otData[id] = ltData[id] / divisor;
+    else
+        otData[id] = 0.0;
 }

 __kernel void oclFormulaMin(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
 {
    const unsigned int id = get_global_id(0);
-    unsigned int startFlag = start[id];
-    unsigned int endFlag = end[id];
-    fp_t fMinVal = input[startFlag];
-    for(int i=startFlag;i<=endFlag;i++)
-    {
-        fMinVal = fmin( fMinVal, input[i] );
-    }
-    output[id] = fMinVal;
+    fp_t fVal = oclMin(id,input,start,end);
+    output[id] = fVal ;
 }

 __kernel void oclFormulaMax(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
 {
    const unsigned int id = get_global_id(0);
-    unsigned int startFlag = start[id];
-    unsigned int endFlag = end[id];
-    fp_t fMaxVal = input[startFlag];
-    for ( int i = startFlag; i <= endFlag; i++ )
-    {
-        fMaxVal = fmax( fMaxVal, input[i] );
-    }
-    output[id] = fMaxVal;
+    fp_t fVal = oclMax(id,input,start,end);
+    output[id] = fVal ;
 }
 //Sum
 __kernel void oclFormulaSum(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
@@ -94,12 +176,10 @@ __kernel void oclFormulaCount(__global uint *start,__global uint *end,__global f
 __kernel void oclFormulaAverage(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
 {
    const unsigned int id = get_global_id(0);
-    fp_t sum=0.0;
-    for(int i = start[id];i<=end[id];i++)
-        sum += input[i];
-    output[id] = sum / (end[id]-start[id]+1);
-}
+    fp_t fVal = oclAverage(id,input,start,end);
+    output[id] = fVal ;

+}
 //Sumproduct
 __kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global uint* npSumSize,__global fp_t *output,uint nMatixSize)
 {
@@ -147,7 +227,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s

    // Min
    fp_t fMinVal = values[start];
-    for(int i=start+1;i < end;i++)
+    for ( int i = start + 1; i < end; i++ )
    {
        if(values[i]<fMinVal)
            fMinVal = values[i];
@@ -177,14 +257,14 @@ __kernel void oclFormulaMtxInv(__global fp_t * fpMatrixInput, __global fp_t * fp
    fpP[nOffset*nDimension+nId] = fpP[nMax*nDimension+nId];
    fpP[nMax*nDimension+nId] = dMovebuffer;
 }
-__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY)
+__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY,__global uint* npDim)
 {
    int nId = get_global_id(0);
-    int nDimension = get_global_size(0);
-
+    int nDimension = npDim[nId];
+    fp_t fsum = 0.0;
    for ( int yi=0; yi < nDimension; yi++ )
    {
-        fp_t fsum = 0.0;
+        fsum = 0.0;
        for ( int yj=0; yj < nDimension; yj++ )
        {
            fsum += fpMatrixInput[yi*nDimension+yj] * fpY[nId+yj*nDimension];
@@ -194,7 +274,7 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
    }
    for ( int xi = nDimension - 1; xi >= 0; xi-- )
    {
-        fp_t fsum = 0.0;
+        fsum = 0.0;
        for ( int xj = 0; xj < nDimension; xj++ )
        {
            fsum += fpMatrixInput[xi*nDimension+xj] * fpMatrixOutput[nId+nDimension*xj];
@@ -203,6 +283,101 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
    }
 }

+__kernel void oclAverageAdd(__global fp_t *values,__global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fVal = oclAverage(id,values,startArray,endArray);
+    output[id] = fVal + addend[id];
+}
+
+__kernel void oclAverageSub(__global fp_t *values,__global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fVal = oclAverage(id,values,startArray,endArray);
+    output[id] = fVal - subtract[id];
+}
+
+__kernel void oclAverageMul(__global fp_t *values,__global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fVal = oclAverage(id,values,startArray,endArray);
+    output[id] = fVal * multiplier[id];
+}
+__kernel void oclAverageDiv(__global fp_t *values,__global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fVal = oclAverage(id,values,startArray,endArray);
+    fp_t divisor = div[id];
+    if ( divisor != 0 )
+        output[id] = fVal / divisor;
+    else
+        output[id] = 0.0;
+}
+
+__kernel void oclMinAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMin = oclMin(id,values,startArray,endArray);
+    output[id] = fMin + addend[id];
+}
+
+__kernel void oclMinSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMin = oclMin(id,values,startArray,endArray);
+    output[id] = fMin - subtract[id];
+}
+__kernel void oclMinMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMin = oclMin(id,values,startArray,endArray);
+    output[id] = fMin * multiplier[id];
+}
+__kernel void oclMinDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMin = oclMin(id,values,startArray,endArray);
+    fp_t divisor = div[id];
+    if ( divisor != 0 )
+        output[id] = fMin / divisor;
+    else
+        output[id] = 0.0;
+}
+__kernel void oclMaxAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMax = oclMax(id,values,startArray,endArray);
+    output[id] = fMax + addend[id];
+}
+
+__kernel void oclMaxSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMax = oclMax(id,values,startArray,endArray);
+    output[id] = fMax - subtract[id];
+}
+__kernel void oclMaxMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMax = oclMax(id,values,startArray,endArray);
+    output[id] = fMax * multiplier[id];
+}
+__kernel void oclMaxDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+    fp_t fMax = oclMax(id,values,startArray,endArray);
+    fp_t divisor = div[id];
+    if ( divisor != 0 )
+        output[id] = fMax / divisor;
+    else
+        output[id] = 0.0;
+}
+
+__kernel void oclSub( fp_t ltData, __global fp_t *rtData, __global fp_t *outData )
+{
+    const unsigned int id = get_global_id(0);
+    outData[id] = ltData - rtData[id];
+}
 );

 #endif // USE_EXTERNAL_KERNEL

--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
--- a/sc/source/core/opencl/openclwrapper.hxx
+++ b/sc/source/core/opencl/openclwrapper.hxx
@@ -7,8 +7,8 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

-#ifndef SC_OPENCL_WRAPPER_H
-#define SC_OPENCL_WRAPPER_H
+#ifndef SC_OPENCLWRAPPER_HXX
+#define SC_OPENCLWRAPPER_HXX

 #include <config_features.h>
 #include <formula/opcode.hxx>
@@ -87,6 +87,19 @@ if( status != CL_SUCCESS )    \
    return 0;    \
 }

+#define CHECK_OPENCL_VOID(status,name)    \
+if( status != CL_SUCCESS )    \
+{    \
+    printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when %s .\n", status, name);    \
+}
+
+#define CHECK_OPENCL_RELEASE(status,name)    \
+if ( name != NULL )    \
+    clReleaseMemObject( name );    \
+if( status != CL_SUCCESS )    \
+{    \
+    printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when clReleaseMemObject( %s ).\n", status, #name);    \
+}

 #define MAX_KERNEL_STRING_LEN 64
 #define MAX_CLFILE_NUM 50
@@ -119,25 +132,48 @@ typedef struct
    char kernelName[MAX_KERNEL_NAME_LEN + 1];
    char *kernelStr;
 } kernel_node;
-
+typedef struct _SingleVectorFormula
+{
+    const double *mdpInputLeftData;
+    const double *mdpInputRightData;
+    size_t mnInputLeftDataSize;
+    size_t mnInputRightDataSize;
+    uint mnInputLeftStartPosition;
+    uint mnInputRightStartPosition;
+    int mnInputLeftOffset;
+    int mnInputRightOffset;
+} SingleVectorFormula;
+
+typedef struct _DoubleVectorFormula
+{
+    const double *mdpInputData;
+    size_t mnInputDataSize;
+    uint mnInputStartPosition;
+    uint mnInputEndPosition;
+    int mnInputStartOffset;
+    int mnInputEndOffset;
+} DoubleVectorFormula;
 class OpenclCalcBase
 {
 public:
    OpenclCalcBase(){};
    virtual ~OpenclCalcBase(){};
-    virtual int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize )=0;
-    virtual int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize )=0;
-    virtual int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0;
-    virtual int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0;
-    virtual int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>& dpResult, uint nDim)=0;
+    virtual int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize )=0;
+    virtual int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0;
+    virtual int oclHostFormulaStatistics64Bits( const char* aKernelName,double *&output, int outputSize )=0;
+    virtual int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0;
+    virtual int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0;
+    virtual int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim)=0;
+    virtual int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0;

-    virtual int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize )=0;
-    virtual int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize )=0;
-    virtual int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0;
-    virtual int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0;
-    virtual int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0;
+    virtual int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize )=0;
+    virtual int oclHostFormulaStatistics32Bits( const char* aKernelName,double *output, int outputSize )=0;
+    virtual int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0;
+    virtual int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0;
+    virtual int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0;

-    virtual double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0;
+    virtual int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle, size_t nSrcDataSize,size_t nElements, double delta,uint *nStartPos,uint *nEndPos ,double *deResult)=0;
+    virtual double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0;


 };
@@ -151,40 +187,40 @@ public:
    static int isInited;
    OpenclDevice();
    ~OpenclDevice();
-    static int InitEnv();
-    static int RegistOpenclKernel();
-    static int ReleaseOpenclRunEnv();
-    static int InitOpenclRunEnv( GPUEnv *gpu );
-    static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
-    static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
-    static int InitOpenclRunEnv( int argc );
-    static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
-    static int GeneratBinFromKernelSource( cl_program program, const char * clFileName );
-    static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
-    static int BinaryGenerated( const char * clFileName, FILE ** fhandle );
-    static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
-
-    int InitOpenclAttr( OpenCLEnv * env );
-    int ReleaseKernel( KernelEnv * env );
-    int SetKernelEnv( KernelEnv *envInfo );
-    int CreateKernel( char * kernelname, KernelEnv * env );
-    int RunKernel( const char *kernelName, void **userdata );
-    int ConvertToString( const char *filename, char **source );
-    int CheckKernelName( KernelEnv *envInfo, const char *kernelName );
-    int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function );
-    int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
-    int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
+    static int initEnv();
+    static int registOpenclKernel();
+    static int releaseOpenclRunEnv();
+    static int initOpenclRunEnv( GPUEnv *gpu );
+    static int releaseOpenclEnv( GPUEnv *gpuInfo );
+    static int compileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
+    static int initOpenclRunEnv( int argc );
+    static int cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
+    static int generatBinFromKernelSource( cl_program program, const char * clFileName );
+    static int writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
+    static int binaryGenerated( const char * clFileName, FILE ** fhandle );
+    static int compileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
+
+    int initOpenclAttr( OpenCLEnv * env );
+    int releaseKernel( KernelEnv * env );
+    int setKernelEnv( KernelEnv *envInfo );
+    int createKernel( char * kernelname, KernelEnv * env );
+    int runKernel( const char *kernelName, void **userdata );
+    int convertToString( const char *filename, char **source );
+    int checkKernelName( KernelEnv *envInfo, const char *kernelName );
+    int registerKernelWrapper( const char *kernelName, cl_kernel_function function );
+    int runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
+    int getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );


 #ifdef WIN32
-    static int LoadOpencl();
-    static int OpenclInite();
-    static void FreeOpenclDll();
+    static int loadOpencl();
+    static int openclInite();
+    static void freeOpenclDll();
 #endif

-    int GetOpenclState();
-    void SetOpenclState( int state );
-    inline static int AddKernelConfig( int kCount, const char *kName );
+    int getOpenclState();
+    void setOpenclState( int state );
+    inline static int addKernelConfig( int kCount, const char *kName );

 };

@@ -201,6 +237,10 @@ public:
    cl_mem mpClmemMergeLfData;
    cl_mem mpClmemMergeRtData;
    cl_mem mpClmemMatixSumSize;
+    cl_mem mpClmemeOp;
+    unsigned int nArithmeticLen;
+    unsigned int nFormulaLen;
+    unsigned int nClmemLen;
    unsigned int nFormulaColSize;
    unsigned int nFormulaRowSize;

@@ -208,27 +248,49 @@ public:
    ~OclCalc();

 // for 64bits double
-    int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize );
-    int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize);
-    int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize );
-    int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize);
-    int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim );
+    int oclHostArithmeticOperator64Bits( const char* aKernelName,  double *&rResult, int nRowSize );
+    int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize );
+    int oclHostFormulaStatistics64Bits( const char* aKernelName, double *&output, int outputSize);
+    int oclHostFormulaStash64Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size);
+    int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize );
+    int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize);
+    int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim );
 // for 32bits float
-    int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize );
-    int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize);
-    int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize );
-    int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize );
-    int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim );
+    int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize );
+    int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize );
+    int oclHostFormulaStatistics32Bits( const char* aKernelName, double *output, int outputSize);
+    int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize );
+    int oclHostArithmeticStash64Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize );
+    int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize );
+    int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim );
 // for groundwater
-    double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta );
+    int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle,size_t nSrcDataSize, size_t nElements, double delta ,uint *nStartPos,uint *nEndPos,double *deResult);
+    double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta );

    ///////////////////////////////////////////////////////////////
-    int CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize );
-    int CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize );
-    int CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize );
-    int CreateBuffer32Bits( float *&fpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize );
-    int CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize );
-    int CreateBuffer32Bits( float *&fpSumProMergeLfData, float *&fpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize );
+    int createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize );
+    int mapAndCopy64Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize );
+    int mapAndCopy64Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize);
+    int mapAndCopyArithmetic64Bits( const double *dpMoreArithmetic,int nBufferSize );
+    int mapAndCopyMoreColArithmetic64Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize );
+    int createMoreColArithmeticBuf64Bits( int nBufferSize, int neOpSize );
+
+    int createFormulaBuf64Bits( int nBufferSize, int rowSize );
+    int createArithmeticOptBuf64Bits( int nBufferSize );
+
+    int createBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize );
+    int mapAndCopy32Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize );
+    int mapAndCopy32Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize);
+    int mapAndCopyArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize );
+    int mapAndCopyMoreColArithmetic32Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize );
+    int createMoreColArithmeticBuf32Bits( int nBufferSize, int neOpSize );
+    int createFormulaBuf32Bits( int nBufferSize, int rowSize  );
+    int createArithmeticOptBuf32Bits( int nBufferSize );
+    int oclHostFormulaStash32Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size );
+    int oclHostArithmeticStash32Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize );
+
+    int releaseOclBuffer(void);
+    friend class agency;
 };

 #endif