Kaydet (Commit) 1e3bc292 authored tarafından Wei Wei's avatar Wei Wei Kaydeden (comit) I-Jui (Ray) Sung

GPU Calc: implemented parallel reduction for SUMIFS

For now only works for fixed and sliding fixed-sized windows.

Change-Id: I25e3f893a86d0e1723ae1e1633ffeeee93926b8d
Signed-off-by: 's avatarI-Jui (Ray) Sung <ray@multicorewareinc.com>
üst 501bc66c
...@@ -419,42 +419,51 @@ public: ...@@ -419,42 +419,51 @@ public:
bIsStartFixed = mpDVR->IsStartFixed(); bIsStartFixed = mpDVR->IsStartFixed();
bIsEndFixed = mpDVR->IsEndFixed(); bIsEndFixed = mpDVR->IsEndFixed();
} }
virtual bool NeedParallelReduction(void) const
{
return GetWindowSize()> 100 &&
( (GetStartFixed() && GetEndFixed()) ||
(!GetStartFixed() && !GetEndFixed()) ) ;
}
virtual void GenSlidingWindowFunction(std::stringstream &ss) { virtual void GenSlidingWindowFunction(std::stringstream &ss) {
std::string name = Base::GetName(); if (dynamic_cast<OpSum*>(mpCodeGen.get()))
ss << "__kernel void "<<name; {
ss << "_reduction(__global double* A, " std::string name = Base::GetName();
"__global double *result,int arrayLength,int windowSize){\n"; ss << "__kernel void "<<name;
ss << " double tmp, current_result = 0.0;\n"; ss << "_reduction(__global double* A, "
ss << " int writePos = get_group_id(1);\n"; "__global double *result,int arrayLength,int windowSize){\n";
ss << " int offset = get_group_id(1);\n"; ss << " double tmp, current_result = 0.0;\n";
ss << " int lidx = get_local_id(0);\n"; ss << " int writePos = get_group_id(1);\n";
ss << " __local double shm_buf[256];\n"; ss << " int offset = get_group_id(1);\n";
ss << " if (arrayLength == windowSize)\n"; ss << " int lidx = get_local_id(0);\n";
ss << " offset = 0;\n"; ss << " __local double shm_buf[256];\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; ss << " if (arrayLength == windowSize)\n";
ss << " int loop = arrayLength/512 + 1;\n"; ss << " offset = 0;\n";
ss << " for (int l=0; l<loop; l++){\n"; ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " tmp = 0.0;\n"; ss << " int loop = arrayLength/512 + 1;\n";
ss << " int loopOffset = l*512;\n"; ss << " for (int l=0; l<loop; l++){\n";
ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n"; ss << " tmp = 0.0;\n";
ss << " tmp = A[loopOffset + lidx + offset] + " ss << " int loopOffset = l*512;\n";
"A[loopOffset + lidx + offset + 256];\n"; ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n"; ss << " tmp = A[loopOffset + lidx + offset] + "
ss << " tmp = A[loopOffset + lidx + offset];\n"; "A[loopOffset + lidx + offset + 256];\n";
ss << " shm_buf[lidx] = tmp;\n"; ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; ss << " tmp = A[loopOffset + lidx + offset];\n";
ss << " for (int i = 128; i >0; i/=2) {\n"; ss << " shm_buf[lidx] = tmp;\n";
ss << " if (lidx < i)\n"; ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; ss << " for (int i = 128; i >0; i/=2) {\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; ss << " if (lidx < i)\n";
ss << " }\n"; ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
ss << " if (lidx == 0)\n"; ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " current_result += shm_buf[0];\n"; ss << " }\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; ss << " if (lidx == 0)\n";
ss << " }\n"; ss << " current_result += shm_buf[0];\n";
ss << " if (lidx == 0)\n"; ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " result[writePos] = current_result;\n"; ss << " }\n";
ss << "}\n"; ss << " if (lidx == 0)\n";
ss << " result[writePos] = current_result;\n";
ss << "}\n";
}
} }
...@@ -573,11 +582,16 @@ public: ...@@ -573,11 +582,16 @@ public:
if (CL_SUCCESS != err) if (CL_SUCCESS != err)
throw OpenCLError(err); throw OpenCLError(err);
// reproduce the reduction function name // reproduce the reduction function name
std::string kernelName = Base::GetName() + "_reduction"; std::string kernelName;
if (dynamic_cast<OpSum*>(mpCodeGen.get()))
kernelName = Base::GetName() + "_reduction";
else throw Unhandled();
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
throw OpenCLError(err); throw OpenCLError(err);
// set kernel arg of reduction kernel // set kernel arg of reduction kernel
// TODO(Wei Wei): use unique name for kernel
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
(void *)&(Base::mpClmem)); (void *)&(Base::mpClmem));
if (CL_SUCCESS != err) if (CL_SUCCESS != err)
...@@ -621,6 +635,14 @@ public: ...@@ -621,6 +635,14 @@ public:
} }
} }
size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); }
size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); }
size_t GetStartFixed(void) const {return bIsStartFixed; }
size_t GetEndFixed(void) const {return bIsEndFixed; }
protected: protected:
bool bIsStartFixed, bIsEndFixed; bool bIsStartFixed, bIsEndFixed;
const formula::DoubleVectorRefToken *mpDVR; const formula::DoubleVectorRefToken *mpDVR;
...@@ -1001,6 +1023,75 @@ public: ...@@ -1001,6 +1023,75 @@ public:
{ {
i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram); i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
} }
if (OpSumIfs *OpSumCodeGen = dynamic_cast<OpSumIfs*>(mpCodeGen.get()))
{
assert(mpClmem == NULL);
// Obtain cl context
KernelEnv kEnv;
OpenclDevice::setKernelEnv(&kEnv);
cl_int err;
DynamicKernelSlidingArgument<DynamicKernelArgument> *slidingArgPtr =
dynamic_cast< DynamicKernelSlidingArgument<DynamicKernelArgument> *>
(mvSubArguments[0].get());
cl_mem mpClmem2;
if (OpSumCodeGen->NeedReductionKernel())
{
assert(slidingArgPtr);
size_t nInput = slidingArgPtr -> GetArrayLength();
size_t nCurWindowSize = slidingArgPtr -> GetWindowSize();
std::vector<cl_mem> vclmem;
for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
++it)
{
vclmem.push_back((*it)->GetCLBuffer());
}
mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE,
sizeof(double)*nVectorWidth, NULL, &err);
if (CL_SUCCESS != err)
throw OpenCLError(err);
std::string kernelName = "SumIfs_reduction";
cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError(err);
// set kernel arg of reduction kernel
for (size_t j=0; j< vclmem.size(); j++){
err = clSetKernelArg(redKernel, j, sizeof(cl_mem),
(void *)&vclmem[j]);
if (CL_SUCCESS != err)
throw OpenCLError(err);
}
err = clSetKernelArg(redKernel, vclmem.size(), sizeof(cl_mem), (void *)&mpClmem2);
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clSetKernelArg(redKernel, vclmem.size()+1, sizeof(cl_int), (void*)&nInput);
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clSetKernelArg(redKernel, vclmem.size()+2, sizeof(cl_int), (void*)&nCurWindowSize);
if (CL_SUCCESS != err)
throw OpenCLError(err);
// set work group size and execute
size_t global_work_size[] = {256, (size_t)nVectorWidth };
size_t local_work_size[] = {256, 1};
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
global_work_size, local_work_size, 0, NULL, NULL);
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clFinish(kEnv.mpkCmdQueue);
if (CL_SUCCESS != err)
throw OpenCLError(err);
// Pass mpClmem2 to the "real" kernel
err = clSetKernelArg(k, argno, sizeof(cl_mem), (void *)&mpClmem2);
if (CL_SUCCESS != err)
throw OpenCLError(err);
}
}
return i; return i;
} }
......
...@@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss, ...@@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
size_t nCurWindowSize = pCurDVR->GetArrayLength() < size_t nCurWindowSize = pCurDVR->GetArrayLength() <
pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength(): pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength():
pCurDVR->GetRefRowSize() ; pCurDVR->GetRefRowSize() ;
mNeedReductionKernel = vSubArguments[0]->NeedParallelReduction();
if (mNeedReductionKernel)
{
// generate reduction functions
ss << "__kernel void ";
ss << "SumIfs_reduction( ";
for (unsigned i = 0; i < vSubArguments.size(); i++)
{
if (i)
ss << ",";
vSubArguments[i]->GenSlidingWindowDecl(ss);
}
ss << ", __global double *result,int arrayLength,int windowSize";
ss << ")\n{\n";
ss << " double tmp =0;\n";
ss << " int i ;\n";
GenTmpVariables(ss,vSubArguments);
ss << " double current_result = 0.0;\n";
ss << " int writePos = get_group_id(1);\n";
if (pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed())
ss << " int offset = 0;\n";
else if (!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
ss << " int offset = get_group_id(1);\n";
else
throw Unhandled();
// actually unreachable
ss << " int lidx = get_local_id(0);\n";
ss << " __local double shm_buf[256];\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " int loop = arrayLength/512 + 1;\n";
ss << " for (int l=0; l<loop; l++){\n";
ss << " tmp = 0.0;\n";
ss << " int loopOffset = l*512;\n";
ss << " int p1 = loopOffset + lidx + offset, p2 = p1 + 256;\n";
ss << " if (p2 < min(offset + windowSize, arrayLength)) {\n";
ss << " tmp0 = 0.0;\n";
int mm=0;
std::string p1 = "p1";
std::string p2 = "p2";
for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
{
CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
ss << "";
ss <<" if(isequal(";
ss <<"tmp";
ss <<j;
ss <<" , ";
ss << "tmp";
ss << j+1;
ss << "))";
ss << "{\n";
}
CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
ss << " tmp += tmp0;\n";
for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
{
for(int n = 0;n<mm+1;n++)
{
ss << " ";
}
ss<< "}\n\n";
}
mm=0;
for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
{
CheckSubArgumentIsNan2(ss,vSubArguments,j,p2);
CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p2);
ss <<" if(isequal(";
ss <<"tmp";
ss <<j;
ss <<" , ";
ss << "tmp";
ss << j+1;
ss << ")){\n";
}
CheckSubArgumentIsNan2(ss,vSubArguments,0,p2);
ss << " tmp += tmp0;\n";
for(unsigned j=1;j< vSubArguments.size();j+=2,mm--)
{
for(int n = 0;n<mm+1;n++)
{
ss << " ";
}
ss<< "}\n";
}
ss << " }\n";
ss << " else if (p1 < min(arrayLength, offset + windowSize)) {\n";
mm=0;
for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
{
CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
ss <<" if(isequal(";
ss <<"tmp";
ss <<j;
ss <<" , ";
ss << "tmp";
ss << j+1;
ss << ")){\n";
}
CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
ss << " tmp += tmp0;\n";
for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
{
for(int n = 0;n<mm+1;n++)
{
ss << " ";
}
ss<< "}\n\n";
}
ss << " }\n";
ss << " shm_buf[lidx] = tmp;\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " for (int i = 128; i >0; i/=2) {\n";
ss << " if (lidx < i)\n";
ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " }\n";
ss << " if (lidx == 0)\n";
ss << " current_result += shm_buf[0];\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " }\n";
ss << " if (lidx == 0)\n";
ss << " result[writePos] = current_result;\n";
ss << "}\n";
}// finish generate reduction code
// generate functions as usual
ss << "\ndouble " << sSymName; ss << "\ndouble " << sSymName;
ss << "_"<< BinFuncName() <<"("; ss << "_"<< BinFuncName() <<"(";
for (unsigned i = 0; i < vSubArguments.size(); i++) for (unsigned i = 0; i < vSubArguments.size(); i++)
...@@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss, ...@@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
ss << ")\n {\n"; ss << ")\n {\n";
ss <<" int gid0=get_global_id(0);\n"; ss <<" int gid0=get_global_id(0);\n";
ss << " double tmp =0;\n"; ss << " double tmp =0;\n";
ss << " int i ;\n"; if (!mNeedReductionKernel)
GenTmpVariables(ss,vSubArguments); {
ss << " for (i = "; ss << " int i ;\n";
if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) { GenTmpVariables(ss,vSubArguments);
ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n"; ss << " for (i = ";
} else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n"; ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
} else { } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
ss << "0; i < "<< nCurWindowSize <<"; i++)\n"; ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
} } else {
ss << " {\n"; ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) }
{ ss << " {\n";
ss<< " int doubleIndex =i+gid0;\n"; if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
}else
{
ss<< " int doubleIndex =i;\n";
}
ss<< " int singleIndex =gid0;\n";
int m=0;
for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
{
CheckSubArgumentIsNan(ss,vSubArguments,j);
CheckSubArgumentIsNan(ss,vSubArguments,j+1);
ss <<" if(isequal(";
ss <<"tmp";
ss <<j;
ss <<" , ";
ss << "tmp";
ss << j+1;
ss << ")){\n";
}
CheckSubArgumentIsNan(ss,vSubArguments,0);
ss << " tmp += tmp0;\n";
for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
{
for(int n = 0;n<m+1;n++)
{ {
ss << " "; ss<< " int doubleIndex =i+gid0;\n";
}else
{
ss<< " int doubleIndex =i;\n";
} }
ss<< "}\n"; ss<< " int singleIndex =gid0;\n";
} int m=0;
for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
{
CheckSubArgumentIsNan(ss,vSubArguments,j);
CheckSubArgumentIsNan(ss,vSubArguments,j+1);
ss <<" if(isequal(";
ss <<"tmp";
ss <<j;
ss <<" , ";
ss << "tmp";
ss << j+1;
ss << ")){\n";
}
CheckSubArgumentIsNan(ss,vSubArguments,0);
ss << " tmp += tmp0;\n";
for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
{
for(int n = 0;n<m+1;n++)
{
ss << " ";
}
ss<< "}\n";
}
}
if (mNeedReductionKernel)
{
ss << "tmp =";
vSubArguments[0]->GenDeclRef(ss);
ss << "[gid0];\n";
}
ss << "return tmp;\n"; ss << "return tmp;\n";
ss << "}"; ss << "}";
} }
......
...@@ -33,9 +33,13 @@ public: ...@@ -33,9 +33,13 @@ public:
class OpSumIfs: public CheckVariables class OpSumIfs: public CheckVariables
{ {
public: public:
OpSumIfs(void): CheckVariables(), mNeedReductionKernel(false) {}
virtual void GenSlidingWindowFunction(std::stringstream &ss, virtual void GenSlidingWindowFunction(std::stringstream &ss,
const std::string sSymName, SubArguments &vSubArguments); const std::string sSymName, SubArguments &vSubArguments);
virtual std::string BinFuncName(void) const { return "SumIfs"; } virtual std::string BinFuncName(void) const { return "SumIfs"; }
bool NeedReductionKernel(void) const { return mNeedReductionKernel; }
protected:
bool mNeedReductionKernel;
}; };
class OpCosh: public Normal class OpCosh: public Normal
{ {
......
...@@ -156,6 +156,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss, ...@@ -156,6 +156,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss,
ss << vSubArguments[i]->GenSlidingWindowDeclRef(); ss << vSubArguments[i]->GenSlidingWindowDeclRef();
ss<<";\n"; ss<<";\n";
} }
void CheckVariables::CheckSubArgumentIsNan2( std::stringstream & ss,
SubArguments &vSubArguments, int argumentNum, std::string p)
{
int i = argumentNum;
if(vSubArguments[i]->GetFormulaToken()->GetType() == formula::svDouble)
{
ss <<" tmp";
ss <<i;
ss << "=";
vSubArguments[i]->GenDeclRef(ss);
ss<<";\n";
return;
}
#ifdef ISNAN
ss<< " tmp";
ss<< i;
ss<< "= fsum(";
vSubArguments[i]->GenDeclRef(ss);
if(vSubArguments[i]->GetFormulaToken()->GetType() ==
formula::svDoubleVectorRef)
ss<<"["<< p.c_str()<< "]";
else if(vSubArguments[i]->GetFormulaToken()->GetType() ==
formula::svSingleVectorRef)
ss<<"[get_group_id(1)]";
ss<<", 0);\n";
return;
#endif
ss <<" tmp";
ss <<i;
ss << "=";
vSubArguments[i]->GenDeclRef(ss);
if(vSubArguments[i]->GetFormulaToken()->GetType() ==
formula::svDoubleVectorRef)
ss<<"["<< p.c_str()<< "]";
else if(vSubArguments[i]->GetFormulaToken()->GetType() ==
formula::svSingleVectorRef)
ss<<"[get_group_id(1)]";
ss<<";\n";
}
void CheckVariables::CheckAllSubArgumentIsNan( void CheckVariables::CheckAllSubArgumentIsNan(
std::stringstream & ss, SubArguments & vSubArguments) std::stringstream & ss, SubArguments & vSubArguments)
{ {
......
...@@ -104,6 +104,9 @@ public: ...@@ -104,6 +104,9 @@ public:
virtual void DumpInlineFun(std::set<std::string>& , virtual void DumpInlineFun(std::set<std::string>& ,
std::set<std::string>& ) const {} std::set<std::string>& ) const {}
const std::string& GetName(void) const { return mSymName; } const std::string& GetName(void) const { return mSymName; }
cl_mem GetCLBuffer(void) const {return mpClmem; }
virtual bool NeedParallelReduction(void) const { return false; }
protected: protected:
const std::string mSymName; const std::string mSymName;
FormulaTreeNodeRef mFormulaTree; FormulaTreeNodeRef mFormulaTree;
...@@ -157,6 +160,9 @@ public: ...@@ -157,6 +160,9 @@ public:
SubArguments &vSubArguments, int argumentNum); SubArguments &vSubArguments, int argumentNum);
void CheckAllSubArgumentIsNan(std::stringstream &ss, void CheckAllSubArgumentIsNan(std::stringstream &ss,
SubArguments &vSubArguments); SubArguments &vSubArguments);
// only check isNan
void CheckSubArgumentIsNan2(std::stringstream &ss,
SubArguments &vSubArguments, int argumentNum, std::string p);
}; };
}} }}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment