Kaydet (Commit) 501bc66c authored tarafından I-Jui (Ray) Sung's avatar I-Jui (Ray) Sung

GPU Calc: use parallel reduction to implement sum

Use reduction kernel when given a large DoubleVectorRef

Change-Id: Ifd4977b81be64274733909e43f0e5ef161bb455e
Signed-off-by: 's avatarI-Jui (Ray) Sung <ray@multicorewareinc.com>
üst 43cab408
...@@ -44,7 +44,7 @@ namespace sc { namespace opencl { ...@@ -44,7 +44,7 @@ namespace sc { namespace opencl {
/// Map the buffer used by an argument and do necessary argument setting /// Map the buffer used by an argument and do necessary argument setting
size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int) size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int, cl_program)
{ {
FormulaToken *ref = mFormulaTree->GetFormulaToken(); FormulaToken *ref = mFormulaTree->GetFormulaToken();
assert(mpClmem == NULL); assert(mpClmem == NULL);
...@@ -125,7 +125,7 @@ public: ...@@ -125,7 +125,7 @@ public:
return 1; return 1;
} }
/// Pass the 32-bit hash of the string to the kernel /// Pass the 32-bit hash of the string to the kernel
virtual size_t Marshal(cl_kernel k, int argno, int) virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{ {
FormulaToken *ref = mFormulaTree->GetFormulaToken(); FormulaToken *ref = mFormulaTree->GetFormulaToken();
assert(mpClmem == NULL); assert(mpClmem == NULL);
...@@ -183,7 +183,7 @@ public: ...@@ -183,7 +183,7 @@ public:
return 1; return 1;
} }
/// Create buffer and pass the buffer to a given kernel /// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal(cl_kernel k, int argno, int) virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{ {
double tmp = 0.0; double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel // Pass the scalar result back to the rest of the formula kernel
...@@ -222,7 +222,7 @@ public: ...@@ -222,7 +222,7 @@ public:
return 1; return 1;
} }
/// Create buffer and pass the buffer to a given kernel /// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal(cl_kernel k, int argno, int) virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{ {
double tmp = 0.0; double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel // Pass the scalar result back to the rest of the formula kernel
...@@ -264,7 +264,7 @@ public: ...@@ -264,7 +264,7 @@ public:
return 1; return 1;
} }
/// Create buffer and pass the buffer to a given kernel /// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal(cl_kernel k, int argno, int) virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{ {
double tmp = 0.0; double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel // Pass the scalar result back to the rest of the formula kernel
...@@ -292,11 +292,11 @@ public: ...@@ -292,11 +292,11 @@ public:
{ {
DynamicKernelStringArgument::GenDecl(ss); DynamicKernelStringArgument::GenDecl(ss);
} }
virtual size_t Marshal(cl_kernel, int, int); virtual size_t Marshal(cl_kernel, int, int, cl_program);
}; };
/// Marshal a string vector reference /// Marshal a string vector reference
size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int) size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int, cl_program)
{ {
FormulaToken *ref = mFormulaTree->GetFormulaToken(); FormulaToken *ref = mFormulaTree->GetFormulaToken();
assert(mpClmem == NULL); assert(mpClmem == NULL);
...@@ -390,10 +390,10 @@ public: ...@@ -390,10 +390,10 @@ public:
ss << ")"; ss << ")";
return ss.str(); return ss.str();
} }
virtual size_t Marshal(cl_kernel k, int argno, int vw) virtual size_t Marshal(cl_kernel k, int argno, int vw, cl_program p)
{ {
int i = DynamicKernelArgument::Marshal(k, argno, vw); int i = DynamicKernelArgument::Marshal(k, argno, vw, p);
i += mStringArgument.Marshal(k, argno+i, vw); i += mStringArgument.Marshal(k, argno+i, vw, p);
return i; return i;
} }
protected: protected:
...@@ -402,24 +402,61 @@ protected: ...@@ -402,24 +402,61 @@ protected:
/// Handling a Double Vector that is used as a sliding window input /// Handling a Double Vector that is used as a sliding window input
/// to either a sliding window average or sum-of-products /// to either a sliding window average or sum-of-products
class OpSum; // Forward Declaration
template<class Base> template<class Base>
class DynamicKernelSlidingArgument: public Base class DynamicKernelSlidingArgument: public Base
{ {
public: public:
DynamicKernelSlidingArgument(const std::string &s, DynamicKernelSlidingArgument(const std::string &s,
FormulaTreeNodeRef ft): FormulaTreeNodeRef ft, boost::shared_ptr<SlidingFunctionBase> &CodeGen):
Base(s, ft) Base(s, ft), mpCodeGen(CodeGen), needReductionKernel(true), mpClmem2(NULL)
{ {
FormulaToken *t = ft->GetFormulaToken(); FormulaToken *t = ft->GetFormulaToken();
if (t->GetType() != formula::svDoubleVectorRef) if (t->GetType() != formula::svDoubleVectorRef)
throw Unhandled(); throw Unhandled();
const formula::DoubleVectorRefToken* pDVR = mpDVR = dynamic_cast<const formula::DoubleVectorRefToken *>(t);
dynamic_cast<const formula::DoubleVectorRefToken *>(t); assert(mpDVR);
assert(pDVR); bIsStartFixed = mpDVR->IsStartFixed();
bIsStartFixed = pDVR->IsStartFixed(); bIsEndFixed = mpDVR->IsEndFixed();
bIsEndFixed = pDVR->IsEndFixed();
} }
virtual void GenSlidingWindowFunction(std::stringstream &) {} virtual void GenSlidingWindowFunction(std::stringstream &ss) {
std::string name = Base::GetName();
ss << "__kernel void "<<name;
ss << "_reduction(__global double* A, "
"__global double *result,int arrayLength,int windowSize){\n";
ss << " double tmp, current_result = 0.0;\n";
ss << " int writePos = get_group_id(1);\n";
ss << " int offset = get_group_id(1);\n";
ss << " int lidx = get_local_id(0);\n";
ss << " __local double shm_buf[256];\n";
ss << " if (arrayLength == windowSize)\n";
ss << " offset = 0;\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " int loop = arrayLength/512 + 1;\n";
ss << " for (int l=0; l<loop; l++){\n";
ss << " tmp = 0.0;\n";
ss << " int loopOffset = l*512;\n";
ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
ss << " tmp = A[loopOffset + lidx + offset] + "
"A[loopOffset + lidx + offset + 256];\n";
ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
ss << " tmp = A[loopOffset + lidx + offset];\n";
ss << " shm_buf[lidx] = tmp;\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " for (int i = 128; i >0; i/=2) {\n";
ss << " if (lidx < i)\n";
ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " }\n";
ss << " if (lidx == 0)\n";
ss << " current_result += shm_buf[0];\n";
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
ss << " }\n";
ss << " if (lidx == 0)\n";
ss << " result[writePos] = current_result;\n";
ss << "}\n";
}
virtual std::string GenSlidingWindowDeclRef(bool=false) const virtual std::string GenSlidingWindowDeclRef(bool=false) const
{ {
...@@ -430,8 +467,168 @@ public: ...@@ -430,8 +467,168 @@ public:
ss << Base::GetName() << "[i]"; ss << Base::GetName() << "[i]";
return ss.str(); return ss.str();
} }
/// Controls how the elements in the DoubleVectorRef are traversed
virtual size_t GenLoop(std::stringstream &ss, bool &needBody)
{
assert(mpDVR);
size_t nCurWindowSize = mpDVR->GetRefRowSize();
if (dynamic_cast<OpSum*>(mpCodeGen.get()))
{
if (!bIsStartFixed && !bIsEndFixed)
{
// set 100 as a threshold for invoking reduction kernel
if (nCurWindowSize > 100 )
{
std::string temp = Base::GetName() + "[gid0]";
ss << "tmp = ";
ss << mpCodeGen->Gen2(temp, "tmp");
ss << ";\n\t";
needBody = false;
needReductionKernel = false;
return nCurWindowSize;
}
}
if (bIsStartFixed && bIsEndFixed)
{
// set 100 as a threshold for invoking reduction kernel
if (nCurWindowSize > 100 )
{
std::string temp = Base::GetName() + "[0]";
ss << "tmp = ";
ss << mpCodeGen->Gen2(temp, "tmp");
ss << ";\n\t";
needBody = false;
needReductionKernel = false;
return nCurWindowSize;
}
}
}
needBody = true;
needReductionKernel = true;
ss << "for (int i = ";
if (!bIsStartFixed && bIsEndFixed)
{
#ifdef ISNAN
ss << "gid0; i < " << mpDVR->GetArrayLength();
ss << " && i < " << nCurWindowSize << "; i++){\n\t\t";
#else
ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t";
#endif
}
else if (bIsStartFixed && !bIsEndFixed)
{
#ifdef ISNAN
ss << "0; i < " << mpDVR->GetArrayLength();
ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t";
#else
ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t";
#endif
}
else if (!bIsStartFixed && !bIsEndFixed)
{
#ifdef ISNAN
ss << "0; i + gid0 < " << mpDVR->GetArrayLength();
ss << " && i < "<< nCurWindowSize << "; i++){\n\t\t";
#else
ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
#endif
}
else
{
#ifdef ISNAN
ss << "0; i < "<< nCurWindowSize << "; i++){\n\t\t";
#else
ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
#endif
}
return nCurWindowSize;
}
virtual size_t Marshal(cl_kernel k, int argno, int w, cl_program mpProgram)
{
if (needReductionKernel)
return Base::Marshal(k, argno, w, mpProgram);
assert(Base::mpClmem == NULL);
// Obtain cl context
KernelEnv kEnv;
OpenclDevice::setKernelEnv(&kEnv);
cl_int err;
size_t nInput = mpDVR->GetArrayLength();
size_t nCurWindowSize = mpDVR->GetRefRowSize();
// create clmem buffer
if (mpDVR->GetArrays()[0].mpNumericArray == NULL)
throw Unhandled();
double *pHostBuffer = const_cast<double*>(
mpDVR->GetArrays()[0].mpNumericArray);
size_t szHostBuffer = nInput * sizeof(double);
Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
(cl_mem_flags) CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,
szHostBuffer,
pHostBuffer, &err);
mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_WRITE_ONLY,
sizeof(double)*w, NULL, NULL);
if (CL_SUCCESS != err)
throw OpenCLError(err);
// reproduce the reduction function name
std::string kernelName = Base::GetName() + "_reduction";
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError(err);
// set kernel arg of reduction kernel
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
(void *)&(Base::mpClmem));
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), (void *)&mpClmem2);
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clSetKernelArg(redKernel, 2, sizeof(cl_int), (void*)&nInput);
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clSetKernelArg(redKernel, 3, sizeof(cl_int), (void*)&nCurWindowSize);
if (CL_SUCCESS != err)
throw OpenCLError(err);
// set work group size and execute
size_t global_work_size[] = {256, (size_t)w };
size_t local_work_size[] = {256, 1};
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
global_work_size, local_work_size, 0, NULL, NULL);
if (CL_SUCCESS != err)
throw OpenCLError(err);
err = clFinish(kEnv.mpkCmdQueue);
if (CL_SUCCESS != err)
throw OpenCLError(err);
// set kernel arg
err = clSetKernelArg(k, argno, sizeof(cl_mem), (void*)&(mpClmem2));
if (CL_SUCCESS != err)
throw OpenCLError(err);
return 1;
}
~DynamicKernelSlidingArgument()
{
if (mpClmem2)
{
clReleaseMemObject(mpClmem2);
mpClmem2 = NULL;
}
}
protected: protected:
bool bIsStartFixed, bIsEndFixed; bool bIsStartFixed, bIsEndFixed;
const formula::DoubleVectorRefToken *mpDVR;
// from parent nodes
boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
// controls whether to invoke the reduction kernel during marshaling or not
bool needReductionKernel;
cl_mem mpClmem2;
}; };
/// Abstract class for code generation /// Abstract class for code generation
...@@ -439,6 +636,9 @@ protected: ...@@ -439,6 +636,9 @@ protected:
class Reduction: public SlidingFunctionBase class Reduction: public SlidingFunctionBase
{ {
public: public:
typedef DynamicKernelSlidingArgument<DynamicKernelArgument> NumericRange;
typedef DynamicKernelSlidingArgument<DynamicKernelStringArgument> StringRange;
virtual void GenSlidingWindowFunction(std::stringstream &ss, virtual void GenSlidingWindowFunction(std::stringstream &ss,
const std::string sSymName, SubArguments &vSubArguments) const std::string sSymName, SubArguments &vSubArguments)
{ {
...@@ -459,43 +659,25 @@ public: ...@@ -459,43 +659,25 @@ public:
size_t nItems = 0; size_t nItems = 0;
while (i--) while (i--)
{ {
FormulaToken *pCur = vSubArguments[i]->GetFormulaToken(); if (NumericRange *NR = dynamic_cast<NumericRange *> (vSubArguments[i].get()))
assert(pCur);
if (pCur->GetType() == formula::svDoubleVectorRef)
{ {
const formula::DoubleVectorRefToken* pDVR = bool needBody;
dynamic_cast<const formula::DoubleVectorRefToken *>(pCur); nItems += NR->GenLoop(ss, needBody);
size_t nCurWindowSize = pDVR->GetRefRowSize(); if (needBody == false) continue;
ss << "for (int i = ";
if (!pDVR->IsStartFixed() && pDVR->IsEndFixed()) {
#ifdef ISNAN
ss << "gid0; i < " << pDVR->GetArrayLength();
ss << " && i < " << nCurWindowSize << "; i++){\n\t\t";
#else
ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t";
#endif
} else if (pDVR->IsStartFixed() && !pDVR->IsEndFixed()) {
#ifdef ISNAN
ss << "0; i < " << pDVR->GetArrayLength();
ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t";
#else
ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t";
#endif
} else if (!pDVR->IsStartFixed() && !pDVR->IsEndFixed()){
#ifdef ISNAN
ss << "0; i + gid0 < " << pDVR->GetArrayLength();
ss << " && i < "<< nCurWindowSize << "; i++){\n\t\t";
#else
ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
#endif
} }
else else if (StringRange *SR = dynamic_cast<StringRange *> (vSubArguments[i].get()))
{ {
ss << "0; i < "<< pDVR->GetArrayLength() << "; i++){\n\t\t"; bool needBody;
} nItems += SR->GenLoop(ss, needBody); //did not handle yet
nItems += nCurWindowSize; if (needBody == false) continue;
} }
else if (pCur->GetType() == formula::svSingleVectorRef) else
{
FormulaToken *pCur = vSubArguments[i]->GetFormulaToken();
assert(pCur);
assert(pCur->GetType() != formula::svDoubleVectorRef);
if (pCur->GetType() == formula::svSingleVectorRef)
{ {
#ifdef ISNAN #ifdef ISNAN
const formula::SingleVectorRefToken* pSVR = const formula::SingleVectorRefToken* pSVR =
...@@ -515,10 +697,11 @@ public: ...@@ -515,10 +697,11 @@ public:
else else
{ {
#ifdef ISNAN #ifdef ISNAN
ss << "nCount += 1;\n\t"; ss << "nCount += 1;\n\t\t";
#endif #endif
nItems += 1; nItems += 1;
} }
}
#ifdef ISNAN #ifdef ISNAN
if(ocPush==vSubArguments[i]->GetFormulaToken()->GetOpCode()) if(ocPush==vSubArguments[i]->GetFormulaToken()->GetOpCode())
{ {
...@@ -810,13 +993,13 @@ public: ...@@ -810,13 +993,13 @@ public:
const std::string &s, const FormulaTreeNodeRef& ft, SlidingFunctionBase* pCodeGen); const std::string &s, const FormulaTreeNodeRef& ft, SlidingFunctionBase* pCodeGen);
/// Create buffer and pass the buffer to a given kernel /// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth) virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth, cl_program pProgram)
{ {
unsigned i = 0; unsigned i = 0;
for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e; for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
++it) ++it)
{ {
i += (*it)->Marshal(k, argno + i, nVectorWidth); i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
} }
return i; return i;
} }
...@@ -910,7 +1093,7 @@ public: ...@@ -910,7 +1093,7 @@ public:
} }
private: private:
SubArgumentsType mvSubArguments; SubArgumentsType mvSubArguments;
boost::scoped_ptr<SlidingFunctionBase> mpCodeGen; boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
}; };
boost::shared_ptr<DynamicKernelArgument> SoPHelper( boost::shared_ptr<DynamicKernelArgument> SoPHelper(
...@@ -944,12 +1127,12 @@ DynamicKernelSoPArguments::DynamicKernelSoPArguments( ...@@ -944,12 +1127,12 @@ DynamicKernelSoPArguments::DynamicKernelSoPArguments(
if (pDVR->GetArrays()[0].mpNumericArray) if (pDVR->GetArrays()[0].mpNumericArray)
mvSubArguments.push_back( mvSubArguments.push_back(
SubArgument(new DynamicKernelSlidingArgument SubArgument(new DynamicKernelSlidingArgument
<DynamicKernelArgument>(ts, ft->Children[i]))); <DynamicKernelArgument>(ts, ft->Children[i], mpCodeGen)));
else else
mvSubArguments.push_back( mvSubArguments.push_back(
SubArgument(new DynamicKernelSlidingArgument SubArgument(new DynamicKernelSlidingArgument
<DynamicKernelStringArgument>( <DynamicKernelStringArgument>(
ts, ft->Children[i]))); ts, ft->Children[i], mpCodeGen)));
} else if (pChild->GetType() == formula::svSingleVectorRef) { } else if (pChild->GetType() == formula::svSingleVectorRef) {
const formula::SingleVectorRefToken* pSVR = const formula::SingleVectorRefToken* pSVR =
dynamic_cast< const formula::SingleVectorRefToken* >(pChild); dynamic_cast< const formula::SingleVectorRefToken* >(pChild);
...@@ -1713,19 +1896,19 @@ public: ...@@ -1713,19 +1896,19 @@ public:
} }
/// Memory mapping from host to device and pass buffers to the given kernel as /// Memory mapping from host to device and pass buffers to the given kernel as
/// arguments /// arguments
void Marshal(cl_kernel, int); void Marshal(cl_kernel, int, cl_program);
private: private:
unsigned int mCurId; unsigned int mCurId;
ArgumentMap mSymbols; ArgumentMap mSymbols;
ArgumentList mParams; ArgumentList mParams;
}; };
void SymbolTable::Marshal(cl_kernel k, int nVectorWidth) void SymbolTable::Marshal(cl_kernel k, int nVectorWidth, cl_program pProgram)
{ {
int i = 1; //The first argument is reserved for results int i = 1; //The first argument is reserved for results
for(ArgumentList::iterator it = mParams.begin(), e= mParams.end(); it!=e; for(ArgumentList::iterator it = mParams.begin(), e= mParams.end(); it!=e;
++it) { ++it) {
i+=(*it)->Marshal(k, i, nVectorWidth); i+=(*it)->Marshal(k, i, nVectorWidth, pProgram);
} }
} }
...@@ -1816,7 +1999,7 @@ public: ...@@ -1816,7 +1999,7 @@ public:
if (CL_SUCCESS != err) if (CL_SUCCESS != err)
throw OpenCLError(err); throw OpenCLError(err);
// The rest of buffers // The rest of buffers
mSyms.Marshal(mpKernel, nr); mSyms.Marshal(mpKernel, nr, mpProgram);
size_t global_work_size[] = {nr}; size_t global_work_size[] = {nr};
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, mpKernel, 1, NULL, err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, mpKernel, 1, NULL,
global_work_size, NULL, 0, NULL, NULL); global_work_size, NULL, 0, NULL, NULL);
......
...@@ -92,7 +92,7 @@ public: ...@@ -92,7 +92,7 @@ public:
virtual void GenDeclRef(std::stringstream &ss) const; virtual void GenDeclRef(std::stringstream &ss) const;
/// Create buffer and pass the buffer to a given kernel /// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal(cl_kernel, int, int); virtual size_t Marshal(cl_kernel, int, int, cl_program);
virtual ~DynamicKernelArgument(); virtual ~DynamicKernelArgument();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment