I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.
Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.
Here are the code used to run my simple test:
void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
m_util.perfMeasure_start();
for (int i = 0; i < 50; i++)
{
cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
}
m_util.perfMeasure_end("cvtColor");
for (int i = 0; i < 50; i++)
{
cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
}
m_util.perfMeasure_end("GaussianBlur");
for (int i = 0; i < 50; i++)
{
cv::Canny(matDst, matDst, 0, 50);
}
m_util.perfMeasure_end("Canny");
for (int i = 0; i < 50; i++)
{
cv::dilate(matDst, matDst, cv::noArray());
}
m_util.perfMeasure_end("Dilate");
for (int i = 0; i < 50; i++)
{
cv::add(matSrc, matSrc, matDst);
}
m_util.perfMeasure_end("Add");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, matDst, matDst);
}
m_util.perfMeasure_end("multiply");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, 2.5, matDst);
}
m_util.perfMeasure_end("multiply_scalar");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, matDst, matDst);
}
m_util.perfMeasure_end("divide");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, 2.5, matDst);
}
m_util.perfMeasure_end("divide_Scalar");
for (int i = 0; i < 50; i++)
{
cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
}
m_util.perfMeasure_end("addWeighted");
}
void TestOpenCL(cv::InputArray matSrc)
{
//std::cout << "**************************************\n";
m_util.printConsole("[PERF] -= Performance Check =-\n");
//std::cout << "**************************************\n\n";
m_util.printConsole("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());
//////////// CPU MEASUREMENT CODE //////////////////////
m_util.printConsole("[PERF] CPU Process\n");
m_util.perfMeasure_start();
cv::Mat img, gray;
cv::Mat matImg, matDst;
// Read
matImg = matSrc.getMat();
m_util.perfMeasure_end("Read");
//Transfer
matImg.copyTo(img);
m_util.perfMeasure_end("Transfer CPU->CPU");
// Process
OpenCLProc(img, gray);
// Transfer
gray.copyTo(matDst);
m_util.perfMeasure_end("Transfer CPU->CPU");
std::cout << "\n";
//////////////////// GPU MEASUREMENT CODE 1 //////////////////////
m_util.printConsole("[PERF] GPU Process (copyTo)\n");
cv::UMat img1, gray1;
cv::Mat matImg1, matDst1;
// Read
matImg1 = matSrc.getMat();
m_util.perfMeasure_end("Read");
// Transfer
matImg1.copyTo(img1);
m_util.perfMeasure_end("Transfer CPU->GPU");
//Process
OpenCLProc(img1, gray1);
// Transfer
gray1.copyTo(matDst1);
m_util.perfMeasure_end("Transfer GPU->CPU");
std::cout << "\n";
return;
}
And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.
Is there a problem with the OpenCV function? How can multiplication function takes longer than division?
This process is run on - Library : OpenCV3.3.1 build with OpenCL and TBB - CPU : Intel i7-6500U - GPU : Intel HD Graphics 520