1 | initial version |
That's not an answer but is is easier to insert code and results:
only to compile kernels *******
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME] Read : 0.001 ms
[TIME] Transfer CPU->CPU : 4.584 ms
[TIME] cvtColor : 59.819 ms
[TIME] GaussianBlur : 584.149 ms
[TIME] Canny : 492.338 ms
[TIME] Dilate : 43.844 ms
[TIME] Add : 110.266 ms
[TIME] multiply : 88.820 ms
[TIME] multiply_scalar : 767.707 ms
[TIME] divide : 354.996 ms
[TIME] divide_Scalar : 1784.366 ms
[TIME] addWeighted : 318.085 ms
[TIME] Transfer CPU->CPU : 3.922 ms
[PERF] GPU Process (copyTo)
[TIME] Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME] Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME] cvtColor : 72.888 ms
[TIME] GaussianBlur : 343.475 ms
[TIME] Canny : 195.293 ms
[TIME] Dilate : 85.200 ms
[TIME] Add : 9.628 ms
[TIME] multiply : 9.469 ms
[TIME] multiply_scalar : 13.250 ms
[TIME] divide : 29.945 ms
[TIME] divide_Scalar : 32.137 ms
[TIME] addWeighted : 94.774 ms
[TIME] Transfer GPU->CPU : 19.806 ms
Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME] Read : 0.001 ms
[TIME] Transfer CPU->CPU : 5.736 ms
[TIME] cvtColor : 65.285 ms
[TIME] GaussianBlur : 582.992 ms
[TIME] Canny : 428.836 ms
[TIME] Dilate : 44.551 ms
[TIME] Add : 93.305 ms
[TIME] multiply : 95.427 ms
[TIME] multiply_scalar : 781.954 ms
[TIME] divide : 366.936 ms
[TIME] divide_Scalar : 1782.807 ms
[TIME] addWeighted : 336.931 ms
[TIME] Transfer CPU->CPU : 8.400 ms
[PERF] GPU Process (copyTo)
[TIME] Read : 0.063 ms
[TIME] Transfer CPU->GPU : 11.910 ms
[TIME] cvtColor : 1.609 ms
[TIME] GaussianBlur : 328.819 ms
[TIME] Canny : 168.397 ms
[TIME] Dilate : 77.406 ms
[TIME] Add : 3.617 ms
[TIME] multiply : 1.209 ms
[TIME] multiply_scalar : 1.350 ms
[TIME] divide : 1.892 ms
[TIME] divide_Scalar : 2.366 ms
[TIME] addWeighted : 2.807 ms
[TIME] Transfer GPU->CPU : 168.320 ms
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace cv;
using namespace std;
// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;
void perfMeasure_start()
{
aChrono.reset();
aChrono.start();
}
void perfMeasure_end(std::string strLabel)
{
aChrono.stop();
printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
aChrono.reset();
aChrono.start();
}
void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
perfMeasure_start();
for (int i = 0; i < 50; i++)
{
cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
}
perfMeasure_end("cvtColor");
for (int i = 0; i < 50; i++)
{
cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
}
perfMeasure_end("GaussianBlur");
for (int i = 0; i < 50; i++)
{
cv::Canny(matDst, matDst, 0, 50);
}
perfMeasure_end("Canny");
for (int i = 0; i < 50; i++)
{
cv::dilate(matDst, matDst, cv::noArray());
}
perfMeasure_end("Dilate");
for (int i = 0; i < 50; i++)
{
cv::add(matSrc, matSrc, matDst);
}
perfMeasure_end("Add");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, matDst, matDst);
}
perfMeasure_end("multiply");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, 2.5, matDst);
}
perfMeasure_end("multiply_scalar");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, matDst, matDst);
}
perfMeasure_end("divide");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, 2.5, matDst);
}
perfMeasure_end("divide_Scalar");
for (int i = 0; i < 50; i++)
{
cv::addWeighted(matSrc, 2.5, matDst, 0.6, 0, matDst);
}
perfMeasure_end("addWeighted");
}
void TestOpenCL(cv::InputArray matSrc)
{
printf("[PERF] -= Performance Check =-\n");
printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());
//////////// CPU MEASUREMENT CODE //////////////////////
printf("[PERF] CPU Process\n");
perfMeasure_start();
cv::Mat img, gray;
cv::Mat matImg, matDst;
// Read
matImg = matSrc.getMat();
perfMeasure_end("Read");
//Transfer
matImg.copyTo(img);
perfMeasure_end("Transfer CPU->CPU");
// Process
OpenCLProc(img, gray);
// Transfer
gray.copyTo(matDst);
perfMeasure_end("Transfer CPU->CPU");
std::cout << "\n";
//////////////////// GPU MEASUREMENT CODE 1 //////////////////////
printf("[PERF] GPU Process (copyTo)\n");
cv::UMat img1, gray1;
cv::Mat matImg1, matDst1;
// Read
matImg1 = matSrc.getMat();
perfMeasure_end("Read");
// Transfer
matImg1.copyTo(img1);
perfMeasure_end("Transfer CPU->GPU");
//Process
OpenCLProc(img1, gray1);
// Transfer
gray1.copyTo(matDst1);
perfMeasure_end("Transfer GPU->CPU");
std::cout << "\n";
return;
}
int main(int argc, const char** argv)
{
cv::Mat matSrc = cv::imread("g:/lib/opencv/samples/data/lena.jpg");
resize(matSrc, matSrc,Size(), 4, 4, INTER_LINEAR);
// First round of process to let the GPU initialize
cout << "only to compile kernels *************************\n";
TestOpenCL(matSrc);
// The code performance is made based on result from this function
cout << "Real Test *************************\n";
TestOpenCL(matSrc);
return 0;
}
2 | No.2 Revision |
That's not an answer but is is easier to insert code and results:results
without or
only to compile kernels *******
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME] Read : 0.001 ms
[TIME] Transfer CPU->CPU : 4.584 ms
[TIME] cvtColor : 59.819 ms
[TIME] GaussianBlur : 584.149 ms
[TIME] Canny : 492.338 ms
[TIME] Dilate : 43.844 ms
[TIME] Add : 110.266 ms
[TIME] multiply : 88.820 ms
[TIME] multiply_scalar : 767.707 ms
[TIME] divide : 354.996 ms
[TIME] divide_Scalar : 1784.366 ms
[TIME] addWeighted : 318.085 ms
[TIME] Transfer CPU->CPU : 3.922 ms
[PERF] GPU Process (copyTo)
[TIME] Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME] Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME] cvtColor : 72.888 ms
[TIME] GaussianBlur : 343.475 ms
[TIME] Canny : 195.293 ms
[TIME] Dilate : 85.200 ms
[TIME] Add : 9.628 ms
[TIME] multiply : 9.469 ms
[TIME] multiply_scalar : 13.250 ms
[TIME] divide : 29.945 ms
[TIME] divide_Scalar : 32.137 ms
[TIME] addWeighted : 94.774 ms
[TIME] Transfer GPU->CPU : 19.806 ms
Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME] Read : 0.001 ms
[TIME] Transfer CPU->CPU : 5.736 ms
[TIME] cvtColor : 65.285 ms
[TIME] GaussianBlur : 582.992 ms
[TIME] Canny : 428.836 ms
[TIME] Dilate : 44.551 ms
[TIME] Add : 93.305 ms
[TIME] multiply : 95.427 ms
[TIME] multiply_scalar : 781.954 ms
[TIME] divide : 366.936 ms
[TIME] divide_Scalar : 1782.807 ms
[TIME] addWeighted : 336.931 ms
[TIME] Transfer CPU->CPU : 8.400 ms
[PERF] GPU Process (copyTo)
[TIME] Read : 0.063 ms
[TIME] Transfer CPU->GPU : 11.910 ms
[TIME] cvtColor : 1.609 ms
[TIME] GaussianBlur : 328.819 ms
[TIME] Canny : 168.397 ms
[TIME] Dilate : 77.406 ms
[TIME] Add : 3.617 ms
[TIME] multiply : 1.209 ms
[TIME] multiply_scalar : 1.350 ms
[TIME] divide : 1.892 ms
[TIME] divide_Scalar : 2.366 ms
[TIME] addWeighted : 2.807 ms
[TIME] Transfer GPU->CPU : 168.320 ms
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace cv;
using namespace std;
// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;
void perfMeasure_start()
{
aChrono.reset();
aChrono.start();
}
void perfMeasure_end(std::string strLabel)
{
aChrono.stop();
printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
aChrono.reset();
aChrono.start();
}
void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
perfMeasure_start();
for (int i = 0; i < 50; i++)
{
cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
}
perfMeasure_end("cvtColor");
for (int i = 0; i < 50; i++)
{
cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
}
perfMeasure_end("GaussianBlur");
for (int i = 0; i < 50; i++)
{
cv::Canny(matDst, matDst, 0, 50);
}
perfMeasure_end("Canny");
for (int i = 0; i < 50; i++)
{
cv::dilate(matDst, matDst, cv::noArray());
}
perfMeasure_end("Dilate");
for (int i = 0; i < 50; i++)
{
cv::add(matSrc, matSrc, matDst);
}
perfMeasure_end("Add");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, matDst, matDst);
}
perfMeasure_end("multiply");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, 2.5, matDst);
}
perfMeasure_end("multiply_scalar");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, matDst, matDst);
}
perfMeasure_end("divide");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, 2.5, matDst);
}
perfMeasure_end("divide_Scalar");
for (int i = 0; i < 50; i++)
{
cv::addWeighted(matSrc, 2.5, matDst, 0.6, 0, matDst);
}
perfMeasure_end("addWeighted");
}
void TestOpenCL(cv::InputArray matSrc)
{
printf("[PERF] -= Performance Check =-\n");
printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());
//////////// CPU MEASUREMENT CODE //////////////////////
printf("[PERF] CPU Process\n");
perfMeasure_start();
cv::Mat img, gray;
cv::Mat matImg, matDst;
// Read
matImg = matSrc.getMat();
perfMeasure_end("Read");
//Transfer
matImg.copyTo(img);
perfMeasure_end("Transfer CPU->CPU");
// Process
OpenCLProc(img, gray);
// Transfer
gray.copyTo(matDst);
perfMeasure_end("Transfer CPU->CPU");
std::cout << "\n";
//////////////////// GPU MEASUREMENT CODE 1 //////////////////////
printf("[PERF] GPU Process (copyTo)\n");
cv::UMat img1, gray1;
cv::Mat matImg1, matDst1;
// Read
matImg1 = matSrc.getMat();
perfMeasure_end("Read");
// Transfer
matImg1.copyTo(img1);
perfMeasure_end("Transfer CPU->GPU");
//Process
OpenCLProc(img1, gray1);
// Transfer
gray1.copyTo(matDst1);
perfMeasure_end("Transfer GPU->CPU");
std::cout << "\n";
return;
}
int main(int argc, const char** argv)
{
cv::Mat matSrc = cv::imread("g:/lib/opencv/samples/data/lena.jpg");
resize(matSrc, matSrc,Size(), 4, 4, INTER_LINEAR);
// First round of process to let the GPU initialize
cout << "only to compile kernels *************************\n";
TestOpenCL(matSrc);
// The code performance is made based on result from this function
cout << "Real Test *************************\n";
TestOpenCL(matSrc);
return 0;
}
3 | No.3 Revision |
That's not an answer but is is easier to insert code and results
without patch for or for
only to compile kernels *******
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME] Read : 0.001 ms
[TIME] Transfer CPU->CPU : 4.584 ms
[TIME] cvtColor : 59.819 ms
[TIME] GaussianBlur : 584.149 ms
[TIME] Canny : 492.338 ms
[TIME] Dilate : 43.844 ms
[TIME] Add : 110.266 ms
[TIME] multiply : 88.820 ms
[TIME] multiply_scalar : 767.707 ms
[TIME] divide : 354.996 ms
[TIME] divide_Scalar : 1784.366 ms
[TIME] addWeighted : 318.085 ms
[TIME] Transfer CPU->CPU : 3.922 ms
[PERF] GPU Process (copyTo)
[TIME] Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME] Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME] cvtColor : 72.888 ms
[TIME] GaussianBlur : 343.475 ms
[TIME] Canny : 195.293 ms
[TIME] Dilate : 85.200 ms
[TIME] Add : 9.628 ms
[TIME] multiply : 9.469 ms
[TIME] multiply_scalar : 13.250 ms
[TIME] divide : 29.945 ms
[TIME] divide_Scalar : 32.137 ms
[TIME] addWeighted : 94.774 ms
[TIME] Transfer GPU->CPU : 19.806 ms
Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME] Read : 0.001 ms
[TIME] Transfer CPU->CPU : 5.736 ms
[TIME] cvtColor : 65.285 ms
[TIME] GaussianBlur : 582.992 ms
[TIME] Canny : 428.836 ms
[TIME] Dilate : 44.551 ms
[TIME] Add : 93.305 ms
[TIME] multiply : 95.427 ms
[TIME] multiply_scalar : 781.954 ms
[TIME] divide : 366.936 ms
[TIME] divide_Scalar : 1782.807 ms
[TIME] addWeighted : 336.931 ms
[TIME] Transfer CPU->CPU : 8.400 ms
[PERF] GPU Process (copyTo)
[TIME] Read : 0.063 ms
[TIME] Transfer CPU->GPU : 11.910 ms
[TIME] cvtColor : 1.609 ms
[TIME] GaussianBlur : 328.819 ms
[TIME] Canny : 168.397 ms
[TIME] Dilate : 77.406 ms
[TIME] Add : 3.617 ms
[TIME] multiply : 1.209 ms
[TIME] multiply_scalar : 1.350 ms
[TIME] divide : 1.892 ms
[TIME] divide_Scalar : 2.366 ms
[TIME] addWeighted : 2.807 ms
[TIME] Transfer GPU->CPU : 168.320 ms
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace cv;
using namespace std;
// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;
void perfMeasure_start()
{
aChrono.reset();
aChrono.start();
}
void perfMeasure_end(std::string strLabel)
{
aChrono.stop();
printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
aChrono.reset();
aChrono.start();
}
void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
perfMeasure_start();
for (int i = 0; i < 50; i++)
{
cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
}
perfMeasure_end("cvtColor");
for (int i = 0; i < 50; i++)
{
cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
}
perfMeasure_end("GaussianBlur");
for (int i = 0; i < 50; i++)
{
cv::Canny(matDst, matDst, 0, 50);
}
perfMeasure_end("Canny");
for (int i = 0; i < 50; i++)
{
cv::dilate(matDst, matDst, cv::noArray());
}
perfMeasure_end("Dilate");
for (int i = 0; i < 50; i++)
{
cv::add(matSrc, matSrc, matDst);
}
perfMeasure_end("Add");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, matDst, matDst);
}
perfMeasure_end("multiply");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, 2.5, matDst);
}
perfMeasure_end("multiply_scalar");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, matDst, matDst);
}
perfMeasure_end("divide");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, 2.5, matDst);
}
perfMeasure_end("divide_Scalar");
for (int i = 0; i < 50; i++)
{
cv::addWeighted(matSrc, 2.5, matDst, 0.6, 0, matDst);
}
perfMeasure_end("addWeighted");
}
void TestOpenCL(cv::InputArray matSrc)
{
printf("[PERF] -= Performance Check =-\n");
printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());
//////////// CPU MEASUREMENT CODE //////////////////////
printf("[PERF] CPU Process\n");
perfMeasure_start();
cv::Mat img, gray;
cv::Mat matImg, matDst;
// Read
matImg = matSrc.getMat();
perfMeasure_end("Read");
//Transfer
matImg.copyTo(img);
perfMeasure_end("Transfer CPU->CPU");
// Process
OpenCLProc(img, gray);
// Transfer
gray.copyTo(matDst);
perfMeasure_end("Transfer CPU->CPU");
std::cout << "\n";
//////////////////// GPU MEASUREMENT CODE 1 //////////////////////
printf("[PERF] GPU Process (copyTo)\n");
cv::UMat img1, gray1;
cv::Mat matImg1, matDst1;
// Read
matImg1 = matSrc.getMat();
perfMeasure_end("Read");
// Transfer
matImg1.copyTo(img1);
perfMeasure_end("Transfer CPU->GPU");
//Process
OpenCLProc(img1, gray1);
// Transfer
gray1.copyTo(matDst1);
perfMeasure_end("Transfer GPU->CPU");
std::cout << "\n";
return;
}
int main(int argc, const char** argv)
{
cv::Mat matSrc = cv::imread("g:/lib/opencv/samples/data/lena.jpg");
resize(matSrc, matSrc,Size(), 4, 4, INTER_LINEAR);
// First round of process to let the GPU initialize
cout << "only to compile kernels *************************\n";
TestOpenCL(matSrc);
// The code performance is made based on result from this function
cout << "Real Test *************************\n";
TestOpenCL(matSrc);
return 0;
}