Ask Your Question
4

Slow matrix multiplication when using OpenCL enabled OpenCV

asked 2018-01-05 02:32:08 -0600

yapws87 gravatar image

updated 2018-01-05 03:47:51 -0600

I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.

Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.

Here are the code used to run my simple test:

    #include <opencv2\opencv.hpp>
    #include <opencv2\opencv_modules.hpp>
    #include <opencv2\core\ocl.hpp>
    #include <iostream>
    #include <fstream>

    // Here are the functions used to measure time
    double m_dTime = 0;

    void perfMeasure_start()
    {
        m_dTime = cv::getTickCount();
    }

    void perfMeasure_end(std::string strLabel)
    {
        double currentTime = cv::getTickCount();
        double dTimeTaken = (currentTime - m_dTime) / cv::getTickFrequency();
        printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), dTimeTaken * 1000);
        m_dTime = currentTime;
     }

    void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
    {
        perfMeasure_start();
        for (int i = 0; i < 50; i++)
        {
            cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
        }
        perfMeasure_end("cvtColor");

        for (int i = 0; i < 50; i++)
        {
            cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
        }
        perfMeasure_end("GaussianBlur");

        for (int i = 0; i < 50; i++)
        {
            cv::Canny(matDst, matDst, 0, 50);
        }
        perfMeasure_end("Canny");

        for (int i = 0; i < 50; i++)
        {
            cv::dilate(matDst, matDst, cv::noArray());
        }
        perfMeasure_end("Dilate");

        for (int i = 0; i < 50; i++)
        {
            cv::add(matSrc, matSrc, matDst);
        }
        perfMeasure_end("Add");

        for (int i = 0; i < 50; i++)
        {
            cv::multiply(matSrc, matDst, matDst);
        }
        perfMeasure_end("multiply");

        for (int i = 0; i < 50; i++)
        {
            cv::multiply(matSrc, 2.5, matDst);
        }
        perfMeasure_end("multiply_scalar");

        for (int i = 0; i < 50; i++)
        {
            cv::divide(matSrc, matDst, matDst);
        }
        perfMeasure_end("divide");

        for (int i = 0; i < 50; i++)
        {
            cv::divide(matSrc, 2.5, matDst);
        }
        perfMeasure_end("divide_Scalar");

        for (int i = 0; i < 50; i++)
        {
            cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
        }
        perfMeasure_end("addWeighted");

    }

    void TestOpenCL(cv::InputArray matSrc)
    {
        printf("[PERF] -= Performance Check =-\n");       
        printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

        //////////// CPU MEASUREMENT CODE //////////////////////
        printf("[PERF] CPU Process\n");
        perfMeasure_start();
        cv::Mat img, gray;
        cv::Mat matImg, matDst;

        // Read
        matImg = matSrc.getMat();
        perfMeasure_end("Read");

        //Transfer
        matImg.copyTo(img);
        perfMeasure_end("Transfer CPU->CPU");

        // Process
        OpenCLProc(img, gray);

        // Transfer
        gray.copyTo(matDst);
        perfMeasure_end("Transfer CPU->CPU");

        std::cout << "\n";

        //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
        printf("[PERF] GPU Process (copyTo)\n");
        cv::UMat img1, gray1;
        cv::Mat matImg1, matDst1;

        // Read
        matImg1 = matSrc.getMat();
        perfMeasure_end("Read");

        // Transfer
        matImg1.copyTo(img1);
        perfMeasure_end("Transfer CPU->GPU");

        //Process
        OpenCLProc(img1, gray1);

        // Transfer
        gray1.copyTo(matDst1);
        perfMeasure_end("Transfer GPU->CPU");
        std::cout << "\n";


        return;

    }


int main(int argc, const char** argv)
{
    if (argv[1] == "")
        std::cout << "Please insert image path" << std::endl;
    else {

        cv::Mat matSrc = cv::imread(argv[1]);

            // First round of process to let the GPU initialize
        TestOpenCL(matSrc);

            // The code performance is made based on result from this function
        TestOpenCL(matSrc);

    }


    return 0;
}

And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.

Is there a problem with the OpenCV ... (more)

edit retag flag offensive close merge delete

Comments

What's image size? what is m_util . to test your example we need full code...

LBerger gravatar imageLBerger ( 2018-01-05 03:01:28 -0600 )edit

Image size is 4252x2835

yapws87 gravatar imageyapws87 ( 2018-01-05 03:10:36 -0600 )edit
1

m_util is just a process time measurement class.I will modify the code to be compile friendly.

yapws87 gravatar imageyapws87 ( 2018-01-05 03:11:37 -0600 )edit
1

you can use Tickmeter class

LBerger gravatar imageLBerger ( 2018-01-05 03:12:56 -0600 )edit

Done, i think i have provided all the necessary code for testing.

yapws87 gravatar imageyapws87 ( 2018-01-05 03:24:25 -0600 )edit

I don't think that results are same than your

LBerger gravatar imageLBerger ( 2018-01-05 06:37:56 -0600 )edit

1 answer

Sort by ยป oldest newest most voted
1

answered 2018-01-05 06:28:58 -0600

LBerger gravatar image

updated 2018-01-05 06:36:40 -0600

That's not an answer but is is easier to insert code and results

without patch for image description or for image description

only to compile kernels *******

[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 4.584 ms
[TIME]                 cvtColor : 59.819 ms
[TIME]             GaussianBlur : 584.149 ms
[TIME]                    Canny : 492.338 ms
[TIME]                   Dilate : 43.844 ms
[TIME]                      Add : 110.266 ms
[TIME]                 multiply : 88.820 ms
[TIME]          multiply_scalar : 767.707 ms
[TIME]                   divide : 354.996 ms
[TIME]            divide_Scalar : 1784.366 ms
[TIME]              addWeighted : 318.085 ms
[TIME]        Transfer CPU->CPU : 3.922 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME]        Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME]                 cvtColor : 72.888 ms
[TIME]             GaussianBlur : 343.475 ms
[TIME]                    Canny : 195.293 ms
[TIME]                   Dilate : 85.200 ms
[TIME]                      Add : 9.628 ms
[TIME]                 multiply : 9.469 ms
[TIME]          multiply_scalar : 13.250 ms
[TIME]                   divide : 29.945 ms
[TIME]            divide_Scalar : 32.137 ms
[TIME]              addWeighted : 94.774 ms
[TIME]        Transfer GPU->CPU : 19.806 ms

Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 5.736 ms
[TIME]                 cvtColor : 65.285 ms
[TIME]             GaussianBlur : 582.992 ms
[TIME]                    Canny : 428.836 ms
[TIME]                   Dilate : 44.551 ms
[TIME]                      Add : 93.305 ms
[TIME]                 multiply : 95.427 ms
[TIME]          multiply_scalar : 781.954 ms
[TIME]                   divide : 366.936 ms
[TIME]            divide_Scalar : 1782.807 ms
[TIME]              addWeighted : 336.931 ms
[TIME]        Transfer CPU->CPU : 8.400 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.063 ms
[TIME]        Transfer CPU->GPU : 11.910 ms
[TIME]                 cvtColor : 1.609 ms
[TIME]             GaussianBlur : 328.819 ms
[TIME]                    Canny : 168.397 ms
[TIME]                   Dilate : 77.406 ms
[TIME]                      Add : 3.617 ms
[TIME]                 multiply : 1.209 ms
[TIME]          multiply_scalar : 1.350 ms
[TIME]                   divide : 1.892 ms
[TIME]            divide_Scalar : 2.366 ms
[TIME]              addWeighted : 2.807 ms
[TIME]        Transfer GPU->CPU : 168.320 ms


#include <opencv2/opencv.hpp>
#include <iostream>

using namespace cv;
using namespace std;

// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;

void perfMeasure_start()
{
    aChrono.reset();
    aChrono.start();
}

void perfMeasure_end(std::string strLabel)
{
    aChrono.stop();
    printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
    aChrono.reset();
    aChrono.start();
}

void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
    perfMeasure_start();
    for (int i = 0; i < 50; i++)
    {
        cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
    }
    perfMeasure_end("cvtColor");

    for (int i = 0; i < 50; i++)
    {
        cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
    }
    perfMeasure_end("GaussianBlur");

    for (int i = 0; i < 50; i++)
    {
        cv::Canny(matDst, matDst, 0, 50);
    }
    perfMeasure_end("Canny");

    for (int i = 0; i < 50; i++)
    {
        cv::dilate(matDst, matDst, cv::noArray());
    }
    perfMeasure_end("Dilate");

    for (int i = 0; i < 50; i++)
    {
        cv::add(matSrc, matSrc, matDst);
    }
    perfMeasure_end ...
(more)
edit flag offensive delete link more

Question Tools

1 follower

Stats

Asked: 2018-01-05 02:29:29 -0600

Seen: 1,167 times

Last updated: Jan 05 '18