Revision history [back]

Use this pattern:
using namespace cv;
using namespace cv::gpu;

void callKernel(const GpuMat& src, GpuMat& dst)
{
    // you can use
    // src.cols - width
    // src.rows - height
    // src.step - step in bytes between image rows
    // src.ptr<T>() - device pointer to data, T - element type
    func(src.ptr<uchar3>(), src.step, src.cols, src.rows);
}

int main()
{
    ...
    Mat frame;
    Mat output;
    GpuMat d_frame;
    GpuMat d_output;
    for(;;)
    {
        cap >> frame;
        if (frame.empty())
            break;

        // memory Copy from Host to Device
        d_frame.upload(frame);

        // Call CUDA kernel
        d_output.create(size, type);
        callKernel(d_frame, d_output);

        // memory Copy from Device to Host
        d_output.download(output);

        imshow("output", output);
        if(waitKey(30) >= 0) 
            break;
    }
    ...
}