Hi. I'm now working on jetson TX1 with cuda8.0. I compiled opencv3.1 with cuda and it looks successful.
However, when I tried some samples, I found cuda calclation pretty slow mostly because of uploading images.
For example, I tried surf feature extraction and matching from opencv sample cord. The result is like below:
upLoad = 39.9022
Device 0: "NVIDIA Tegra X1" 3995Mb, sm_53, Driver/Runtime ver.8.0/8.0
FOUND 158 keypoints on first image
FOUND 137 keypoints on second image
Findcuda = 0.000123487 Extraction = 0.0952315
Matching = 0.00152424 Download = 0.00137919
This means uploading two images took about 40sec! Are there any solutions? Thank you.
I put my code here.
#include <iostream>
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_XFEATURES2D
#include "opencv2/core.hpp"
#include "opencv2/features2d.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/cudafeatures2d.hpp"
#include "opencv2/xfeatures2d/cuda.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
static void help()
{
cout << "\nThis program demonstrates using SURF_CUDA features detector, descriptor extractor and BruteForceMatcher_CUDA" << endl;
cout << "\nUsage:\n\tsurf_keypoint_matcher --left <image1> --right <image2>" << endl;
}
int main(int argc, char* argv[])
{
if (argc != 5)
{
help();
return -1;
}
GpuMat img1, img2;
cv::Mat raw1,raw2;
raw1 = imread(argv[2], IMREAD_GRAYSCALE);
raw2= imread(argv[4], IMREAD_GRAYSCALE);
int64 t0 = cv::getTickCount();
for (int i = 1; i < argc; ++i)
{
if (string(argv[i]) == "--left")
{
img1.upload(raw1);
CV_Assert(!img1.empty());
}
else if (string(argv[i]) == "--right")
{
img2.upload(raw2);
CV_Assert(!img2.empty());
}
else if (string(argv[i]) == "--help")
{
help();
return -1;
}
}
int64 t1 = cv::getTickCount();
cout << " upLoad = " << (t1-t0)/cv::getTickFrequency() << endl;
cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());
int64 t2 = cv::getTickCount();
SURF_CUDA surf;
// detecting keypoints & computing descriptors
GpuMat keypoints1GPU, keypoints2GPU;
GpuMat descriptors1GPU, descriptors2GPU;
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
int64 t3 = cv::getTickCount();
cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
cout << " Findcuda = " << (t2-t1)/cv::getTickFrequency() << " Extraction = " << (t3-t2)/cv::getTickFrequency() << endl;
// matching descriptors
Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(surf.defaultNorm());
vector<DMatch> matches;
matcher->match(descriptors1GPU, descriptors2GPU, matches);
int64 t4 = cv::getTickCount();
// downloading results
vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2;
surf.downloadKeypoints(keypoints1GPU, keypoints1);
surf.downloadKeypoints(keypoints2GPU, keypoints2);
surf.downloadDescriptors(descriptors1GPU, descriptors1);
surf.downloadDescriptors(descriptors2GPU, descriptors2);
int64 t5 = cv::getTickCount();
cout << " Matching = " << (t4-t3)/cv::getTickFrequency() << " Download = " << (t5-t4)/cv::getTickFrequency() << endl;
// drawing the results
Mat img_matches;
drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);
namedWindow("matches", 0);
imshow("matches", img_matches);
waitKey(0);
return 0;
}
#else
int main()
{
std::cerr << "OpenCV was built without xfeatures2d module" << std::endl;
return 0;
}
#endif