Hi there! I recently built a simple OpenCV-application in Python, that uses ORB and a BruteForce-Matcher to compare image-pairs that live in two separate datasets. The code for that was as follows:
def compute_descriptors(path):
nfeatures = 500
files = [x for x in path.iterdir() if x.is_file()]
orb = cv2.ORB_create(nfeatures)
descs = np.empty((nfeatures, 32, len(files)), dtype=np.uint8)
for i, file in enumerate(files):
img = cv2.imread(str(file), 0)
img = cv2.equalizeHist(img) # Histogram equalization to bring out more detail
kpts, desc = orb.detectAndCompute(img, None)
descs[:,:,i] = desc
query_path = pathlib.Path("/PATH/TO/QUERY-DATASET")
train_path = pathlib.Path("/PATH/TO/TRAIN-DATASET")
query_descriptors = compute_descriptors(query_path)
train_descriptors = compute_descriptors(train_path)
nfeatures = query_descriptors.shape[0]
scores = np.zeros((query_descriptors.shape[-1], train_descriptors.shape[-1]))
matcher = cv2.BFMatcher_create(cv2.NORM_HAMMING)
for i in range(scores.shape[0]):
for j in range(scores.shape[1]):
matches = matcher.knnMatch(query_descs[:,:,i], train_descs[:,:,j], k=2)
good = 0
# Apply ratio-test
for m, n in matches:
if m.distance < threshold * n.distance:
good += 1
scores[i,j] = good / nfeatures
The result is a 2d-array of shape (n, m), where n is the count of images in the query-dataset and m is the count of images in the train-dataset. The matching image for the current query-image is chosen by finding the maximum proportion of good features to the total nfeatures.
When i run this code i see two active threads. Looking at the source code for BFMatcher
there is batch_distance
that calculates the actual distances of two descriptors. Furthermore, there is parallel_for_
(https://github.com/opencv/opencv/blob/1196eb33fcbdc87241d426ccd428718fc81affe9/modules/core/src/batch_distance.cpp#L384), which (to my understanding) is able to run the function in parallel, i.e. in multiple threads which would explain the two active threads.
When i ran cv2.getNumThreads()
however, i got back 4
, even though only 2 were run.
The issue got even more weird, when porting the above code to C++. The code was roughly the following:
#include <vector>
#include <string>
#include <filesystem>
#include <opencv2/features2d.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
using namespace cv;
std::vector<Mat> computeDescriptors(const std::string &path, const int _nfeatures=500) {
// Creating ORB-object
Ptr<Feature2D> orb {ORB::create(_nfeatures)};
// Declaring needed variables
Mat tmp, descriptors;
std::vector<KeyPoint> kpts;
std::vector<Mat> descriptors_vec;
// Computation
for (const auto & entry : std::filesystem::directory_iterator(path)) {
Mat img {imread(entry.path(), IMREAD_GRAYSCALE)};
equalizeHist(img, tmp);
orb->detectAndCompute(tmp, noArray(), kpts, descriptors);
descriptors_vec.push_back(descriptors);
}
return descriptors_vec;
}
Mat computeScores(const std::vector<Mat> &queryDescriptors_vec, const std::vector<Mat> &trainDescriptors_vec) {
Mat_<float> scores(queryDescriptors_vec.size(), trainDescriptors_vec.size(), 0.0f);
Ptr<DescriptorMatcher> matcher;
auto nfeatures {queryDescriptors_vec[0].rows};
matcher = BFMatcher::create(NORM_HAMMING);
std::vector<std::vector<DMatch>> knn_matches;
for (size_t i=0; i < queryDescriptors_vec.size(); ++i) {
for (size_t j = 0; j < trainDescriptors_vec.size(); ++j) {
matcher->knnMatch(queryDescriptors_vec[i], trainDescriptors_vec[j], knn_matches, 2);
// Filter matches using Lowes ratio test
unsigned int good_matches{0};
for (auto &matches : knn_matches)
if (matches[0].distance < 0.6f * matches[1].distance)
good_matches++;
// Write score to scores-Mat
const float ratio {(float) good_matches / (float) nfeatures};
scores.at<float>(i, j) = ratio;
}
}
return scores;
}
std::vector<Mat> queryDescriptorsVec {computeDescriptors("/PATH/TO/QUERY-IMAGES")};
std::vector<Mat> trainDescriptorsVec {computeDescriptors("/PATH/TO/TRAIN-IMAGES")};
Mat scores = computeScores(queryDescriptorsVec, trainDescriptorsVec);
When i run the above code i see only one active thread, even though i would consider the both implementations identical! cv::getNumThreads()
yielded 4
, as was the case in Python.
The Python-version was installed through conda (v4.4.0), while i compiled the C++-version myself (v4.5.0 from GitHub-Master). I compiled it with WITH_PTHREADS_PF
active. My OS is Arch Linux and i compile my C++-projects with G++.
~ λ uname -r
5.8.12-arch1-1
~ λ g++ --version
g++ (GCC) 10.2.0
Copyright (C) 2020 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Is there some configuration required to get it working with threads in C++ also? As a bonus, is there some way to utilize all (4
) available threads when running a BFMatcher
?