I have been writing some optimizations for the OpenCV's threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.
However, I do not have a device to test it on, so I am looking for volunteers to give me a little help. If that motivates you more, I am planning to push it to the main OpenCV repo - and hope you guys will accept it
I am interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.
If this patch format is a nonsense for you, the post here may help you more
Patch 1. Update checkHardwareSupport()
From 5129091430a7423e5c07a4f3c845033adb8ccefe Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 09:59:36 +0300
Subject: [PATCH 1/2] Update checkHardwareSupport and HWFeatures to support
ARM NEON
---
modules/core/include/opencv2/core/core_c.h | 3 +++
modules/core/src/system.cpp | 17 ++++++++++++++++-
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index df763ab..bdfb879 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1706,6 +1706,9 @@ CVAPI(double) cvGetTickFrequency( void );
#define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8
#define CV_CPU_AVX 10
+
+#define CV_CPU_ARM_NEON 100
+
#define CV_HARDWARE_MAX_FEATURE 255
CVAPI(int) cvCheckHardwareSupport(int feature);
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b8a4661..eeb2a58 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -176,7 +176,22 @@ struct HWFeatures
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (cpuid_data[2] & (1<<28)) != 0;
}
-
+
+ // Android check
+ #if defined ANDROID
+ if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+ (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0)
+ {
+ f.have[CV_CPU_ARM_NEON] = 1;
+ }
+ #endif
+ // iOS check. Automatically activated by GCC/LLVM compiler option
+ #if define TARGET_OS_IPHONE
+ #if define __ARM_NEON__
+ f.have[CV_CPU_ARM_NEON] = 1;
+ #endif
+ #endif
+
return f;
}
--
1.7.11
Patch 2. threshold() optimizations
From 9f9e6e0a382b0ec2b5ddb3eedd27bcdf95af9763 Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 10:02:35 +0300
Subject: [PATCH 2/2] NEON-accelerated threshold()
---
modules/imgproc/src/thresh.cpp | 140 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 140 insertions(+)
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 1fb4847..fb01852 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -42,6 +42,10 @@
#include "precomp.hpp"
+#if defined CV_USE_NEON
+#include <arm_neon.h>
+#endif
+
namespace cv
{
@@ -226,6 +230,142 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
}
#endif
+#if CV_USE_NEON
+ if( checkHardwareSupport(CV_CPU_ARM_NEON) )
+ {
+ uint8x16_t thresh_u = vdupq_n_u8(thresh);
+ uint8x16_t maxval_ = vdupq_n_u8(maxval);
+
+ j_scalar = roi.width & -8;
+
+ for( i = 0; i < roi.height; i++ )
+ {
+ const uchar* src = (const uchar*)(_src.data + _src.step*i);
+ uchar* dst = (uchar*)(_dst.data + _dst.step*i);
+
+ switch( type )
+ {
+ case THRESH_BINARY:
+ for( j = 0; j <= roi.width - 32; j += 32 )
+ {
+ uint8x16_t v0, v1;
+ v0 = vld1q_u8 ( src + j );
+ v1 = vld1q_u8 ( src + j + 16 );
+ v0 = vcgtq_u8 ( v0, thresh_u );
+ v1 = vcgtq_u8 ( v1, thresh_u );
+ v0 = vandq_u8 ( v0, maxval_ );
+ v1 = vandq_u8 ( v1, maxval_ );
+ vst1q_u8 ( dst + j, v0 );
+ vst1q_u8 ( dst + j + 16, v1 );
+ }
+
+
+ for( ; j <= roi.width - 8; j += 8 )
+ {
+ uint8x8_t v2;
+ v2 = vld1_u8( src + j );
+ v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
+ v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+ vst1_u8 ( dst + j, v2 );
+ }
+ break;
+
+ case THRESH_BINARY_INV:
+ for( j = 0; j <= roi.width - 32; j += 32 )
+ {
+ uint8x16_t v0, v1;
+ v0 = vld1q_u8 ( src + j );
+ v1 = vld1q_u8 ( src + j + 16 );
+ v0 = vcleq_u8 ( v0, thresh_u );
+ v1 = vcleq_u8 ( v1, thresh_u );
+ v0 = vandq_u8 ( v0, maxval_ );
+ v1 = vandq_u8 ( v1, maxval_ );
+ vst1q_u8 ( dst + j, v0 );
+ vst1q_u8 ( dst + j + 16, v1 );
+ }
+
+
+ for( ; j <= roi.width - 8; j += 8 )
+ {
+ uint8x8_t v2;
+ v2 = vld1_u8( src + j );
+ v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
+ v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+ vst1_u8 ( dst + j, v2 );
+ }
+ break;
+
+ case THRESH_TRUNC:
+ for( j = 0; j <= roi.width - 32; j += 32 )
+ {
+ uint8x16_t v0, v1;
+ v0 = vld1q_u8 ( src + j );
+ v1 = vld1q_u8 ( src + j + 16 );
+ v0 = vminq_u8 ( v0, thresh_u );
+ v1 = vminq_u8 ( v1, thresh_u );
+ vst1q_u8 ( dst + j, v0 );
+ vst1q_u8 ( dst + j + 16, v1 );
+ }
+
+
+ for( ; j <= roi.width - 8; j += 8 )
+ {
+ uint8x8_t v2;
+ v2 = vld1_u8( src + j );
+ v2 = vmin_u8 ( v2, vget_low_s8 ( thresh_u ) );
+ vst1_u8 ( dst + j, v2 );
+ }
+ break;
+
+ case THRESH_TOZERO:
+ for( j = 0; j <= roi.width - 32; j += 32 )
+ {
+ uint8x16_t v0, v1;
+ v0 = vld1q_u8 ( src + j );
+ v1 = vld1q_u8 ( src + j + 16 );
+ v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
+ v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
+ vst1q_u8 ( dst + j, v0 );
+ vst1q_u8 ( dst + j + 16, v1 );
+ }
+
+
+ for( ; j <= roi.width - 8; j += 8 )
+ {
+ uint8x8_t v2;
+ v2 = vld1_u8 ( src + j );
+ v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
+ vst1_u8 ( dst + j, v2 );
+ }
+ break;
+
+ case THRESH_TOZERO_INV:
+ for( j = 0; j <= roi.width - 32; j += 32 )
+ {
+ uint8x16_t v0, v1;
+ v0 = vld1q_u8 ( src + j );
+ v1 = vld1q_u8 ( src + j + 16 );
+ v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
+ v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
+ vst1q_u8 ( dst + j, v0 );
+ vst1q_u8 ( dst + j + 16, v1 );
+ }
+
+
+ for( ; j <= roi.width - 8; j += 8 )
+ {
+ uint8x8_t v2;
+ v2 = vld1_u8 ( src + j );
+ v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
+ vst1_u8 ( dst + j, v2 );
+ }
+ break;
+ }
+ }
+ }
+#endif
+
+
if( j_scalar < roi.width )
{
for( i = 0; i < roi.height; i++ )
--
1.7.11