在移动设备上testingNEON优化的cv :: threshold()

对于ARM设备(手机),我一直在为OpenCV的阈值function写一些优化。 它应该在Android和iPhone上工作。

但是,我没有一个设备来testing它,所以我正在寻找志愿者给我一点帮助。 如果这激励你更多,我打算将它发送到OpenCV,以便将其集成到主存储库中。

我会对代码的正确性感兴趣,如果它恰好按照预期工作,那么对原始/优化性能进行一些统计。 不要忘记看所有的情况。

所以,这里是代码。 要运行它,请粘贴到位于第228行(截至2.4.2)的opencv/modules/imgproc/src/thresh.cpp – 正好在SSE块的下面,然后重新编译OpenCV。

另外,在文件的顶部添加这一行

 #include <arm_neon.h> 

主代码体:

 #define CV_USE_NEON 1 #if CV_USE_NEON //if( checkHardwareSupport(CV_CPU_ARM_NEON) ) if( true ) { uint8x16_t thresh_u = vdupq_n_u8(thresh); uint8x16_t maxval_ = vdupq_n_u8(maxval); j_scalar = roi.width & -8; for( i = 0; i < roi.height; i++ ) { const uchar* src = (const uchar*)(_src.data + _src.step*i); uchar* dst = (uchar*)(_dst.data + _dst.step*i); switch( type ) { case THRESH_BINARY: for( j = 0; j <= roi.width - 32; j += 32 ) { uint8x16_t v0, v1; v0 = vld1q_u8 ( src + j ); v1 = vld1q_u8 ( src + j + 16 ); v0 = vcgtq_u8 ( v0, thresh_u ); v1 = vcgtq_u8 ( v1, thresh_u ); v0 = vandq_u8 ( v0, maxval_ ); v1 = vandq_u8 ( v1, maxval_ ); vst1q_u8 ( dst + j, v0 ); vst1q_u8 ( dst + j + 16, v1 ); } for( ; j <= roi.width - 8; j += 8 ) { uint8x8_t v2; v2 = vld1_u8( src + j ); v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) ); v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) ); vst1_u8 ( dst + j, v2 ); } break; case THRESH_BINARY_INV: for( j = 0; j <= roi.width - 32; j += 32 ) { uint8x16_t v0, v1; v0 = vld1q_u8 ( src + j ); v1 = vld1q_u8 ( src + j + 16 ); v0 = vcleq_u8 ( v0, thresh_u ); v1 = vcleq_u8 ( v1, thresh_u ); v0 = vandq_u8 ( v0, maxval_ ); v1 = vandq_u8 ( v1, maxval_ ); vst1q_u8 ( dst + j, v0 ); vst1q_u8 ( dst + j + 16, v1 ); } for( ; j <= roi.width - 8; j += 8 ) { uint8x8_t v2; v2 = vld1_u8( src + j ); v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) ); v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) ); vst1_u8 ( dst + j, v2 ); } break; case THRESH_TRUNC: for( j = 0; j <= roi.width - 32; j += 32 ) { uint8x16_t v0, v1; v0 = vld1q_u8 ( src + j ); v1 = vld1q_u8 ( src + j + 16 ); v0 = vminq_u8 ( v0, thresh_u ); v1 = vminq_u8 ( v1, thresh_u ); vst1q_u8 ( dst + j, v0 ); vst1q_u8 ( dst + j + 16, v1 ); } for( ; j <= roi.width - 8; j += 8 ) { uint8x8_t v2; v2 = vld1_u8( src + j ); v2 = vmin_u8 ( v2, vget_low_s8 ( thresh_u ) ); vst1_u8 ( dst + j, v2 ); } break; case THRESH_TOZERO: for( j = 0; j <= roi.width - 32; j += 32 ) { uint8x16_t v0, v1; v0 = vld1q_u8 ( src + j ); v1 = vld1q_u8 ( src + j + 16 ); v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) ); v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) ); vst1q_u8 ( dst + j, v0 ); vst1q_u8 ( dst + j + 16, v1 ); } for( ; j <= roi.width - 8; j += 8 ) { uint8x8_t v2; v2 = vld1_u8 ( src + j ); v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) ); vst1_u8 ( dst + j, v2 ); } break; case THRESH_TOZERO_INV: for( j = 0; j <= roi.width - 32; j += 32 ) { uint8x16_t v0, v1; v0 = vld1q_u8 ( src + j ); v1 = vld1q_u8 ( src + j + 16 ); v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) ); v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) ); vst1q_u8 ( dst + j, v0 ); vst1q_u8 ( dst + j + 16, v1 ); } for( ; j <= roi.width - 8; j += 8 ) { uint8x8_t v2; v2 = vld1_u8 ( src + j ); v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) ); vst1_u8 ( dst + j, v2 ); } break; } } } #endif