DeltaCV之CPU算法优化[YCrCb空间阈值分割]

这篇文章我们先来认识一下DeltaCV中CPU上做的相关优化工作,使用SSE指令集优化RGB转YCrCb颜色空间过程,并进行阈值分割。

OpenCV中的函数

在OpenCV中完成RGB图像在YCrCb空间的阈值分割,需要用到两个函数,cv::cvtColor(颜色空间转换)及cv::inRange(阈值处理).

函数原型

1
2
3
4

void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0 );

void inRange(InputArray src, InputArray lowerb,InputArray upperb, OutputArray dst);

DeltaCV中的优化

函数原型

1
2
3
4
5
6
7
8
9
10
    /**  @brief: more details - > 
     *
     * src: input(BGR image)
     * dst: output
     * width:
     * height:
     * lower: lower boundary
     * upper: upper boundary
     */
    bool ycrcbWithSeg(unsigned char *src, unsigned char *dst,const int width,const int height,scalar lower,scalar upper);

加载变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
assert(lower.channels()==3 && upper.channels()==3);
int blockSize = 48; //16*3
int block = (height * width * 3) / blockSize;

// 加载阈值
__m128i ch0_min_sse = _mm_setr_epi8(lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0],lower[0]);
__m128i ch1_min_sse = _mm_setr_epi8(lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1],lower[1]);
__m128i ch2_min_sse = _mm_setr_epi8(lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2],lower[2]);

__m128i ch0_max_sse = _mm_setr_epi8(upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0],upper[0]);
__m128i ch1_max_sse = _mm_setr_epi8(upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1],upper[1]);
__m128i ch2_max_sse = _mm_setr_epi8(upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2],upper[2]);

const int Shift = 15;
const int HalfV = 1<<(Shift-1);

//这里不能用网上给的转换公式,因为bgr转ycrcb有很多版本,我是直接看opencv源码算出的系数,
// static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; in /opencv-4.0.1/modules/imgproc/src/color_yuv.cpp
//为什么要和opencv一样的系数?从结果来看,这套系数得到的cr cb通道对比度更大,也就是说红色和蓝色与其他背景区分度更高.
const int Y_B_WT = 0.114f * (1 << Shift), Y_G_WT = 0.587f * (1 << Shift), Y_R_WT = 0.299f * (1 << Shift), Y_C_WT = 1;
const int Cr_B_WT = 0.499f * (1 << Shift), Cr_G_WT = -0.419f * (1 << Shift), Cr_R_WT = -0.081f * (1 << Shift), Cr_C_WT = 257;
const int Cb_B_WT = 0.395f * (1 << Shift), Cb_G_WT = -0.331f * (1 << Shift), Cb_R_WT = -0.064f * (1 << Shift), Cb_C_WT = 257;

// load 上面的权值
__m128i Weight_YBG = _mm_setr_epi16(Y_B_WT, Y_G_WT, Y_B_WT, Y_G_WT, Y_B_WT, Y_G_WT, Y_B_WT, Y_G_WT);
__m128i Weight_YRC = _mm_setr_epi16(Y_R_WT, Y_C_WT, Y_R_WT, Y_C_WT, Y_R_WT, Y_C_WT, Y_R_WT, Y_C_WT);
__m128i Weight_UBG = _mm_setr_epi16(Cr_B_WT, Cr_G_WT, Cr_B_WT, Cr_G_WT, Cr_B_WT, Cr_G_WT, Cr_B_WT, Cr_G_WT);
__m128i Weight_URC = _mm_setr_epi16(Cr_R_WT, Cr_C_WT, Cr_R_WT, Cr_C_WT, Cr_R_WT, Cr_C_WT, Cr_R_WT, Cr_C_WT);
__m128i Weight_VBG = _mm_setr_epi16(Cb_B_WT, Cb_G_WT, Cb_B_WT, Cb_G_WT, Cb_B_WT, Cb_G_WT, Cb_B_WT, Cb_G_WT);
__m128i Weight_VRC = _mm_setr_epi16(Cb_R_WT, Cb_C_WT, Cb_R_WT, Cb_C_WT, Cb_R_WT, Cb_C_WT, Cb_R_WT, Cb_C_WT);
__m128i Half = _mm_setr_epi16(0, HalfV, 0, HalfV, 0, HalfV, 0, HalfV);
  • 刚开始我们进行了阈值的通道检查(因为这个函数输入的数据必须是RGB图像).
  • 紧接着,再将高低两个阈值加载进SSE变量中.
  • 下面是重点,RGB转ycrcb的公式里面是每个系数都是0-1之间的浮点型,而我们前面取到的像素是以8位无符号整形保存在SSE变量中,所以我们需要将系数左移15位,既保证了精度,又满足了计算要求(没有超过16位,正好两个8位).

计算主体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
for (int i = 0; i < block; ++i, src += blockSize, dst += 16)
{
	__m128i src1, src2, src3;

    src1 = _mm_loadu_si128((__m128i *) (src + 0)); //一次性读取个字节
    src2 = _mm_loadu_si128((__m128i *) (src + 16));
    src3 = _mm_loadu_si128((__m128i *) (src + 32));

// 在_mm_shuffle_epi8中构造出16位的数据
	__m128i BGLL = _mm_shuffle_epi8(src1,_mm_setr_epi8(0,-1,1,-1,3,-1,4,-1,6,-1,7,-1,9,-1,10,-1));
	__m128i BGLH = _mm_shuffle_epi8(src1,_mm_setr_epi8(12,-1,13,-1,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
	BGLH = _mm_or_si128(BGLH,_mm_shuffle_epi8(src2,_mm_setr_epi8(-1,-1,-1,-1,-1,-1,0,-1,2,-1,3,-1,5,-1,6,-1)));

	__m128i BGHL = _mm_shuffle_epi8(src2, _mm_setr_epi8(8, -1, 9, -1, 11, -1, 12, -1, 14, -1, 15, -1, -1, -1, -1, -1));
	BGHL = _mm_or_si128(BGHL,_mm_shuffle_epi8(src3,_mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 2, -1)));
	__m128i BGHH = _mm_shuffle_epi8(src3, _mm_setr_epi8(4, -1, 5, -1, 7, -1, 8, -1, 10, -1, 11, -1, 13, -1, 14, -1));

	__m128i RCLL = _mm_shuffle_epi8(src1,_mm_setr_epi8(2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1));
	RCLL = _mm_or_si128(RCLL,Half);
	__m128i RCLH = _mm_shuffle_epi8(src1,_mm_setr_epi8(14,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
	RCLH = _mm_or_si128(_mm_or_si128(RCLH,_mm_shuffle_epi8(src2,_mm_setr_epi8(-1,-1,-1,-1,1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1))),Half);

	__m128i RCHL = _mm_shuffle_epi8(src2,_mm_setr_epi8(10,-1,-1,-1,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
	RCHL = _mm_or_si128(_mm_or_si128(RCHL,_mm_shuffle_epi8(src3,_mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,3,-1,-1,-1))),Half);
	__m128i RCHH = _mm_shuffle_epi8(src3,_mm_setr_epi8(6,-1,-1,-1,9,-1,-1,-1,12,-1,-1,-1,15,-1,-1,-1));
	RCHH = _mm_or_si128(RCHH,Half);

// 下面是进行计算
	__m128i Result;

	__m128i Y_LL = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGLL, Weight_YBG), _mm_madd_epi16(RCLL, Weight_YRC)), Shift);
	__m128i Y_LH = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGLH, Weight_YBG), _mm_madd_epi16(RCLH, Weight_YRC)), Shift);
	__m128i Y_HL = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGHL, Weight_YBG), _mm_madd_epi16(RCHL, Weight_YRC)), Shift);
	__m128i Y_HH = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGHH, Weight_YBG), _mm_madd_epi16(RCHH, Weight_YRC)), Shift);

	__m128i Ch0 = _mm_packus_epi16(_mm_packus_epi32(Y_LL, Y_LH), _mm_packus_epi32(Y_HL, Y_HH));
	Result = _mm_cmpge_up_epu8(Ch0, ch0_min_sse);
	Result = _mm_and_si128(Result, _mm_cmpge_down_epu8(Ch0, ch0_max_sse));

	__m128i U_LL = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGLL, Weight_UBG), _mm_madd_epi16(RCLL, Weight_URC)), Shift);
	__m128i U_LH = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGLH, Weight_UBG), _mm_madd_epi16(RCLH, Weight_URC)), Shift);
	__m128i U_HL = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGHL, Weight_UBG), _mm_madd_epi16(RCHL, Weight_URC)), Shift);
	__m128i U_HH = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGHH, Weight_UBG), _mm_madd_epi16(RCHH, Weight_URC)), Shift);

	__m128i Ch1 = _mm_packus_epi16(_mm_packus_epi32(U_LL, U_LH), _mm_packus_epi32(U_HL, U_HH));
	Result = _mm_and_si128(Result, _mm_cmpge_up_epu8(Ch1, ch1_min_sse));
	Result = _mm_and_si128(Result, _mm_cmpge_down_epu8(Ch1, ch1_max_sse));

	__m128i V_LL = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGLL, Weight_VBG), _mm_madd_epi16(RCLL, Weight_VRC)), Shift);
	__m128i V_LH = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGLH, Weight_VBG), _mm_madd_epi16(RCLH, Weight_VRC)), Shift);
	__m128i V_HL = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGHL, Weight_VBG), _mm_madd_epi16(RCHL, Weight_VRC)), Shift);
	__m128i V_HH = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(BGHH, Weight_VBG), _mm_madd_epi16(RCHH, Weight_VRC)), Shift);

	__m128i Ch2 = _mm_packus_epi16(_mm_packus_epi32(V_LL, V_LH), _mm_packus_epi32(V_HL, V_HH));
	Result = _mm_and_si128(Result, _mm_cmpge_up_epu8(Ch2, ch2_min_sse));
	Result = _mm_and_si128(Result, _mm_cmpge_down_epu8(Ch2, ch2_max_sse));

	_mm_storeu_si128((__m128i *)dst, Result);
}

//剩余不足一个block的单独处理
for (int j = blockSize * block; j < height * width; ++j, src += 3, dst++) {
	uint8_t _ch0 = src[0], _ch1 = src[1], _ch2 = src[2];

	uint8_t YUV_Y = (Y_B_WT * _ch0 + Y_G_WT * _ch1 + Y_R_WT * _ch2 + Y_C_WT * HalfV) >> Shift;
	uint8_t YUV_U = (Cr_B_WT * _ch0 + Cr_G_WT * _ch1 + Cr_R_WT * _ch2 + Cr_C_WT * HalfV) >> Shift;
	uint8_t YUV_V = (Cb_B_WT * _ch0 + Cb_G_WT * _ch1 + Cb_R_WT * _ch2 + Cb_C_WT * HalfV) >> Shift;
	if ((YUV_Y >= lower[0] && YUV_Y <= upper[0] && YUV_U >= lower[1] && YUV_U <= upper[1] && YUV_V >= lower[2] &&
YUV_V <= upper[2])) {
	dst[0] = 255;
	} else {
		dst[0] = 0;
	}
}
  • 这里我们同样先从输入图像中加载待处理的数据.
  • 接着我们将B G R B G R B G R这样的数据拆分成B 0 G 0 B 0 G 0 和R 0 half 0 R 0 half 0这样的形式,为了后面使用_mm_madd_epi16将像素与各自的权重相乘再相加.
  • Y_C_WT, Cr_C_WT, Cb_C_WT这三个变量是干什么? 我们来看一下rgb转ycrcb的公式:[参考Imageshop])(https://www.cnblogs.com/Imageshop/p/8405517.html)
    1
    2
    3
    Y = (Y_B_WT * Blue + Y_G_WT * Green + Y_R_WT * Red + HalfV) >> Shift;
    Cr = ((Cr_B_WT * Blue + Cr_G_WT * Green + Cr_R_WT * Red + HalfV) >> Shift) + 128;
    Cb = ((Cb_B_WT * Blue + Cb_G_WT * Green + Cb_R_WT * Red + HalfV) >> Shift) + 128;
    

    经过变形成为:

    1
    2
    3
    4
    Y = (Y_B_WT * Blue + Y_G_WT * Green + Y_R_WT * Red + 1 * HalfV) >> Shift;
    Cr = (U_B_WT * Blue + U_G_WT * Green + U_R_WT * Red + 257 * HalfV) >> Shift;
    Cb = (V_B_WT * Blue + V_G_WT * Green + V_R_WT * Red + 257 * HalfV) >> Shift;
    这就是Y_C_WT=1,Cr_C_WT=257,Cb_C_WT=257的由来
    
  • 虽然上面代码量看着较多,但是都是基本运算,而且比OpenCV的函数快了7-8倍.

性能对比

处理1000次取平均值

Image Size: 1024 x 1280(H x W)

opencv函数耗时: 6.68 - 6.75 ms

DeltaCV函数耗时: 0.88 - 0.90 ms

加速比: 7.4 - 7.6