opencv 积分图源码(自己看,学习)

#if CV_SSE2

template <>
struct Integral_SIMD<uchar, int, double>
{
    Integral_SIMD()
    {
        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
    }

    bool operator()(const uchar * src, size_t _srcstep,
                    int * sum, size_t _sumstep,
                    double * sqsum, size_t,
                    int * tilted, size_t,
                    int width, int height, int cn) const
    {
        if (sqsum || tilted || cn != 1 || !haveSSE2)
            return false;

        // the first iteration
        memset(sum, 0, (width + 1) * sizeof(int));

        __m128i v_zero = _mm_setzero_si128(), prev = v_zero;
        int j = 0;

        // the others
        for (int i = 0; i < height; ++i)
        {
            const uchar * src_row = src + _srcstep * i;
            int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1;
            int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1;

            sum_row[-1] = 0;

            prev = v_zero;
            j = 0;

            for ( ; j + 7 < width; j += 8)
            {
                __m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j));
                __m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4));

                __m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j));
                __m128i el8shr1 = _mm_slli_si128(el8shr0, 1);
                __m128i el8shr2 = _mm_slli_si128(el8shr0, 2);
                __m128i el8shr3 = _mm_slli_si128(el8shr0, 3);

                vsuml = _mm_add_epi32(vsuml, prev);
                vsumh = _mm_add_epi32(vsumh, prev);

                __m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero),
                                                 _mm_unpacklo_epi8(el8shr2, v_zero));
                __m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero),
                                                 _mm_unpacklo_epi8(el8shr3, v_zero));
                __m128i el8 = _mm_add_epi16(el8shr12, el8shr03);

                __m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero),
                                             _mm_unpacklo_epi16(el8, v_zero));

                vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero));
                vsumh = _mm_add_epi32(vsumh, el4h);

                _mm_storeu_si128((__m128i *)(sum_row + j), vsuml);
                _mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh);

                prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3)));
            }

            for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
                sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
        }

        return true;
    }

    bool haveSSE2;
};

#endif

猜你喜欢

转载自blog.csdn.net/myzhouwang/article/details/84641833
今日推荐