///
// ARM Neon/** FIR 滤波器* y(n)=h(0)x(n) + h(1)x(n-1) + h(2)x(n-2) + ... h(N-1)x(n-N-1)**/
void TaskManger::fir_filter_c(short * y, const short *x, const short *h, int n_out, int n_coefs)
{int n;for (n = 0; n < n_out; n++){int k, sum = 0;for(k = 0; k < n_coefs; k++){sum += h[k] * x[n - n_coefs + 1 + k];}y[n] = ((sum>>15) + 1) >> 1;}
}void fir_filter_neon(short * y, const short *x, const short *h, int n_out, int n_coefs)
{int n, k;int sum;int16x4_t h_vec;int16x4_t x_vec;int32x4_t result_vec;for (n = 0; n < n_out; n++){/* Clear the scalar and vector sums */sum = 0;result_vec = vdupq_n_s32(0); /* vdup -> duplicates a scalar into every element of the destination vector */for(k = 0; k < n_coefs / 4; k++){/* Four vector multiply-accumulate operations in parallel */h_vec = vld1_s16(&h[k*4]);x_vec = vld1_s16(&x[n - n_coefs + 1 + k*4]);result_vec = vmlal_s16(result_vec, h_vec, x_vec);}/* Reduction operation - add each vector lane result to the sum */sum += vgetq_lane_s32(result_vec, 0);sum += vgetq_lane_s32(result_vec, 1);sum += vgetq_lane_s32(result_vec, 2);sum += vgetq_lane_s32(result_vec, 3);/* consume the last few data using scalar operations */if(n_coefs % 4){for(k = n_coefs - (n_coefs % 4); k < n_coefs; k++)sum += h[k] * x[n - n_coefs + 1 + k];}/* Store the adjusted result */y[n] = ((sum>>15) + 1) >> 1;}
}/** RGB转灰度图像*/void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{int i;for (i=0; i<n; i++){int r = *src++; // load redint g = *src++; // load greenint b = *src++; // load blue// build weighted average:int y = (r*77)+(g*151)+(b*28);// undo the scale by 256 and write to memory:*dest++ = (y>>8);}
}void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{int i;uint8x8_t rfac = vdup_n_u8 (77);uint8x8_t gfac = vdup_n_u8 (151);uint8x8_t bfac = vdup_n_u8 (28);n /= 8;for (i=0; i<n; i++){uint16x8_t temp;uint8x8x3_t rgb = vld3_u8 (src); // vld3 ->loads 3 vectors from memory. 就是 uint8x8_t val[3]uint8x8_t result;temp = vmull_u8 (rgb.val[0], rfac); // 长指令相乘temp = vmlal_u8 (temp,rgb.val[1], gfac);temp = vmlal_u8 (temp,rgb.val[2], bfac);result = vshrn_n_u16 (temp, 8); // vshrn -> ri = ai >> b; 移位指针vst1_u8 (dest, result); // vst1 -> stores a vector into memorysrc += 8*3;dest += 8;}
}/** Vector add */uint32_t TaskManger::vector_add_c(uint32_t *s, uint32_t n)
{// assert n % 4 == 0uint32_t sum = 0;for(uint32_t i = 0; i < n; i++)sum += *s++;return sum;
}uint32_t TaskManger::vector_add_neon(uint32_t *s, uint32_t n)
{// assert n % 4 == 0uint32_t sum, *i;uint32x2_t vec64a, vec64b;uint32x4_t vec128 = vdupq_n_u32(0);for (i = s; i < (s + n); i += 4){uint32x4_t temp128 = vld1q_u32(i); // 从内存加载一个向量vector, vld1 -> loads a vector from memoryvec128 = vaddq_u32(vec128, temp128); // 向量相加uint32x4_t相加,一次加4个uint32_t, vector add, vadd -> ri = ai + bi}vec64a = vget_low_u32(vec128); // returns the lower half of the 128-bit input vector.vec64b = vget_high_u32(vec128); // returns the higher half of the 128-bit input vector.vec64a = vadd_u32 (vec64a, vec64b); // 再将 uint32x2_t 向量相加sum = vget_lane_u32(vec64a, 0); // returns the value from the specified lane of a vector.sum += vget_lane_u32(vec64a, 1);return sum;
}void TaskManger::neon_demo()
{//1, demo 1
#define ELE 65536uint32_t arry[ELE];struct timeval tpstart,tpend;float timeuse;printf("run neon demo 1, vector add\n");srand((unsigned int) time(NULL));for(int i = 0; i < ELE; i++){arry[i] = rand() % 500;}// c/c++gettimeofday(&tpstart,NULL);uint32_t result2 = vector_add_c(arry, ARRAY_SIZE(arry));gettimeofday(&tpend,NULL);timeuse = 1000000*(tpend.tv_sec-tpstart.tv_sec)+ tpend.tv_usec-tpstart.tv_usec;printf("generator loop c add result:%d, used time:%f ms\n\n\n", result2, timeuse);// neongettimeofday(&tpstart,NULL);uint32_t result1 = vector_add_neon(arry, ARRAY_SIZE(arry));gettimeofday(&tpend,NULL);timeuse = 1000000*(tpend.tv_sec-tpstart.tv_sec)+ tpend.tv_usec-tpstart.tv_usec;printf("neon vector add result:%d, used time:%f ms\n\n\n", result1, timeuse);
}