因memcpy导致tda4vm上的h264解码占CPU较高而改弃,从网上找到各种memcpy的优化代码,在一起做了个运行速度对比,请查收;
#include <stdio.h>
#include <stdlib.h> /* rand, srand */
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <time.h> /* time() *//* Nonzero if either X or Y is not aligned on a "long" boundary. */
#define UNALIGNED(X, Y) (((long)X & (sizeof(long) - 1)) | ((long)Y & (sizeof(long) - 1)))/* How many bytes are copied each iteration of the 4X unrolled loop. */
#define BIGBLOCKSIZE (sizeof(long) << 2)/* How many bytes are copied each iteration of the word copy loop. */
#define LITTLEBLOCKSIZE (sizeof(long))/* Threshhold for punting to the byte copier. */
#define TOO_SMALL(LEN) ((LEN) < BIGBLOCKSIZE)char *memcopy_super(char* dest0, const char *src0, size_t len0)
{assert(dest0 && src0 && (len0 > 0));char *dest = dest0;const char *src = src0;long *aligned_dest;const long *aligned_src;/* If the size is small, or either SRC or DST is unaligned,then punt into the byte copy loop. This should be rare. */if (!TOO_SMALL(len0) && !UNALIGNED(src, dest)) {aligned_dest = (long *)dest;aligned_src = (long *)src;/* Copy 4X long words at a time if possible. */while (len0 >= BIGBLOCKSIZE) {*aligned_dest++ = *aligned_src++;*aligned_dest++ = *aligned_src++;*aligned_dest++ = *aligned_src++;*aligned_dest++ = *aligned_src++;len0 -= BIGBLOCKSIZE;}/* Copy one long word at a time if possible. */while (len0 >= LITTLEBLOCKSIZE) {*aligned_dest++ = *aligned_src++;len0 -= LITTLEBLOCKSIZE;}/* Pick up any residual with a byte copier. */dest = (char *)aligned_dest;src = (char *)aligned_src;}while (len0--)*dest++ = *src++;return dest0;
}static void get_rand_bytes(unsigned char *data, int len)
{int i;srand((unsigned)time(NULL)); //种下随机种子for (i = 0; i < len; i++) {data[i] = rand() % 255; //取随机数,并保证数在0-255之间//printf("%02X ", data[i]);}
}static int get_cur_time_us(void)
{struct timeval tv;gettimeofday(&tv, NULL); //使用gettimeofday获取当前系统时间return (tv.tv_sec * 1000 * 1000 + tv.tv_usec); //利用struct timeval结构体将时间转换为ms
}#define ARRAY_SIZE(n) sizeof(n) / sizeof(n[0])int main(void)
{int size_list[] = {1024 * 1024 * 10, // 10MB1024 * 1024 * 1, // 1MB1024 * 100, // 100KB1024 * 10, // 10KB1024 * 1, // 1KB};char *data1;char *data2;int t1;int t2;int i = 0;data1 = (char *)malloc(size_list[0]);data2 = (char *)malloc(size_list[0]);get_rand_bytes((unsigned char *)data1, size_list[0]);for (i = 0; i < ARRAY_SIZE(size_list); i++) {t1 = get_cur_time_us();memcpy(data2, data1, size_list[i]);t2 = get_cur_time_us();printf("copy %d bytes, memcpy waste time %dus\n", size_list[i], t2 - t1);t1 = get_cur_time_us();memcopy_super(data2, data1, size_list[i]);t2 = get_cur_time_us();printf("copy %d bytes, memcopy_super waste time %dus\n\n", size_list[i], t2 - t1);}free(data1);free(data2);return 0;
}
#if 0
copy 10485760 bytes, memcpy waste time 6502us
copy 10485760 bytes, memcopy_super waste time 12689us
copy 1048576 bytes, memcpy waste time 659us
copy 1048576 bytes, memcopy_super waste time 999us
copy 102400 bytes, memcpy waste time 76us
copy 102400 bytes, memcopy_super waste time 73us
copy 10240 bytes, memcpy waste time 1us
copy 10240 bytes, memcopy_super waste time 5us
copy 1024 bytes, memcpy waste time 1us
copy 1024 bytes, memcopy_super waste time 1us
#endif