缩放图片算法优化 sse

前情提要
这里实现了打印文件的缩放算法缩放打印文件（prt,prn）
核心功能如下：

void CZoomPrtFile::zoomPrtFile(BYTE* pTargetData)
{float xRatio = static_cast<float>(m_perWidth - 1) / m_zoomWidth;float yRatio = static_cast<float>(m_perHeight - 1) / m_zoomHeight;int srcX=0, srcY=0;int nTargetLineSize = (m_zoomWidth * m_header.nImageColorDeep + 7) / 8;BYTE* pSourceLineData = new BYTE[m_header.nBytePerLine];for (size_t row = 0; row < m_zoomHeight; row++) {srcY = static_cast<int>(yRatio * row);for (int ColorNum = 0; ColorNum < m_header.nImageColorNum; ColorNum++){memset(pSourceLineData, 0, m_header.nBytePerLine);ReadLine(pSourceLineData, srcY * m_header.nImageColorNum + ColorNum, 0, m_header.nBytePerLine);for (size_t column = 0; column < m_zoomWidth; column++) {srcX = static_cast<int>(xRatio * column);// 获取源图像的 bit像素值unsigned char srcValue = getPixel(pSourceLineData, srcX, m_header.nImageColorDeep);int y = (row * m_header.nImageColorNum + ColorNum) * nTargetLineSize + sizeof(m_header);// 设置目标图像的2bit像素值setPixel(pTargetData+y,column, srcValue, m_header.nImageColorDeep);}}}delete[] pSourceLineData;
}unsigned char CZoomPrtFile::getPixel(unsigned char* data, int x, int nImageColorDeep)
{//unsigned char存储 8/int nImageColorDeep 个 int nImageColorDeep bit像素 ,其中 nImageColorDeep ：1,2,4,8 RIP image output bit per colorint byteIndex = x * nImageColorDeep / 8; // 字节索引int bitIndex = (x * nImageColorDeep) % 8; // 位索引// 创建掩码unsigned char mask = (1 << nImageColorDeep) - 1;//std::lock_guard<std::mutex> locker(fileMutex);// 位运算提取像素unsigned char pixel;{ std::lock_guard<std::mutex> locker(fileMutex); pixel = data[byteIndex];}pixel = (pixel >> bitIndex) & mask;return pixel;
}unsigned char CZoomPrtFile::getPixel(unsigned char data, int x, int nImageColorDeep)
{//unsigned char存储 8/int nImageColorDeep 个 int nImageColorDeep bit像素 ,其中 nImageColorDeep ：1,2,4,8 RIP image output bit per colorint byteIndex = x * nImageColorDeep / 8; // 字节索引int bitIndex = (x * nImageColorDeep) % 8; // 位索引// 创建掩码unsigned char mask = (1 << nImageColorDeep) - 1;//std::lock_guard<std::mutex> locker(fileMutex);// 位运算提取像素unsigned char pixel = (data >> bitIndex) & mask;return pixel;
}

使用sse 128

void CZoomPrtFile::zoomPrtFileSSE128(BYTE* pTargetData)
{const float xRatio = static_cast<float>(m_perWidth - 1) / m_zoomWidth;const float yRatio = static_cast<float>(m_perHeight - 1) / m_zoomHeight;__m128 xmmRatio = _mm_set1_ps(xRatio);__m128 ymmRatio = _mm_set1_ps(yRatio);__m128i colorDeep = _mm_set1_epi32(m_header.nImageColorDeep);__m128i bitMask = _mm_set1_epi8(0x07);unsigned char mask = (1 << m_header.nImageColorNum) - 1;__m128i mmMask = _mm_set1_epi32(mask); BYTE* pSourceLineData = new BYTE[m_header.nBytePerLine];int nTargetLineSize = (m_zoomWidth * m_header.nImageColorDeep + 7) / 8;for (int row = 0; row < m_zoomHeight; row++){int srcY = static_cast<int>(yRatio * row);for (int ColorNum = 0; ColorNum < m_header.nImageColorNum; ColorNum++){memset(pSourceLineData, 0, m_header.nBytePerLine);ReadLine(pSourceLineData, srcY * m_header.nImageColorNum + ColorNum, 0, m_header.nBytePerLine);for (int column = 0; column < m_zoomWidth; column+=4){// 加载128位值。返回值代表寄存器的变量中的相同值，地址p不需要16字节对齐。__m128i xmmColumn = _mm_setr_epi32(column, column + 1, column + 2, column + 3);// 列索引转换为浮点数__m128 xmmColumnF = _mm_cvtepi32_ps(xmmColumn);// 乘以缩放比例  转换为整数__m128i xmmSrcXInt = _mm_cvttps_epi32(_mm_mul_ps(xmmColumnF, xmmRatio));//原图xmmSrcXInt = _mm_mullo_epi32(xmmSrcXInt, colorDeep);// 方法1:右移三位实现除以8__m128i xmmSrcXByteIndex = _mm_srli_epi32(xmmSrcXInt, 3);// 字节索引__m128i xmmSrcXBitIndex = _mm_and_si128(xmmSrcXInt, bitMask);// 位索引//目标图__m128i xmmDesX = _mm_mullo_epi32(xmmColumn, colorDeep);//位索引 __m128i xmmDesXByteIndex = _mm_srli_epi32(xmmDesX, 3);// 字节索引__m128i xmmDesXBitIndex = _mm_and_si128(xmmDesX, bitMask);// 位索引//源数据alignas(16) int srcXByteIndex[4];//_mm_store_si128((__m128i*)srcXByteIndex, xmmSrcXByteIndex);alignas(16) int srcXBitIndex[4];//_mm_store_si128((__m128i*)srcXBitIndex, xmmSrcXBitIndex);unsigned char pixel[4]{ (pSourceLineData[srcXByteIndex[0]] >> srcXBitIndex[0])& mask,(pSourceLineData[srcXByteIndex[1]] >> srcXBitIndex[1])& mask,(pSourceLineData[srcXByteIndex[2]] >> srcXBitIndex[2])& mask,(pSourceLineData[srcXByteIndex[3]] >> srcXBitIndex[3])& mask};//目标数据int y = (row * m_header.nImageColorNum + ColorNum) * nTargetLineSize + sizeof(m_header);BYTE* desDataPointer = pTargetData + y;alignas(16) int DesXByteIndex[4];_mm_store_si128((__m128i*)DesXByteIndex, xmmDesXByteIndex);alignas(16) int DesXBitIndex[4];//_mm_store_si128((__m128i*)DesXBitIndex, xmmDesXBitIndex);desDataPointer[DesXByteIndex[0]] = (desDataPointer[DesXByteIndex[0]] & ~(mask << DesXBitIndex[0])) |((pixel[0] /*& mask*/ ) << DesXBitIndex[0]);desDataPointer[DesXByteIndex[1]] = (desDataPointer[DesXByteIndex[1]] & ~(mask << DesXBitIndex[1])) |((pixel[1] /*& mask*/) << DesXBitIndex[1]);desDataPointer[DesXByteIndex[2]] = (desDataPointer[DesXByteIndex[2]] & ~(mask << DesXBitIndex[2])) |((pixel[2] /*& mask*/) << DesXBitIndex[2]);desDataPointer[DesXByteIndex[3]] = (desDataPointer[DesXByteIndex[3]] & ~(mask << DesXBitIndex[3])) |((pixel[3] /*& mask*/) << DesXBitIndex[3]);}}}}