将0x1234转换为0x11223344

如何以高性能的方式将hex数0x1234扩展到0x11223344?

unsigned int c = 0x1234, b; b = (c & 0xff) << 4 | c & 0xf | (c & 0xff0) << 8 | (c & 0xff00) << 12 | (c & 0xf000) << 16; printf("%p -> %p\n", c, b); 

输出:

 0x1234 -> 0x11223344 

我需要这个颜色转换。 用户以0xARGB的forms提供他们的数据,我需要将其转换为0xAARRGGBB 。 是的,可能有数百万,因为每个可能是一个像素。 1000×1000像素等于一百万。

实际情况更加复杂,因为一个32位值包含前景色和背景色。 所以0xARGBargb变成: [ 0xAARRGGBB, 0xaarrggbb ]

哦,是的,还有一件事,在一个真正的应用程序中,我也否定了alpha,因为在OpenGL中 0xFF是非透明的,0x00是最透明的,这在大多数情况下是不方便的,因为通常你只需要一个RGB部分,不在场。

这可以使用SSE2完成如下:

 void ExpandSSE2(unsigned __int64 in, unsigned __int64 &outLo, unsigned __int64 &outHi) { __m128i const mask = _mm_set1_epi16((short)0xF00F); __m128i const mul0 = _mm_set1_epi16(0x0011); __m128i const mul1 = _mm_set1_epi16(0x1000); __m128i v; v = _mm_cvtsi64_si128(in); // Move the 64-bit value to a 128-bit register v = _mm_unpacklo_epi8(v, v); // 0x12 -> 0x1212 v = _mm_and_si128(v, mask); // 0x1212 -> 0x1002 v = _mm_mullo_epi16(v, mul0); // 0x1002 -> 0x1022 v = _mm_mulhi_epu16(v, mul1); // 0x1022 -> 0x0102 v = _mm_mullo_epi16(v, mul0); // 0x0102 -> 0x1122 outLo = _mm_extract_epi64(v, 0); outHi = _mm_extract_epi64(v, 1); } 

当然,你会想把这个函数的内部函数放在一个内部循环中,并把常量拉出来。 您还将要跳过x64寄存器并将值直接加载到128位SSE寄存器中。 有关如何执行此操作的示例,请参阅下面的性能testing中的SSE2实现。

其核心有五条指令,一次执行四个颜色值的操作。 所以,每个颜色值只有大约1.25条指令。 还应该指出,SSE2是可用的任何地方x64可用。

针对各种解决scheme的性能testing这里有几个人提到,知道更快的唯一方法就是运行代码,这是毫无疑问的事实。 所以我已经将一些解决scheme编译成性能testing,所以我们可以将苹果与苹果进行比较。 我select的解决scheme我觉得与其他人有很大的不同,需要testing。 所有的解决scheme都是从内存中读取,对数据进行操作,然后回写到内存中。 在实践中,当input数据中没有另外16个字节需要处理时,一些SSE解决scheme将需要在alignment和处理情况下进行额外的处理。 我testing的代码是在使用运行在4 GHz GHz Core i7上的Visual Studio 2013的版本下编译的。

这是我的结果:

 ExpandOrig: 56.234 seconds // From asker's original question ExpandSmallLUT: 30.209 seconds // From Dmitry's answer ExpandLookupSmallOneLUT: 33.689 seconds // from Dmitry's answer ExpandLookupLarge: 51.312 seconds // A straightforward lookup table ExpandAShelly: 43.829 seconds // From AShelly's answer ExpandAShellyMulOp: 43.580 seconds // AShelly's answer with an optimization ExpandSSE4: 17.854 seconds // My original SSE4 answer ExpandSSE4Unroll: 17.405 seconds // My original SSE4 answer with loop unrolling ExpandSSE2: 17.281 seconds // My current SSE2 answer ExpandSSE2Unroll: 17.152 seconds // My current SSE2 answer with loop unrolling 

在上面的testing结果中,您会看到我包含了提问者的代码,三个查找表实现,包括在Dmitry的答案中提出的小查找表实现。 AShelly的解决scheme也包括在内,以及我所做的优化(可以消除操作)的版本。 我包括了我原来的SSE4实现,以及我后来做出的SSE2版本(现在反映为答案),以及两者的展开版本,因为它们是这里最快的,我想看看展开了多less速度。 我还包括了AShelly的SSE4实现的答案。

到目前为止,我必须宣布自己是赢家。 但是源代码在下面,所以任何人都可以在他们的平台上进行testing,并将自己的解决scheme纳入testing,看看他们是否提出了更快的解决scheme。

 #define DATA_SIZE_IN ((unsigned)(1024 * 1024 * 128)) #define DATA_SIZE_OUT ((unsigned)(2 * DATA_SIZE_IN)) #define RERUN_COUNT 500 #include <cstdlib> #include <ctime> #include <iostream> #include <utility> #include <emmintrin.h> // SSE2 #include <tmmintrin.h> // SSSE3 #include <smmintrin.h> // SSE4 void ExpandOrig(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v; do { // Read in data u = *(unsigned const*)in; v = u >> 16; u &= 0x0000FFFF; // Do computation u = (u & 0x00FF) << 4 | (u & 0x000F) | (u & 0x0FF0) << 8 | (u & 0xFF00) << 12 | (u & 0xF000) << 16; v = (v & 0x00FF) << 4 | (v & 0x000F) | (v & 0x0FF0) << 8 | (v & 0xFF00) << 12 | (v & 0xF000) << 16; // Store data *(unsigned*)(out) = u; *(unsigned*)(out + 4) = v; in += 4; out += 8; } while (in != past); } unsigned LutLo[256], LutHi[256]; void MakeLutLo(void) { for (unsigned i = 0, x; i < 256; ++i) { x = i; x = ((x & 0xF0) << 4) | (x & 0x0F); x |= (x << 4); LutLo[i] = x; } } void MakeLutHi(void) { for (unsigned i = 0, x; i < 256; ++i) { x = i; x = ((x & 0xF0) << 20) | ((x & 0x0F) << 16); x |= (x << 4); LutHi[i] = x; } } void ExpandLookupSmall(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v; do { // Read in data u = *(unsigned const*)in; v = u >> 16; u &= 0x0000FFFF; // Do computation u = LutHi[u >> 8] | LutLo[u & 0xFF]; v = LutHi[v >> 8] | LutLo[v & 0xFF]; // Store data *(unsigned*)(out) = u; *(unsigned*)(out + 4) = v; in += 4; out += 8; } while (in != past); } void ExpandLookupSmallOneLUT(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v; do { // Read in data u = *(unsigned const*)in; v = u >> 16; u &= 0x0000FFFF; // Do computation u = ((LutLo[u >> 8] << 16) | LutLo[u & 0xFF]); v = ((LutLo[v >> 8] << 16) | LutLo[v & 0xFF]); // Store data *(unsigned*)(out) = u; *(unsigned*)(out + 4) = v; in += 4; out += 8; } while (in != past); } unsigned LutLarge[256 * 256]; void MakeLutLarge(void) { for (unsigned i = 0; i < (256 * 256); ++i) LutLarge[i] = LutHi[i >> 8] | LutLo[i & 0xFF]; } void ExpandLookupLarge(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v; do { // Read in data u = *(unsigned const*)in; v = u >> 16; u &= 0x0000FFFF; // Do computation u = LutLarge[u]; v = LutLarge[v]; // Store data *(unsigned*)(out) = u; *(unsigned*)(out + 4) = v; in += 4; out += 8; } while (in != past); } void ExpandAShelly(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v, w, x; do { // Read in data u = *(unsigned const*)in; v = u >> 16; u &= 0x0000FFFF; // Do computation w = (((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00); x = (((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00); w += w * 0x10; x += x * 0x10; // Store data *(unsigned*)(out) = w; *(unsigned*)(out + 4) = x; in += 4; out += 8; } while (in != past); } void ExpandAShellyMulOp(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v; do { // Read in data u = *(unsigned const*)in; v = u >> 16; u &= 0x0000FFFF; // Do computation u = ((((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11; v = ((((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11; // Store data *(unsigned*)(out) = u; *(unsigned*)(out + 4) = v; in += 4; out += 8; } while (in != past); } void ExpandSSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask0 = _mm_set1_epi16((short)0x8000), mask1 = _mm_set1_epi8(0x0F), mul = _mm_set1_epi16(0x0011); __m128i u, v, w, x; do { // Read input into low 8 bytes of u and v u = _mm_load_si128((__m128i const*)in); v = _mm_unpackhi_epi8(u, u); // Expand each single byte to two bytes u = _mm_unpacklo_epi8(u, u); // Do it again for v w = _mm_srli_epi16(u, 4); // Copy the value into w and shift it right half a byte x = _mm_srli_epi16(v, 4); // Do it again for v u = _mm_blendv_epi8(u, w, mask0); // Select odd bytes from w, and even bytes from v, giving the the desired value in the upper nibble of each byte v = _mm_blendv_epi8(v, x, mask0); // Do it again for v u = _mm_and_si128(u, mask1); // Clear the all the upper nibbles v = _mm_and_si128(v, mask1); // Do it again for v u = _mm_mullo_epi16(u, mul); // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte v = _mm_mullo_epi16(v, mul); // Do it again for v // Write output _mm_store_si128((__m128i*)(out ), u); _mm_store_si128((__m128i*)(out + 16), v); in += 16; out += 32; } while (in != past); } void ExpandSSE4Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask0 = _mm_set1_epi16((short)0x8000), mask1 = _mm_set1_epi8(0x0F), mul = _mm_set1_epi16(0x0011); __m128i u0, v0, w0, x0, u1, v1, w1, x1, u2, v2, w2, x2, u3, v3, w3, x3; do { // Read input into low 8 bytes of u and v u0 = _mm_load_si128((__m128i const*)(in )); u1 = _mm_load_si128((__m128i const*)(in + 16)); u2 = _mm_load_si128((__m128i const*)(in + 32)); u3 = _mm_load_si128((__m128i const*)(in + 48)); v0 = _mm_unpackhi_epi8(u0, u0); // Expand each single byte to two bytes u0 = _mm_unpacklo_epi8(u0, u0); // Do it again for v v1 = _mm_unpackhi_epi8(u1, u1); // Do it again u1 = _mm_unpacklo_epi8(u1, u1); // Again for u1 v2 = _mm_unpackhi_epi8(u2, u2); // Again for v1 u2 = _mm_unpacklo_epi8(u2, u2); // Again for u2 v3 = _mm_unpackhi_epi8(u3, u3); // Again for v2 u3 = _mm_unpacklo_epi8(u3, u3); // Again for u3 w0 = _mm_srli_epi16(u0, 4); // Copy the value into w and shift it right half a byte x0 = _mm_srli_epi16(v0, 4); // Do it again for v w1 = _mm_srli_epi16(u1, 4); // Again for u1 x1 = _mm_srli_epi16(v1, 4); // Again for v1 w2 = _mm_srli_epi16(u2, 4); // Again for u2 x2 = _mm_srli_epi16(v2, 4); // Again for v2 w3 = _mm_srli_epi16(u3, 4); // Again for u3 x3 = _mm_srli_epi16(v3, 4); // Again for v3 u0 = _mm_blendv_epi8(u0, w0, mask0); // Select even bytes from w, and odd bytes from v, giving the the desired value in the upper nibble of each byte v0 = _mm_blendv_epi8(v0, x0, mask0); // Do it again for v u1 = _mm_blendv_epi8(u1, w1, mask0); // Again for u1 v1 = _mm_blendv_epi8(v1, x1, mask0); // Again for v1 u2 = _mm_blendv_epi8(u2, w2, mask0); // Again for u2 v2 = _mm_blendv_epi8(v2, x2, mask0); // Again for v2 u3 = _mm_blendv_epi8(u3, w3, mask0); // Again for u3 v3 = _mm_blendv_epi8(v3, x3, mask0); // Again for v3 u0 = _mm_and_si128(u0, mask1); // Clear the all the upper nibbles v0 = _mm_and_si128(v0, mask1); // Do it again for v u1 = _mm_and_si128(u1, mask1); // Again for u1 v1 = _mm_and_si128(v1, mask1); // Again for v1 u2 = _mm_and_si128(u2, mask1); // Again for u2 v2 = _mm_and_si128(v2, mask1); // Again for v2 u3 = _mm_and_si128(u3, mask1); // Again for u3 v3 = _mm_and_si128(v3, mask1); // Again for v3 u0 = _mm_mullo_epi16(u0, mul); // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte v0 = _mm_mullo_epi16(v0, mul); // Do it again for v u1 = _mm_mullo_epi16(u1, mul); // Again for u1 v1 = _mm_mullo_epi16(v1, mul); // Again for v1 u2 = _mm_mullo_epi16(u2, mul); // Again for u2 v2 = _mm_mullo_epi16(v2, mul); // Again for v2 u3 = _mm_mullo_epi16(u3, mul); // Again for u3 v3 = _mm_mullo_epi16(v3, mul); // Again for v3 // Write output _mm_store_si128((__m128i*)(out ), u0); _mm_store_si128((__m128i*)(out + 16), v0); _mm_store_si128((__m128i*)(out + 32), u1); _mm_store_si128((__m128i*)(out + 48), v1); _mm_store_si128((__m128i*)(out + 64), u2); _mm_store_si128((__m128i*)(out + 80), v2); _mm_store_si128((__m128i*)(out + 96), u3); _mm_store_si128((__m128i*)(out + 112), v3); in += 64; out += 128; } while (in != past); } void ExpandSSE2(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask = _mm_set1_epi16((short)0xF00F), mul0 = _mm_set1_epi16(0x0011), mul1 = _mm_set1_epi16(0x1000); __m128i u, v; do { // Read input into low 8 bytes of u and v u = _mm_load_si128((__m128i const*)in); v = _mm_unpackhi_epi8(u, u); // Expand each single byte to two bytes u = _mm_unpacklo_epi8(u, u); // Do it again for v u = _mm_and_si128(u, mask); v = _mm_and_si128(v, mask); u = _mm_mullo_epi16(u, mul0); v = _mm_mullo_epi16(v, mul0); u = _mm_mulhi_epu16(u, mul1); // This can also be done with a right shift of 4 bits, but this seems to mesure faster v = _mm_mulhi_epu16(v, mul1); u = _mm_mullo_epi16(u, mul0); v = _mm_mullo_epi16(v, mul0); // write output _mm_store_si128((__m128i*)(out ), u); _mm_store_si128((__m128i*)(out + 16), v); in += 16; out += 32; } while (in != past); } void ExpandSSE2Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask = _mm_set1_epi16((short)0xF00F), mul0 = _mm_set1_epi16(0x0011), mul1 = _mm_set1_epi16(0x1000); __m128i u0, v0, u1, v1; do { // Read input into low 8 bytes of u and v u0 = _mm_load_si128((__m128i const*)(in )); u1 = _mm_load_si128((__m128i const*)(in + 16)); v0 = _mm_unpackhi_epi8(u0, u0); // Expand each single byte to two bytes u0 = _mm_unpacklo_epi8(u0, u0); // Do it again for v v1 = _mm_unpackhi_epi8(u1, u1); // Do it again u1 = _mm_unpacklo_epi8(u1, u1); // Again for u1 u0 = _mm_and_si128(u0, mask); v0 = _mm_and_si128(v0, mask); u1 = _mm_and_si128(u1, mask); v1 = _mm_and_si128(v1, mask); u0 = _mm_mullo_epi16(u0, mul0); v0 = _mm_mullo_epi16(v0, mul0); u1 = _mm_mullo_epi16(u1, mul0); v1 = _mm_mullo_epi16(v1, mul0); u0 = _mm_mulhi_epu16(u0, mul1); v0 = _mm_mulhi_epu16(v0, mul1); u1 = _mm_mulhi_epu16(u1, mul1); v1 = _mm_mulhi_epu16(v1, mul1); u0 = _mm_mullo_epi16(u0, mul0); v0 = _mm_mullo_epi16(v0, mul0); u1 = _mm_mullo_epi16(u1, mul0); v1 = _mm_mullo_epi16(v1, mul0); // write output _mm_store_si128((__m128i*)(out ), u0); _mm_store_si128((__m128i*)(out + 16), v0); _mm_store_si128((__m128i*)(out + 32), u1); _mm_store_si128((__m128i*)(out + 48), v1); in += 32; out += 64; } while (in != past); } void ExpandAShellySSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const zero = _mm_setzero_si128(), v0F0F = _mm_set1_epi32(0x0F0F), vF0F0 = _mm_set1_epi32(0xF0F0), v0101 = _mm_set1_epi32(0x0101), v1010 = _mm_set1_epi32(0x1010), v000F000F = _mm_set1_epi32(0x000F000F), v0F000F00 = _mm_set1_epi32(0x0F000F00), v0011 = _mm_set1_epi32(0x0011); __m128i u, v, w, x; do { // Read in data u = _mm_load_si128((__m128i const*)in); v = _mm_unpackhi_epi16(u, zero); u = _mm_unpacklo_epi16(u, zero); // original source: ((((a & 0xF0F) * 0x101) & 0xF000F) + (((a & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11; w = _mm_and_si128(u, v0F0F); x = _mm_and_si128(v, v0F0F); u = _mm_and_si128(u, vF0F0); v = _mm_and_si128(v, vF0F0); w = _mm_mullo_epi32(w, v0101); // _mm_mullo_epi32 is what makes this require SSE4 instead of SSE2 x = _mm_mullo_epi32(x, v0101); u = _mm_mullo_epi32(u, v1010); v = _mm_mullo_epi32(v, v1010); w = _mm_and_si128(w, v000F000F); x = _mm_and_si128(x, v000F000F); u = _mm_and_si128(u, v0F000F00); v = _mm_and_si128(v, v0F000F00); u = _mm_add_epi32(u, w); v = _mm_add_epi32(v, x); u = _mm_mullo_epi32(u, v0011); v = _mm_mullo_epi32(v, v0011); // write output _mm_store_si128((__m128i*)(out ), u); _mm_store_si128((__m128i*)(out + 16), v); in += 16; out += 32; } while (in != past); } int main() { unsigned char *const indat = new unsigned char[DATA_SIZE_IN ], *const outdat0 = new unsigned char[DATA_SIZE_OUT], *const outdat1 = new unsigned char[DATA_SIZE_OUT], * curout = outdat0, * lastout = outdat1, * place; unsigned start, stop; place = indat + DATA_SIZE_IN - 1; do { *place = (unsigned char)rand(); } while (place-- != indat); MakeLutLo(); MakeLutHi(); MakeLutLarge(); for (unsigned testcount = 0; testcount < 1000; ++testcount) { // Solution posted by the asker start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandOrig(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandOrig:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); // Dmitry's small lookup table solution start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandLookupSmall(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandSmallLUT:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // Dmitry's small lookup table solution using only one lookup table start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandLookupSmallOneLUT(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandLookupSmallOneLUT:\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // Large lookup table solution start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandLookupLarge(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandLookupLarge:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // AShelly's Interleave bits by Binary Magic Numbers solution start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandAShelly(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandAShelly:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // AShelly's Interleave bits by Binary Magic Numbers solution optimizing out an addition start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandAShellyMulOp(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandAShellyMulOp:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // My SSE4 solution start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandSSE4(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandSSE4:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // My SSE4 solution unrolled start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandSSE4Unroll(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandSSE4Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // My SSE2 solution start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandSSE2(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandSSE2:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // My SSE2 solution unrolled start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandSSE2Unroll(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandSSE2Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; // AShelly's Interleave bits by Binary Magic Numbers solution implemented using SSE2 start = clock(); for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun) ExpandAShellySSE4(indat, indat + DATA_SIZE_IN, curout); stop = clock(); std::cout << "ExpandAShellySSE4:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl; std::swap(curout, lastout); if (memcmp(outdat0, outdat1, DATA_SIZE_OUT)) std::cout << "INCORRECT OUTPUT" << std::endl; } delete[] indat; delete[] outdat0; delete[] outdat1; return 0; } 

注意:

我最初在这里有一个SSE4实现。 我find了一种使用SSE2来实现这一点的方法,因为它可以运行在更多的平台上。 SSE2的实施也更快。 所以上面提出的解决scheme现在是SSE2的实现,而不是SSE4。 SSE4的实现仍然可以在性能testing或编辑历史中看到。

我不确定最有效的方法是什么,但是这个更短一些:

 #include <stdio.h> int main() { unsigned x = 0x1234; x = (x << 8) | x; x = ((x & 0x00f000f0) << 4) | (x & 0x000f000f); x = (x << 4) | x; printf("0x1234 -> 0x%08x\n",x); return 0; } 

如果您需要按照编辑中的build议反复执行此操作,则可以考虑生成查找表并使用它。 下面的函数dynamic分配和初始化这样一个表:

 unsigned *makeLookupTable(void) { unsigned *tbl = malloc(sizeof(unsigned) * 65536); if (!tbl) return NULL; int i; for (i = 0; i < 65536; i++) { unsigned x = i; x |= (x << 8); x = ((x & 0x00f000f0) << 4) | (x & 0x000f000f); x |= (x << 4); /* Uncomment next line to invert the high byte as mentioned in the edit. */ /* x = x ^ 0xff000000; */ tbl[i] = x; } return tbl; } 

之后,每个转换只是像这样的:

 result = lookuptable[input]; 

..或者可能:

 result = lookuptable[input & 0xffff]; 

或者一个更小的,更容易caching的查找表(或者对)可以用于高字节和低字节的查找(正如注释中的@LưuVĩnhPhúc所指出的)。 在这种情况下,表生成代码可能是:

 unsigned *makeLookupTableLow(void) { unsigned *tbl = malloc(sizeof(unsigned) * 256); if (!tbl) return NULL; int i; for (i = 0; i < 256; i++) { unsigned x = i; x = ((x & 0xf0) << 4) | (x & 0x0f); x |= (x << 4); tbl[i] = x; } return tbl; } 

…和一个可选的第二个表格:

 unsigned *makeLookupTableHigh(void) { unsigned *tbl = malloc(sizeof(unsigned) * 256); if (!tbl) return NULL; int i; for (i = 0; i < 256; i++) { unsigned x = i; x = ((x & 0xf0) << 20) | ((x & 0x0f) << 16); x |= (x << 4); /* uncomment next line to invert high byte */ /* x = x ^ 0xff000000; */ tbl[i] = x; } return tbl; } 

…并用两个表格转换值:

 result = hightable[input >> 8] | lowtable[input & 0xff]; 

…或一个(只是上面的低桌):

 result = (lowtable[input >> 8] << 16) | lowtable[input & 0xff]; result ^= 0xff000000; /* to invert high byte */ 

如果值(alpha?)的上半部分变化不大,那么即使单个大表也可能performance良好,因为连续查找在表中更接近。


我把性能testing代码@Apriori发布了,做了一些调整,并且为最初没有包含的其他响应添加了testing,然后用不同的设置编译了它的三个版本。 一个是启用了SSE4.1的64位代码,编译器可以利用SSE进行优化…然后是两个32位版本,一个是SSE,一个是没有的。 尽pipe三款处理器均采用了相同的新处理器,但结果显示了最佳解决scheme如何根据处理器function发生变化:

  64b SSE4.1 32b SSE4.1 32b no SSE -------------------------- ---------- ---------- ---------- ExpandOrig time: 3.502 s 3.501 s 6.260 s ExpandLookupSmall time: 3.530 s 3.997 s 3.996 s ExpandLookupLarge time: 3.434 s 3.419 s 3.427 s ExpandIsalamon time: 3.654 s 3.673 s 8.870 s ExpandIsalamonOpt time: 3.784 s 3.720 s 8.719 s ExpandChronoKitsune time: 3.658 s 3.463 s 6.546 s ExpandEvgenyKluev time: 6.790 s 7.697 s 13.383 s ExpandIammilind time: 3.485 s 3.498 s 6.436 s ExpandDmitri time: 3.457 s 3.477 s 5.461 s ExpandNitish712 time: 3.574 s 3.800 s 6.789 s ExpandAdamLiss time: 3.673 s 5.680 s 6.969 s ExpandAShelly time: 3.524 s 4.295 s 5.867 s ExpandAShellyMulOp time: 3.527 s 4.295 s 5.852 s ExpandSSE4 time: 3.428 s ExpandSSE4Unroll time: 3.333 s ExpandSSE2 time: 3.392 s ExpandSSE2Unroll time: 3.318 s ExpandAShellySSE4 time: 3.392 s 

使用-m64 -O3 -march=core2 -msse4.1-m32 -O3 -march=core2 -msse4.1-m32 -O3 -march=core2 -mno-sse在64位Linux上使用gcc 4.8.1编译可执行文件-m32 -O3 -march=core2 -mno-sse分别。 对于32位版本,Apriori的SSEtesting被省略(在启用SSE的情况下在32位上崩溃,显然不能在禁用SSE的情况下工作)。

其中所做的调整是使用实际图像数据而不是随机值(具有透明背景的对象的照片),这大大提高了大型查找表的性能,但是对于其他的几乎没有什么影响。

从本质上讲,当SSE不可用(或未使用)时,查找表会以滑坡的forms获胜,而手动编码的SSE解决scheme则会赢得胜利。 但是,值得注意的是,当编译器可以使用SSE进行优化时,大多数位操作解决scheme几乎与手动编码的SSE一样快 – 仍然较慢,但只是稍微有点。

这是另一个尝试,使用八个操作:

 b = (((c & 0x0F0F) * 0x0101) & 0x00F000F) + (((c & 0xF0F0) * 0x1010) & 0xF000F00); b += b * 0x10; printf("%x\n",b); //Shows '0x11223344' 

*请注意,这个post最初包含了完全不同的代码,基于Sean Anderson的bithacks页面的Binary Magic Numbers中的Interleave位 。 但这不是OP所要求的。 所以它已经删除。 下面的大部分评论都是指那个缺less的版本 。

我想把这个链接添加到答案池中,因为我认为在谈论优化时,记住我们正在运行的硬件,以及编译我们的平台代码的技术是非常重要的。

博客文章使用CPUstream水线正在考虑优化CPUstream水线的一组代码。 它实际上是一个例子,他试图将math简化为最less的实际math运算,然而就时间而言,它是来自最优解的FAR。 我在这里看到了几个这样的答案,他们可能是对的,他们可能不对。 The only way to know is to actually measure the time from start to finish of your particular snippet of code, in comparison to others. Read this blog; it is EXTREMELY interesting.

I think I should mention that I am in this particular case not going to put ANY code up here unless I have truly tried multiple attempts, and actually gotten on that is particularly faster through multiple tries.

I think that the lookup table approach suggested by Dimitri is a good choice, but I suggest to go one step further and generate the table in compile time; doing the work at compile time will obviously lessen the execution time.

First, we create a compile-time value, using any of the suggested methods:

 constexpr unsigned int transform1(unsigned int x) { return ((x << 8) | x); } constexpr unsigned int transform2(unsigned int x) { return (((x & 0x00f000f0) << 4) | (x & 0x000f000f)); } constexpr unsigned int transform3(unsigned int x) { return ((x << 4) | x); } constexpr unsigned int transform(unsigned int x) { return transform3(transform2(transform1(x))); } // Dimitri version, using constexprs template <unsigned int argb> struct aarrggbb_dimitri { static const unsigned int value = transform(argb); }; // Adam Liss version template <unsigned int argb> struct aarrggbb_adamLiss { static const unsigned int value = (argb & 0xf000) * 0x11000 + (argb & 0x0f00) * 0x01100 + (argb & 0x00f0) * 0x00110 + (argb & 0x000f) * 0x00011; }; 

And then, we create the compile-time lookup table with whatever method we have available, I'll wish to use the C++14 integer sequence but I don't know which compiler will the OP be using. So another possible approach would be to use a pretty ugly macro:

 #define EXPAND16(x) aarrggbb<x + 0>::value, \ aarrggbb<x + 1>::value, \ aarrggbb<x + 2>::value, \ aarrggbb<x + 3>::value, \ aarrggbb<x + 4>::value, \ aarrggbb<x + 5>::value, \ aarrggbb<x + 6>::value, \ ... and so on #define EXPAND EXPAND16(0), \ EXPAND16(0x10), \ EXPAND16(0x20), \ EXPAND16(0x30), \ EXPAND16(0x40), \ ... and so on ... and so on 

See demo here .

PS: The Adam Liss approach could be used without C++11.

If multiplication is cheap and 64-bit arithmetics is available, you could use this code:

 uint64_t x = 0x1234; x *= 0x0001000100010001ull; x &= 0xF0000F0000F0000Full; x *= 0x0000001001001001ull; x &= 0xF0F0F0F000000000ull; x = (x >> 36) * 0x11; std::cout << std::hex << x << '\n'; 

In fact, it uses the same idea as the original attempt by AShelly.

This works and may be easier to understand, but bit manipulations are so cheap that I wouldn't worry much about efficiency.

 #include <stdio.h> #include <stdlib.h> void main() { unsigned int c = 0x1234, b; b = (c & 0xf000) * 0x11000 + (c & 0x0f00) * 0x01100 + (c & 0x00f0) * 0x00110 + (c & 0x000f) * 0x00011; printf("%x -> %x\n", c, b); } 

Assuming that, you want to always convert 0xWXYZ to 0xWWXXYYZZ , I believe that below solution would be little faster than the one you suggested:

 unsigned int c = 0x1234; unsigned int b = (c & 0xf) | ((c & 0xf0) << 4) | ((c & 0xf00) << 8) | ((c & 0xf000) << 12); b |= (b << 4); 

Notice that, one & ( and ) operation is saved from your solution. 🙂
演示 。

另一种方法是:

 DWORD OrVal(DWORD & nible_pos, DWORD input_val, DWORD temp_val, int shift) { if (nible_pos==0) nible_pos = 0x0000000F; else nible_pos = nible_pos << 4; DWORD nible = input_val & nible_pos; temp_val |= (nible << shift); temp_val |= (nible << (shift + 4)); return temp_val; } DWORD Converter2(DWORD input_val) { DWORD nible_pos = 0x00000000; DWORD temp_val = 0x00000000; temp_val = OrVal(nible_pos, input_val, temp_val, 0); temp_val = OrVal(nible_pos, input_val, temp_val, 4); temp_val = OrVal(nible_pos, input_val, temp_val, 8); temp_val = OrVal(nible_pos, input_val, temp_val, 12); return temp_val; } DWORD val2 = Converter2(0x1234); 

An optimized version (3 times faster):

DWORD Converter3(DWORD input_val)
 {
    DWORD nible_pos = 0;
    DWORD temp_val = 0;
    int shift = 0;
    DWORD bit_nible[4] = { 0x000F, 0x000F0, 0x0F00, 0xF000 };

    for ( ; shift < 16; shift+=4 )
     {
        if (nible_pos==0)
            nible_pos = 0x0000000F;
        其他
            nible_pos = nible_pos << 4;
        DWORD nible = input_val & nible_pos;
        temp_val |= (nible << shift);
        temp_val |= (nible << (shift + 4));
     }

    return temp_val;
 }

Perhaps this could be more simpler & efficient.

 unsigned int g = 0x1234; unsigned int ans = 0; ans = ( ( g & 0xf000 ) << 16) + ( (g & 0xf00 ) << 12) + ( ( g&0xf0 ) << 8) + ( ( g&0xf ) << 4); ans = ( ans | ans>>4 ); printf("%p -> %p\n", g, ans); 
 unsigned long transform(unsigned long n) { /* n: 00AR * 00GB */ n = ((n & 0xff00) << 8) | (n & 0x00ff); /* n: 0AR0 * 0GB0 */ n <<= 4; /* n: AAR0 * GGB0 */ n |= (n & 0x0f000f00L) << 4; /* n: AARR * GGBB */ n |= (n & 0x00f000f0L) >> 4; return n; } 

The alpha and red components are shifted into the higher 2 bytes where they belong, and the result is then shifted left by 4 bits, resulting in every component being exactly where it needs to be.

With a form of 0AR0 0GB0, a bit mask and left-shift combination is OR'ed with the current value. This copies the A and G components to the position just left of them. The same thing is done for the R and B components, except in the opposite direction.

If you are going to do this for OpenGL , I suggest you to use a glTexImageXD function with type parameter set to GL_UNSIGNED_SHORT_4_4_4_4 . Your OpenGL driver should do the rest. And about the transparency inversion you can always manipulate blending via the glBlendFunc and glBlendEquation functions.

While others operate on hard-core optimization…

Take this as your best bet:

 std::string toAARRGGBB(const std::string &argb) { std::string ret("0x"); int start = 2; //"0x####"; // ^^ skipped for (int i = start;i < argb.length(); ++i) { ret += argb[i]; ret += argb[i]; } return ret; } int main() { std::string argb = toAARRGGBB("0xACED"); //!!! } 

Haha