SSE Optimized Compositing
As part of a graphics API I've been working on (for my own use, it's hardly ready for production,) I decided to try learning SSE optimization by making the compositing routine faster.
I came up with an implementation which, according to my tests, is 3-5X faster than the non-optimized version. The optimized version below composites a single color starting at a destination pixel buffer for a specified run of pixels. It uses source over compositing on a RGBA, 8bpc, integer pixel buffer.
The optimized version is presented below:
static __m128i zero = _mm_set_epi32(0x0, 0x0, 0x0, 0x0); static __m128i one = _mm_set_epi32(0x00010001U, 0x00010001U, 0x00010001U, 0x00010001U); static __m128i mask1 = _mm_set_epi32(0x00FF00FFU, 0x00FF00FFU, 0x00FF00FFU, 0x00FF00FFU); static __m128i mask2 = _mm_set_epi32(0xFF00FF00U, 0xFF00FF00U, 0xFF00FF00U, 0xFF00FF00U); if (_comp[3] == 255) { /* We're compositing a fully opaque pixel, just copy onto the destination */ unsigned int *dst = (unsigned int *)_dst; /* The pixel offset from dst */ size_t i = 0; /* The number of quad-pixels processed */ size_t j = 0; /* Leading 0..3 pixels */ while (((intptr_t)(dst+i) & (intptr_t)0xF) != 0) { *(dst + i) = *(unsigned int *)_comp; i++; run--; } /* Start at the pixel i */ __m128i *mdst = (__m128i *)(dst + i); /* Set 4 pixel chunks */ for (j = 0; j < (run>>2); j++) { _mm_prefetch(mdst+1, _MM_HINT_T0); _mm_store_si128(mdst++, _src); } /* If we couldn't get all of the run in 4 pixel chunks, get them now */ for (size_t k = 0; k < run % 4; k++) { *(dst + i + k + (j<<2)) = *(unsigned int *)_comp; } } else { /* We need full composition as this pixel has transparency */ unsigned char *dst = (unsigned char *)_dst; /* char offset from dst */ size_t i = 0; /* Quad-pixels processed */ size_t j = 0; /* get the non-aligned starting 0..3 pixels */ while (((intptr_t)(dst+i) & (intptr_t)0xF) != 0) { dst[0+i] = _comp[0] + dst[0+i] - (dst[0+i] > 0? ((((unsigned short)dst[0+i] * _comp[3]) >> 8) + 1) : 0); //R dst[1+i] = _comp[1] + dst[1+i] - (dst[1+i] > 0? ((((unsigned short)dst[1+i] * _comp[3]) >> 8) + 1) : 0); //G dst[2+i] = _comp[2] + dst[2+i] - (dst[2+i] > 0? ((((unsigned short)dst[2+i] * _comp[3]) >> 8) + 1) : 0); //B dst[3+i] = _comp[3] + dst[3+i] - (dst[3+i] > 0? ((((unsigned short)dst[3+i] * _comp[3]) >> 8) + 1) : 0); //A i += 4; run--; } /* Load in 2 pixels */ __m128i *mdst = (__m128i *)(dst+i); __m128i d, d1, d2, res; for (j; j < (run>>2); j++) { /* Load 4 pixels */ d = _mm_load_si128(mdst); /* Take chars 0,2,4,6,8,10,12,14 and composite */ d1 = _mm_and_si128(d, mask1); d1 = _mm_add_epi16(_mm_subs_epi16(d1, _mm_add_epi16( _mm_srli_epi16(_mm_mullo_epi16(d1, _c3), 8), _mm_and_si128(_mm_cmpgt_epi16(d1, zero), one))), _ce); /* Take chars 1,3,5,7,9,11,13,15 and composite */ d2 = _mm_srli_epi16(_mm_and_si128(d, mask2), 8); /* prefetch the next 4 pixels */ _mm_prefetch(mdst+1, _MM_HINT_T0); d2 = _mm_add_epi16(_mm_subs_epi16(d2, _mm_add_epi16( _mm_srli_epi16(_mm_mullo_epi16(d2, _c3), 8), _mm_and_si128(_mm_cmpgt_epi16(d2, zero), one))), _co); /* shift odd chars (d2) to high 8 bits and or with d1 */ res = _mm_or_si128(d1, _mm_slli_epi16(d2, 8)); /* Store 4 pixels */ _mm_store_si128(mdst, res); mdst++; } // Get the trailing 0..3 pixels dst += i + (j< <4); for (size_t k = 0; k < (run % 4) * 4; k+=4) { dst[0+k] = _comp[0] + dst[0+k] - (dst[0+k] > 0? ((((unsigned short)dst[0+k] * _comp[3]) >> 8) + 1) : 0); //R dst[1+k] = _comp[1] + dst[1+k] - (dst[1+k] > 0? ((((unsigned short)dst[1+k] * _comp[3]) >> 8) + 1) : 0); //G dst[2+k] = _comp[2] + dst[2+k] - (dst[2+k] > 0? ((((unsigned short)dst[2+k] * _comp[3]) >> 8) + 1) : 0); //B dst[3+k] = _comp[3] + dst[3+k] - (dst[3+k] > 0? ((((unsigned short)dst[3+k] * _comp[3]) >> 8) + 1) : 0); //A } }
_c3 is a vector composed solely of the alpha value of the color.
_ce is a vector composed of pairs of the red and blue components.
_co is a vector composed of pairs of the green and alpha components.
_src is the start address of the pixel buffer.
_c3, _ce, _co, and _src are set from the following function:
unsigned short r = _comp[0], g = _comp[1], b = _comp[2], a = _comp[3]; _c3 = _mm_setr_epi16(a, a, a, a, a, a, a, a); _ce = _mm_setr_epi16(r, b, r, b, r, b, r, b); _co = _mm_setr_epi16(g, a, g, a, g, a, g, a); _src = _mm_set_epi32(*(int*)_comp, *(int*)_comp, *(int*)_comp, *(int*)_comp);
0 comments: