SSE Optimized Compositing

As part of a graphics API I've been working on (for my own use, it's hardly ready for production,) I decided to try learning SSE optimization by making the compositing routine faster.

I came up with an implementation which, according to my tests, is 3-5X faster than the non-optimized version. The optimized version below composites a single color starting at a destination pixel buffer for a specified run of pixels. It uses source over compositing on a RGBA, 8bpc, integer pixel buffer.

The optimized version is presented below:

static __m128i zero = 
        _mm_set_epi32(0x0, 0x0, 0x0, 0x0);
static __m128i one = 
        _mm_set_epi32(0x00010001U, 0x00010001U, 0x00010001U, 0x00010001U);
static __m128i mask1 = 
        _mm_set_epi32(0x00FF00FFU, 0x00FF00FFU, 0x00FF00FFU, 0x00FF00FFU);
static __m128i mask2 = 
        _mm_set_epi32(0xFF00FF00U, 0xFF00FF00U, 0xFF00FF00U, 0xFF00FF00U);

if (_comp[3] == 255) {
    /* We're compositing a fully opaque pixel, just copy onto the destination */
    
    unsigned int *dst = (unsigned int *)_dst;
    
    /* The pixel offset from dst */
    size_t i = 0;
    
    /* The number of quad-pixels processed */
    size_t j = 0;
    
    /* Leading 0..3 pixels */
    while (((intptr_t)(dst+i) & (intptr_t)0xF) != 0) {
        *(dst + i) = *(unsigned int *)_comp;
        i++; run--;
    }
    
    /* Start at the pixel i */
    __m128i *mdst = (__m128i *)(dst + i);
    
    /* Set 4 pixel chunks */
    for (j = 0; j < (run>>2); j++) {
        _mm_prefetch(mdst+1, _MM_HINT_T0);
        _mm_store_si128(mdst++, _src);
    }
    
    /* If we couldn't get all of the run in 4 pixel chunks, get them now */
    for (size_t k = 0; k < run % 4; k++) {
        *(dst + i + k + (j<<2)) = *(unsigned int *)_comp;
    }
} else {
    /* We need full composition as this pixel has transparency */
    
    unsigned char *dst = (unsigned char *)_dst;
    
    /* char offset from dst */
    size_t i = 0;
    
    /* Quad-pixels processed */
    size_t j = 0;
    
    /* get the non-aligned starting 0..3 pixels */
    while (((intptr_t)(dst+i) & (intptr_t)0xF) != 0) {
        dst[0+i] = _comp[0] + dst[0+i] - 
            (dst[0+i] > 0? ((((unsigned short)dst[0+i] * _comp[3]) >> 8) + 1) : 0); //R
        dst[1+i] = _comp[1] + dst[1+i] - 
            (dst[1+i] > 0? ((((unsigned short)dst[1+i] * _comp[3]) >> 8) + 1) : 0); //G
        dst[2+i] = _comp[2] + dst[2+i] - 
            (dst[2+i] > 0? ((((unsigned short)dst[2+i] * _comp[3]) >> 8) + 1) : 0); //B
        dst[3+i] = _comp[3] + dst[3+i] - 
            (dst[3+i] > 0? ((((unsigned short)dst[3+i] * _comp[3]) >> 8) + 1) : 0); //A
        i += 4; run--;
    }
    
    /* Load in 2 pixels */
    __m128i *mdst = (__m128i *)(dst+i);
    __m128i d, d1, d2, res;
    for (j; j < (run>>2); j++) {
        /* Load 4 pixels */
        d = _mm_load_si128(mdst);
        
        /* Take chars 0,2,4,6,8,10,12,14 and composite */
        d1 = _mm_and_si128(d, mask1);
        d1 = _mm_add_epi16(_mm_subs_epi16(d1, _mm_add_epi16(
                    _mm_srli_epi16(_mm_mullo_epi16(d1, _c3), 8), 
                    _mm_and_si128(_mm_cmpgt_epi16(d1, zero), one))), _ce);
        
        /* Take chars 1,3,5,7,9,11,13,15 and composite */
        d2 = _mm_srli_epi16(_mm_and_si128(d, mask2), 8);
        
        /* prefetch the next 4 pixels */
        _mm_prefetch(mdst+1, _MM_HINT_T0);
        
        d2 = _mm_add_epi16(_mm_subs_epi16(d2, _mm_add_epi16(
                    _mm_srli_epi16(_mm_mullo_epi16(d2, _c3), 8),
                    _mm_and_si128(_mm_cmpgt_epi16(d2, zero), one))), _co);
        
        /* shift odd chars (d2) to high 8 bits and or with d1 */
        res = _mm_or_si128(d1, _mm_slli_epi16(d2, 8));
        
        /* Store 4 pixels */
        _mm_store_si128(mdst, res);
        
        mdst++;
    }
    
    // Get the trailing 0..3 pixels
    dst += i + (j< <4);
    for (size_t k = 0; k < (run % 4) * 4; k+=4) {
        dst[0+k] = _comp[0] + dst[0+k] - 
            (dst[0+k] > 0? ((((unsigned short)dst[0+k] * _comp[3]) >> 8) + 1) : 0); //R
        dst[1+k] = _comp[1] + dst[1+k] - 
            (dst[1+k] > 0? ((((unsigned short)dst[1+k] * _comp[3]) >> 8) + 1) : 0); //G
        dst[2+k] = _comp[2] + dst[2+k] - 
            (dst[2+k] > 0? ((((unsigned short)dst[2+k] * _comp[3]) >> 8) + 1) : 0); //B
        dst[3+k] = _comp[3] + dst[3+k] - 
            (dst[3+k] > 0? ((((unsigned short)dst[3+k] * _comp[3]) >> 8) + 1) : 0); //A
    }
}

_c3 is a vector composed solely of the alpha value of the color.
_ce is a vector composed of pairs of the red and blue components.
_co is a vector composed of pairs of the green and alpha components.
_src is the start address of the pixel buffer.

_c3, _ce, _co, and _src are set from the following function:

unsigned short r = _comp[0], g = _comp[1], b = _comp[2], a = _comp[3];
_c3 = _mm_setr_epi16(a, a, a, a, a, a, a, a);
_ce = _mm_setr_epi16(r, b, r, b, r, b, r, b);
_co = _mm_setr_epi16(g, a, g, a, g, a, g, a);
			
_src = _mm_set_epi32(*(int*)_comp, *(int*)_comp, *(int*)_comp, *(int*)_comp);