diff --git a/alg.c b/alg.c index 93c260f..1692537 100644 --- a/alg.c +++ b/alg.c @@ -14,6 +14,12 @@ #include "mmx.h" #endif +#ifdef __SSE2__ +#define HAVE_SSE2 +#include +#include "alg/sse2.h" +#endif + #define MAX2(x, y) ((x) > (y) ? (x) : (y)) #define MAX3(x, y, z) ((x) > (y) ? ((x) > (z) ? (x) : (z)) : ((y) > (z) ? (y) : (z))) @@ -352,42 +358,23 @@ void alg_draw_red_location(struct coord *cent, struct images *imgs, int width, u #define DIFF(x, y) (ABS((x)-(y))) #define NDIFF(x, y) (ABS(x) * NORM / (ABS(x) + 2 * DIFF(x, y))) +#ifdef HAVE_SSE2 +#include "alg/alg_noise_tune.sse2.c" +#else +#include "alg/alg_noise_tune.plain.c" +#endif + /** * alg_noise_tune * */ void alg_noise_tune(struct context *cnt, unsigned char *new) { - struct images *imgs = &cnt->imgs; - int i; - unsigned char *ref = imgs->ref; - int diff, sum = 0, count = 0; - unsigned char *mask = imgs->mask; - unsigned char *smartmask = imgs->smartmask_final; - - i = imgs->motionsize; - - for (; i > 0; i--) { - diff = ABS(*ref - *new); - - if (mask) - diff = ((diff * *mask++) / 255); - - if (*smartmask) { - sum += diff + 1; - count++; - } - - ref++; - new++; - smartmask++; - } - - if (count > 3) /* Avoid divide by zero. */ - sum /= count / 3; - - /* 5: safe, 4: regular, 3: more sensitive */ - cnt->noise = 4 + (cnt->noise + sum) / 2; +#ifdef HAVE_SSE2 + alg_noise_tune_sse2(cnt, new); +#else + alg_noise_tune_plain(cnt, new); +#endif } /** @@ -1301,6 +1288,15 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg) return 0; } +#define ACCEPT_STATIC_OBJECT_TIME 10 /* Seconds */ +#define EXCLUDE_LEVEL_PERCENT 20 + +#ifdef HAVE_SSE2 +#include "alg/alg_update_reference_frame.sse2.c" +#else +#include "alg/alg_update_reference_frame.plain.c" +#endif + /** * alg_update_reference_frame * @@ -1314,55 +1310,11 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg) * action - UPDATE_REF_FRAME or RESET_REF_FRAME * */ -#define ACCEPT_STATIC_OBJECT_TIME 10 /* Seconds */ -#define EXCLUDE_LEVEL_PERCENT 20 void alg_update_reference_frame(struct context *cnt, int action) { - int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME; - int i, threshold_ref; - int *ref_dyn = cnt->imgs.ref_dyn; - unsigned char *image_virgin = cnt->imgs.image_virgin; - unsigned char *ref = cnt->imgs.ref; - unsigned char *smartmask = cnt->imgs.smartmask_final; - unsigned char *out = cnt->imgs.out; - - if (cnt->lastrate > 5) /* Match rate limit */ - accept_timer /= (cnt->lastrate / 3); - - if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */ - threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100; - - for (i = cnt->imgs.motionsize; i > 0; i--) { - /* Exclude pixels from ref frame well below noise level. */ - if (((int)(abs(*ref - *image_virgin)) > threshold_ref) && (*smartmask)) { - if (*ref_dyn == 0) { /* Always give new pixels a chance. */ - *ref_dyn = 1; - } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */ - *ref_dyn = 0; - *ref = *image_virgin; - } else if (*out) { - (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */ - } else { - *ref_dyn = 0; /* Nothing special - release pixel. */ - *ref = (*ref + *image_virgin) / 2; - } - - } else { /* No motion: copy to ref frame. */ - *ref_dyn = 0; /* Reset pixel */ - *ref = *image_virgin; - } - - ref++; - image_virgin++; - smartmask++; - ref_dyn++; - out++; - } /* end for i */ - - } else { /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */ - /* Copy fresh image */ - memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size); - /* Reset static objects */ - memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(cnt->imgs.ref_dyn)); - } +#ifdef HAVE_SSE2 + alg_update_reference_frame_sse2(cnt, action); +#else + alg_update_reference_frame_plain(cnt, action); +#endif } diff --git a/alg/alg_noise_tune.plain.c b/alg/alg_noise_tune.plain.c new file mode 100644 index 0000000..ddb861e --- /dev/null +++ b/alg/alg_noise_tune.plain.c @@ -0,0 +1,36 @@ +/** + * alg_noise_tune_plain + * + */ +static void alg_noise_tune_plain(struct context *cnt, unsigned char *new) +{ + struct images *imgs = &cnt->imgs; + unsigned char *ref = imgs->ref; + unsigned int sum = 0, count = 0; + unsigned char *mask = imgs->mask; + unsigned char *smartmask = imgs->smartmask_final; + + int i = imgs->motionsize; + + for (; i > 0; i--) { + unsigned char absdiff = (*ref > *new) ? (*ref - *new) : (*new - *ref); + + if (mask) + absdiff = ((absdiff * *mask++) / 255); + + if (*smartmask) { + sum += absdiff + 1; + count++; + } + + ref++; + new++; + smartmask++; + } + + if (count > 3) /* Avoid divide by zero. */ + sum /= count / 3; + + /* 5: safe, 4: regular, 3: more sensitive */ + cnt->noise = 4 + (cnt->noise + sum) / 2; +} diff --git a/alg/alg_noise_tune.sse2.c b/alg/alg_noise_tune.sse2.c new file mode 100644 index 0000000..9c6f042 --- /dev/null +++ b/alg/alg_noise_tune.sse2.c @@ -0,0 +1,129 @@ +/** + * alg_noise_tune_sse2 + * + */ +static void alg_noise_tune_sse2(struct context *cnt, unsigned char *new) +{ + struct images *imgs = &cnt->imgs; + unsigned char *ref = imgs->ref; + unsigned int sum = 0, count = 0; + unsigned char *mask = imgs->mask; + unsigned char *smartmask = imgs->smartmask_final; + + int j, i = imgs->motionsize; + + int sse_iters; + __m128i maskrow, zeromask; + __m128i alo, ahi; + __m128i ones = _mm_set1_epi8(1); + __m128i sum16lo = _mm_setzero_si128(); + __m128i sum16hi = _mm_setzero_si128(); + __m128i sum32 = _mm_setzero_si128(); + __m128i count8 = _mm_setzero_si128(); + uint32_t total[4]; + uint8_t counts[16] __attribute__((aligned(16))); + + /* SSE reads 16 bytes at a time; truncating division: */ + for (sse_iters = i >> 4; sse_iters > 0; sse_iters--) + { + /* Load 16 bytes from images. Addresses need not be 16-byte aligned: */ + __m128i refrow = _mm_loadu_si128((__m128i *)ref); + __m128i newrow = _mm_loadu_si128((__m128i *)new); + + /* Calculate absolute difference per byte: abs(ref - new): */ + __m128i absdiff = _mm_absdiff_epu8(refrow, newrow); + + /* If there is a mask image, alpha blend the absdiff by its pixels: */ + if (mask) + { + /* Load mask image data: */ + maskrow = _mm_loadu_si128((__m128i *)mask); + mask += 16; + + /* "Alpha blend" absdiff with mask, absdiff *= (mask / 255): */ + absdiff = _mm_scale_epu8(absdiff, maskrow); + } + /* Add 1 to all diff values: */ + absdiff = _mm_adds_epu8(absdiff, ones); + + /* Fetch the smartmask values: */ + maskrow = _mm_loadu_si128((__m128i *)smartmask); + + /* Set diff values to 0 where smartmask is 0: */ + zeromask = _mm_cmpeq_epi8(maskrow, _mm_setzero_si128()); + absdiff = _mm_andnot_si128(zeromask, absdiff); + + /* Increment count for every nonzero value of smartmask: */ + count8 = _mm_adds_epu8(count8, _mm_andnot_si128(zeromask, ones)); + + /* Split 16 bytes of sum into 16x16-bit values: + * 0 . 1 . 2 . 3 . 4 . 5 . 6 . 7 . + * 8 . 9 . A . B . C . D . E . F . + */ + sse_u8_to_u16(absdiff, &alo, &ahi); + sum16lo = _mm_adds_epu16(sum16lo, alo); + sum16hi = _mm_adds_epu16(sum16hi, ahi); + + /* Offload these 16-bit counters into a 32-bit counter at least once + * every 128 rounds to prevent overflow: + * Also do this in the last iteration to empty out the counters: */ + if (!(sse_iters & 0x7F) || sse_iters == 1) + { + /* Split these two into 4x32 bits and do 32-bit additions: + * 0 . . . 1 . . . 2 . . . 3 . . . + + * 4 . . . 5 . . . 6 . . . 7 . . . + + * 8 . . . 9 . . . A . . . B . . . + + * C . . . D . . . E . . . F . . . + * Add all of these to the running sum: */ + + sse_u16_to_u32(sum16lo, &alo, &ahi); + sum32 = _mm_add_epi32(sum32, _mm_add_epi32(alo, ahi)); + + sse_u16_to_u32(sum16hi, &alo, &ahi); + sum32 = _mm_add_epi32(sum32, _mm_add_epi32(alo, ahi)); + + sum16lo = _mm_setzero_si128(); + sum16hi = _mm_setzero_si128(); + + _mm_store_si128((__m128i *)counts, count8); + for (j = 0; j < 16; j++) { + count += counts[j]; + } + count8 = _mm_setzero_si128(); + } + + ref += 16; + new += 16; + smartmask += 16; + } + /* Outside the hot loop, write out the running sum to memory + * and add the four component uint32's to get the total sum: */ + _mm_storeu_si128((__m128i *)&total, sum32); + sum = total[0] + total[1] + total[2] + total[3]; + + /* We handled all 16-bit blocks. Truncate i to its value mod 16, so that + * the regular bytewise code can handle the remainder: */ + i &= 0x0F; + + for (; i > 0; i--) { + unsigned char absdiff = (*ref > *new) ? (*ref - *new) : (*new - *ref); + + if (mask) + absdiff = ((absdiff * *mask++) / 255); + + if (*smartmask) { + sum += absdiff + 1; + count++; + } + + ref++; + new++; + smartmask++; + } + + if (count > 3) /* Avoid divide by zero. */ + sum /= count / 3; + + /* 5: safe, 4: regular, 3: more sensitive */ + cnt->noise = 4 + (cnt->noise + sum) / 2; +} diff --git a/alg/alg_update_reference_frame.plain.c b/alg/alg_update_reference_frame.plain.c new file mode 100644 index 0000000..71c0ab1 --- /dev/null +++ b/alg/alg_update_reference_frame.plain.c @@ -0,0 +1,53 @@ +static void alg_update_reference_frame_plain(struct context *cnt, int action) +{ + int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME; + int i, threshold_ref; + uint16_t *ref_dyn = cnt->imgs.ref_dyn; + unsigned char *image_virgin = cnt->imgs.image_virgin; + unsigned char *ref = cnt->imgs.ref; + unsigned char *smartmask = cnt->imgs.smartmask_final; + unsigned char *out = cnt->imgs.out; + + if (cnt->lastrate > 5) /* Match rate limit */ + accept_timer /= (cnt->lastrate / 3); + + if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */ + threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100; + + for (i = cnt->imgs.motionsize; i > 0; i--) { + int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref); + int includemask = (thresholdmask && (*smartmask != 0)); + + /* Exclude pixels from ref frame well below noise level. */ + if (includemask) { + if (*ref_dyn == 0) { /* Always give new pixels a chance. */ + *ref_dyn = 1; + } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */ + *ref_dyn = 0; + *ref = *image_virgin; + } else if (*out) { + (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */ + } else { + *ref_dyn = 0; /* Nothing special - release pixel. */ + *ref = (*ref + *image_virgin) / 2; + } + + } else { /* No motion: copy to ref frame. */ + *ref_dyn = 0; /* Reset pixel */ + *ref = *image_virgin; + } + + ref++; + image_virgin++; + smartmask++; + ref_dyn++; + out++; + } /* end for i */ + + } else { /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */ + /* Copy fresh image */ + memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size); + /* Reset static objects */ + memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn)); + } +} diff --git a/alg/alg_update_reference_frame.sse2-algo.c b/alg/alg_update_reference_frame.sse2-algo.c new file mode 100644 index 0000000..b8fbef8 --- /dev/null +++ b/alg/alg_update_reference_frame.sse2-algo.c @@ -0,0 +1,55 @@ +/* This file is not meant to be included into the main program; it's intended + * to showcase, benchmark and test the algorithm used in the SSE2 version of + * this routine, in simple, non-vectorized code. + * The idea is to replace all conditionals from the "plain" function with a + * series of mask operations. This is slow when done per pixel (since we do all + * calculations for all pixels), but fast in parallel. + */ +static void alg_update_reference_frame_sse2_algo(struct context *cnt, int action) +{ + int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME; + int i, threshold_ref; + uint16_t *ref_dyn = cnt->imgs.ref_dyn; + unsigned char *image_virgin = cnt->imgs.image_virgin; + unsigned char *ref = cnt->imgs.ref; + unsigned char *smartmask = cnt->imgs.smartmask_final; + unsigned char *out = cnt->imgs.out; + + if (cnt->lastrate > 5) /* Match rate limit */ + accept_timer /= (cnt->lastrate / 3); + + if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */ + threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100; + + for (i = cnt->imgs.motionsize; i > 0; i--) { + int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref); + int includemask = (thresholdmask && !(*smartmask == 0)); + int refdynzero = (*ref_dyn == 0); + int refdyntimer = (*ref_dyn > accept_timer); + int outzero = (*out == 0); + + *ref_dyn &= (includemask && !(refdynzero || refdyntimer || outzero)); + + if (includemask && !(refdynzero || refdyntimer) && outzero) { + *ref = (*ref + *image_virgin) / 2; + } + if (includemask && !((refdyntimer || outzero) && !refdynzero)) { + *ref_dyn += 1; + } + if (!(includemask && !(refdyntimer && !refdynzero))) { + *ref = *image_virgin; + } + ref++; + image_virgin++; + smartmask++; + ref_dyn++; + out++; + } /* end for i */ + + } else { /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */ + /* Copy fresh image */ + memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size); + /* Reset static objects */ + memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn)); + } +} diff --git a/alg/alg_update_reference_frame.sse2.c b/alg/alg_update_reference_frame.sse2.c new file mode 100644 index 0000000..4ce6668 --- /dev/null +++ b/alg/alg_update_reference_frame.sse2.c @@ -0,0 +1,142 @@ +/* The basic algorithm is demonstrated in 'alg_update_reference_frame.sse2-algo.c' + * as regular (non-SIMD), more readable code. Comments below allude to + * snippets from that file. The idea is to use masks instead of + * branches to compose the output, then do it in parallel. */ + +static void alg_update_reference_frame_sse2(struct context *cnt, int action) +{ + int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME; + int i, threshold_ref; + uint16_t *ref_dyn = cnt->imgs.ref_dyn; + unsigned char *image_virgin = cnt->imgs.image_virgin; + unsigned char *ref = cnt->imgs.ref; + unsigned char *smartmask = cnt->imgs.smartmask_final; + unsigned char *out = cnt->imgs.out; + + int sse_iters; + __m128i threshrow, accepttimerrow, mask; + + if (cnt->lastrate > 5) /* Match rate limit */ + accept_timer /= (cnt->lastrate / 3); + + if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */ + threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100; + + i = cnt->imgs.motionsize; + + /* Below we'll do a calculation to see whether our 8-bit uints + * are *larger* than threshold_ref. Threshold_ref is an int, but + * for the purposes of this check we can cast it to an 8-bit uint + * and clamp it to 255; the comparator can never exceed that value: */ + threshrow = _mm_set1_epi8((threshold_ref > 0xFF) ? 0xFF : threshold_ref); + + /* Create a row of 8 uint16_t's with almost clamped accept timer: */ + accepttimerrow = _mm_set1_epi16((accept_timer > 0xFFFE) ? 0xFFFE : accept_timer); + + /* SSE row size is 16 bytes: */ + for (sse_iters = i >> 4; sse_iters > 0; sse_iters--) + { + /* Load reference row and virgin image: */ + __m128i refrow = _mm_loadu_si128((__m128i *)ref); + __m128i vgnrow = _mm_loadu_si128((__m128i *)image_virgin); + + /* int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref); */ + __m128i thresholdmask = _mm_cmpgt_epu8(_mm_absdiff_epu8(refrow, vgnrow), threshrow); + + /* int includemask = (thresholdmask && !(*smartmask == 0)); */ + __m128i smartmaskzero = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)smartmask), _mm_setzero_si128()); + __m128i includemask = _mm_andnot_si128(smartmaskzero, thresholdmask); + + /* Load the two ref_dyn's: */ + __m128i refdynlo = _mm_loadu_si128((__m128i *)(ref_dyn + 0)); + __m128i refdynhi = _mm_loadu_si128((__m128i *)(ref_dyn + 8)); + + /* int refdynzero = (*ref_dyn == 0); */ + /* Make an 8-bit mask with 0xFF where ref_dyn == 0: */ + __m128i refdynzero = _mm_packs_epi16( + _mm_cmpeq_epi16(refdynlo, _mm_setzero_si128()), + _mm_cmpeq_epi16(refdynhi, _mm_setzero_si128()) + ); + + /* int refdyntimer = (*ref_dyn > accept_timer); */ + /* Make an 8-bit mask with 0xFF where ref_dyn > accept_timer: */ + __m128i refdyntimer = _mm_packs_epi16( + _mm_cmpgt_epu16(refdynlo, accepttimerrow), + _mm_cmpgt_epu16(refdynhi, accepttimerrow) + ); + + /* int outzero = (*out == 0); */ + __m128i outzero = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)out), _mm_setzero_si128()); + + /* *ref_dyn &= (includemask && !(refdynzero || refdyntimer || outzero)); */ + mask = _mm_andnot_si128(_mm_or_si128(_mm_or_si128(refdynzero, refdyntimer), outzero), includemask); + + /* Duplicate mask to 16-bit widths: */ + refdynlo = _mm_and_si128(refdynlo, _mm_unpacklo_epi8(mask, mask)); + refdynhi = _mm_and_si128(refdynhi, _mm_unpackhi_epi8(mask, mask)); + + /* if (includemask && !(refdynzero || refdyntimer) && outzero) *ref = (*ref + *image_virgin) / 2; */ + mask = _mm_and_si128(_mm_andnot_si128(_mm_or_si128(refdynzero, refdyntimer), includemask), outzero); + refrow = _mm_blendv_si128(refrow, _mm_avg_epu8(refrow, vgnrow), mask); + + /* if (includemask && !((refdyntimer || outzero) && !refdynzero)) *ref_dyn += 1; */ + mask = _mm_andnot_si128(_mm_andnot_si128(refdynzero, _mm_or_si128(refdyntimer, outzero)), includemask); + refdynlo = _mm_adds_epu16(refdynlo, _mm_and_si128(_mm_set1_epi16(1), _mm_unpacklo_epi8(mask, mask))); + refdynhi = _mm_adds_epu16(refdynhi, _mm_and_si128(_mm_set1_epi16(1), _mm_unpackhi_epi8(mask, mask))); + + /* Store the two ref dyn's back: */ + _mm_storeu_si128((__m128i *)(ref_dyn + 0), refdynlo); + _mm_storeu_si128((__m128i *)(ref_dyn + 8), refdynhi); + + /* if (!(includemask && !(refdyntimer && !refdynzero))) *ref = *image_virgin; */ + mask = _mm_andnot_si128(_mm_andnot_si128(refdynzero, refdyntimer), includemask); + refrow = _mm_blendv_si128(vgnrow, refrow, mask); + + /* Store ref back: */ + _mm_storeu_si128((__m128i *)ref, refrow); + + ref += 16; + image_virgin += 16; + smartmask += 16; + ref_dyn += 16; + out += 16; + } + + /* Let the bytewise code handle the remaining bytes: */ + for (i = cnt->imgs.motionsize & 0x0F; i > 0; i--) { + int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref); + int includemask = (thresholdmask && (*smartmask != 0)); + + /* Exclude pixels from ref frame well below noise level. */ + if (includemask) { + if (*ref_dyn == 0) { /* Always give new pixels a chance. */ + *ref_dyn = 1; + } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */ + *ref_dyn = 0; + *ref = *image_virgin; + } else if (*out) { + (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */ + } else { + *ref_dyn = 0; /* Nothing special - release pixel. */ + *ref = (*ref + *image_virgin) / 2; + } + + } else { /* No motion: copy to ref frame. */ + *ref_dyn = 0; /* Reset pixel */ + *ref = *image_virgin; + } + + ref++; + image_virgin++; + smartmask++; + ref_dyn++; + out++; + } /* end for i */ + + } else { /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */ + /* Copy fresh image */ + memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size); + /* Reset static objects */ + memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn)); + } +} diff --git a/alg/sse2.h b/alg/sse2.h new file mode 100644 index 0000000..f7e5a64 --- /dev/null +++ b/alg/sse2.h @@ -0,0 +1,89 @@ +static __inline __m128i +_mm_cmpgt_epu8 (__m128i x, __m128i y) +{ + /* Returns 0xFF where x > y: */ + return _mm_andnot_si128( + _mm_cmpeq_epi8(x, y), + _mm_cmpeq_epi8(_mm_max_epu8(x, y), x) + ); +} + +static __inline __m128i +_mm_cmple_epu16 (__m128i x, __m128i y) +{ + /* Returns 0xFFFF where x <= y: */ + return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128()); +} + +static __inline __m128i +_mm_cmpgt_epu16 (__m128i x, __m128i y) +{ + /* Returns 0xFFFF where x > y: */ + return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x)); +} + +static __inline __m128i +_mm_absdiff_epu8 (__m128i x, __m128i y) +{ + /* Calculate absolute difference: abs(x - y): */ + return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x)); +} + +static __inline __m128i +_mm_blendv_si128 (__m128i x, __m128i y, __m128i mask) +{ + /* Replace bit in x with bit in y when matching bit in mask is set: */ + return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y)); +} + +static __inline __m128i +_mm_div255_epu16 (__m128i x) +{ + /* Divide 8 16-bit uints by 255: + * x := ((x + 1) + (x >> 8)) >> 8: */ + return _mm_srli_epi16(_mm_adds_epu16( + _mm_adds_epu16(x, _mm_set1_epi16(1)), + _mm_srli_epi16(x, 8)), 8); +} + +static __inline void +sse_u8_to_u16 (__m128i in, __m128i *__restrict lo, __m128i *__restrict hi) +{ + /* Zero-extend an 8-bit vector to two 16-bit vectors: */ + *lo = _mm_unpacklo_epi8(in, _mm_setzero_si128()); + *hi = _mm_unpackhi_epi8(in, _mm_setzero_si128()); +} + +static __inline void +sse_u16_to_u32 (__m128i in, __m128i *__restrict lo, __m128i *__restrict hi) +{ + /* Zero-extend a 16-bit vector to two 32-bit vectors: */ + *lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); + *hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); +} + +static __inline __m128i +_mm_scale_epu8 (__m128i x, __m128i y) +{ + /* Returns an "alpha blend" of x with y; + * x := x * (y / 255) + * Reorder: x := (x * y) / 255 + */ + __m128i xlo, xhi; + __m128i ylo, yhi; + + /* Unpack x and y into 16-bit uints: */ + sse_u8_to_u16(x, &xlo, &xhi); + sse_u8_to_u16(y, &ylo, &yhi); + + /* Multiply x with y, keeping the low 16 bits: */ + xlo = _mm_mullo_epi16(xlo, ylo); + xhi = _mm_mullo_epi16(xhi, yhi); + + /* Divide by 255: */ + xlo = _mm_div255_epu16(xlo); + xhi = _mm_div255_epu16(xhi); + + /* Repack the 16-bit uints to 8-bit values: */ + return _mm_packus_epi16(xlo, xhi); +} diff --git a/alg/tests/Makefile b/alg/tests/Makefile new file mode 100644 index 0000000..a02b4f5 --- /dev/null +++ b/alg/tests/Makefile @@ -0,0 +1,28 @@ +CFLAGS += -std=c89 -Werror -Wall -Wextra -pedantic -msse2 -O3 +LDFLAGS += -lrt + +.PHONY: all clean + +all: test_alg_noise_tune test_alg_update_reference_frame + +test_alg_noise_tune: test_alg_noise_tune.o timer.o + $(CC) $(LDFLAGS) -o $@ $^ + +test_alg_noise_tune.o: ../alg_noise_tune.plain.c ../alg_noise_tune.sse2.c test_alg_noise_tune.c + $(CC) $(CFLAGS) -o $@ -c test_alg_noise_tune.c + +test_alg_update_reference_frame: test_alg_update_reference_frame.o timer.o + $(CC) $(LDFLAGS) -o $@ $^ + +test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c ../alg_update_reference_frame.sse2-algo.c ../alg_update_reference_frame.sse2.c test_alg_update_reference_frame.c + $(CC) $(CFLAGS) -o $@ -c test_alg_update_reference_frame.c + +timer.o: timer.c + $(CC) $(CFLAGS) -o $@ -c $^ + +# This one is just for curiosity: +test_alg_update_reference_frame.s: test_alg_update_reference_frame.c + $(CC) $(CFLAGS) -S -o $@ -c $^ + +clean: + rm -f *.o *.s test_alg_noise_tune test_alg_update_reference_frame diff --git a/alg/tests/test_alg_noise_tune.c b/alg/tests/test_alg_noise_tune.c new file mode 100644 index 0000000..50cf807 --- /dev/null +++ b/alg/tests/test_alg_noise_tune.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include + +#include "../sse2.h" +#include "timer.h" + +/* Stub structures for test purposes: */ +struct images +{ + unsigned char *ref; + unsigned char *mask; + unsigned char *smartmask_final; + int motionsize; +}; + +struct context +{ + struct images imgs; + int noise; +}; + +#define WIDTH 600 +#define HEIGHT 400 +#define BLOCKPX 50 + +static void +init (struct context *ctx, unsigned char **new) +{ + ctx->imgs.motionsize = WIDTH * HEIGHT; + ctx->imgs.ref = malloc(ctx->imgs.motionsize); + ctx->imgs.mask = malloc(ctx->imgs.motionsize); + ctx->imgs.smartmask_final = malloc(ctx->imgs.motionsize); + *new = malloc(ctx->imgs.motionsize); +} + +static void +clean (struct context *ctx, unsigned char *new) +{ + ctx->noise = 0; + memset(ctx->imgs.ref, 0, WIDTH * HEIGHT); + memset(ctx->imgs.mask, 0, WIDTH * HEIGHT); + memset(ctx->imgs.smartmask_final, 0, WIDTH * HEIGHT); + memset(new, 0, WIDTH * HEIGHT); +} + +static void +apply_pattern (unsigned char *pattern, unsigned char *img) +{ + int x = 0, y = 0; + + /* Each pattern represents BLOCKPX * BLOCKPX pixels in the output: */ + while (y < HEIGHT) { + unsigned char *col = pattern; + while (x < WIDTH) { + *img++ = *col; + if (++x % BLOCKPX == 0) { + col++; + } + } + /* After BLOCKPX rows, move to next: */ + if (++y % BLOCKPX == 0) { + pattern += WIDTH / BLOCKPX; + } + } +} + +static void +random_patterns (int seed, struct context *ctx, unsigned char *new) +{ + int i; + unsigned char *c; + unsigned char pattern[(HEIGHT * WIDTH) / BLOCKPX]; + unsigned char *ptrs[4]; + + ptrs[0] = ctx->imgs.ref; + ptrs[1] = ctx->imgs.mask; + ptrs[2] = ctx->imgs.smartmask_final; + ptrs[3] = new; + + srand(seed); + + for (i = 0; i < 4; i++) { + for (c = pattern; c < (pattern + sizeof(pattern)); c++) { + *c = rand() / (RAND_MAX / 256); + } + apply_pattern(pattern, ptrs[i]); + } +} + +static void +testsuite (char *name, struct context *ctx, unsigned char *new, void (*func)(struct context *, unsigned char *)) +{ + int i; + + printf("---\n%s\n", name); + clean(ctx, new); + + timer_start(); + for (i = 100; i > 0; i--) { + func(ctx, new); + } + timer_stop(); + + printf("Noise level: %d\nTime: %.4f sec\n", ctx->noise, timer_sec()); + + for (i = 100; i > 0; i--) { + clean(ctx, new); + random_patterns(i, ctx, new); + func(ctx, new); + printf("%d ", ctx->noise); + } + puts(""); +} + +#include "../alg_noise_tune.plain.c" +#include "../alg_noise_tune.sse2.c" + +int +main () +{ + struct context ctx; + unsigned char *new; + + init(&ctx, &new); + + testsuite("plain", &ctx, new, alg_noise_tune_plain); + testsuite("sse2", &ctx, new, alg_noise_tune_sse2); + + free(new); + free(ctx.imgs.ref); + free(ctx.imgs.mask); + free(ctx.imgs.smartmask_final); + + return 0; +} diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c new file mode 100644 index 0000000..218fa31 --- /dev/null +++ b/alg/tests/test_alg_update_reference_frame.c @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include + +#include "../sse2.h" +#include "timer.h" + +/* Stub structures for test purposes: */ +struct images +{ + unsigned char *ref; + unsigned char *out; + uint16_t *ref_dyn; + unsigned char *image_virgin; + unsigned char *smartmask_final; + int size; + int motionsize; +}; + +struct context +{ + struct images imgs; + int noise; + unsigned int lastrate; +}; + +#define WIDTH 600 +#define HEIGHT 400 +#define BLOCKPX 50 + +static void +init (struct context *ctx) +{ + ctx->imgs.motionsize = WIDTH * HEIGHT; + ctx->imgs.ref = malloc(ctx->imgs.motionsize); + ctx->imgs.out = malloc(ctx->imgs.motionsize); + ctx->imgs.ref_dyn = malloc(ctx->imgs.motionsize * sizeof(*ctx->imgs.ref_dyn)); + ctx->imgs.image_virgin = malloc(ctx->imgs.motionsize); + ctx->imgs.smartmask_final = malloc(ctx->imgs.motionsize); +} + +static void +clean (struct context *ctx) +{ + ctx->noise = 0; + ctx->lastrate = 0; + memset(ctx->imgs.ref, 0, WIDTH * HEIGHT); + memset(ctx->imgs.out, 0, WIDTH * HEIGHT); + memset(ctx->imgs.ref_dyn, 0, WIDTH * HEIGHT * sizeof(*ctx->imgs.ref_dyn)); + memset(ctx->imgs.image_virgin, 0, WIDTH * HEIGHT); + memset(ctx->imgs.smartmask_final, 0, WIDTH * HEIGHT); + ctx->imgs.size = WIDTH * HEIGHT; + ctx->imgs.motionsize = WIDTH * HEIGHT; +} + +static int +equal_output (struct context *ctx, int action, void (*func_a)(struct context *, int), void (*func_b)(struct context *, int)) +{ + int i, ret = 1; + struct context cxs[2]; + + for (i = 0; i < 2; i++) + { + /* Copy original context: */ + memcpy(&cxs[i], ctx, sizeof(*ctx)); + memcpy(&cxs[i].imgs, &ctx->imgs, sizeof(ctx->imgs)); + + /* Copy the original image structures: */ + #define CPY(x) cxs[i].imgs.x = malloc(ctx->imgs.size * sizeof(*ctx->imgs.x)); memcpy(cxs[i].imgs.x, ctx->imgs.x, ctx->imgs.size * sizeof(*ctx->imgs.x)); + CPY(ref) + CPY(out) + CPY(image_virgin) + CPY(smartmask_final) + CPY(ref_dyn) + #undef CPY + } + /* Run both functions on their own copy: */ + func_a(&cxs[0], action); + func_b(&cxs[1], action); + + /* Compare image outputs: */ + #define CMP(x) if (memcmp(cxs[0].imgs.x, cxs[1].imgs.x, sizeof(*cxs[0].imgs.x)) != 0) { ret = 0; goto out; } + CMP(ref) + CMP(ref_dyn) + #undef CMP + +out: /* Free memory, return: */ + for (i = 0; i < 2; i++) { + free(cxs[i].imgs.ref); + free(cxs[i].imgs.out); + free(cxs[i].imgs.image_virgin); + free(cxs[i].imgs.smartmask_final); + free(cxs[i].imgs.ref_dyn); + } + return ret; +} + +static void +permutate (int action, void (*func_a)(struct context *, int), void (*func_b)(struct context *, int)) +{ + #define STRIPSZ 41 + + unsigned char ref[STRIPSZ]; + unsigned char out[STRIPSZ]; + unsigned char image_virgin[STRIPSZ]; + unsigned char smartmask_final[STRIPSZ]; + uint16_t ref_dyn[STRIPSZ]; + struct context ctx; + + int i, iter_ref_dyn, iter_smartmask, iter_image_virgin, iter_out, iter_ref; + + ctx.noise = 0; + ctx.lastrate = 0; + ctx.imgs.ref = ref; + ctx.imgs.out = out; + ctx.imgs.image_virgin = image_virgin; + ctx.imgs.smartmask_final = smartmask_final; + ctx.imgs.ref_dyn = ref_dyn; + ctx.imgs.size = STRIPSZ; + ctx.imgs.motionsize = STRIPSZ; + + /* For the purposes of the routine, smartmask is zero or nonzero: */ + for (iter_smartmask = 0; iter_smartmask < 2; iter_smartmask++) { + memset(smartmask_final, iter_smartmask, ctx.imgs.size); + + /* For the purposes of the routine, out is zero or nonzero: */ + for (iter_out = 0; iter_out < 2; iter_out++) { + memset(out, iter_out, ctx.imgs.size); + + for (iter_image_virgin = 0; iter_image_virgin < 256; iter_image_virgin++) { + for (i = 0; i < ctx.imgs.size; i++) { + image_virgin[i] = iter_image_virgin + i; + } + /* ref_dyn has a limited range: */ + for (iter_ref_dyn = 0; iter_ref_dyn < 10; iter_ref_dyn++) { + for (i = 0; i < ctx.imgs.size; i++) { + ref_dyn[i] = iter_ref_dyn + i; + } + for (iter_ref = 0; iter_ref < 256; iter_ref++) { + for (i = 0; i < ctx.imgs.size; i++) { + ref[i] = iter_ref + i; + } + /* For this permutation, check that both functions + * return the same output data: */ + if (equal_output(&ctx, action, func_a, func_b) == 0) { + printf("Functions do NOT match!\n"); + return; + } + } + } + } + } + } + printf("Functions MATCH\n"); +} + +static void +timing (char *name, struct context *ctx, int action, void (*func)(struct context *, int)) +{ + int i; + float total_time = 0.0f; + + printf("---\n%s\n", name); + clean(ctx); + + for (i = 300; i > 0; i--) { + timer_start(); + func(ctx, action); + timer_stop(); + total_time += timer_sec(); + } + + /* Print bogus value to prevent the loop from being optimized out: */ + printf("Value: %d\nTime: %.4f sec\n", ctx->imgs.ref[0], total_time); +} + +#define UPDATE_REF_FRAME 1 +#define ACCEPT_STATIC_OBJECT_TIME 10 /* Seconds */ +#define EXCLUDE_LEVEL_PERCENT 20 + +#include "../alg_update_reference_frame.plain.c" +#include "../alg_update_reference_frame.sse2-algo.c" +#include "../alg_update_reference_frame.sse2.c" + +int +main () +{ + struct context ctx; + + init(&ctx); + + timing("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain); + timing("plain, SSE2 algorithm demo", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2_algo); + timing("SSE2", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2); + + permutate(UPDATE_REF_FRAME, alg_update_reference_frame_plain, alg_update_reference_frame_sse2_algo); + permutate(UPDATE_REF_FRAME, alg_update_reference_frame_plain, alg_update_reference_frame_sse2); + + free(ctx.imgs.ref); + free(ctx.imgs.out); + free(ctx.imgs.ref_dyn); + free(ctx.imgs.image_virgin); + free(ctx.imgs.smartmask_final); + + return 0; +} diff --git a/alg/tests/timer.c b/alg/tests/timer.c new file mode 100644 index 0000000..10e64a5 --- /dev/null +++ b/alg/tests/timer.c @@ -0,0 +1,35 @@ +#define _POSIX_C_SOURCE 199309L + +#include + +/* This is not threadsafe at all, but that's fine for our purposes. */ + +static struct timespec start; +static struct timespec end; + +void +timer_start () +{ + clock_gettime(CLOCK_MONOTONIC, &start); +} + +void +timer_stop () +{ + clock_gettime(CLOCK_MONOTONIC, &end); +} + +float +timer_sec () +{ + struct timespec temp; + + if ((end.tv_nsec - start.tv_nsec) < 0) { + temp.tv_sec = end.tv_sec - start.tv_sec - 1; + temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec; + } else { + temp.tv_sec = end.tv_sec - start.tv_sec; + temp.tv_nsec = end.tv_nsec - start.tv_nsec; + } + return (float)(temp.tv_sec + ((float)temp.tv_nsec / 1000000000.0)); +} diff --git a/alg/tests/timer.h b/alg/tests/timer.h new file mode 100644 index 0000000..8c90baf --- /dev/null +++ b/alg/tests/timer.h @@ -0,0 +1,3 @@ +void timer_start (); +void timer_stop (); +float timer_sec (); diff --git a/motion.h b/motion.h index c08d84f..9c12255 100644 --- a/motion.h +++ b/motion.h @@ -289,7 +289,7 @@ struct images { unsigned char *ref; /* The reference frame */ unsigned char *out; /* Picture buffer for motion images */ - int *ref_dyn; /* Dynamic objects to be excluded from reference frame */ + uint16_t *ref_dyn; /* Dynamic objects to be excluded from reference frame */ unsigned char *image_virgin; /* Last picture frame with no text or locate overlay */ struct image_data preview_image; /* Picture buffer for best image when enables */ unsigned char *mask; /* Buffer for the mask file */