diff --git a/alg.c b/alg.c
index 93c260f..1692537 100644
--- a/alg.c
+++ b/alg.c
@@ -14,6 +14,12 @@
 #include "mmx.h"
 #endif
 
+#ifdef __SSE2__
+#define HAVE_SSE2
+#include <emmintrin.h>
+#include "alg/sse2.h"
+#endif
+
 #define MAX2(x, y) ((x) > (y) ? (x) : (y))
 #define MAX3(x, y, z) ((x) > (y) ? ((x) > (z) ? (x) : (z)) : ((y) > (z) ? (y) : (z)))
 
@@ -352,42 +358,23 @@ void alg_draw_red_location(struct coord *cent, struct images *imgs, int width, u
 #define DIFF(x, y)         (ABS((x)-(y)))
 #define NDIFF(x, y)        (ABS(x) * NORM / (ABS(x) + 2 * DIFF(x, y)))
 
+#ifdef HAVE_SSE2
+#include "alg/alg_noise_tune.sse2.c"
+#else
+#include "alg/alg_noise_tune.plain.c"
+#endif
+
 /**
  * alg_noise_tune
  *
  */
 void alg_noise_tune(struct context *cnt, unsigned char *new)
 {
-    struct images *imgs = &cnt->imgs;
-    int i;
-    unsigned char *ref = imgs->ref;
-    int diff, sum = 0, count = 0;
-    unsigned char *mask = imgs->mask;
-    unsigned char *smartmask = imgs->smartmask_final;
-
-    i = imgs->motionsize;
-            
-    for (; i > 0; i--) {
-        diff = ABS(*ref - *new);
-
-        if (mask)
-            diff = ((diff * *mask++) / 255);
-
-        if (*smartmask) {
-            sum += diff + 1;
-            count++;
-        }
-
-        ref++;
-        new++;
-        smartmask++;
-    }
-
-    if (count > 3)  /* Avoid divide by zero. */
-        sum /= count / 3;
-    
-    /* 5: safe, 4: regular, 3: more sensitive */
-    cnt->noise = 4 + (cnt->noise + sum) / 2;
+#ifdef HAVE_SSE2
+    alg_noise_tune_sse2(cnt, new);
+#else
+    alg_noise_tune_plain(cnt, new);
+#endif
 }
 
 /**
@@ -1301,6 +1288,15 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg)
     return 0;
 }
 
+#define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
+#define EXCLUDE_LEVEL_PERCENT 20
+
+#ifdef HAVE_SSE2
+#include "alg/alg_update_reference_frame.sse2.c"
+#else
+#include "alg/alg_update_reference_frame.plain.c"
+#endif
+
 /** 
  * alg_update_reference_frame
  *
@@ -1314,55 +1310,11 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg)
  *   action - UPDATE_REF_FRAME or RESET_REF_FRAME
  *
  */
-#define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
-#define EXCLUDE_LEVEL_PERCENT 20
 void alg_update_reference_frame(struct context *cnt, int action) 
 {
-    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
-    int i, threshold_ref;
-    int *ref_dyn = cnt->imgs.ref_dyn;
-    unsigned char *image_virgin = cnt->imgs.image_virgin;
-    unsigned char *ref = cnt->imgs.ref;
-    unsigned char *smartmask = cnt->imgs.smartmask_final;
-    unsigned char *out = cnt->imgs.out;
-
-    if (cnt->lastrate > 5)  /* Match rate limit */
-        accept_timer /= (cnt->lastrate / 3);
-
-    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
-        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
-
-        for (i = cnt->imgs.motionsize; i > 0; i--) {
-            /* Exclude pixels from ref frame well below noise level. */
-            if (((int)(abs(*ref - *image_virgin)) > threshold_ref) && (*smartmask)) {
-                if (*ref_dyn == 0) { /* Always give new pixels a chance. */
-                    *ref_dyn = 1;
-                } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */
-                    *ref_dyn = 0;
-                    *ref = *image_virgin;
-                } else if (*out) {
-                    (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */
-                } else {
-                    *ref_dyn = 0; /* Nothing special - release pixel. */
-                    *ref = (*ref + *image_virgin) / 2;
-                }
-
-            } else {  /* No motion: copy to ref frame. */
-                *ref_dyn = 0; /* Reset pixel */
-                *ref = *image_virgin;
-            }
-
-            ref++;
-            image_virgin++;
-            smartmask++;
-            ref_dyn++;
-            out++;
-        } /* end for i */
-
-    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
-        /* Copy fresh image */
-        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
-        /* Reset static objects */
-        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(cnt->imgs.ref_dyn)); 
-    }
+#ifdef HAVE_SSE2
+    alg_update_reference_frame_sse2(cnt, action);
+#else
+    alg_update_reference_frame_plain(cnt, action);
+#endif
 }
diff --git a/alg/alg_noise_tune.plain.c b/alg/alg_noise_tune.plain.c
new file mode 100644
index 0000000..ddb861e
--- /dev/null
+++ b/alg/alg_noise_tune.plain.c
@@ -0,0 +1,36 @@
+/**
+ * alg_noise_tune_plain
+ *
+ */
+static void alg_noise_tune_plain(struct context *cnt, unsigned char *new)
+{
+    struct images *imgs = &cnt->imgs;
+    unsigned char *ref = imgs->ref;
+    unsigned int sum = 0, count = 0;
+    unsigned char *mask = imgs->mask;
+    unsigned char *smartmask = imgs->smartmask_final;
+
+    int i = imgs->motionsize;
+
+    for (; i > 0; i--) {
+        unsigned char absdiff = (*ref > *new) ? (*ref - *new) : (*new - *ref);
+
+        if (mask)
+            absdiff = ((absdiff * *mask++) / 255);
+
+        if (*smartmask) {
+            sum += absdiff + 1;
+            count++;
+        }
+
+        ref++;
+        new++;
+        smartmask++;
+    }
+
+    if (count > 3)  /* Avoid divide by zero. */
+        sum /= count / 3;
+
+    /* 5: safe, 4: regular, 3: more sensitive */
+    cnt->noise = 4 + (cnt->noise + sum) / 2;
+}
diff --git a/alg/alg_noise_tune.sse2.c b/alg/alg_noise_tune.sse2.c
new file mode 100644
index 0000000..9c6f042
--- /dev/null
+++ b/alg/alg_noise_tune.sse2.c
@@ -0,0 +1,129 @@
+/**
+ * alg_noise_tune_sse2
+ *
+ */
+static void alg_noise_tune_sse2(struct context *cnt, unsigned char *new)
+{
+    struct images *imgs = &cnt->imgs;
+    unsigned char *ref = imgs->ref;
+    unsigned int sum = 0, count = 0;
+    unsigned char *mask = imgs->mask;
+    unsigned char *smartmask = imgs->smartmask_final;
+
+    int j, i = imgs->motionsize;
+
+    int sse_iters;
+    __m128i maskrow, zeromask;
+    __m128i alo, ahi;
+    __m128i ones = _mm_set1_epi8(1);
+    __m128i sum16lo = _mm_setzero_si128();
+    __m128i sum16hi = _mm_setzero_si128();
+    __m128i sum32 = _mm_setzero_si128();
+    __m128i count8 = _mm_setzero_si128();
+    uint32_t total[4];
+    uint8_t counts[16] __attribute__((aligned(16)));
+
+    /* SSE reads 16 bytes at a time; truncating division: */
+    for (sse_iters = i >> 4; sse_iters > 0; sse_iters--)
+    {
+        /* Load 16 bytes from images. Addresses need not be 16-byte aligned: */
+        __m128i refrow = _mm_loadu_si128((__m128i *)ref);
+        __m128i newrow = _mm_loadu_si128((__m128i *)new);
+
+        /* Calculate absolute difference per byte: abs(ref - new): */
+        __m128i absdiff = _mm_absdiff_epu8(refrow, newrow);
+
+        /* If there is a mask image, alpha blend the absdiff by its pixels: */
+        if (mask)
+        {
+            /* Load mask image data: */
+            maskrow = _mm_loadu_si128((__m128i *)mask);
+            mask += 16;
+
+            /* "Alpha blend" absdiff with mask, absdiff *= (mask / 255): */
+            absdiff = _mm_scale_epu8(absdiff, maskrow);
+        }
+        /* Add 1 to all diff values: */
+        absdiff = _mm_adds_epu8(absdiff, ones);
+
+        /* Fetch the smartmask values: */
+        maskrow = _mm_loadu_si128((__m128i *)smartmask);
+
+        /* Set diff values to 0 where smartmask is 0: */
+        zeromask = _mm_cmpeq_epi8(maskrow, _mm_setzero_si128());
+        absdiff = _mm_andnot_si128(zeromask, absdiff);
+
+        /* Increment count for every nonzero value of smartmask: */
+        count8 = _mm_adds_epu8(count8, _mm_andnot_si128(zeromask, ones));
+
+        /* Split 16 bytes of sum into 16x16-bit values:
+         * 0 . 1 . 2 . 3 . 4 . 5 . 6 . 7 .
+         * 8 . 9 . A . B . C . D . E . F .
+         */
+        sse_u8_to_u16(absdiff, &alo, &ahi);
+        sum16lo = _mm_adds_epu16(sum16lo, alo);
+        sum16hi = _mm_adds_epu16(sum16hi, ahi);
+
+        /* Offload these 16-bit counters into a 32-bit counter at least once
+         * every 128 rounds to prevent overflow:
+         * Also do this in the last iteration to empty out the counters: */
+        if (!(sse_iters & 0x7F) || sse_iters == 1)
+        {
+            /* Split these two into 4x32 bits and do 32-bit additions:
+             * 0 . . . 1 . . . 2 . . . 3 . . . +
+             * 4 . . . 5 . . . 6 . . . 7 . . . +
+             * 8 . . . 9 . . . A . . . B . . . +
+             * C . . . D . . . E . . . F . . .
+             * Add all of these to the running sum: */
+
+            sse_u16_to_u32(sum16lo, &alo, &ahi);
+            sum32 = _mm_add_epi32(sum32, _mm_add_epi32(alo, ahi));
+
+            sse_u16_to_u32(sum16hi, &alo, &ahi);
+            sum32 = _mm_add_epi32(sum32, _mm_add_epi32(alo, ahi));
+
+            sum16lo = _mm_setzero_si128();
+            sum16hi = _mm_setzero_si128();
+
+            _mm_store_si128((__m128i *)counts, count8);
+            for (j = 0; j < 16; j++) {
+                count += counts[j];
+            }
+            count8 = _mm_setzero_si128();
+        }
+
+        ref += 16;
+        new += 16;
+        smartmask += 16;
+    }
+    /* Outside the hot loop, write out the running sum to memory
+     * and add the four component uint32's to get the total sum: */
+    _mm_storeu_si128((__m128i *)&total, sum32);
+    sum = total[0] + total[1] + total[2] + total[3];
+
+    /* We handled all 16-bit blocks. Truncate i to its value mod 16, so that
+     * the regular bytewise code can handle the remainder: */
+    i &= 0x0F;
+
+    for (; i > 0; i--) {
+        unsigned char absdiff = (*ref > *new) ? (*ref - *new) : (*new - *ref);
+
+        if (mask)
+            absdiff = ((absdiff * *mask++) / 255);
+
+        if (*smartmask) {
+            sum += absdiff + 1;
+            count++;
+        }
+
+        ref++;
+        new++;
+        smartmask++;
+    }
+
+    if (count > 3)  /* Avoid divide by zero. */
+        sum /= count / 3;
+
+    /* 5: safe, 4: regular, 3: more sensitive */
+    cnt->noise = 4 + (cnt->noise + sum) / 2;
+}
diff --git a/alg/alg_update_reference_frame.plain.c b/alg/alg_update_reference_frame.plain.c
new file mode 100644
index 0000000..71c0ab1
--- /dev/null
+++ b/alg/alg_update_reference_frame.plain.c
@@ -0,0 +1,53 @@
+static void alg_update_reference_frame_plain(struct context *cnt, int action)
+{
+    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
+    int i, threshold_ref;
+    uint16_t *ref_dyn = cnt->imgs.ref_dyn;
+    unsigned char *image_virgin = cnt->imgs.image_virgin;
+    unsigned char *ref = cnt->imgs.ref;
+    unsigned char *smartmask = cnt->imgs.smartmask_final;
+    unsigned char *out = cnt->imgs.out;
+
+    if (cnt->lastrate > 5)  /* Match rate limit */
+        accept_timer /= (cnt->lastrate / 3);
+
+    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
+        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
+
+        for (i = cnt->imgs.motionsize; i > 0; i--) {
+            int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref);
+            int includemask = (thresholdmask && (*smartmask != 0));
+
+            /* Exclude pixels from ref frame well below noise level. */
+            if (includemask) {
+                if (*ref_dyn == 0) { /* Always give new pixels a chance. */
+                    *ref_dyn = 1;
+                } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */
+                    *ref_dyn = 0;
+                    *ref = *image_virgin;
+                } else if (*out) {
+                    (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */
+                } else {
+                    *ref_dyn = 0; /* Nothing special - release pixel. */
+                    *ref = (*ref + *image_virgin) / 2;
+                }
+
+            } else {  /* No motion: copy to ref frame. */
+                *ref_dyn = 0; /* Reset pixel */
+                *ref = *image_virgin;
+            }
+
+            ref++;
+            image_virgin++;
+            smartmask++;
+            ref_dyn++;
+            out++;
+        } /* end for i */
+
+    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
+        /* Copy fresh image */
+        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
+        /* Reset static objects */
+        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn));
+    }
+}
diff --git a/alg/alg_update_reference_frame.sse2-algo.c b/alg/alg_update_reference_frame.sse2-algo.c
new file mode 100644
index 0000000..b8fbef8
--- /dev/null
+++ b/alg/alg_update_reference_frame.sse2-algo.c
@@ -0,0 +1,55 @@
+/* This file is not meant to be included into the main program; it's intended
+ * to showcase, benchmark and test the algorithm used in the SSE2 version of
+ * this routine, in simple, non-vectorized code.
+ * The idea is to replace all conditionals from the "plain" function with a
+ * series of mask operations. This is slow when done per pixel (since we do all
+ * calculations for all pixels), but fast in parallel.
+ */
+static void alg_update_reference_frame_sse2_algo(struct context *cnt, int action)
+{
+    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
+    int i, threshold_ref;
+    uint16_t *ref_dyn = cnt->imgs.ref_dyn;
+    unsigned char *image_virgin = cnt->imgs.image_virgin;
+    unsigned char *ref = cnt->imgs.ref;
+    unsigned char *smartmask = cnt->imgs.smartmask_final;
+    unsigned char *out = cnt->imgs.out;
+
+    if (cnt->lastrate > 5)  /* Match rate limit */
+        accept_timer /= (cnt->lastrate / 3);
+
+    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
+        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
+
+        for (i = cnt->imgs.motionsize; i > 0; i--) {
+            int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref);
+            int includemask = (thresholdmask && !(*smartmask == 0));
+            int refdynzero = (*ref_dyn == 0);
+            int refdyntimer = (*ref_dyn > accept_timer);
+            int outzero = (*out == 0);
+
+            *ref_dyn &= (includemask && !(refdynzero || refdyntimer || outzero));
+
+            if (includemask && !(refdynzero || refdyntimer) && outzero) {
+                *ref = (*ref + *image_virgin) / 2;
+            }
+            if (includemask && !((refdyntimer || outzero) && !refdynzero)) {
+                *ref_dyn += 1;
+            }
+            if (!(includemask && !(refdyntimer && !refdynzero))) {
+                *ref = *image_virgin;
+            }
+            ref++;
+            image_virgin++;
+            smartmask++;
+            ref_dyn++;
+            out++;
+        } /* end for i */
+
+    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
+        /* Copy fresh image */
+        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
+        /* Reset static objects */
+        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn));
+    }
+}
diff --git a/alg/alg_update_reference_frame.sse2.c b/alg/alg_update_reference_frame.sse2.c
new file mode 100644
index 0000000..4ce6668
--- /dev/null
+++ b/alg/alg_update_reference_frame.sse2.c
@@ -0,0 +1,142 @@
+/* The basic algorithm is demonstrated in 'alg_update_reference_frame.sse2-algo.c'
+ *  as regular (non-SIMD), more readable code. Comments below allude to
+ *  snippets from that file. The idea is to use masks instead of
+ *  branches to compose the output, then do it in parallel. */
+
+static void alg_update_reference_frame_sse2(struct context *cnt, int action)
+{
+    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
+    int i, threshold_ref;
+    uint16_t *ref_dyn = cnt->imgs.ref_dyn;
+    unsigned char *image_virgin = cnt->imgs.image_virgin;
+    unsigned char *ref = cnt->imgs.ref;
+    unsigned char *smartmask = cnt->imgs.smartmask_final;
+    unsigned char *out = cnt->imgs.out;
+
+    int sse_iters;
+    __m128i threshrow, accepttimerrow, mask;
+
+    if (cnt->lastrate > 5)  /* Match rate limit */
+        accept_timer /= (cnt->lastrate / 3);
+
+    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
+        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
+
+        i = cnt->imgs.motionsize;
+
+        /* Below we'll do a calculation to see whether our 8-bit uints
+         * are *larger* than threshold_ref. Threshold_ref is an int, but
+         * for the purposes of this check we can cast it to an 8-bit uint
+         * and clamp it to 255; the comparator can never exceed that value: */
+        threshrow = _mm_set1_epi8((threshold_ref > 0xFF) ? 0xFF : threshold_ref);
+
+        /* Create a row of 8 uint16_t's with almost clamped accept timer: */
+        accepttimerrow = _mm_set1_epi16((accept_timer > 0xFFFE) ? 0xFFFE : accept_timer);
+
+        /* SSE row size is 16 bytes: */
+        for (sse_iters = i >> 4; sse_iters > 0; sse_iters--)
+        {
+            /* Load reference row and virgin image: */
+            __m128i refrow = _mm_loadu_si128((__m128i *)ref);
+            __m128i vgnrow = _mm_loadu_si128((__m128i *)image_virgin);
+
+            /* int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref); */
+            __m128i thresholdmask = _mm_cmpgt_epu8(_mm_absdiff_epu8(refrow, vgnrow), threshrow);
+
+            /* int includemask = (thresholdmask && !(*smartmask == 0)); */
+            __m128i smartmaskzero = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)smartmask), _mm_setzero_si128());
+            __m128i includemask = _mm_andnot_si128(smartmaskzero, thresholdmask);
+
+            /* Load the two ref_dyn's: */
+            __m128i refdynlo = _mm_loadu_si128((__m128i *)(ref_dyn + 0));
+            __m128i refdynhi = _mm_loadu_si128((__m128i *)(ref_dyn + 8));
+
+            /* int refdynzero = (*ref_dyn == 0); */
+            /* Make an 8-bit mask with 0xFF where ref_dyn == 0: */
+            __m128i refdynzero = _mm_packs_epi16(
+                _mm_cmpeq_epi16(refdynlo, _mm_setzero_si128()),
+                _mm_cmpeq_epi16(refdynhi, _mm_setzero_si128())
+            );
+
+            /* int refdyntimer = (*ref_dyn > accept_timer); */
+            /* Make an 8-bit mask with 0xFF where ref_dyn > accept_timer: */
+            __m128i refdyntimer = _mm_packs_epi16(
+                _mm_cmpgt_epu16(refdynlo, accepttimerrow),
+                _mm_cmpgt_epu16(refdynhi, accepttimerrow)
+            );
+
+            /* int outzero = (*out == 0); */
+            __m128i outzero = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)out), _mm_setzero_si128());
+
+            /* *ref_dyn &= (includemask && !(refdynzero || refdyntimer || outzero)); */
+            mask = _mm_andnot_si128(_mm_or_si128(_mm_or_si128(refdynzero, refdyntimer), outzero), includemask);
+
+            /* Duplicate mask to 16-bit widths: */
+            refdynlo = _mm_and_si128(refdynlo, _mm_unpacklo_epi8(mask, mask));
+            refdynhi = _mm_and_si128(refdynhi, _mm_unpackhi_epi8(mask, mask));
+
+            /* if (includemask && !(refdynzero || refdyntimer) && outzero) *ref = (*ref + *image_virgin) / 2; */
+            mask = _mm_and_si128(_mm_andnot_si128(_mm_or_si128(refdynzero, refdyntimer), includemask), outzero);
+            refrow = _mm_blendv_si128(refrow, _mm_avg_epu8(refrow, vgnrow), mask);
+
+            /* if (includemask && !((refdyntimer || outzero) && !refdynzero)) *ref_dyn += 1; */
+            mask = _mm_andnot_si128(_mm_andnot_si128(refdynzero, _mm_or_si128(refdyntimer, outzero)), includemask);
+            refdynlo = _mm_adds_epu16(refdynlo, _mm_and_si128(_mm_set1_epi16(1), _mm_unpacklo_epi8(mask, mask)));
+            refdynhi = _mm_adds_epu16(refdynhi, _mm_and_si128(_mm_set1_epi16(1), _mm_unpackhi_epi8(mask, mask)));
+
+            /* Store the two ref dyn's back: */
+            _mm_storeu_si128((__m128i *)(ref_dyn + 0), refdynlo);
+            _mm_storeu_si128((__m128i *)(ref_dyn + 8), refdynhi);
+
+            /* if (!(includemask && !(refdyntimer && !refdynzero))) *ref = *image_virgin; */
+            mask = _mm_andnot_si128(_mm_andnot_si128(refdynzero, refdyntimer), includemask);
+            refrow = _mm_blendv_si128(vgnrow, refrow, mask);
+
+            /* Store ref back: */
+            _mm_storeu_si128((__m128i *)ref, refrow);
+
+            ref += 16;
+            image_virgin += 16;
+            smartmask += 16;
+            ref_dyn += 16;
+            out += 16;
+        }
+
+        /* Let the bytewise code handle the remaining bytes: */
+        for (i = cnt->imgs.motionsize & 0x0F; i > 0; i--) {
+            int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref);
+            int includemask = (thresholdmask && (*smartmask != 0));
+
+            /* Exclude pixels from ref frame well below noise level. */
+            if (includemask) {
+                if (*ref_dyn == 0) { /* Always give new pixels a chance. */
+                    *ref_dyn = 1;
+                } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */
+                    *ref_dyn = 0;
+                    *ref = *image_virgin;
+                } else if (*out) {
+                    (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */
+                } else {
+                    *ref_dyn = 0; /* Nothing special - release pixel. */
+                    *ref = (*ref + *image_virgin) / 2;
+                }
+
+            } else {  /* No motion: copy to ref frame. */
+                *ref_dyn = 0; /* Reset pixel */
+                *ref = *image_virgin;
+            }
+
+            ref++;
+            image_virgin++;
+            smartmask++;
+            ref_dyn++;
+            out++;
+        } /* end for i */
+
+    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
+        /* Copy fresh image */
+        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
+        /* Reset static objects */
+        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn));
+    }
+}
diff --git a/alg/sse2.h b/alg/sse2.h
new file mode 100644
index 0000000..f7e5a64
--- /dev/null
+++ b/alg/sse2.h
@@ -0,0 +1,89 @@
+static __inline __m128i
+_mm_cmpgt_epu8 (__m128i x, __m128i y)
+{
+    /* Returns 0xFF where x > y: */
+    return _mm_andnot_si128(
+        _mm_cmpeq_epi8(x, y),
+        _mm_cmpeq_epi8(_mm_max_epu8(x, y), x)
+    );
+}
+
+static __inline __m128i
+_mm_cmple_epu16 (__m128i x, __m128i y)
+{
+    /* Returns 0xFFFF where x <= y: */
+    return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
+}
+
+static __inline __m128i
+_mm_cmpgt_epu16 (__m128i x, __m128i y)
+{
+    /* Returns 0xFFFF where x > y: */
+    return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
+}
+
+static __inline __m128i
+_mm_absdiff_epu8 (__m128i x, __m128i y)
+{
+    /* Calculate absolute difference: abs(x - y): */
+    return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
+}
+
+static __inline __m128i
+_mm_blendv_si128 (__m128i x, __m128i y, __m128i mask)
+{
+    /* Replace bit in x with bit in y when matching bit in mask is set: */
+    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
+}
+
+static __inline __m128i
+_mm_div255_epu16 (__m128i x)
+{
+    /* Divide 8 16-bit uints by 255:
+     * x := ((x + 1) + (x >> 8)) >> 8: */
+    return _mm_srli_epi16(_mm_adds_epu16(
+        _mm_adds_epu16(x, _mm_set1_epi16(1)),
+        _mm_srli_epi16(x, 8)), 8);
+}
+
+static __inline void
+sse_u8_to_u16 (__m128i in, __m128i *__restrict lo, __m128i *__restrict hi)
+{
+    /* Zero-extend an 8-bit vector to two 16-bit vectors: */
+    *lo = _mm_unpacklo_epi8(in, _mm_setzero_si128());
+    *hi = _mm_unpackhi_epi8(in, _mm_setzero_si128());
+}
+
+static __inline void
+sse_u16_to_u32 (__m128i in, __m128i *__restrict lo, __m128i *__restrict hi)
+{
+    /* Zero-extend a 16-bit vector to two 32-bit vectors: */
+    *lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+    *hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+}
+
+static __inline __m128i
+_mm_scale_epu8 (__m128i x, __m128i y)
+{
+    /* Returns an "alpha blend" of x with y;
+     *   x := x * (y / 255)
+     * Reorder: x := (x * y) / 255
+     */
+    __m128i xlo, xhi;
+    __m128i ylo, yhi;
+
+    /* Unpack x and y into 16-bit uints: */
+    sse_u8_to_u16(x, &xlo, &xhi);
+    sse_u8_to_u16(y, &ylo, &yhi);
+
+    /* Multiply x with y, keeping the low 16 bits: */
+    xlo = _mm_mullo_epi16(xlo, ylo);
+    xhi = _mm_mullo_epi16(xhi, yhi);
+
+    /* Divide by 255: */
+    xlo = _mm_div255_epu16(xlo);
+    xhi = _mm_div255_epu16(xhi);
+
+    /* Repack the 16-bit uints to 8-bit values: */
+    return _mm_packus_epi16(xlo, xhi);
+}
diff --git a/alg/tests/Makefile b/alg/tests/Makefile
new file mode 100644
index 0000000..a02b4f5
--- /dev/null
+++ b/alg/tests/Makefile
@@ -0,0 +1,28 @@
+CFLAGS += -std=c89 -Werror -Wall -Wextra -pedantic -msse2 -O3
+LDFLAGS += -lrt
+
+.PHONY: all clean
+
+all: test_alg_noise_tune test_alg_update_reference_frame
+
+test_alg_noise_tune: test_alg_noise_tune.o timer.o
+	$(CC) $(LDFLAGS) -o $@ $^
+
+test_alg_noise_tune.o: ../alg_noise_tune.plain.c ../alg_noise_tune.sse2.c test_alg_noise_tune.c
+	$(CC) $(CFLAGS) -o $@ -c test_alg_noise_tune.c
+
+test_alg_update_reference_frame: test_alg_update_reference_frame.o timer.o
+	$(CC) $(LDFLAGS) -o $@ $^
+
+test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c ../alg_update_reference_frame.sse2-algo.c ../alg_update_reference_frame.sse2.c test_alg_update_reference_frame.c
+	$(CC) $(CFLAGS) -o $@ -c test_alg_update_reference_frame.c
+
+timer.o: timer.c
+	$(CC) $(CFLAGS) -o $@ -c $^
+
+# This one is just for curiosity:
+test_alg_update_reference_frame.s: test_alg_update_reference_frame.c
+	$(CC) $(CFLAGS) -S -o $@ -c $^
+
+clean:
+	rm -f *.o *.s test_alg_noise_tune test_alg_update_reference_frame
diff --git a/alg/tests/test_alg_noise_tune.c b/alg/tests/test_alg_noise_tune.c
new file mode 100644
index 0000000..50cf807
--- /dev/null
+++ b/alg/tests/test_alg_noise_tune.c
@@ -0,0 +1,138 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+#include "../sse2.h"
+#include "timer.h"
+
+/* Stub structures for test purposes: */
+struct images
+{
+	unsigned char *ref;
+	unsigned char *mask;
+	unsigned char *smartmask_final;
+	int motionsize;
+};
+
+struct context
+{
+	struct images imgs;
+	int noise;
+};
+
+#define WIDTH    600
+#define HEIGHT   400
+#define BLOCKPX   50
+
+static void
+init (struct context *ctx, unsigned char **new)
+{
+	ctx->imgs.motionsize = WIDTH * HEIGHT;
+	ctx->imgs.ref = malloc(ctx->imgs.motionsize);
+	ctx->imgs.mask = malloc(ctx->imgs.motionsize);
+	ctx->imgs.smartmask_final = malloc(ctx->imgs.motionsize);
+	*new = malloc(ctx->imgs.motionsize);
+}
+
+static void
+clean (struct context *ctx, unsigned char *new)
+{
+	ctx->noise = 0;
+	memset(ctx->imgs.ref, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.mask, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.smartmask_final, 0, WIDTH * HEIGHT);
+	memset(new, 0, WIDTH * HEIGHT);
+}
+
+static void
+apply_pattern (unsigned char *pattern, unsigned char *img)
+{
+	int x = 0, y = 0;
+
+	/* Each pattern represents BLOCKPX * BLOCKPX pixels in the output: */
+	while (y < HEIGHT) {
+		unsigned char *col = pattern;
+		while (x < WIDTH) {
+			*img++ = *col;
+			if (++x % BLOCKPX == 0) {
+				col++;
+			}
+		}
+		/* After BLOCKPX rows, move to next: */
+		if (++y % BLOCKPX == 0) {
+			pattern += WIDTH / BLOCKPX;
+		}
+	}
+}
+
+static void
+random_patterns (int seed, struct context *ctx, unsigned char *new)
+{
+	int i;
+	unsigned char *c;
+	unsigned char pattern[(HEIGHT * WIDTH) / BLOCKPX];
+	unsigned char *ptrs[4];
+
+	ptrs[0] = ctx->imgs.ref;
+	ptrs[1] = ctx->imgs.mask;
+	ptrs[2] = ctx->imgs.smartmask_final;
+	ptrs[3] = new;
+
+	srand(seed);
+
+	for (i = 0; i < 4; i++) {
+		for (c = pattern; c < (pattern + sizeof(pattern)); c++) {
+			*c = rand() / (RAND_MAX / 256);
+		}
+		apply_pattern(pattern, ptrs[i]);
+	}
+}
+
+static void
+testsuite (char *name, struct context *ctx, unsigned char *new, void (*func)(struct context *, unsigned char *))
+{
+	int i;
+
+	printf("---\n%s\n", name);
+	clean(ctx, new);
+
+	timer_start();
+	for (i = 100; i > 0; i--) {
+		func(ctx, new);
+	}
+	timer_stop();
+
+	printf("Noise level: %d\nTime: %.4f sec\n", ctx->noise, timer_sec());
+
+	for (i = 100; i > 0; i--) {
+		clean(ctx, new);
+		random_patterns(i, ctx, new);
+		func(ctx, new);
+		printf("%d ", ctx->noise);
+	}
+	puts("");
+}
+
+#include "../alg_noise_tune.plain.c"
+#include "../alg_noise_tune.sse2.c"
+
+int
+main ()
+{
+	struct context ctx;
+	unsigned char *new;
+
+	init(&ctx, &new);
+
+	testsuite("plain", &ctx, new, alg_noise_tune_plain);
+	testsuite("sse2", &ctx, new, alg_noise_tune_sse2);
+
+	free(new);
+	free(ctx.imgs.ref);
+	free(ctx.imgs.mask);
+	free(ctx.imgs.smartmask_final);
+
+	return 0;
+}
diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c
new file mode 100644
index 0000000..218fa31
--- /dev/null
+++ b/alg/tests/test_alg_update_reference_frame.c
@@ -0,0 +1,208 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+#include "../sse2.h"
+#include "timer.h"
+
+/* Stub structures for test purposes: */
+struct images
+{
+	unsigned char *ref;
+	unsigned char *out;
+	uint16_t *ref_dyn;
+	unsigned char *image_virgin;
+	unsigned char *smartmask_final;
+	int size;
+	int motionsize;
+};
+
+struct context
+{
+	struct images imgs;
+	int noise;
+	unsigned int lastrate;
+};
+
+#define WIDTH    600
+#define HEIGHT   400
+#define BLOCKPX   50
+
+static void
+init (struct context *ctx)
+{
+	ctx->imgs.motionsize = WIDTH * HEIGHT;
+	ctx->imgs.ref = malloc(ctx->imgs.motionsize);
+	ctx->imgs.out = malloc(ctx->imgs.motionsize);
+	ctx->imgs.ref_dyn = malloc(ctx->imgs.motionsize * sizeof(*ctx->imgs.ref_dyn));
+	ctx->imgs.image_virgin = malloc(ctx->imgs.motionsize);
+	ctx->imgs.smartmask_final = malloc(ctx->imgs.motionsize);
+}
+
+static void
+clean (struct context *ctx)
+{
+	ctx->noise = 0;
+	ctx->lastrate = 0;
+	memset(ctx->imgs.ref, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.out, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.ref_dyn, 0, WIDTH * HEIGHT * sizeof(*ctx->imgs.ref_dyn));
+	memset(ctx->imgs.image_virgin, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.smartmask_final, 0, WIDTH * HEIGHT);
+	ctx->imgs.size = WIDTH * HEIGHT;
+	ctx->imgs.motionsize = WIDTH * HEIGHT;
+}
+
+static int
+equal_output (struct context *ctx, int action, void (*func_a)(struct context *, int), void (*func_b)(struct context *, int))
+{
+	int i, ret = 1;
+	struct context cxs[2];
+
+	for (i = 0; i < 2; i++)
+	{
+		/* Copy original context: */
+		memcpy(&cxs[i], ctx, sizeof(*ctx));
+		memcpy(&cxs[i].imgs, &ctx->imgs, sizeof(ctx->imgs));
+
+		/* Copy the original image structures: */
+		#define CPY(x)  cxs[i].imgs.x = malloc(ctx->imgs.size * sizeof(*ctx->imgs.x)); memcpy(cxs[i].imgs.x, ctx->imgs.x, ctx->imgs.size * sizeof(*ctx->imgs.x));
+		CPY(ref)
+		CPY(out)
+		CPY(image_virgin)
+		CPY(smartmask_final)
+		CPY(ref_dyn)
+		#undef CPY
+	}
+	/* Run both functions on their own copy: */
+	func_a(&cxs[0], action);
+	func_b(&cxs[1], action);
+
+	/* Compare image outputs: */
+	#define CMP(x)  if (memcmp(cxs[0].imgs.x, cxs[1].imgs.x, sizeof(*cxs[0].imgs.x)) != 0) { ret = 0; goto out; }
+	CMP(ref)
+	CMP(ref_dyn)
+	#undef CMP
+
+out:	/* Free memory, return: */
+	for (i = 0; i < 2; i++) {
+		free(cxs[i].imgs.ref);
+		free(cxs[i].imgs.out);
+		free(cxs[i].imgs.image_virgin);
+		free(cxs[i].imgs.smartmask_final);
+		free(cxs[i].imgs.ref_dyn);
+	}
+	return ret;
+}
+
+static void
+permutate (int action, void (*func_a)(struct context *, int), void (*func_b)(struct context *, int))
+{
+	#define STRIPSZ 41
+
+	unsigned char ref[STRIPSZ];
+	unsigned char out[STRIPSZ];
+	unsigned char image_virgin[STRIPSZ];
+	unsigned char smartmask_final[STRIPSZ];
+	uint16_t ref_dyn[STRIPSZ];
+	struct context ctx;
+
+	int i, iter_ref_dyn, iter_smartmask, iter_image_virgin, iter_out, iter_ref;
+
+	ctx.noise = 0;
+	ctx.lastrate = 0;
+	ctx.imgs.ref = ref;
+	ctx.imgs.out = out;
+	ctx.imgs.image_virgin = image_virgin;
+	ctx.imgs.smartmask_final = smartmask_final;
+	ctx.imgs.ref_dyn = ref_dyn;
+	ctx.imgs.size = STRIPSZ;
+	ctx.imgs.motionsize = STRIPSZ;
+
+	/* For the purposes of the routine, smartmask is zero or nonzero: */
+	for (iter_smartmask = 0; iter_smartmask < 2; iter_smartmask++) {
+		memset(smartmask_final, iter_smartmask, ctx.imgs.size);
+
+		/* For the purposes of the routine, out is zero or nonzero: */
+		for (iter_out = 0; iter_out < 2; iter_out++) {
+			memset(out, iter_out, ctx.imgs.size);
+
+			for (iter_image_virgin = 0; iter_image_virgin < 256; iter_image_virgin++) {
+				for (i = 0; i < ctx.imgs.size; i++) {
+					image_virgin[i] = iter_image_virgin + i;
+				}
+				/* ref_dyn has a limited range: */
+				for (iter_ref_dyn = 0; iter_ref_dyn < 10; iter_ref_dyn++) {
+					for (i = 0; i < ctx.imgs.size; i++) {
+						ref_dyn[i] = iter_ref_dyn + i;
+					}
+					for (iter_ref = 0; iter_ref < 256; iter_ref++) {
+						for (i = 0; i < ctx.imgs.size; i++) {
+							ref[i] = iter_ref + i;
+						}
+						/* For this permutation, check that both functions
+						 * return the same output data: */
+						if (equal_output(&ctx, action, func_a, func_b) == 0) {
+							printf("Functions do NOT match!\n");
+							return;
+						}
+					}
+				}
+			}
+		}
+	}
+	printf("Functions MATCH\n");
+}
+
+static void
+timing (char *name, struct context *ctx, int action, void (*func)(struct context *, int))
+{
+	int i;
+	float total_time = 0.0f;
+
+	printf("---\n%s\n", name);
+	clean(ctx);
+
+	for (i = 300; i > 0; i--) {
+		timer_start();
+		func(ctx, action);
+		timer_stop();
+		total_time += timer_sec();
+	}
+
+	/* Print bogus value to prevent the loop from being optimized out: */
+	printf("Value: %d\nTime: %.4f sec\n", ctx->imgs.ref[0], total_time);
+}
+
+#define UPDATE_REF_FRAME  1
+#define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
+#define EXCLUDE_LEVEL_PERCENT 20
+
+#include "../alg_update_reference_frame.plain.c"
+#include "../alg_update_reference_frame.sse2-algo.c"
+#include "../alg_update_reference_frame.sse2.c"
+
+int
+main ()
+{
+	struct context ctx;
+
+	init(&ctx);
+
+	timing("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain);
+	timing("plain, SSE2 algorithm demo", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2_algo);
+	timing("SSE2", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2);
+
+	permutate(UPDATE_REF_FRAME, alg_update_reference_frame_plain, alg_update_reference_frame_sse2_algo);
+	permutate(UPDATE_REF_FRAME, alg_update_reference_frame_plain, alg_update_reference_frame_sse2);
+
+	free(ctx.imgs.ref);
+	free(ctx.imgs.out);
+	free(ctx.imgs.ref_dyn);
+	free(ctx.imgs.image_virgin);
+	free(ctx.imgs.smartmask_final);
+
+	return 0;
+}
diff --git a/alg/tests/timer.c b/alg/tests/timer.c
new file mode 100644
index 0000000..10e64a5
--- /dev/null
+++ b/alg/tests/timer.c
@@ -0,0 +1,35 @@
+#define _POSIX_C_SOURCE 199309L
+
+#include <time.h>
+
+/* This is not threadsafe at all, but that's fine for our purposes. */
+
+static struct timespec start;
+static struct timespec end;
+
+void
+timer_start ()
+{
+	clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+void
+timer_stop ()
+{
+	clock_gettime(CLOCK_MONOTONIC, &end);
+}
+
+float
+timer_sec ()
+{
+	struct timespec temp;
+
+	if ((end.tv_nsec - start.tv_nsec) < 0) {
+		temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+		temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+	} else {
+		temp.tv_sec = end.tv_sec - start.tv_sec;
+		temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+	}
+	return (float)(temp.tv_sec + ((float)temp.tv_nsec / 1000000000.0));
+}
diff --git a/alg/tests/timer.h b/alg/tests/timer.h
new file mode 100644
index 0000000..8c90baf
--- /dev/null
+++ b/alg/tests/timer.h
@@ -0,0 +1,3 @@
+void timer_start ();
+void timer_stop ();
+float timer_sec ();
diff --git a/motion.h b/motion.h
index c08d84f..9c12255 100644
--- a/motion.h
+++ b/motion.h
@@ -289,7 +289,7 @@ struct images {
 
     unsigned char *ref;               /* The reference frame */
     unsigned char *out;               /* Picture buffer for motion images */
-    int *ref_dyn;                     /* Dynamic objects to be excluded from reference frame */
+    uint16_t *ref_dyn;                /* Dynamic objects to be excluded from reference frame */
     unsigned char *image_virgin;      /* Last picture frame with no text or locate overlay */
     struct image_data preview_image;  /* Picture buffer for best image when enables */
     unsigned char *mask;              /* Buffer for the mask file */