diff --git a/crap/tube.h b/crap/tube.h
index 159fec9..b322e2c 100644
--- a/crap/tube.h
+++ b/crap/tube.h
@@ -15,7 +15,7 @@
 
 #include "util.h"
 #include "param.h"
-#include "os2piir.h"
+#include "os2piir_stereo.h"
 
 typedef struct {
 	double desired, actual, speed;
@@ -23,7 +23,7 @@ typedef struct {
 } smoothval;
 
 typedef struct {
-	halfband_t hbu_L, hbu_R, hbd_L, hbd_R;
+	halfband_t hb_up, hb_down;
 	smoothval drive, wet;
 } personal;
 
@@ -47,16 +47,16 @@ smooth(smoothval *val)
 	return a;
 }
 
-INNER double
-distort(double x)
+INNER CONST v2df
+distort(v2df x)
 {
-	return (27*x + 9) / (9*x*x + 6*x + 19) - 9/19.;
+	return (V(27.)*x + V(9.)) / (V(9.)*x*x + V(6.)*x + V(19.)) - V(9./19.);
 }
 
-INNER double
-process_one(double x, double drive, double wet)
+INNER CONST v2df
+process_one(v2df x, v2df drive, v2df wet)
 {
-	return (distort(x*drive)/drive*0.79 - x)*wet + x;
+	return (distort(x*drive)/drive*V(0.79) - x)*wet + x;
 }
 
 static void
@@ -64,74 +64,20 @@ process_double(personal *data,
     double *in_L, double *in_R,
     double *out_L, double *out_R,
     ulong count)
-{
-	disable_denormals();
+#include "process_nonlinear.h"
 
-	double drives[FULL_SIZE], wets[FULL_SIZE];
-	double in_os[FULL_SIZE], out_os[FULL_SIZE];
-
-	for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) {
-		ulong rem = BLOCK_SIZE;
-		if (pos + BLOCK_SIZE > count)
-			rem = count - pos;
-
-		for (ulong i = 0; i < rem*OVERSAMPLING; i++)
-			drives[i] = smooth(&data->drive);
-		for (ulong i = 0; i < rem*OVERSAMPLING; i++)
-			wets[i] = smooth(&data->wet);
-
-		halfband_t *hb;
-
-		// left channel
-		hb = &data->hbu_L;
-		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
-			in_os[i+0] = interpolate(hb, in_L[j]);
-			in_os[i+1] = interpolate(hb, in_L[j]);
-		}
-
-		for (ulong i = 0; i < rem*OVERSAMPLING; i++) {
-			out_os[i] = process_one(in_os[i], drives[i], wets[i]);
-		}
-
-		hb = &data->hbd_L;
-		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
-			decimate(hb, out_os[i+0]);
-			out_L[j] = decimate(hb, out_os[i+1]);
-		}
-
-		// right channel
-		hb = &data->hbu_R;
-		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
-			in_os[i+0] = interpolate(hb, in_R[j]);
-			in_os[i+1] = interpolate(hb, in_R[j]);
-		}
-
-		for (ulong i = 0; i < rem*OVERSAMPLING; i++) {
-			out_os[i] = process_one(in_os[i], drives[i], wets[i]);
-		}
-
-		hb = &data->hbd_R;
-		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
-			decimate(hb, out_os[i+0]);
-			out_R[j] = decimate(hb, out_os[i+1]);
-		}
-
-		in_L += BLOCK_SIZE;
-		in_R += BLOCK_SIZE;
-		out_L += BLOCK_SIZE;
-		out_R += BLOCK_SIZE;
-	}
-}
-
-#include "process.h"
+static void
+process(personal *data,
+    float *in_L, float *in_R,
+    float *out_L, float *out_R,
+    ulong count)
+#include "process_nonlinear.h"
 
 INNER void
 resume(personal *data)
 {
-	memset(&data->hbu_L, 0, sizeof(halfband_t));
-	memset(&data->hbu_R, 0, sizeof(halfband_t));
-	memset(&data->hbd_L, 0, sizeof(halfband_t));
-	memset(&data->hbd_R, 0, sizeof(halfband_t));
+	memset(&data->hb_up,   0, sizeof(halfband_t));
+	memset(&data->hb_down, 0, sizeof(halfband_t));
 }
 
 INNER void
diff --git a/include/os2piir_stereo.h b/include/os2piir_stereo.h
new file mode 100644
index 0000000..5710154
--- /dev/null
+++ b/include/os2piir_stereo.h
@@ -0,0 +1,98 @@
+/* halfband polyphase IIR filter
+   coefficients designed with halfband program:
+     https://gist.github.com/3be345efb6c97d757398#file-halfband-c
+   parameters: 16 coefficients, 0.1 transition band
+   stopband: ~-150dB
+   overall delay: ~8 samples
+*/
+
+#define copy(dst, src) memcpy(dst, src, sizeof(v2df)*8)
+//#define copy(dst, src) _copy(dst, src)
+
+// all should be initialized to 0
+typedef struct {
+	v2df ao[8], bo[8];
+	v2df at[8], bt[8];
+	v2df x1, x2, x3;
+	int i;
+} halfband_t;
+
+INNER void
+halfband_a(v2df a[8], v2df ao[8], v2df x0, v2df x2)
+{
+	a[0] = x2    + (x0   - ao[0])*V(0.006185967461045014);
+	a[1] = ao[0] + (a[0] - ao[1])*V(0.054230780876613788);
+	a[2] = ao[1] + (a[1] - ao[2])*V(0.143280861566087270);
+	a[3] = ao[2] + (a[2] - ao[3])*V(0.262004358403954640);
+	a[4] = ao[3] + (a[3] - ao[4])*V(0.398796973552973666);
+	a[5] = ao[4] + (a[4] - ao[5])*V(0.545323651071132232);
+	a[6] = ao[5] + (a[5] - ao[6])*V(0.698736833646440347);
+	a[7] = ao[6] + (a[6] - ao[7])*V(0.862917812650502936);
+}
+
+INNER void
+halfband_b(v2df b[8], v2df bo[8], v2df x1, v2df x3)
+{
+	b[0] = x3    + (x1   - bo[0])*V(0.024499027624721819);
+	b[1] = bo[0] + (b[0] - bo[1])*V(0.094283481125726432);
+	b[2] = bo[1] + (b[1] - bo[2])*V(0.199699579426327684);
+	b[3] = bo[2] + (b[2] - bo[3])*V(0.328772348316831664);
+	b[4] = bo[3] + (b[3] - bo[4])*V(0.471167216679969414);
+	b[5] = bo[4] + (b[4] - bo[5])*V(0.621096845120503893);
+	b[6] = bo[5] + (b[5] - bo[6])*V(0.778944517099529166);
+	b[7] = bo[6] + (b[6] - bo[7])*V(0.952428157718303137);
+}
+
+INNER v2df
+halfband(halfband_t *h, v2df x0)
+{
+	v2df a[8], b[8];
+	halfband_a(a, h->ao,    x0, h->x2);
+	halfband_b(b, h->bo, h->x1, h->x3);
+	copy(h->ao, h->at);
+	copy(h->bo, h->bt);
+	copy(h->at, a);
+	copy(h->bt, b);
+	h->x3 = h->x2;
+	h->x2 = h->x1;
+	h->x1 = x0;
+	return (a[7] + b[7])*V(0.5);
+}
+
+INNER v2df
+decimate(halfband_t *h, v2df x0)
+{
+	v2df c[8];
+	if ((h->i = !h->i)) {
+		halfband_b(c, h->bo, x0, h->x2);
+		copy(h->bo, c);
+		h->x2 = h->x1;
+		h->x1 = x0;
+		return V(0);
+	} else {
+		halfband_a(c, h->ao, x0, h->x2);
+		copy(h->ao, c);
+		h->x2 = h->x1;
+		h->x1 = x0;
+		return (c[7] + h->bo[7])*V(0.5);
+	}
+}
+
+// note: do not zero-stuff! send the input each time.
+INNER v2df
+interpolate(halfband_t *h, v2df x0)
+{
+	v2df c[8];
+	if ((h->i = !h->i)) {
+		halfband_a(c, h->ao, x0, h->x1);
+		copy(h->ao, c);
+		return c[7];
+	} else {
+		halfband_b(c, h->bo, x0, h->x1);
+		copy(h->bo, c);
+		h->x1 = x0;
+		return c[7];
+	}
+}
+
+#undef copy
diff --git a/include/process_nonlinear.h b/include/process_nonlinear.h
new file mode 100644
index 0000000..603e489
--- /dev/null
+++ b/include/process_nonlinear.h
@@ -0,0 +1,58 @@
+{
+	disable_denormals();
+
+	v2df drives[FULL_SIZE], wets[FULL_SIZE];
+	v2df buf[BLOCK_SIZE];
+	v2df over[FULL_SIZE];
+
+	halfband_t *hb_up   = &data->hb_up;
+	halfband_t *hb_down = &data->hb_down;
+
+	for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) {
+		ulong rem = BLOCK_SIZE;
+		if (pos + BLOCK_SIZE > count)
+			rem = count - pos;
+
+		ulong rem2 = rem*OVERSAMPLING;
+
+		for (ulong i = 0; i < rem2; i++) {
+			double y = smooth(&data->drive);
+			drives[i] = V(y);
+		}
+		for (ulong i = 0; i < rem2; i++) {
+			double y = smooth(&data->wet);
+			wets[i] = V(y);
+		}
+
+		for (ulong i = 0; i < rem; i++) {
+			buf[i][0] = in_L[i];
+			buf[i][1] = in_R[i];
+		}
+
+		for (ulong i = 0; i < rem; i++) {
+			hb_up->i = 0; // so compiler can optimize
+			over[i*2+0] = interpolate(hb_up, buf[i]);
+			over[i*2+1] = interpolate(hb_up, buf[i]);
+		}
+
+		for (ulong i = 0; i < rem2; i++) {
+			over[i] = process_one(over[i], drives[i], wets[i]);
+		}
+
+		for (ulong i = 0; i < rem; i++) {
+			hb_down->i = 0; // so compiler can optimize
+			         decimate(hb_down, over[i*2+0]);
+			buf[i] = decimate(hb_down, over[i*2+1]);
+		}
+
+		for (ulong i = 0; i < rem; i++) {
+			out_L[i] = buf[i][0];
+			out_R[i] = buf[i][1];
+		}
+
+		in_L += BLOCK_SIZE;
+		in_R += BLOCK_SIZE;
+		out_L += BLOCK_SIZE;
+		out_R += BLOCK_SIZE;
+	}
+}
diff --git a/include/util.h b/include/util.h
index 971fb8f..665ff8e 100644
--- a/include/util.h
+++ b/include/util.h
@@ -1,12 +1,17 @@
 #include "math.h"
 
 #ifdef __SSE2__
-#include <xmmintrin.h>
+#include <emmintrin.h>
 #endif
 
 #define INNER static inline
+#define PURE __attribute__((pure))
+#define CONST __attribute__((const))
 typedef double v2df __attribute__((vector_size(16), aligned(16)));
-typedef unsigned long ulong;
+typedef float v4sf __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long ulong; // __attribute((aligned(16)));
+
+#define V(x) (v2df){(x), (x)}
 
 INNER void
 disable_denormals();