vectorize tube

2015-04-07 11:25:26 -07:00 · 2015-04-07 11:25:26 -07:00 · 33c0ef8f14
commit 33c0ef8f14
parent 8e7dde59f6
4 changed files with 180 additions and 73 deletions
--- a/crap/tube.h
+++ b/crap/tube.h
@ -15,7 +15,7 @@
 #include "util.h"
 #include "param.h"
-#include "os2piir.h"
+#include "os2piir_stereo.h"
 typedef struct {
 	double desired, actual, speed;
@ -23,7 +23,7 @@ typedef struct {
 } smoothval;
 typedef struct {
-	halfband_t hbu_L, hbu_R, hbd_L, hbd_R;
+	halfband_t hb_up, hb_down;
 	smoothval drive, wet;
 } personal;
@ -47,16 +47,16 @@ smooth(smoothval *val)
 	return a;
 }
-INNER double
+INNER CONST v2df
-distort(double x)
+distort(v2df x)
 {
-	return (27*x + 9) / (9*x*x + 6*x + 19) - 9/19.;
+	return (V(27.)*x + V(9.)) / (V(9.)*x*x + V(6.)*x + V(19.)) - V(9./19.);
 }
-INNER double
+INNER CONST v2df
-process_one(double x, double drive, double wet)
+process_one(v2df x, v2df drive, v2df wet)
 {
-	return (distort(x*drive)/drive*0.79 - x)*wet + x;
+	return (distort(x*drive)/drive*V(0.79) - x)*wet + x;
 }
 static void
@ -64,74 +64,20 @@ process_double(personal *data,
    double *in_L, double *in_R,
    double *out_L, double *out_R,
    ulong count)
-{
+#include "process_nonlinear.h"
 	disable_denormals();
-	double drives[FULL_SIZE], wets[FULL_SIZE];
+static void
-	double in_os[FULL_SIZE], out_os[FULL_SIZE];
+process(personal *data,
-
+    float *in_L, float *in_R,
-	for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) {
+    float *out_L, float *out_R,
-		ulong rem = BLOCK_SIZE;
+    ulong count)
-		if (pos + BLOCK_SIZE > count)
+#include "process_nonlinear.h"
 			rem = count - pos;
 		for (ulong i = 0; i < rem*OVERSAMPLING; i++)
 			drives[i] = smooth(&data->drive);
 		for (ulong i = 0; i < rem*OVERSAMPLING; i++)
 			wets[i] = smooth(&data->wet);
 		halfband_t *hb;
 		// left channel
 		hb = &data->hbu_L;
 		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
 			in_os[i+0] = interpolate(hb, in_L[j]);
 			in_os[i+1] = interpolate(hb, in_L[j]);
 		}
 		for (ulong i = 0; i < rem*OVERSAMPLING; i++) {
 			out_os[i] = process_one(in_os[i], drives[i], wets[i]);
 		}
 		hb = &data->hbd_L;
 		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
 			decimate(hb, out_os[i+0]);
 			out_L[j] = decimate(hb, out_os[i+1]);
 		}
 		// right channel
 		hb = &data->hbu_R;
 		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
 			in_os[i+0] = interpolate(hb, in_R[j]);
 			in_os[i+1] = interpolate(hb, in_R[j]);
 		}
 		for (ulong i = 0; i < rem*OVERSAMPLING; i++) {
 			out_os[i] = process_one(in_os[i], drives[i], wets[i]);
 		}
 		hb = &data->hbd_R;
 		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
 			decimate(hb, out_os[i+0]);
 			out_R[j] = decimate(hb, out_os[i+1]);
 		}
 		in_L += BLOCK_SIZE;
 		in_R += BLOCK_SIZE;
 		out_L += BLOCK_SIZE;
 		out_R += BLOCK_SIZE;
 	}
 }
 #include "process.h"
 INNER void
 resume(personal *data)
 {
-	memset(&data->hbu_L, 0, sizeof(halfband_t));
+	memset(&data->hb_up,   0, sizeof(halfband_t));
-	memset(&data->hbu_R, 0, sizeof(halfband_t));
+	memset(&data->hb_down, 0, sizeof(halfband_t));
 	memset(&data->hbd_L, 0, sizeof(halfband_t));
 	memset(&data->hbd_R, 0, sizeof(halfband_t));
 }
 INNER void
--- a/include/os2piir_stereo.h
+++ b/include/os2piir_stereo.h
@ -0,0 +1,98 @@
 /* halfband polyphase IIR filter
   coefficients designed with halfband program:
     https://gist.github.com/3be345efb6c97d757398#file-halfband-c
   parameters: 16 coefficients, 0.1 transition band
   stopband: ~-150dB
   overall delay: ~8 samples
 */
 #define copy(dst, src) memcpy(dst, src, sizeof(v2df)*8)
 //#define copy(dst, src) _copy(dst, src)
 // all should be initialized to 0
 typedef struct {
 	v2df ao[8], bo[8];
 	v2df at[8], bt[8];
 	v2df x1, x2, x3;
 	int i;
 } halfband_t;
 INNER void
 halfband_a(v2df a[8], v2df ao[8], v2df x0, v2df x2)
 {
 	a[0] = x2    + (x0   - ao[0])*V(0.006185967461045014);
 	a[1] = ao[0] + (a[0] - ao[1])*V(0.054230780876613788);
 	a[2] = ao[1] + (a[1] - ao[2])*V(0.143280861566087270);
 	a[3] = ao[2] + (a[2] - ao[3])*V(0.262004358403954640);
 	a[4] = ao[3] + (a[3] - ao[4])*V(0.398796973552973666);
 	a[5] = ao[4] + (a[4] - ao[5])*V(0.545323651071132232);
 	a[6] = ao[5] + (a[5] - ao[6])*V(0.698736833646440347);
 	a[7] = ao[6] + (a[6] - ao[7])*V(0.862917812650502936);
 }
 INNER void
 halfband_b(v2df b[8], v2df bo[8], v2df x1, v2df x3)
 {
 	b[0] = x3    + (x1   - bo[0])*V(0.024499027624721819);
 	b[1] = bo[0] + (b[0] - bo[1])*V(0.094283481125726432);
 	b[2] = bo[1] + (b[1] - bo[2])*V(0.199699579426327684);
 	b[3] = bo[2] + (b[2] - bo[3])*V(0.328772348316831664);
 	b[4] = bo[3] + (b[3] - bo[4])*V(0.471167216679969414);
 	b[5] = bo[4] + (b[4] - bo[5])*V(0.621096845120503893);
 	b[6] = bo[5] + (b[5] - bo[6])*V(0.778944517099529166);
 	b[7] = bo[6] + (b[6] - bo[7])*V(0.952428157718303137);
 }
 INNER v2df
 halfband(halfband_t *h, v2df x0)
 {
 	v2df a[8], b[8];
 	halfband_a(a, h->ao,    x0, h->x2);
 	halfband_b(b, h->bo, h->x1, h->x3);
 	copy(h->ao, h->at);
 	copy(h->bo, h->bt);
 	copy(h->at, a);
 	copy(h->bt, b);
 	h->x3 = h->x2;
 	h->x2 = h->x1;
 	h->x1 = x0;
 	return (a[7] + b[7])*V(0.5);
 }
 INNER v2df
 decimate(halfband_t *h, v2df x0)
 {
 	v2df c[8];
 	if ((h->i = !h->i)) {
 		halfband_b(c, h->bo, x0, h->x2);
 		copy(h->bo, c);
 		h->x2 = h->x1;
 		h->x1 = x0;
 		return V(0);
 	} else {
 		halfband_a(c, h->ao, x0, h->x2);
 		copy(h->ao, c);
 		h->x2 = h->x1;
 		h->x1 = x0;
 		return (c[7] + h->bo[7])*V(0.5);
 	}
 }
 // note: do not zero-stuff! send the input each time.
 INNER v2df
 interpolate(halfband_t *h, v2df x0)
 {
 	v2df c[8];
 	if ((h->i = !h->i)) {
 		halfband_a(c, h->ao, x0, h->x1);
 		copy(h->ao, c);
 		return c[7];
 	} else {
 		halfband_b(c, h->bo, x0, h->x1);
 		copy(h->bo, c);
 		h->x1 = x0;
 		return c[7];
 	}
 }
 #undef copy
--- a/include/process_nonlinear.h
+++ b/include/process_nonlinear.h
@ -0,0 +1,58 @@
 {
 	disable_denormals();
 	v2df drives[FULL_SIZE], wets[FULL_SIZE];
 	v2df buf[BLOCK_SIZE];
 	v2df over[FULL_SIZE];
 	halfband_t *hb_up   = &data->hb_up;
 	halfband_t *hb_down = &data->hb_down;
 	for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) {
 		ulong rem = BLOCK_SIZE;
 		if (pos + BLOCK_SIZE > count)
 			rem = count - pos;
 		ulong rem2 = rem*OVERSAMPLING;
 		for (ulong i = 0; i < rem2; i++) {
 			double y = smooth(&data->drive);
 			drives[i] = V(y);
 		}
 		for (ulong i = 0; i < rem2; i++) {
 			double y = smooth(&data->wet);
 			wets[i] = V(y);
 		}
 		for (ulong i = 0; i < rem; i++) {
 			buf[i][0] = in_L[i];
 			buf[i][1] = in_R[i];
 		}
 		for (ulong i = 0; i < rem; i++) {
 			hb_up->i = 0; // so compiler can optimize
 			over[i*2+0] = interpolate(hb_up, buf[i]);
 			over[i*2+1] = interpolate(hb_up, buf[i]);
 		}
 		for (ulong i = 0; i < rem2; i++) {
 			over[i] = process_one(over[i], drives[i], wets[i]);
 		}
 		for (ulong i = 0; i < rem; i++) {
 			hb_down->i = 0; // so compiler can optimize
 			         decimate(hb_down, over[i*2+0]);
 			buf[i] = decimate(hb_down, over[i*2+1]);
 		}
 		for (ulong i = 0; i < rem; i++) {
 			out_L[i] = buf[i][0];
 			out_R[i] = buf[i][1];
 		}
 		in_L += BLOCK_SIZE;
 		in_R += BLOCK_SIZE;
 		out_L += BLOCK_SIZE;
 		out_R += BLOCK_SIZE;
 	}
 }
--- a/include/util.h
+++ b/include/util.h
@ -1,12 +1,17 @@
 #include "math.h"
 #ifdef __SSE2__
-#include <xmmintrin.h>
+#include <emmintrin.h>
 #endif
 #define INNER static inline
 #define PURE __attribute__((pure))
 #define CONST __attribute__((const))
 typedef double v2df __attribute__((vector_size(16), aligned(16)));
-typedef unsigned long ulong;
+typedef float v4sf __attribute__((vector_size(16), aligned(16)));
 typedef unsigned long ulong; // __attribute((aligned(16)));
 #define V(x) (v2df){(x), (x)}
 INNER void
 disable_denormals();