From 4de1731d9315038196b849893c769f5996275fa9 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Wed, 5 Feb 2014 20:32:16 -0800
Subject: [PATCH] tube plugin, begin on SSE2 optimizations

---
 Makefile    |   2 +-
 README.md   |  14 ++++--
 crap_tube.h | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+), 4 deletions(-)
 create mode 100644 crap_tube.h

diff --git a/Makefile b/Makefile
index cb94fc9..f185ecb 100755
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ DISTNAME = crap
 VERSION = git
 FULLNAME = ${DISTNAME}-${VERSION}
 
-BOTH = crap_eq crap_eq_const
+BOTH = crap_eq crap_eq_const crap_tube
 LADSPA_ONLY = crap_noise
 VST_ONLY = crap_delay_test
 LADSPA = ${BOTH:=-ladspa} ${LADSPA_ONLY:=-ladspa}
diff --git a/README.md b/README.md
index 37833c7..9aab33f 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,14 @@ __crap\_noise (0xEC57A71C)__
 
 white noise generator. loud, full-range, 0dBFS. don't say i didn't warn you.
 
+### crap Tube Distortion
+
+__crap\_tube (0x50F7BA11)__
+
+static waveshaper with 4x oversampling, sounds kinda like a tube I guess?
+be aware that the oversampling is a bit naive and attenuates the signal
+past 17kHz, assuming a 44.1kHz sample rate.
+
 ### crap delay test
 
 __crap\_delay\_test (0xDEDEDEDE)__
@@ -37,7 +45,8 @@ experimentation with delay compensation and EQ oversampling, not for use.
 
 a `benchmark` target is included, however it doesn't build on Windows.
 
-try `CFLAGS="-O3 -ffast-math -march=core2"`
+for speed, try `CFLAGS="-O3 -ffast-math -march=core2 -mfpmath=sse"`
+and the same for CXXFLAGS.
 
 on Linux, you'll need `CFLAGS+=" -fpic" CXXFLAGS+=" -fpic -D__cdecl="`
 
@@ -47,10 +56,9 @@ remember to export `VST_SDK_DIR` to the path of your `vstsdk2.4/`
 
 * convert crap\_noise to the template format
 * rename plugins (fix capitalization consistency and such)
-* make style consistent
+* make code style consistent
 * remove crap\_ prefixes?
 * move to subdirs?
-* make crap faster (hand-written SSE2? compiler directives?)
 * reduce input/output buffers on biquads (shared)
 * ease up on the preprocessor ifs
 * polish parameter support
diff --git a/crap_tube.h b/crap_tube.h
new file mode 100644
index 0000000..f0035c7
--- /dev/null
+++ b/crap_tube.h
@@ -0,0 +1,135 @@
+#include <string.h>
+
+#ifdef __SSE2__
+#include <xmmintrin.h>
+#ifndef __SSE2_MATH__
+#warning SSE2 enabled but not forced, beware denormals
+#endif
+#else
+#warning built without SSE2, denormals will be painful
+#endif
+
+#define ID 0x50F7BA11
+#define LABEL "crap_tube"
+#define NAME "crap Tube Distortion"
+#define AUTHOR "Connor Olding"
+#define COPYRIGHT "MIT"
+#define PARAMETERS 0
+
+typedef struct {
+	double history_L[64];
+	double history_R[64];
+} personal;
+
+static void
+disable_denormals()
+{
+	#if __SSE2__
+        _mm_setcsr(_mm_getcsr() | 0x8040);
+	#endif
+}
+
+static double
+distort(double x)
+{
+	return (27*x + 9) / (9*x*x + 6*x + 19) - 9/19.;
+}
+
+// b2 is always b0 with lowpasses
+// a0 is already factored into the rest of the coefficients
+#define LOWPASS(i, b0, b1, a1, a2) \
+	y = b0*x + b1*xn[i*2] + b0*xn[i*2 + 1] \
+		 - a1*yn[i*2] - a2*yn[i*2 + 1]; \
+	xn[i*2 + 1] = xn[i*2]; \
+	xn[i*2] = x; \
+	yn[i*2 + 1] = yn[i*2]; \
+	yn[i*2] = y; \
+	x = y;
+
+static double
+upsample(double xn[16], double yn[16], double x)
+{
+	double y;
+	LOWPASS(0, +0.71327159,+0.00688573,-0.45391337,+0.88734229);
+	LOWPASS(1, +0.63347126,+0.05572752,-0.36946634,+0.69213639);
+	LOWPASS(2, +0.55963645,+0.13990391,-0.26487901,+0.52405582);
+	LOWPASS(3, +0.49037095,+0.24706928,-0.14763065,+0.37544183);
+	LOWPASS(4, +0.42692239,+0.36379839,-0.02763286,+0.24527604);
+	LOWPASS(5, +0.37268890,+0.47433865,+0.08224090,+0.13747554);
+	LOWPASS(6, +0.33241251,+0.56148939,+0.16727062,+0.05904378);
+	LOWPASS(7, +0.31079382,+0.60975767,+0.21392163,+0.01742368);
+	return y;
+}
+
+static double
+downsample(double xn[16], double yn[16], double x)
+{
+	double y;
+	LOWPASS(0, +0.62136966,-0.87573986,-1.56336581,+0.93036527);
+	LOWPASS(1, +0.56540370,-0.77393348,-1.44258778,+0.79946170);
+	LOWPASS(2, +0.49824084,-0.63630306,-1.31114921,+0.67132784);
+	LOWPASS(3, +0.41949184,-0.46466704,-1.16600279,+0.54031944);
+	LOWPASS(4, +0.33172375,-0.26684785,-1.00993399,+0.40653364);
+	LOWPASS(5, +0.24269774,-0.06242297,-0.85492245,+0.27789496);
+	LOWPASS(6, +0.16673206,+0.11379847,-0.72421195,+0.17147454);
+	LOWPASS(7, +0.12199271,+0.21811002,-0.64769184,+0.10978728);
+	return y;
+}
+
+static double
+process_one(double h[64], double x)
+{
+	double y;
+	y = downsample(h+32, h+48, distort(4*upsample(h, h+16, x)));
+	    downsample(h+32, h+48, distort(4*upsample(h, h+16, 0)));
+	    downsample(h+32, h+48, distort(4*upsample(h, h+16, 0)));
+	    downsample(h+32, h+48, distort(4*upsample(h, h+16, 0)));
+	return y*0.71;
+}
+
+static void
+process(personal *data,
+    float *in_L, float *in_R,
+    float *out_L, float *out_R,
+    unsigned long count) {
+	disable_denormals();
+	for (unsigned long pos = 0; pos < count; pos++) {
+		out_L[pos] = process_one(data->history_L, in_L[pos]);
+		out_R[pos] = process_one(data->history_R, in_R[pos]);
+	}
+}
+
+static void
+process_double(personal *data,
+    double *in_L, double *in_R,
+    double *out_L, double *out_R,
+    unsigned long count) {
+	disable_denormals();
+	for (unsigned long pos = 0; pos < count; pos++) {
+		out_L[pos] = process_one(data->history_L, in_L[pos]);
+		out_R[pos] = process_one(data->history_R, in_R[pos]);
+	}
+}
+
+static void
+resume(personal *data) {
+	memset(data->history_L, 0, 64);
+	memset(data->history_R, 0, 64);
+}
+
+static void
+pause(personal *data) {
+}
+static void
+construct(personal *data) {
+	resume(data);
+}
+static void
+destruct(personal *data) {
+}
+
+static void
+adjust(personal *data, unsigned long fs) {
+	resume(data);
+}
+