diff --git a/Makefile b/Makefile
index d645cfd..f98b960 100644
--- a/Makefile
+++ b/Makefile
@@ -34,8 +34,9 @@ VST_SRC = ${VST_CPP:%=$(VST_CPP_DIR)/%}
 VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o}
 VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def
 
-GENERAL_FLAGS = -Wall -Wno-unused-function -I include
-ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu99 $(CFLAGS)
+INLINE_FLAGS = -Winline -finline-limit=1000
+GENERAL_FLAGS = -Wall -Wno-unused-function -I include $(INLINE_FLAGS)
+ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS)
 ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS)
 ALL_LDFLAGS = -lm $(LDFLAGS)
 
@@ -43,6 +44,8 @@ LADSPA_FLAGS =
 VST_FLAGS = -Wno-write-strings -Wno-narrowing
 VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1
 
+# specifying core2 as the target architecture
+# seems significantly faster, even on newer processors. ymmv.
 OPT_FLAGS = -Ofast -march=core2 -mfpmath=sse
 
 # any possibly produced files besides intermediates
diff --git a/crap/delay_test.h b/crap/delay_test.h
index f7cf78b..d5344e5 100644
--- a/crap/delay_test.h
+++ b/crap/delay_test.h
@@ -27,7 +27,7 @@ typedef struct {
 	channel c[2];
 } personal;
 
-static double
+INNER double
 fir_up(double *x, double s)
 {
 	x[0] = s;
@@ -55,7 +55,7 @@ fir_up(double *x, double s)
 	return s;
 }
 
-static double
+INNER double
 fir_down(double *x, double s)
 {
 	x[0] = s;
@@ -69,7 +69,7 @@ fir_down(double *x, double s)
 	return s;
 }
 
-static double
+INNER double
 process_one(channel *c, double s)
 {
 	s =    fir_down(c->down, biquad_run(&c->filter, fir_up(c->up, s)));
@@ -79,7 +79,7 @@ process_one(channel *c, double s)
 	return s;
 }
 
-static void
+INNER void
 process(personal *data,
     float *in_L, float *in_R,
     float *out_L, float *out_R,
@@ -91,7 +91,7 @@ process(personal *data,
 	}
 }
 
-static void
+INNER void
 process_double(personal *data,
     double *in_L, double *in_R,
     double *out_L, double *out_R,
@@ -103,23 +103,23 @@ process_double(personal *data,
 	}
 }
 
-static void
+INNER void
 construct(personal *data)
 {}
 
-static void
+INNER void
 destruct(personal *data)
 {}
 
-static void
+INNER void
 resume(personal *data)
 {}
 
-static void
+INNER void
 pause(personal *data)
 {}
 
-static void
+INNER void
 adjust(personal *data, ulong fs)
 {
 	for (int k = 0; k < 2; k++) {
diff --git a/crap/eq.h b/crap/eq.h
index 2d33491..578ca90 100644
--- a/crap/eq.h
+++ b/crap/eq.h
@@ -18,7 +18,7 @@ typedef struct {
 	float fs;
 } personal;
 
-static double
+INNER double
 process_one(biquad *filters, double samp)
 {
 	for (int i = 0; i < BANDS; i++)
@@ -26,7 +26,7 @@ process_one(biquad *filters, double samp)
 	return samp;
 }
 
-static void
+INNER void
 process(personal *data,
     float *in_L, float *in_R,
     float *out_L, float *out_R,
@@ -39,7 +39,7 @@ process(personal *data,
 	}
 }
 
-static void
+INNER void
 process_double(personal *data,
     double *in_L, double *in_R,
     double *out_L, double *out_R,
@@ -52,7 +52,7 @@ process_double(personal *data,
 	}
 }
 
-static void
+INNER void
 resume(personal *data)
 {
 	biquad *filters = data->filters[0];
@@ -61,11 +61,11 @@ resume(personal *data)
 	memcpy(data->filters[1], filters, BANDS*sizeof(biquad));
 }
 
-static void
+INNER void
 pause(personal *data)
 {}
 
-static void
+INNER void
 construct_params(param *params)
 {
 	for (int i = 0; i < BANDS; i++) {
@@ -94,15 +94,15 @@ construct_params(param *params)
 	}
 }
 
-static void
+INNER void
 construct(personal *data)
 {}
 
-static void
+INNER void
 destruct(personal *data)
 {}
 
-static void
+INNER void
 adjust(personal *data, param *params, unsigned long fs)
 {
 	data->fs = fs;
@@ -115,7 +115,7 @@ adjust(personal *data, param *params, unsigned long fs)
 	resume(data);
 }
 
-static void
+INNER void
 adjust_one(personal *data, param *params, unsigned int index)
 {
 	float fs = data->fs;
diff --git a/crap/eq_const.h b/crap/eq_const.h
index 66c716e..d69221e 100644
--- a/crap/eq_const.h
+++ b/crap/eq_const.h
@@ -14,7 +14,7 @@ typedef struct {
 	biquad filters[2][BANDS];
 } personal;
 
-static double
+INNER double
 process_one(biquad *filters, double samp)
 {
 	for (int i = 0; i < BANDS; i++)
@@ -22,7 +22,7 @@ process_one(biquad *filters, double samp)
 	return samp;
 }
 
-static void
+INNER void
 process(personal *data,
     float *in_L, float *in_R,
     float *out_L, float *out_R,
@@ -35,7 +35,7 @@ process(personal *data,
 	}
 }
 
-static void
+INNER void
 process_double(personal *data,
     double *in_L, double *in_R,
     double *out_L, double *out_R,
@@ -48,15 +48,15 @@ process_double(personal *data,
 	}
 }
 
-static void
+INNER void
 construct(personal *data)
 {}
 
-static void
+INNER void
 destruct(personal *data)
 {}
 
-static void
+INNER void
 resume(personal *data)
 {
 	biquad *filters = data->filters[0];
@@ -65,11 +65,11 @@ resume(personal *data)
 	memcpy(data->filters[1], filters, BANDS*sizeof(biquad));
 }
 
-static void
+INNER void
 pause(personal *data)
 {}
 
-static void
+INNER void
 adjust(personal *data, unsigned long fs)
 {
 	biquad *filters = data->filters[0];
diff --git a/crap/noise.h b/crap/noise.h
index 7bcc6ab..0116a7d 100644
--- a/crap/noise.h
+++ b/crap/noise.h
@@ -10,7 +10,7 @@
 typedef struct {
 } personal;
 
-static void
+INNER void
 process(personal *data,
     float *in_L, float *in_R,
     float *out_L, float *out_R,
@@ -23,7 +23,7 @@ process(personal *data,
 		out_R[pos] = whitenoise();
 }
 
-static void
+INNER void
 process_double(personal *data,
     double *in_L, double *in_R,
     double *out_L, double *out_R,
@@ -35,22 +35,22 @@ process_double(personal *data,
 		out_R[pos] = whitenoise();
 }
 
-static void
+INNER void
 construct(personal *data)
 {}
 
-static void
+INNER void
 destruct(personal *data)
 {}
 
-static void
+INNER void
 resume(personal *data)
 {}
 
-static void
+INNER void
 pause(personal *data)
 {}
 
-static void
+INNER void
 adjust(personal *data, unsigned long fs)
 {}
diff --git a/crap/tube.h b/crap/tube.h
index 3d64bd5..aa37ed5 100644
--- a/crap/tube.h
+++ b/crap/tube.h
@@ -1,3 +1,4 @@
+#include <alloca.h>
 #include <string.h>
 #include <stdio.h>
 
@@ -13,6 +14,10 @@
 #define PARAMETERS 2
 
 #define OVERSAMPLING 2
+#define BLOCK_SIZE 256
+#define FULL_SIZE (BLOCK_SIZE*OVERSAMPLING)
+
+typedef unsigned long ulong;
 
 typedef struct {
 	double desired, actual, speed;
@@ -24,7 +29,7 @@ typedef struct {
 	smoothval drive, wet;
 } personal;
 
-static double
+INNER double
 smooth(smoothval *val)
 {
 	double a = val->actual;
@@ -44,62 +49,116 @@ smooth(smoothval *val)
 	return a;
 }
 
-static double
+INNER double
 distort(double x)
 {
 	return (27*x + 9) / (9*x*x + 6*x + 19) - 9/19.;
 }
 
-static double
+INNER double
 process_one(double x, double drive, double wet)
 {
 	return (distort(x*drive)/drive*0.79 - x)*wet + x;
 }
 
-static double
-process_os(personal *data, double x, int right)
-{
-	halfband_t *hbu = (!right) ? &data->hbu_L : &data->hbu_R;
-	halfband_t *hbd = (!right) ? &data->hbd_L : &data->hbd_R;
-	double y;
-
-	#define doit(SAMP) \
-	decimate(hbd, process_one(interpolate(hbu, SAMP), \
-	    smooth(&data->drive), smooth(&data->wet)))
-	    doit(x);
-	y = doit(x);
-	#undef doit
-
-	return y;
-}
-
-static void
-process(personal *data,
-    float *in_L, float *in_R,
-    float *out_L, float *out_R,
-    unsigned long count)
-{
-	disable_denormals();
-	for (unsigned long pos = 0; pos < count; pos++) {
-		out_L[pos] = process_os(data, in_L[pos], 0);
-		out_R[pos] = process_os(data, in_R[pos], 1);
-	}
-}
-
-static void
+INNER void
 process_double(personal *data,
     double *in_L, double *in_R,
     double *out_L, double *out_R,
-    unsigned long count)
+    ulong count)
 {
 	disable_denormals();
-	for (unsigned long pos = 0; pos < count; pos++) {
-		out_L[pos] = process_os(data, in_L[pos], 0);
-		out_R[pos] = process_os(data, in_R[pos], 1);
+
+	double drives[FULL_SIZE], wets[FULL_SIZE];
+	double in_os[FULL_SIZE], out_os[FULL_SIZE];
+
+	for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) {
+		ulong rem = BLOCK_SIZE;
+		if (pos + BLOCK_SIZE > count)
+			rem = count - pos;
+
+		for (ulong i = 0; i < rem*OVERSAMPLING; i++)
+			drives[i] = smooth(&data->drive);
+		for (ulong i = 0; i < rem*OVERSAMPLING; i++)
+			wets[i] = smooth(&data->wet);
+
+		halfband_t *hb;
+
+		// left channel
+		hb = &data->hbu_L;
+		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
+			in_os[i+0] = interpolate(hb, in_L[j]);
+			in_os[i+1] = interpolate(hb, in_L[j]);
+		}
+
+		for (ulong i = 0; i < rem*OVERSAMPLING; i++) {
+			out_os[i] = process_one(in_os[i], drives[i], wets[i]);
+		}
+
+		hb = &data->hbd_L;
+		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
+			decimate(hb, out_os[i+0]);
+			out_L[j] = decimate(hb, out_os[i+1]);
+		}
+
+		// right channel
+		hb = &data->hbu_R;
+		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
+			in_os[i+0] = interpolate(hb, in_R[j]);
+			in_os[i+1] = interpolate(hb, in_R[j]);
+		}
+
+		for (ulong i = 0; i < rem*OVERSAMPLING; i++) {
+			out_os[i] = process_one(in_os[i], drives[i], wets[i]);
+		}
+
+		hb = &data->hbd_R;
+		for (ulong i = 0, j = 0; j < rem; i += OVERSAMPLING, j++) {
+			decimate(hb, out_os[i+0]);
+			out_R[j] = decimate(hb, out_os[i+1]);
+		}
+
+		in_L += BLOCK_SIZE;
+		in_R += BLOCK_SIZE;
+		out_L += BLOCK_SIZE;
+		out_R += BLOCK_SIZE;
 	}
 }
 
-static void
+INNER void
+process(personal *data,
+    float *in_L, float *in_R,
+    float *out_L, float *out_R,
+    ulong count)
+{
+	double  in_L2[BLOCK_SIZE],  in_R2[BLOCK_SIZE];
+	double out_L2[BLOCK_SIZE], out_R2[BLOCK_SIZE];
+
+	for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) {
+		ulong rem = BLOCK_SIZE;
+		if (pos + BLOCK_SIZE > count)
+			rem = count - pos;
+
+		for (ulong i = 0; i < rem; i++)
+			in_L2[i] = in_L[i];
+		for (ulong i = 0; i < rem; i++)
+			in_R2[i] = in_R[i];
+
+		process_double(data, in_L2, in_R2, out_L2, out_R2, rem);
+
+		for (ulong i = 0; i < rem; i++)
+			out_L[i] = out_L2[i];
+		for (ulong i = 0; i < rem; i++)
+			out_R[i] = out_R2[i];
+
+		in_L += BLOCK_SIZE;
+		in_R += BLOCK_SIZE;
+		out_L += BLOCK_SIZE;
+		out_R += BLOCK_SIZE;
+	}
+}
+
+INNER void
 resume(personal *data)
 {
 	memset(&data->hbu_L, 0, sizeof(halfband_t));
@@ -108,17 +167,17 @@ resume(personal *data)
 	memset(&data->hbd_R, 0, sizeof(halfband_t));
 }
 
-static void
+INNER void
 pause(personal *data)
 {}
 
-static void
+INNER void
 construct(personal *data)
 {
 	memset(data, 0, sizeof(personal));
 }
 
-static void
+INNER void
 construct_params(param *params)
 {
 	sprintf(params[0].name, "Drive");
@@ -137,12 +196,12 @@ construct_params(param *params)
 	param_reset(&params[1]);
 }
 
-static void
+INNER void
 destruct(personal *data)
 {}
 
-static void
-adjust(personal *data, param *params, unsigned long fs_long)
+INNER void
+adjust(personal *data, param *params, ulong fs_long)
 {
 	resume(data);
 	double fs = fs_long;
@@ -156,7 +215,7 @@ adjust(personal *data, param *params, unsigned long fs_long)
 	data->wet.log = 0;
 }
 
-static void
+INNER void
 adjust_one(personal *data, param *params, unsigned int index)
 {
 	data->drive.desired = DB2LIN(params[0].value);
diff --git a/include/os2piir.h b/include/os2piir.h
index e08d1b5..4da9704 100644
--- a/include/os2piir.h
+++ b/include/os2piir.h
@@ -16,7 +16,7 @@ typedef struct {
 	int i;
 } halfband_t;
 
-static void
+INNER void
 halfband_a(double a[8], double ao[8], double x0, double x2)
 {
 	a[0] = x2    + (x0   - ao[0])*0.006185967461045014;
@@ -29,7 +29,7 @@ halfband_a(double a[8], double ao[8], double x0, double x2)
 	a[7] = ao[6] + (a[6] - ao[7])*0.862917812650502936;
 }
 
-static void
+INNER void
 halfband_b(double b[8], double bo[8], double x1, double x3)
 {
 	b[0] = x3    + (x1   - bo[0])*0.024499027624721819;
@@ -42,7 +42,7 @@ halfband_b(double b[8], double bo[8], double x1, double x3)
 	b[7] = bo[6] + (b[6] - bo[7])*0.952428157718303137;
 }
 
-static double
+INNER double
 halfband(halfband_t *h, double x0)
 {
 	double a[8], b[8];
@@ -58,7 +58,7 @@ halfband(halfband_t *h, double x0)
 	return (a[7] + b[7])*0.5;
 }
 
-static double
+INNER double
 decimate(halfband_t *h, double x0)
 {
 	double c[8];
@@ -78,7 +78,7 @@ decimate(halfband_t *h, double x0)
 }
 
 // note: do not zero-stuff! send the input each time.
-static double
+INNER double
 interpolate(halfband_t *h, double x0)
 {
 	double c[8];
diff --git a/include/util.h b/include/util.h
index 8ec4f57..49a9890 100644
--- a/include/util.h
+++ b/include/util.h
@@ -4,7 +4,9 @@
 #include <xmmintrin.h>
 #endif
 
-static void
+#define INNER static inline
+
+INNER void
 disable_denormals();
 
 #define LIMIT(v,l,u) ((v)<(l)?(l):((v)>(u)?(u):(v)))
@@ -22,10 +24,10 @@ typedef struct {
 	double b0, b1, b2, a0, a1, a2;
 } biquad_interim;
 
-static float
+INNER float
 whitenoise();
 
-static void
+INNER void
 biquad_init(biquad *bq);
 
 typedef enum {
@@ -41,16 +43,16 @@ typedef enum {
 	FILT_GAIN
 } filter_t;
 
-static biquad
+INNER biquad
 biquad_gen(filter_t type, double fc, double gain, double bw, double fs);
 
 /* s-plane to z-plane */
-static biquad_interim
+INNER biquad_interim
 design(double cw, double sw,
     double num0, double num1, double num2,
     double den0, double den1, double den2);
 
-static double
+INNER double
 biquad_run(biquad *bq, double x);
 
 #include "util_def.h"
diff --git a/include/util_def.h b/include/util_def.h
index 3d84f32..10814dd 100644
--- a/include/util_def.h
+++ b/include/util_def.h
@@ -2,7 +2,7 @@
 #include <math.h>
 #include <stdint.h>
 
-static void
+INNER void
 disable_denormals()
 {
 	#if __SSE2__
@@ -13,7 +13,7 @@ disable_denormals()
 /* via http://www.rgba.org/articles/sfrand/sfrand.htm */
 static unsigned int mirand = 1;
 
-static float
+INNER float
 whitenoise()
 {
 	union either {
@@ -27,13 +27,13 @@ whitenoise()
 
 /* used to resemble https://github.com/swh/ladspa/blob/master/util/biquad.h */
 
-static void
+INNER void
 biquad_init(biquad *bq)
 {
 	bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0;
 }
 
-static biquad_interim
+INNER biquad_interim
 design(double cw, double sw,
     double num0, double num1, double num2,
     double den0, double den1, double den2)
@@ -48,7 +48,7 @@ design(double cw, double sw,
 	};
 }
 
-static biquad
+INNER biquad
 biquad_gen(filter_t type, double fc, double gain, double bw, double fs)
 {
 	double w0, cw, sw, A, As, Q;
@@ -88,7 +88,7 @@ biquad_gen(filter_t type, double fc, double gain, double bw, double fs)
 	return out;
 }
 
-static double
+INNER double
 biquad_run(biquad *bq, double x)
 {
 	double y;
diff --git a/util/bench.c b/util/bench.c
index 6f303b3..9ce1ad9 100644
--- a/util/bench.c
+++ b/util/bench.c
@@ -1,28 +1,28 @@
+#include <alloca.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include <time.h>
+#include <math.h>
 
 #include "dlfcn.h"
 #include "ladspa.h"
 #include "util.h"
 
-enum {
-	BLOCK_SIZE=2048
-};
+#define BLOCK_SIZE 2048
 
 void *plug = NULL;
 static float *audio_buffer;
 static int audio_count = 0;
 
-static void
+INNER void
 cleanup()
 {
 	dlclose(plug);
 	if (audio_count) free(audio_buffer);
 }
 
-static const LADSPA_Descriptor*
+INNER const LADSPA_Descriptor*
 load_ladspa(char *path)
 {
 	plug = dlopen(path, RTLD_NOW);
@@ -38,6 +38,52 @@ load_ladspa(char *path)
 	return d;
 }
 
+INNER float
+between(float percent, float min, float max, int logscale)
+{
+	if (logscale)
+		return log(min/percent)/log(min/max);
+	else
+		return (min - percent)/(min - max);
+}
+
+INNER float
+get_default(LADSPA_PortRangeHint hint)
+{
+	float x = 0;
+	int hd = hint.HintDescriptor;
+	float min = hint.LowerBound;
+	float max = hint.UpperBound;
+	float logscale = LADSPA_IS_HINT_LOGARITHMIC(hd);
+	if (LADSPA_IS_HINT_DEFAULT_0(hd))
+		x = 0;
+	if (LADSPA_IS_HINT_DEFAULT_1(hd))
+		x = 1;
+	if (LADSPA_IS_HINT_DEFAULT_100(hd))
+		x = 100;
+	if (LADSPA_IS_HINT_DEFAULT_440(hd))
+		x = 440;
+	if (LADSPA_IS_HINT_DEFAULT_MINIMUM(hd))
+		x = min;
+	if (LADSPA_IS_HINT_DEFAULT_LOW(hd))
+		x = between(0.25, min, max, logscale);
+	if (LADSPA_IS_HINT_DEFAULT_MIDDLE(hd))
+		x = between(0.50, min, max, logscale);
+	if (LADSPA_IS_HINT_DEFAULT_HIGH(hd))
+		x = between(0.75, min, max, logscale);
+	if (LADSPA_IS_HINT_DEFAULT_MAXIMUM(hd))
+		x = max;
+	if (LADSPA_IS_HINT_INTEGER(hd))
+		x = round(x);
+	if (LADSPA_IS_HINT_TOGGLED(hd)) {
+		float mid = between(0.50, min, max, logscale);
+		x = x >= mid ? max : min;
+	}
+	if (x < min) x = min;
+	if (x > max) x = max;
+	return x;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -56,9 +102,15 @@ main(int argc, char **argv)
 	audio_buffer = calloc(audio_count*BLOCK_SIZE, sizeof(float));
 
 	int a = 0;
-	for (int i = 0; i < d->PortCount; i++)
-		if (LADSPA_IS_PORT_AUDIO(d->PortDescriptors[i]))
+	for (int i = 0; i < d->PortCount; i++) {
+		if (LADSPA_IS_PORT_AUDIO(d->PortDescriptors[i])) {
 			d->connect_port(h, i, audio_buffer + a++*BLOCK_SIZE);
+		} else {
+			float *x = alloca(sizeof(float));
+			*x = get_default(d->PortRangeHints[i]);
+			d->connect_port(h, i, x);
+		}
+	}
 
 	mirand = time(NULL);
 	for (int i = 0; i < audio_count*BLOCK_SIZE; i++)
diff --git a/util/benchtime b/util/benchtime
index 3cec957..9748972 100755
--- a/util/benchtime
+++ b/util/benchtime
@@ -13,4 +13,4 @@ for i in {1..8}; do
     time "$bench" "$against"
 done 2>&1 >/dev/null | awk 'BEGIN{m=999;printf " …\033[90m"}
 {a+=$1;n++;m=$1<m?$1:m;printf " %6.3f",$1}
-END{printf "\033[0m\nmin %6.3f  —  avg %6.3f  —  total %7.3f\n",m,a/n,a}'
+END{printf "\033[0m\nmin %6.3f  —  avg %6.3f  —  sum %7.3f\n",m,a/n,a}'