diff --git a/Makefile b/Makefile
index d9a3e18..adfdb79 100644
--- a/Makefile
+++ b/Makefile
@@ -34,8 +34,8 @@ VST_SRC = ${VST_CPP:%=$(VST_CPP_DIR)/%}
 VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o}
 VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def
 
-INLINE_FLAGS = -Winline -finline-limit=1000
-GENERAL_FLAGS = -Wall -Wno-unused-function -I include $(INLINE_FLAGS)
+INLINE_FLAGS = -Winline
+GENERAL_FLAGS = -Wall -Wno-unused-function -Wno-sign-compare -I include $(INLINE_FLAGS)
 ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS)
 ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS)
 ALL_LDFLAGS = -lm $(LDFLAGS)
@@ -44,9 +44,7 @@ LADSPA_FLAGS =
 VST_FLAGS = -Wno-write-strings -Wno-narrowing
 VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1
 
-# specifying core2 as the target architecture
-# seems significantly faster, even on newer processors. ymmv.
-OPT_FLAGS = -Ofast -march=core2 -mfpmath=sse
+OPT_FLAGS = -Ofast -march=native -mfpmath=sse
 
 # any possibly produced files besides intermediates
 ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
@@ -57,7 +55,7 @@ ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
 
 .PHONY: all options clean dist pretest ladspa vst $(UTILS)
 .PHONY: benchmark windows linux
-all: pretest ladspa
+all: pretest ladspa vst
 
 exe: $(EXE)
 
diff --git a/crap/eq_const.h b/crap/eq_const.h
index dd9e995..8aa9644 100644
--- a/crap/eq_const.h
+++ b/crap/eq_const.h
@@ -24,7 +24,7 @@ process_double(personal *data,
 {
 	disable_denormals();
 
-	double buf[2*BLOCK_SIZE];
+	v2df buf[BLOCK_SIZE];
 
 	biquad *f0, *f1;
 
@@ -34,8 +34,8 @@ process_double(personal *data,
 			rem = count - pos;
 
 		for (ulong i = 0; i < rem; i++) {
-			buf[i*2+0] = in_L[i];
-			buf[i*2+1] = in_R[i];
+			buf[i][0] = in_L[i];
+			buf[i][1] = in_R[i];
 		}
 
 		f0 = data->filters[0];
@@ -47,8 +47,8 @@ process_double(personal *data,
 		}
 
 		for (ulong i = 0; i < rem; i++) {
-			out_L[i] = buf[i*2+0];
-			out_R[i] = buf[i*2+1];
+			out_L[i] = buf[i][0];
+			out_R[i] = buf[i][1];
 		}
 
 		in_L += BLOCK_SIZE;
diff --git a/include/util.h b/include/util.h
index a443eb6..971fb8f 100644
--- a/include/util.h
+++ b/include/util.h
@@ -5,6 +5,7 @@
 #endif
 
 #define INNER static inline
+typedef double v2df __attribute__((vector_size(16), aligned(16)));
 typedef unsigned long ulong;
 
 INNER void
@@ -48,7 +49,7 @@ INNER biquad
 biquad_gen(filter_t type, double fc, double gain, double bw, double fs);
 
 /* s-plane to z-plane */
-INNER biquad_interim
+static biquad_interim
 design(double cw, double sw,
     double num0, double num1, double num2,
     double den0, double den1, double den2);
diff --git a/include/util_def.h b/include/util_def.h
index 5a5c921..b0e4047 100644
--- a/include/util_def.h
+++ b/include/util_def.h
@@ -33,7 +33,7 @@ biquad_init(biquad *bq)
 	bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0;
 }
 
-INNER biquad_interim
+static biquad_interim
 design(double cw, double sw,
     double num0, double num1, double num2,
     double den0, double den1, double den2)
@@ -105,51 +105,37 @@ biquad_run(biquad *bq, double x)
 
 INNER void
 biquad_run_block_stereo(biquad *bq_L, biquad *bq_R,
-    double *buf, ulong count)
-#ifdef __SSE2__
+    v2df *buf, ulong count)
 {
-	__m128d b0, b1, b2, a1, a2, x1, x2, y1, y2;
+	v2df b0, b1, b2, a1, a2, x1, x2, y1, y2;
 
-	b0 = _mm_set1_pd(bq_L->b0);
-	b1 = _mm_set1_pd(bq_L->b1);
-	b2 = _mm_set1_pd(bq_L->b2);
-	a1 = _mm_set1_pd(bq_L->a1);
-	a2 = _mm_set1_pd(bq_L->a2);
+	b0 = (v2df){bq_L->b0, bq_L->b0};
+	b1 = (v2df){bq_L->b1, bq_L->b1};
+	b2 = (v2df){bq_L->b2, bq_L->b2};
+	a1 = (v2df){bq_L->a1, bq_L->a1};
+	a2 = (v2df){bq_L->a2, bq_L->a2};
 
-	x1 = _mm_setr_pd(bq_L->x1, bq_R->x1);
-	x2 = _mm_setr_pd(bq_L->x2, bq_R->x2);
-	y1 = _mm_setr_pd(bq_L->y1, bq_R->y1);
-	y2 = _mm_setr_pd(bq_L->y2, bq_R->y2);
+	x1 = (v2df){bq_L->x1, bq_R->x1};
+	x2 = (v2df){bq_L->x2, bq_R->x2};
+	y1 = (v2df){bq_L->y1, bq_R->y1};
+	y2 = (v2df){bq_L->y2, bq_R->y2};
 
-	for (int i = 0; i < 2*count; i += 2) {
-		__m128d x = _mm_load_pd(buf + i);
-		__m128d y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2;
+	for (ulong i = 0; i < count; i++) {
+		v2df x = buf[i];
+		v2df y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2;
 		x2 = x1;
 		y2 = y1;
 		x1 = x;
 		y1 = y;
-		_mm_store_pd(buf + i, y);
+		buf[i] = y;
 	}
 
-	double temp[8];
-	_mm_store_pd(temp+0, x1);
-	_mm_store_pd(temp+2, x2);
-	_mm_store_pd(temp+4, y1);
-	_mm_store_pd(temp+6, y2);
-	bq_L->x1 = temp[0];
-	bq_R->x1 = temp[1];
-	bq_L->x2 = temp[2];
-	bq_R->x2 = temp[3];
-	bq_L->y1 = temp[4];
-	bq_R->y1 = temp[5];
-	bq_L->y2 = temp[6];
-	bq_R->y2 = temp[7];
+	bq_L->x1 = x1[0];
+	bq_R->x1 = x1[1];
+	bq_L->x2 = x2[0];
+	bq_R->x2 = x2[1];
+	bq_L->y1 = y1[0];
+	bq_R->y1 = y1[1];
+	bq_L->y2 = y2[0];
+	bq_R->y2 = y2[1];
 }
-#else
-{
-	for (ulong i = 0; i < 2*count; i += 2) {
-		buf[i+0] = biquad_run(bq_L, buf[i+0]);
-		buf[i+1] = biquad_run(bq_R, buf[i+1]);
-	}
-}
-#endif