diff --git a/Makefile b/Makefile index d9a3e18..adfdb79 100644 --- a/Makefile +++ b/Makefile @@ -34,8 +34,8 @@ VST_SRC = ${VST_CPP:%=$(VST_CPP_DIR)/%} VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o} VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def -INLINE_FLAGS = -Winline -finline-limit=1000 -GENERAL_FLAGS = -Wall -Wno-unused-function -I include $(INLINE_FLAGS) +INLINE_FLAGS = -Winline +GENERAL_FLAGS = -Wall -Wno-unused-function -Wno-sign-compare -I include $(INLINE_FLAGS) ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS) ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS) ALL_LDFLAGS = -lm $(LDFLAGS) @@ -44,9 +44,7 @@ LADSPA_FLAGS = VST_FLAGS = -Wno-write-strings -Wno-narrowing VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1 -# specifying core2 as the target architecture -# seems significantly faster, even on newer processors. ymmv. -OPT_FLAGS = -Ofast -march=core2 -mfpmath=sse +OPT_FLAGS = -Ofast -march=native -mfpmath=sse # any possibly produced files besides intermediates ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL) @@ -57,7 +55,7 @@ ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL) .PHONY: all options clean dist pretest ladspa vst $(UTILS) .PHONY: benchmark windows linux -all: pretest ladspa +all: pretest ladspa vst exe: $(EXE) diff --git a/crap/eq_const.h b/crap/eq_const.h index dd9e995..8aa9644 100644 --- a/crap/eq_const.h +++ b/crap/eq_const.h @@ -24,7 +24,7 @@ process_double(personal *data, { disable_denormals(); - double buf[2*BLOCK_SIZE]; + v2df buf[BLOCK_SIZE]; biquad *f0, *f1; @@ -34,8 +34,8 @@ process_double(personal *data, rem = count - pos; for (ulong i = 0; i < rem; i++) { - buf[i*2+0] = in_L[i]; - buf[i*2+1] = in_R[i]; + buf[i][0] = in_L[i]; + buf[i][1] = in_R[i]; } f0 = data->filters[0]; @@ -47,8 +47,8 @@ process_double(personal *data, } for (ulong i = 0; i < rem; i++) { - out_L[i] = buf[i*2+0]; - out_R[i] = buf[i*2+1]; + out_L[i] = buf[i][0]; + out_R[i] = buf[i][1]; } in_L += BLOCK_SIZE; diff --git a/include/util.h b/include/util.h index a443eb6..971fb8f 100644 --- a/include/util.h +++ b/include/util.h @@ -5,6 +5,7 @@ #endif #define INNER static inline +typedef double v2df __attribute__((vector_size(16), aligned(16))); typedef unsigned long ulong; INNER void @@ -48,7 +49,7 @@ INNER biquad biquad_gen(filter_t type, double fc, double gain, double bw, double fs); /* s-plane to z-plane */ -INNER biquad_interim +static biquad_interim design(double cw, double sw, double num0, double num1, double num2, double den0, double den1, double den2); diff --git a/include/util_def.h b/include/util_def.h index 5a5c921..b0e4047 100644 --- a/include/util_def.h +++ b/include/util_def.h @@ -33,7 +33,7 @@ biquad_init(biquad *bq) bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0; } -INNER biquad_interim +static biquad_interim design(double cw, double sw, double num0, double num1, double num2, double den0, double den1, double den2) @@ -105,51 +105,37 @@ biquad_run(biquad *bq, double x) INNER void biquad_run_block_stereo(biquad *bq_L, biquad *bq_R, - double *buf, ulong count) -#ifdef __SSE2__ + v2df *buf, ulong count) { - __m128d b0, b1, b2, a1, a2, x1, x2, y1, y2; + v2df b0, b1, b2, a1, a2, x1, x2, y1, y2; - b0 = _mm_set1_pd(bq_L->b0); - b1 = _mm_set1_pd(bq_L->b1); - b2 = _mm_set1_pd(bq_L->b2); - a1 = _mm_set1_pd(bq_L->a1); - a2 = _mm_set1_pd(bq_L->a2); + b0 = (v2df){bq_L->b0, bq_L->b0}; + b1 = (v2df){bq_L->b1, bq_L->b1}; + b2 = (v2df){bq_L->b2, bq_L->b2}; + a1 = (v2df){bq_L->a1, bq_L->a1}; + a2 = (v2df){bq_L->a2, bq_L->a2}; - x1 = _mm_setr_pd(bq_L->x1, bq_R->x1); - x2 = _mm_setr_pd(bq_L->x2, bq_R->x2); - y1 = _mm_setr_pd(bq_L->y1, bq_R->y1); - y2 = _mm_setr_pd(bq_L->y2, bq_R->y2); + x1 = (v2df){bq_L->x1, bq_R->x1}; + x2 = (v2df){bq_L->x2, bq_R->x2}; + y1 = (v2df){bq_L->y1, bq_R->y1}; + y2 = (v2df){bq_L->y2, bq_R->y2}; - for (int i = 0; i < 2*count; i += 2) { - __m128d x = _mm_load_pd(buf + i); - __m128d y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2; + for (ulong i = 0; i < count; i++) { + v2df x = buf[i]; + v2df y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2; x2 = x1; y2 = y1; x1 = x; y1 = y; - _mm_store_pd(buf + i, y); + buf[i] = y; } - double temp[8]; - _mm_store_pd(temp+0, x1); - _mm_store_pd(temp+2, x2); - _mm_store_pd(temp+4, y1); - _mm_store_pd(temp+6, y2); - bq_L->x1 = temp[0]; - bq_R->x1 = temp[1]; - bq_L->x2 = temp[2]; - bq_R->x2 = temp[3]; - bq_L->y1 = temp[4]; - bq_R->y1 = temp[5]; - bq_L->y2 = temp[6]; - bq_R->y2 = temp[7]; + bq_L->x1 = x1[0]; + bq_R->x1 = x1[1]; + bq_L->x2 = x2[0]; + bq_R->x2 = x2[1]; + bq_L->y1 = y1[0]; + bq_R->y1 = y1[1]; + bq_L->y2 = y2[0]; + bq_R->y2 = y2[1]; } -#else -{ - for (ulong i = 0; i < 2*count; i += 2) { - buf[i+0] = biquad_run(bq_L, buf[i+0]); - buf[i+1] = biquad_run(bq_R, buf[i+1]); - } -} -#endif