use vectors instead of intrinsics

This commit is contained in:
Connor Olding 2015-04-06 11:26:47 -07:00
parent 75fa193a90
commit 4022d11349
4 changed files with 35 additions and 50 deletions

View file

@ -34,8 +34,8 @@ VST_SRC = ${VST_CPP:%=$(VST_CPP_DIR)/%}
VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o} VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o}
VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def
INLINE_FLAGS = -Winline -finline-limit=1000 INLINE_FLAGS = -Winline
GENERAL_FLAGS = -Wall -Wno-unused-function -I include $(INLINE_FLAGS) GENERAL_FLAGS = -Wall -Wno-unused-function -Wno-sign-compare -I include $(INLINE_FLAGS)
ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS) ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS)
ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS) ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS)
ALL_LDFLAGS = -lm $(LDFLAGS) ALL_LDFLAGS = -lm $(LDFLAGS)
@ -44,9 +44,7 @@ LADSPA_FLAGS =
VST_FLAGS = -Wno-write-strings -Wno-narrowing VST_FLAGS = -Wno-write-strings -Wno-narrowing
VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1 VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1
# specifying core2 as the target architecture OPT_FLAGS = -Ofast -march=native -mfpmath=sse
# seems significantly faster, even on newer processors. ymmv.
OPT_FLAGS = -Ofast -march=core2 -mfpmath=sse
# any possibly produced files besides intermediates # any possibly produced files besides intermediates
ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL) ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
@ -57,7 +55,7 @@ ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
.PHONY: all options clean dist pretest ladspa vst $(UTILS) .PHONY: all options clean dist pretest ladspa vst $(UTILS)
.PHONY: benchmark windows linux .PHONY: benchmark windows linux
all: pretest ladspa all: pretest ladspa vst
exe: $(EXE) exe: $(EXE)

View file

@ -24,7 +24,7 @@ process_double(personal *data,
{ {
disable_denormals(); disable_denormals();
double buf[2*BLOCK_SIZE]; v2df buf[BLOCK_SIZE];
biquad *f0, *f1; biquad *f0, *f1;
@ -34,8 +34,8 @@ process_double(personal *data,
rem = count - pos; rem = count - pos;
for (ulong i = 0; i < rem; i++) { for (ulong i = 0; i < rem; i++) {
buf[i*2+0] = in_L[i]; buf[i][0] = in_L[i];
buf[i*2+1] = in_R[i]; buf[i][1] = in_R[i];
} }
f0 = data->filters[0]; f0 = data->filters[0];
@ -47,8 +47,8 @@ process_double(personal *data,
} }
for (ulong i = 0; i < rem; i++) { for (ulong i = 0; i < rem; i++) {
out_L[i] = buf[i*2+0]; out_L[i] = buf[i][0];
out_R[i] = buf[i*2+1]; out_R[i] = buf[i][1];
} }
in_L += BLOCK_SIZE; in_L += BLOCK_SIZE;

View file

@ -5,6 +5,7 @@
#endif #endif
#define INNER static inline #define INNER static inline
typedef double v2df __attribute__((vector_size(16), aligned(16)));
typedef unsigned long ulong; typedef unsigned long ulong;
INNER void INNER void
@ -48,7 +49,7 @@ INNER biquad
biquad_gen(filter_t type, double fc, double gain, double bw, double fs); biquad_gen(filter_t type, double fc, double gain, double bw, double fs);
/* s-plane to z-plane */ /* s-plane to z-plane */
INNER biquad_interim static biquad_interim
design(double cw, double sw, design(double cw, double sw,
double num0, double num1, double num2, double num0, double num1, double num2,
double den0, double den1, double den2); double den0, double den1, double den2);

View file

@ -33,7 +33,7 @@ biquad_init(biquad *bq)
bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0; bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0;
} }
INNER biquad_interim static biquad_interim
design(double cw, double sw, design(double cw, double sw,
double num0, double num1, double num2, double num0, double num1, double num2,
double den0, double den1, double den2) double den0, double den1, double den2)
@ -105,51 +105,37 @@ biquad_run(biquad *bq, double x)
INNER void INNER void
biquad_run_block_stereo(biquad *bq_L, biquad *bq_R, biquad_run_block_stereo(biquad *bq_L, biquad *bq_R,
double *buf, ulong count) v2df *buf, ulong count)
#ifdef __SSE2__
{ {
__m128d b0, b1, b2, a1, a2, x1, x2, y1, y2; v2df b0, b1, b2, a1, a2, x1, x2, y1, y2;
b0 = _mm_set1_pd(bq_L->b0); b0 = (v2df){bq_L->b0, bq_L->b0};
b1 = _mm_set1_pd(bq_L->b1); b1 = (v2df){bq_L->b1, bq_L->b1};
b2 = _mm_set1_pd(bq_L->b2); b2 = (v2df){bq_L->b2, bq_L->b2};
a1 = _mm_set1_pd(bq_L->a1); a1 = (v2df){bq_L->a1, bq_L->a1};
a2 = _mm_set1_pd(bq_L->a2); a2 = (v2df){bq_L->a2, bq_L->a2};
x1 = _mm_setr_pd(bq_L->x1, bq_R->x1); x1 = (v2df){bq_L->x1, bq_R->x1};
x2 = _mm_setr_pd(bq_L->x2, bq_R->x2); x2 = (v2df){bq_L->x2, bq_R->x2};
y1 = _mm_setr_pd(bq_L->y1, bq_R->y1); y1 = (v2df){bq_L->y1, bq_R->y1};
y2 = _mm_setr_pd(bq_L->y2, bq_R->y2); y2 = (v2df){bq_L->y2, bq_R->y2};
for (int i = 0; i < 2*count; i += 2) { for (ulong i = 0; i < count; i++) {
__m128d x = _mm_load_pd(buf + i); v2df x = buf[i];
__m128d y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2; v2df y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2;
x2 = x1; x2 = x1;
y2 = y1; y2 = y1;
x1 = x; x1 = x;
y1 = y; y1 = y;
_mm_store_pd(buf + i, y); buf[i] = y;
} }
double temp[8]; bq_L->x1 = x1[0];
_mm_store_pd(temp+0, x1); bq_R->x1 = x1[1];
_mm_store_pd(temp+2, x2); bq_L->x2 = x2[0];
_mm_store_pd(temp+4, y1); bq_R->x2 = x2[1];
_mm_store_pd(temp+6, y2); bq_L->y1 = y1[0];
bq_L->x1 = temp[0]; bq_R->y1 = y1[1];
bq_R->x1 = temp[1]; bq_L->y2 = y2[0];
bq_L->x2 = temp[2]; bq_R->y2 = y2[1];
bq_R->x2 = temp[3];
bq_L->y1 = temp[4];
bq_R->y1 = temp[5];
bq_L->y2 = temp[6];
bq_R->y2 = temp[7];
} }
#else
{
for (ulong i = 0; i < 2*count; i += 2) {
buf[i+0] = biquad_run(bq_L, buf[i+0]);
buf[i+1] = biquad_run(bq_R, buf[i+1]);
}
}
#endif