use vectors instead of intrinsics
This commit is contained in:
parent
75fa193a90
commit
4022d11349
4 changed files with 35 additions and 50 deletions
10
Makefile
10
Makefile
|
@ -34,8 +34,8 @@ VST_SRC = ${VST_CPP:%=$(VST_CPP_DIR)/%}
|
||||||
VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o}
|
VST_OBJ = ${VST_CPP:%.cpp=$(BIN)/%.o}
|
||||||
VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def
|
VST_DEF = $(VST_SDK_DIR)/public.sdk/samples/vst2.x/win/vstplug.def
|
||||||
|
|
||||||
INLINE_FLAGS = -Winline -finline-limit=1000
|
INLINE_FLAGS = -Winline
|
||||||
GENERAL_FLAGS = -Wall -Wno-unused-function -I include $(INLINE_FLAGS)
|
GENERAL_FLAGS = -Wall -Wno-unused-function -Wno-sign-compare -I include $(INLINE_FLAGS)
|
||||||
ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS)
|
ALL_CFLAGS = $(GENERAL_FLAGS) -std=gnu11 $(CFLAGS)
|
||||||
ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS)
|
ALL_CXXFLAGS = $(GENERAL_FLAGS) $(CXXFLAGS)
|
||||||
ALL_LDFLAGS = -lm $(LDFLAGS)
|
ALL_LDFLAGS = -lm $(LDFLAGS)
|
||||||
|
@ -44,9 +44,7 @@ LADSPA_FLAGS =
|
||||||
VST_FLAGS = -Wno-write-strings -Wno-narrowing
|
VST_FLAGS = -Wno-write-strings -Wno-narrowing
|
||||||
VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1
|
VST_FLAGS += -I $(VST_SDK_DIR) -DBUILDING_DLL=1
|
||||||
|
|
||||||
# specifying core2 as the target architecture
|
OPT_FLAGS = -Ofast -march=native -mfpmath=sse
|
||||||
# seems significantly faster, even on newer processors. ymmv.
|
|
||||||
OPT_FLAGS = -Ofast -march=core2 -mfpmath=sse
|
|
||||||
|
|
||||||
# any possibly produced files besides intermediates
|
# any possibly produced files besides intermediates
|
||||||
ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
|
ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
|
||||||
|
@ -57,7 +55,7 @@ ALL = $(SHOBJ) $(PROGRAM) $(BIN)/vstsdk.o $(EXE) $(DLL)
|
||||||
|
|
||||||
.PHONY: all options clean dist pretest ladspa vst $(UTILS)
|
.PHONY: all options clean dist pretest ladspa vst $(UTILS)
|
||||||
.PHONY: benchmark windows linux
|
.PHONY: benchmark windows linux
|
||||||
all: pretest ladspa
|
all: pretest ladspa vst
|
||||||
|
|
||||||
exe: $(EXE)
|
exe: $(EXE)
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ process_double(personal *data,
|
||||||
{
|
{
|
||||||
disable_denormals();
|
disable_denormals();
|
||||||
|
|
||||||
double buf[2*BLOCK_SIZE];
|
v2df buf[BLOCK_SIZE];
|
||||||
|
|
||||||
biquad *f0, *f1;
|
biquad *f0, *f1;
|
||||||
|
|
||||||
|
@ -34,8 +34,8 @@ process_double(personal *data,
|
||||||
rem = count - pos;
|
rem = count - pos;
|
||||||
|
|
||||||
for (ulong i = 0; i < rem; i++) {
|
for (ulong i = 0; i < rem; i++) {
|
||||||
buf[i*2+0] = in_L[i];
|
buf[i][0] = in_L[i];
|
||||||
buf[i*2+1] = in_R[i];
|
buf[i][1] = in_R[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
f0 = data->filters[0];
|
f0 = data->filters[0];
|
||||||
|
@ -47,8 +47,8 @@ process_double(personal *data,
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ulong i = 0; i < rem; i++) {
|
for (ulong i = 0; i < rem; i++) {
|
||||||
out_L[i] = buf[i*2+0];
|
out_L[i] = buf[i][0];
|
||||||
out_R[i] = buf[i*2+1];
|
out_R[i] = buf[i][1];
|
||||||
}
|
}
|
||||||
|
|
||||||
in_L += BLOCK_SIZE;
|
in_L += BLOCK_SIZE;
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define INNER static inline
|
#define INNER static inline
|
||||||
|
typedef double v2df __attribute__((vector_size(16), aligned(16)));
|
||||||
typedef unsigned long ulong;
|
typedef unsigned long ulong;
|
||||||
|
|
||||||
INNER void
|
INNER void
|
||||||
|
@ -48,7 +49,7 @@ INNER biquad
|
||||||
biquad_gen(filter_t type, double fc, double gain, double bw, double fs);
|
biquad_gen(filter_t type, double fc, double gain, double bw, double fs);
|
||||||
|
|
||||||
/* s-plane to z-plane */
|
/* s-plane to z-plane */
|
||||||
INNER biquad_interim
|
static biquad_interim
|
||||||
design(double cw, double sw,
|
design(double cw, double sw,
|
||||||
double num0, double num1, double num2,
|
double num0, double num1, double num2,
|
||||||
double den0, double den1, double den2);
|
double den0, double den1, double den2);
|
||||||
|
|
|
@ -33,7 +33,7 @@ biquad_init(biquad *bq)
|
||||||
bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0;
|
bq->x1 = bq->x2 = bq->y1 = bq->y2 = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
INNER biquad_interim
|
static biquad_interim
|
||||||
design(double cw, double sw,
|
design(double cw, double sw,
|
||||||
double num0, double num1, double num2,
|
double num0, double num1, double num2,
|
||||||
double den0, double den1, double den2)
|
double den0, double den1, double den2)
|
||||||
|
@ -105,51 +105,37 @@ biquad_run(biquad *bq, double x)
|
||||||
|
|
||||||
INNER void
|
INNER void
|
||||||
biquad_run_block_stereo(biquad *bq_L, biquad *bq_R,
|
biquad_run_block_stereo(biquad *bq_L, biquad *bq_R,
|
||||||
double *buf, ulong count)
|
v2df *buf, ulong count)
|
||||||
#ifdef __SSE2__
|
|
||||||
{
|
{
|
||||||
__m128d b0, b1, b2, a1, a2, x1, x2, y1, y2;
|
v2df b0, b1, b2, a1, a2, x1, x2, y1, y2;
|
||||||
|
|
||||||
b0 = _mm_set1_pd(bq_L->b0);
|
b0 = (v2df){bq_L->b0, bq_L->b0};
|
||||||
b1 = _mm_set1_pd(bq_L->b1);
|
b1 = (v2df){bq_L->b1, bq_L->b1};
|
||||||
b2 = _mm_set1_pd(bq_L->b2);
|
b2 = (v2df){bq_L->b2, bq_L->b2};
|
||||||
a1 = _mm_set1_pd(bq_L->a1);
|
a1 = (v2df){bq_L->a1, bq_L->a1};
|
||||||
a2 = _mm_set1_pd(bq_L->a2);
|
a2 = (v2df){bq_L->a2, bq_L->a2};
|
||||||
|
|
||||||
x1 = _mm_setr_pd(bq_L->x1, bq_R->x1);
|
x1 = (v2df){bq_L->x1, bq_R->x1};
|
||||||
x2 = _mm_setr_pd(bq_L->x2, bq_R->x2);
|
x2 = (v2df){bq_L->x2, bq_R->x2};
|
||||||
y1 = _mm_setr_pd(bq_L->y1, bq_R->y1);
|
y1 = (v2df){bq_L->y1, bq_R->y1};
|
||||||
y2 = _mm_setr_pd(bq_L->y2, bq_R->y2);
|
y2 = (v2df){bq_L->y2, bq_R->y2};
|
||||||
|
|
||||||
for (int i = 0; i < 2*count; i += 2) {
|
for (ulong i = 0; i < count; i++) {
|
||||||
__m128d x = _mm_load_pd(buf + i);
|
v2df x = buf[i];
|
||||||
__m128d y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2;
|
v2df y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2;
|
||||||
x2 = x1;
|
x2 = x1;
|
||||||
y2 = y1;
|
y2 = y1;
|
||||||
x1 = x;
|
x1 = x;
|
||||||
y1 = y;
|
y1 = y;
|
||||||
_mm_store_pd(buf + i, y);
|
buf[i] = y;
|
||||||
}
|
}
|
||||||
|
|
||||||
double temp[8];
|
bq_L->x1 = x1[0];
|
||||||
_mm_store_pd(temp+0, x1);
|
bq_R->x1 = x1[1];
|
||||||
_mm_store_pd(temp+2, x2);
|
bq_L->x2 = x2[0];
|
||||||
_mm_store_pd(temp+4, y1);
|
bq_R->x2 = x2[1];
|
||||||
_mm_store_pd(temp+6, y2);
|
bq_L->y1 = y1[0];
|
||||||
bq_L->x1 = temp[0];
|
bq_R->y1 = y1[1];
|
||||||
bq_R->x1 = temp[1];
|
bq_L->y2 = y2[0];
|
||||||
bq_L->x2 = temp[2];
|
bq_R->y2 = y2[1];
|
||||||
bq_R->x2 = temp[3];
|
|
||||||
bq_L->y1 = temp[4];
|
|
||||||
bq_R->y1 = temp[5];
|
|
||||||
bq_L->y2 = temp[6];
|
|
||||||
bq_R->y2 = temp[7];
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
{
|
|
||||||
for (ulong i = 0; i < 2*count; i += 2) {
|
|
||||||
buf[i+0] = biquad_run(bq_L, buf[i+0]);
|
|
||||||
buf[i+1] = biquad_run(bq_R, buf[i+1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
Loading…
Reference in a new issue