From 75fa193a902e093ee52ced1839f4f84fdb936501 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Sun, 5 Apr 2015 17:45:59 -0700 Subject: [PATCH] optimize biquads using process.h and running in-place is 19% faster on this machine. from there, using intrinsics yields another 94%, for a total speedup of 130%. --- crap/eq_const.h | 70 +++++++++++++++++++++++++++------------------- crap/tube.h | 10 +++---- include/util.h | 1 + include/util_def.h | 53 ++++++++++++++++++++++++++++++++++- 4 files changed, 98 insertions(+), 36 deletions(-) diff --git a/crap/eq_const.h b/crap/eq_const.h index 4dbd7b4..dd9e995 100644 --- a/crap/eq_const.h +++ b/crap/eq_const.h @@ -1,7 +1,5 @@ #include -#include "util.h" - #define ID 0x0DEFACED #define LABEL "crap_eq_const" #define NAME "crap Constant Equalizer" @@ -9,45 +7,59 @@ #define COPYRIGHT "MIT" #define PARAMETERS 0 +#define BLOCK_SIZE 256 + +#include "util.h" + #define BANDS 12 typedef struct { biquad filters[2][BANDS]; } personal; -INNER double -process_one(biquad *filters, double samp) -{ - for (int i = 0; i < BANDS; i++) - samp = biquad_run(&filters[i], samp); - return samp; -} - -INNER void -process(personal *data, - float *in_L, float *in_R, - float *out_L, float *out_R, - unsigned long count) -{ - disable_denormals(); - for (unsigned long pos = 0; pos < count; pos++) { - out_L[pos] = process_one(data->filters[0], in_L[pos]); - out_R[pos] = process_one(data->filters[1], in_R[pos]); - } -} - -INNER void +static void process_double(personal *data, double *in_L, double *in_R, double *out_L, double *out_R, - unsigned long count) + ulong count) { disable_denormals(); - for (unsigned long pos = 0; pos < count; pos++) { - out_L[pos] = process_one(data->filters[0], in_L[pos]); - out_R[pos] = process_one(data->filters[1], in_R[pos]); + + double buf[2*BLOCK_SIZE]; + + biquad *f0, *f1; + + for (ulong pos = 0; pos < count; pos += BLOCK_SIZE) { + ulong rem = BLOCK_SIZE; + if (pos + BLOCK_SIZE > count) + rem = count - pos; + + for (ulong i = 0; i < rem; i++) { + buf[i*2+0] = in_L[i]; + buf[i*2+1] = in_R[i]; + } + + f0 = data->filters[0]; + f1 = data->filters[1]; + for (ulong i = 0; i < BANDS; i++) { + biquad_run_block_stereo(f0, f1, buf, rem); + f0++; + f1++; + } + + for (ulong i = 0; i < rem; i++) { + out_L[i] = buf[i*2+0]; + out_R[i] = buf[i*2+1]; + } + + in_L += BLOCK_SIZE; + in_R += BLOCK_SIZE; + out_L += BLOCK_SIZE; + out_R += BLOCK_SIZE; } } +#include "process.h" + INNER void construct(personal *data) {} @@ -70,7 +82,7 @@ pause(personal *data) {} INNER void -adjust(personal *data, unsigned long fs) +adjust(personal *data, ulong fs) { biquad *filters = data->filters[0]; filters[ 0] = biquad_gen(FILT_PEAKING, 62.0, 5.3, 0.55, fs); diff --git a/crap/tube.h b/crap/tube.h index 2d0e646..159fec9 100644 --- a/crap/tube.h +++ b/crap/tube.h @@ -2,10 +2,6 @@ #include #include -#include "util.h" -#include "param.h" -#include "os2piir.h" - #define ID 0x50F7BA11 #define LABEL "crap_tube" #define NAME "crap Tube Distortion" @@ -17,7 +13,9 @@ #define BLOCK_SIZE 256 #define FULL_SIZE (BLOCK_SIZE*OVERSAMPLING) -typedef unsigned long ulong; +#include "util.h" +#include "param.h" +#include "os2piir.h" typedef struct { double desired, actual, speed; @@ -61,7 +59,7 @@ process_one(double x, double drive, double wet) return (distort(x*drive)/drive*0.79 - x)*wet + x; } -INNER void +static void process_double(personal *data, double *in_L, double *in_R, double *out_L, double *out_R, diff --git a/include/util.h b/include/util.h index 49a9890..a443eb6 100644 --- a/include/util.h +++ b/include/util.h @@ -5,6 +5,7 @@ #endif #define INNER static inline +typedef unsigned long ulong; INNER void disable_denormals(); diff --git a/include/util_def.h b/include/util_def.h index 10814dd..5a5c921 100644 --- a/include/util_def.h +++ b/include/util_def.h @@ -48,7 +48,7 @@ design(double cw, double sw, }; } -INNER biquad +static biquad biquad_gen(filter_t type, double fc, double gain, double bw, double fs) { double w0, cw, sw, A, As, Q; @@ -102,3 +102,54 @@ biquad_run(biquad *bq, double x) return y; } + +INNER void +biquad_run_block_stereo(biquad *bq_L, biquad *bq_R, + double *buf, ulong count) +#ifdef __SSE2__ +{ + __m128d b0, b1, b2, a1, a2, x1, x2, y1, y2; + + b0 = _mm_set1_pd(bq_L->b0); + b1 = _mm_set1_pd(bq_L->b1); + b2 = _mm_set1_pd(bq_L->b2); + a1 = _mm_set1_pd(bq_L->a1); + a2 = _mm_set1_pd(bq_L->a2); + + x1 = _mm_setr_pd(bq_L->x1, bq_R->x1); + x2 = _mm_setr_pd(bq_L->x2, bq_R->x2); + y1 = _mm_setr_pd(bq_L->y1, bq_R->y1); + y2 = _mm_setr_pd(bq_L->y2, bq_R->y2); + + for (int i = 0; i < 2*count; i += 2) { + __m128d x = _mm_load_pd(buf + i); + __m128d y = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2; + x2 = x1; + y2 = y1; + x1 = x; + y1 = y; + _mm_store_pd(buf + i, y); + } + + double temp[8]; + _mm_store_pd(temp+0, x1); + _mm_store_pd(temp+2, x2); + _mm_store_pd(temp+4, y1); + _mm_store_pd(temp+6, y2); + bq_L->x1 = temp[0]; + bq_R->x1 = temp[1]; + bq_L->x2 = temp[2]; + bq_R->x2 = temp[3]; + bq_L->y1 = temp[4]; + bq_R->y1 = temp[5]; + bq_L->y2 = temp[6]; + bq_R->y2 = temp[7]; +} +#else +{ + for (ulong i = 0; i < 2*count; i += 2) { + buf[i+0] = biquad_run(bq_L, buf[i+0]); + buf[i+1] = biquad_run(bq_R, buf[i+1]); + } +} +#endif