/*
 * (c) Copyright 2000 -- Anders Torger
 *
 * This software is free. You can redistribute it and/or modify it under the
 * terms of the GNU Public License as published by the Free Software Foundation.
 *
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <inttypes.h>
#include <dlfcn.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <fcntl.h>
#include <unistd.h>

#include "fir.h"
#include "coeffs.h"
#include "timestamp.h"

#define MAX_FILENAME 4096

struct firdata {
    int taps;
    int spos;
    int xpos;
    int ypos;
    bool_t active;
    int32_t multiplier;
    int16_t *state[4];
    int16_t *coeffs;
    int64_t (*process)(int16_t *, int16_t *);
};

static const int32_t sref = 2147483647;

struct mmx_coeffs {
    double val[4];
    double absval[4];
    double mse[32];
    int index;
    int minshift;
    int maxshift;
    int shift;
};

struct mmx_coeff_group {
    int len;
    int *index;
    int shift;
};

struct mmxfir_info {
    int taps;
    int groups;
    double mean_squares;
};

/*
static int64_t *ref_coeffs;

int64_t
ref_firlib_process(int16_t samples[],
		   int64_t coeffs[],
		   int taps)
{
    int n;
    int64_t s = 0;
    
    for (n = 0; n < taps; n++) {
	s += (int64_t)samples[n] * coeffs[n];
    }
    return s;
}
*/

/* toggle filter processing */
static void
toggle_processing(struct firdata *firdata)
{
    firdata->active = !firdata->active;
}

/*
 * Process a single sample and apply gain, or apply gain only if the filter
 * is deactivated.
 */
static int64_t
process_sample(struct firdata *firdata,
	       int32_t sample)
{
    int64_t a;
    /*
    static int cc = 0;
    static uint64_t ts1, ts2, tssum = 0, tssum2 = 0;
    static double t1, t2;
    */

    /*
     * some init values:
     * firdata->pos = firdata->taps - 2
     * firdata->spos = 4;
     */
       
    //return 0;
    /*
    if (++cc == 491520) {	
	t1 = (double)tssum / 266000000.0;
	t2 = (double)tssum2 / 266000000.0;
	fprintf(stderr, "%f %f %f\n", t1, t2, t1 + t2);
	tssum = 0;
	tssum2 = 0;
	cc = 0;
    }
    */
/*
 * For optimal performance of the firlib_process function, both coefficients
 * and sample data must be aligned on even 8 bytes. This means that we either
 * need four different coefficient sets, each shifted one coefficient (which is
 * 2 bytes). That way we can shift the coefficients one sample at a time over
 * the sample set, without needing to make misaligned access. However here we
 * store the latest <taps> samples in eight separate sets instead. This to avoid
 * the need of four different firlib_process functions, as shifted coefficient
 * sets would demand new code.
 *
 * The sample sets (state) is ordererd this way:
 *
 * 
 *
 *
 *
 */    

    sample >>= 16;

    /*
    for (n = 0; n < 4; n++) {
	firdata->state[n][firdata->spos + n] = 
	    firdata->state[n][firdata->spos + firdata->taps + n] =
	    (int16_t)sample;
    }
    firdata->spos--;
    */
    
    /* this is a bit faster than the commented for loop above */
    firdata->state[0][firdata->spos] = 
	firdata->state[0][firdata->spos + firdata->taps] = (int16_t)sample;
    firdata->spos++;
    firdata->state[1][firdata->spos] = 
	firdata->state[1][firdata->spos + firdata->taps] = (int16_t)sample;
    firdata->spos++;
    firdata->state[2][firdata->spos] = 
	firdata->state[2][firdata->spos + firdata->taps] = (int16_t)sample;
    firdata->spos++;
    firdata->state[3][firdata->spos] = 
	firdata->state[3][firdata->spos + firdata->taps] = (int16_t)sample;
    firdata->spos -= 4;
    
    if (firdata->active) {
	/*
	timestamp(&ts2);
	tssum2 += ts2 - ts1;
	timestamp(&ts1);
	*/
	a = firdata->process(&firdata->state[firdata->ypos][firdata->xpos],
			     firdata->coeffs);
	/*
	timestamp(&ts2);
	tssum += ts2 - ts1;
	timestamp(&ts1);
	*/
	/*
	b = ref_firlib_process(&firdata->state[firdata->ypos][firdata->xpos],
	                       ref_coeffs, firdata->taps);
	*/
    } else {
	a = sample;
	a *= firdata->multiplier;
    }
    firdata->ypos++;

    if (firdata->ypos == 4) {
	firdata->ypos = 0;
	firdata->xpos -= 4;
	if (firdata->xpos == 0) {
	    firdata->xpos = firdata->spos = firdata->taps;
	}
    }
    return a;
}

static void
make_mmxvalues(int16_t mmxvalues[4],
	       struct mmx_coeffs *mc,
	       int32_t ref)
{
    int n;

    for (n = 0; n < 4; n++) {
	mmxvalues[n] = (int32_t)rint(mc->val[n] * (double)ref) >> mc->shift;
    }
}

static void
make_absmmxvalues(int16_t mmxvalues[4],
		  struct mmx_coeffs *mc,
		  int32_t ref)
{
    int n;

    for (n = 0; n < 4; n++) {
	mmxvalues[n] = (int32_t)rint(mc->absval[n] * (double)ref) >> mc->shift;
    }
}

static uint64_t
largest_result(struct mmx_coeffs *mc,
	       int32_t ref)
{
    uint16_t mmxvalues[4];

    make_absmmxvalues(mmxvalues, mc, ref);
    return (uint64_t)mmxvalues[0] * 32768 +
	(uint64_t)mmxvalues[1] * 32768 +
	(uint64_t)mmxvalues[2] * 32768 +
	(uint64_t)mmxvalues[3] * 32768;
}

static void
mmx_coeffs_least_squares(struct mmx_coeffs *mc,
			 int32_t ref)
{
    int n, i, maxpos = 0, bestshift = 0;
    int32_t target[4];
    double max = 0.0, err, minerr = 0.0;
    int16_t mmxvalues[4];
    
    for (n = 0; n < 4; n++) {
	/* the targeted value after scaling */
	target[n] = rint(mc->absval[n] * (double)ref);
    }
    /* find out the least we must shift the target array values to fit the
       largest into 16 bits */
    for (n = 0; n < 4; n++) {
	if (max < mc->absval[n]) {
	    maxpos = n;
	    max = mc->absval[n];
	}
    }
    for (mc->minshift = 0;
	 target[maxpos] >> mc->minshift > 32767;
	 mc->minshift++);

    /* calculate mean squares errors for different shifts */
    for (n = 0; 1 << n < ref && n != 32; n++) {
	mc->mse[n] = 0.0;
	if (n >= mc->minshift) {
	    mc->shift = n;
	    make_mmxvalues(mmxvalues, mc, ref);
	    if (largest_result(mc, ref) <= ref) {
		for (i = 0; i < 4; i++) {
		    err = ((double)((int32_t)mmxvalues[i] << n) -
			   mc->val[i] * (double)ref) / mc->val[i] / (double)ref;
		    err *= err;
		    mc->mse[n] += err;
		}
		mc->mse[n] /= 4;
		if (n == mc->minshift || mc->mse[n] < minerr) {
		    minerr = mc->mse[n];
		    bestshift = n;
		}
	    } else {
		mc->minshift++;
	    }
	}
    }
    mc->maxshift = n - 1;
    mc->shift = bestshift;
}

static int
cmp_mc_shift(const void *mc1,
	     const void *mc2)
{
    int n;
    if ((n = ((struct mmx_coeffs *)mc1)->shift -
	 ((struct mmx_coeffs *)mc2)->shift) == 0)
    {
	if (((struct mmx_coeffs *)mc1)
	    ->mse[((struct mmx_coeffs *)mc1)->shift] -
		((struct mmx_coeffs *)mc2)
	    ->mse[((struct mmx_coeffs *)mc2)->shift] < 0)
	{
	    return -1;
	}
	return 1;
    }
    return n;
}

static void
shift_sort(struct mmx_coeffs amc[],
	   int len)
{
    qsort(amc, len, sizeof(struct mmx_coeffs), cmp_mc_shift);
}


/*
 * Convert double coefficients to fit the FIR filter algorithm implemented
 * with MMX instructions, with the central instruction 'pmaddwd'.
 */
static struct mmx_coeffs *
mmx_coeff_requantisation(double coeffs[],
			 int len,
			 int32_t ref,
			 struct mmxfir_info *mmxfir_info)
{
    struct mmx_coeffs *amc;
    uint64_t sum[32], maxsum, movesum;
    int n, i, n_groups, groupstart[32], maxpos, iterations = 0;

    if (len % 4 != 0 || len == 0) {
	fprintf(stderr, "mmx_coeff_requantisation: the number of coefficients "
		"must be divisable with four\n");
	return NULL;
    }
    if ((amc = malloc(len * sizeof(struct mmx_coeffs))) == NULL) {
	fprintf(stderr, "mmx_coeff_requantisation: failed to "
		"allocate memory\n");
	return NULL;
    }
    
    /* fill the mmx_coeffs array */
    len /= 4;
    for (n = 0; n < len; n ++) {
	for (i = 0; i < 4; i++) {
	    amc[n].val[i] = coeffs[4*n+i];
	    amc[n].absval[i] = fabs(coeffs[4*n+i]);
	    amc[n].index = n;
	    if (amc[n].absval[i] > 1.0) {
		fprintf(stderr, "mmx_coeff_requantisation: at least one filter "
			"coefficient is larger than 1.0 or less than -1.0\n");
		free(amc);
		return NULL;
	    }
	}
	mmx_coeffs_least_squares(&amc[n], ref);
    }

    do {
	shift_sort(amc, len);
	
	bzero(sum, sizeof(sum));
	for (n = n_groups = groupstart[0] = 0, i = amc[0].shift; n < len; n++) {
	    if (amc[n].shift != i) {
		i = amc[n].shift;
		n_groups++;		
		groupstart[n_groups] = n;
	    }
	    sum[n_groups] += largest_result(&amc[n], ref);
	}
	groupstart[++n_groups] = len;
	
	for (n = maxsum = maxpos = 0; n < n_groups; n++) {
	    if (sum[n] > maxsum) {
		maxsum = sum[n];
		maxpos = n;
	    }
	    /*
	    fprintf(stderr, "groupstart: %d\t shift: %d\t maxratio: %f\n",
		    groupstart[n], amc[groupstart[n]].shift,
		    (double)sum[n] / (double)ref);
	    */
	}
	if (maxsum > ref) {
	    for (n = groupstart[maxpos], movesum = 0;
		 n < groupstart[maxpos+1] && maxsum - movesum > ref;
		 n++)
	    {
		if (amc[n].shift == amc[n].maxshift) {
		    fprintf(stderr, "mmx_coeff_requantisation: coefficients "
			    "are impossible to requantisise\n");
		}
		movesum += largest_result(&amc[n], ref);
		amc[n].shift++;
	    }
	}
	/*
        fprintf(stderr, "%f\n", (double)maxsum / (double)ref);
	*/
	iterations++;
    } while (maxsum > ref);

    mmxfir_info->groups = n_groups;
    mmxfir_info->taps = 4 * len;
    for (n = 0, mmxfir_info->mean_squares = 0; n < len; n++) {
	mmxfir_info->mean_squares += amc[n].mse[amc[n].shift];
    }
    mmxfir_info->mean_squares /= len;
    return amc;    
}

void
generate_code(FILE *stream,
	      struct mmx_coeff_group mcg[],
	      int len,
	      char funname[])
{
    char shiftadd[] =
	"\n\t# shiftadd\n"
	"\tmovl %%edx, %%ecx\n"
	"\tmovq %%mm7, %%mm6\n"
	"\tpsrlq $32, %%mm7\n"
	"\tpaddd %%mm6, %%mm7\n"
	"\tmovl %%eax, %%ebx\n"
	"\tmovq %%mm7, %%mm6\n"
	"\tpsllq $%u, %%mm7\n"
	"\tpsrad $%u, %%mm6\n"
	"\tmovd %%mm7, %%eax\n"
	"\tmovd %%mm6, %%edx\n"
	"\taddl %%ebx, %%eax\n"
	"\tadcl %%ecx, %%edx\n";

    char startgroup1[] =
	"\n\t# startgroup1\n"
    	"\tpxor %%mm7, %%mm7\n"
	"\tmovq %d(%%esi), %%mm0\n"
	"\tmovq %d(%%edi), %%mm1\n";

    char startgroup2[] =
	"\n\t# startgroup2\n"
    	"\tpxor %%mm7, %%mm7\n"
	"\tmovq %d(%%esi), %%mm0\n"
	"\tmovq %d(%%edi), %%mm1\n"
	"\tmovq %d(%%esi), %%mm2\n"
	"\tmovq %d(%%edi), %%mm3\n";

    char blockgroup[] =
	"\t# blockgroup\n"
	"\tpmaddwd %%mm1, %%mm0\n"
	"\tmovq %d(%%esi), %%mm4\n"
	"\tmovq %d(%%edi), %%mm5\n"
	"\tpaddd %%mm0, %%mm7\n"
	"\tpmaddwd %%mm3, %%mm2\n"
	"\tmovq %d(%%esi), %%mm0\n"
	"\tmovq %d(%%edi), %%mm1\n"
	"\tpaddd %%mm2, %%mm7\n"
	"\tpmaddwd %%mm5, %%mm4\n"
	"\tmovq %d(%%esi), %%mm2\n"
	"\tmovq %d(%%edi), %%mm3\n"
	"\tpaddd %%mm4, %%mm7\n";

    char endgroup4[] =
	"\t# endgroup4\n"
	"\tpmaddwd %%mm1, %%mm0\n"
	"\tmovq %d(%%esi), %%mm4\n"
	"\tmovq %d(%%edi), %%mm5\n"
	"\tpaddd %%mm0, %%mm7\n"
	"\tmovq %d(%%esi), %%mm0\n"
	"\tmovq %d(%%edi), %%mm1\n"
	"\tpmaddwd %%mm3, %%mm2\n"
	"\tpmaddwd %%mm5, %%mm4\n"
	"\tpmaddwd %%mm1, %%mm0\n"
	"\tpaddd %%mm2, %%mm7\n"
	"\tpaddd %%mm4, %%mm7\n"
	"\tpaddd %%mm0, %%mm7\n";
    
    char endgroup3[] =
	"\t# endgroup3\n"
	"\tpmaddwd %%mm1, %%mm0\n"
	"\tmovq %d(%%esi), %%mm4\n"
	"\tmovq %d(%%edi), %%mm5\n"
	"\tpaddd %%mm0, %%mm7\n"
	"\tpmaddwd %%mm3, %%mm2\n"
	"\tpmaddwd %%mm5, %%mm4\n"
	"\tpaddd %%mm2, %%mm7\n"
	"\tpaddd %%mm4, %%mm7\n";
    
    char endgroup2[] =
	"\t# endgroup2\n"
	"\tpmaddwd %%mm1, %%mm0\n"
	"\tpmaddwd %%mm3, %%mm2\n"
	"\tpaddd %%mm0, %%mm7\n"
	"\tpaddd %%mm2, %%mm7\n";

    char endgroup1[] =
	"\t# endgroup1\n"
	"\tpmaddwd %%mm1, %%mm0\n"
	"\tpaddd %%mm0, %%mm7\n";
	
    int n, i, coffset, taps = 0;

    fprintf(stream,
	    ".globl %s\n"
	    "\t.type %s,@function\n"
	    "%s:\n"
	    "\tpushl %%ebp\n"
	    "\tmovl %%esp,%%ebp\n"
	    "\tpushl %%ebx\n"
	    "\tpushl %%esi\n"
	    "\tpushl %%edi\n"
	    "\txorl %%eax, %%eax     # lower 32 bits of 64 bit return value\n"
	    "\txorl %%edx, %%edx     # higher 32 bits of 64 bit return value\n"
	    "\tmovl 8(%%ebp), %%esi  # sample pointer\n"
	    "\tmovl 12(%%ebp), %%edi # coefficient pointer\n",
	    funname, funname, funname);

    coffset = 0;
    for (n = 0; n < len; n++) {
	fprintf(stream, "\n\n#### coefficient group %d (length 4 * %d)\n",
		n + 1, mcg[n].len);
	taps += mcg[n].len;
	switch (mcg[n].len) {
	case 1:
	    fprintf(stream, startgroup1, 8 * mcg[n].index[0], coffset);
	    fprintf(stream, endgroup1);
	    coffset += 8;
	    break;
	case 2:
	    fprintf(stream, startgroup2, 8 * mcg[n].index[0], coffset,
		    8 * mcg[n].index[1], coffset + 8);
	    fprintf(stream, endgroup2);
	    coffset += 16;
	    break;
	case 3:
	    fprintf(stream, startgroup2, 8 * mcg[n].index[0], coffset,
		    8 * mcg[n].index[1], coffset + 8);
	    fprintf(stream, endgroup3, 8 * mcg[n].index[2], coffset + 16);
	    coffset += 24;
	    break;
	case 4:
	    fprintf(stream, startgroup2, 8 * mcg[n].index[0], coffset,
		    8 * mcg[n].index[1], coffset + 8);
	    fprintf(stream, endgroup4, 8 * mcg[n].index[2], coffset + 16,
		    8 * mcg[n].index[3], coffset + 24);
	    coffset += 32;
	    break;
	    
	default:
	    fprintf(stream, startgroup2, 8 * mcg[n].index[0], coffset,
		    8 * mcg[n].index[1], coffset + 8);
	    coffset += 16;
	    for (i = 2; i < mcg[n].len - 2; i += 3) {
		fprintf(stream, blockgroup,
			8 * mcg[n].index[i], coffset,
			8 * mcg[n].index[i+1], coffset + 8,
			8 * mcg[n].index[i+2], coffset + 16);
		coffset += 24;
	    }
	    switch (mcg[n].len - i) {
	    case 0:
		fprintf(stream, endgroup2);
		break;
	    case 1:
		fprintf(stream, endgroup3, 8 * mcg[n].index[i], coffset);
		coffset += 8;
		break;
	    case 2:
		fprintf(stream, endgroup4, 8 * mcg[n].index[i], coffset,
			8 * mcg[n].index[i+1], coffset + 8);
		coffset += 16;
		break;
	    }
	}	
	fprintf(stream, shiftadd, mcg[n].shift, 32 - mcg[n].shift);
    }

    fprintf(stream,
	    "\n\tmovl -4(%%ebp),%%ebx\n"
	    "\tmovl -8(%%ebp),%%esi\n"
	    "\tmovl -12(%%ebp),%%edi\n"
	    "\temms\n"
	    "\tleave\n"
	    "\tret\n");
}
/*
int64_t *
make_coeffs(struct mmx_coeff *amc,
	    double oc[],
	    int len)
{
    int64_t *coeffs;
    int n, i;
    double f;

    if ((coeffs = malloc(4 * len * sizeof(int64_t))) == NULL) {
	return NULL;
    }
    for (n = 0; n < len; n++) {
	for (i = 0; i < 4; i++) {
	    coeffs[4 * amc[n].index + i] =
		(int64_t)amc[n].mmxval[i] << amc[n].shift;
	}
    }
    for (n = 0; n < 4 * len; n++) {
	f = (double)coeffs[n] / sref;
	fprintf(stderr, "%lld %f %f, %f\n", coeffs[n], f, oc[n], oc[n] / f);
    }
    return coeffs;
}
*/
bool_t
mmx_coeffs_to_code(struct mmx_coeffs *amc,
		   int len,
		   char tempname[],
		   char funname[])
{
    struct mmx_coeff_group *mcg;
    int n, i, k, mcglen = 0, *index = NULL;
    char name[MAX_FILENAME];
    FILE *stream;

    if ((mcg = malloc(len * sizeof(struct mmx_coeff_group))) == NULL ||
	(index = malloc(len * sizeof(int))) == NULL)
    {
	free(mcg); free(index);
	fprintf(stderr, "mmx_coeffs_to_code: could not allocate memory");
	return false;
    }
    
    
    qsort(amc, len, sizeof(struct mmx_coeffs), cmp_mc_shift);

    for (n = k = mcglen = 0, i = amc[0].shift; n <= len; n++) {
	if (n < len) {
	    index[n] = amc[n].index;
	}
	if (n == len || amc[n].shift != i) {
	    i = amc[n].shift;
	    mcg[mcglen].shift = amc[n-1].shift;
	    mcg[mcglen].index = &index[k];
	    mcg[mcglen].len = n - k;
	    k = n;
	    mcglen++;
	}
    }
    sprintf(name, "%s.s", tempname);
    if ((stream = fopen(name, "wt")) == NULL) {
	fprintf(stderr, "mmx_coeffs_to_code: could not open file \"%s\" for "
		"writing: %s\n", name, strerror(errno));
	free(mcg); free(index);
	return false;
    }    
    generate_code(stream, mcg, mcglen, funname);
    fclose(stream);
    free(mcg);
    free(index);
    return true;
}

bool_t
make_firlib(struct mmx_coeffs *amc,
	    struct mmxfir_info *mmx_firinfo,
	    int len,
	    int32_t ref,
	    char tempname[],
	    char firlibname[])
{
    int n, i;
    FILE *stream;
    uint16_t mmxvalues[4];
    uint8_t *info = (uint8_t *)mmx_firinfo;
    char gcccmd[4 * MAX_FILENAME + 200], name[MAX_FILENAME], cwd[MAX_FILENAME];
    bool_t success;

    sprintf(name, "%s1.c", tempname);
    if ((stream = fopen(name, "wt")) == NULL) {
	fprintf(stderr, "make_firlib: could not open file \"%s\" for writing: "
		"%s\n",	name, strerror(errno));
	return false;
    }
    fprintf(stream,
	    "#include <stdlib.h>\n"
	    "#include <inttypes.h>\n"
	    "\n"
	    "#define INFOSZ %d\n"
	    "\n"
	    "static uint8_t info[INFOSZ] = { ",
	    sizeof(struct mmxfir_info));
    for (n = 0; n < sizeof(struct mmxfir_info) - 1; n++) {
	fprintf(stream, "%u, ", info[n]);
    }
    fprintf(stream, "%u };\n\n", info[n]);
    fprintf(stream, 
	    "int16_t *\n"
	    "firlib_get_coeffs(void)\n"
	    "{\n"
	    "    int16_t *c;\n"
	    "\n"
	    "    if ((c = malloc(%d * sizeof(int16_t))) == NULL) {\n"
	    "        return NULL;\n"
	    "    }\n\n",
	    4 * len);
    for (n = 0; n < len; n++) {
	make_mmxvalues(mmxvalues, &amc[n], ref);
	for (i = 0; i < 4; i++) {
	    fprintf(stream, "    c[%d] = %d;\n", 4 * n + i, mmxvalues[i]);
	}
    }
    fprintf(stream,
	    "\n    return c;\n"
	    "}\n"
	    "\n"
	    "void *\n"
            "firlib_get_info(void *buf)\n"
	    "{\n"
	    "    return memcpy(buf, info, INFOSZ);\n"
	    "}\n");
    fclose(stream);

    getcwd(cwd, MAX_FILENAME);
    chdir(P_tmpdir);
    sprintf(gcccmd, "gcc -fPIC -c %s.s %s1.c", tempname, tempname);
    success = (system(gcccmd) == 0);
    sprintf(name, "%s.s", tempname);
    remove(name);
    sprintf(name, "%s1.c", tempname);
    remove(name);
    if (success) {
	sprintf(gcccmd, "gcc -shared -Wl,-soname,%s -o %s %s.o %s1.o",
		firlibname, firlibname, tempname, tempname);
	success = (system(gcccmd) == 0);
	sprintf(name, "%s.o", tempname);
	remove(name);
	sprintf(name, "%s1.o", tempname);
	remove(name);
    }
    chdir(cwd);
    if (!success) {
	fprintf(stderr, "make_firlib: filter compilation failed\n");
	return false;
    }
    return true;
}

#define FIRNEW_OPENLIBFUNC_ERRORCHECK() \
    if ((error = dlerror()) != NULL) {                                         \
	fprintf(stderr, "fir_new: could not open FIRLIB function: %s\n",       \
		error);                                                        \
	free(filter); free(firdata); dlclose(firlib);                          \
	return NULL;                                                           \
    }


filterproc_t *
fir_new(char filename[],         /* file containing filter coefficients, may be
				  * NULL. */
	char firlibname[],       /* name of the fir library */
	double db_gain,          /* sample rescaling in decibel */
	int *taps,               /* receives the number of taps */
	int *groups,
	double *coeffs_error)
{
    filterproc_t *filter = malloc(sizeof(filterproc_t));
    struct firdata *firdata = malloc(sizeof(struct firdata));
    struct mmx_coeffs *amc;
    char *error, flname[MAX_FILENAME+1], tempname[MAX_FILENAME];
    double **coeffs, max;
    FILE *stream;
    void *firlib, *state;
    int n;
    struct mmxfir_info mmxfir_info;
    struct mmxfir_info *(*get_info)(struct mmxfir_info *buf);
    int16_t *(*get_coeffs)(void);

    flname[MAX_FILENAME] = '\0';
    if (firlibname[0] != '/') {
	getcwd(flname, MAX_FILENAME - 10);
	strcat(flname, "/");
	strncat(flname, firlibname, MAX_FILENAME - strlen(flname));
    } else {
	strncpy(flname, firlibname, MAX_FILENAME);
    }
    
    if (filter == NULL || firdata == NULL) {
	free(filter); free(firdata);	
	fprintf(stderr, "fir_new: could not allocate memory\n");
	return NULL;
    }

    if (filename != NULL) {
	if (tmpnam(tempname) == NULL) {
	    fprintf(stderr, "fir_new: could not generate temporary filename\n");
	    return NULL;
	}
	if ((stream = fopen(filename, "rt")) == NULL) {
	    fprintf(stderr, "fir_new: could not open file \"%s\"\n", filename);
	    free(filter); free(firdata);	
	    return NULL;
	}
	if ((coeffs = coeffs_parse(taps, stream)) == NULL) {
	    fprintf(stderr, "fir_new: failed to parse coefficients\n");
	    free(filter); free(firdata);	
	    return NULL;
	}
	fclose(stream);
	for (n = 0, max = 0; n < *taps; n++) {
	    if (fabs(coeffs[0][n]) > max) {
		max = fabs(coeffs[0][n]);
	    }
	}
	if (max > 1.0) {
	    fprintf(stderr, "largest absolute value is larger than 1.0, "
		    "normalising...");	    
	    for (n = 0; n < *taps; n++) {
		coeffs[0][n] = coeffs[0][n] / max;
	    }
	    fprintf(stderr, "finished!\n");
	}
	fprintf(stderr, "requantisating coefficients...");
	if ((amc = mmx_coeff_requantisation(coeffs[0], *taps, sref,
					    &mmxfir_info)) == NULL) {
	    fprintf(stderr, "fir_new: coefficient requantisation failed\n");
	    free(filter); free(firdata); coeffs_free(coeffs);
	    return NULL;
	}
	coeffs_free(coeffs);
	/*ref_coeffs = make_coeffs(amc, coeffs[0], *taps / 4);*/
	if (!mmx_coeffs_to_code(amc, *taps / 4, tempname, "firlib_process")) {
	    fprintf(stderr, "fir_new: filter code generation failed\n");
	    free(filter); free(firdata);
	    return NULL;
	}
	fprintf(stderr, "finished!\n");
	fprintf(stderr, "compiling filter library...");
	if (!make_firlib(amc, &mmxfir_info, *taps / 4, sref,
			 tempname, flname))
	{
	    fprintf(stderr, "fir_new: filter compilation failed\n");
	    free(filter); free(firdata); free(amc);
	    return NULL;
	}
	free(amc);
	fprintf(stderr, "finished!\n");
    }

    if ((firlib = dlopen(flname, RTLD_LAZY)) == NULL) {
	fprintf(stderr, "fir_new: could not open filter library: %s\n",
		dlerror());
	free(filter); free(firdata);
	return NULL;
    }

    get_coeffs = dlsym(firlib, "firlib_get_coeffs");    
    FIRNEW_OPENLIBFUNC_ERRORCHECK();    
    get_info = dlsym(firlib, "firlib_get_info");
    FIRNEW_OPENLIBFUNC_ERRORCHECK();    
    firdata->process = dlsym(firlib, "firlib_process");
    FIRNEW_OPENLIBFUNC_ERRORCHECK();
    
    if ((firdata->coeffs = get_coeffs()) == NULL) {
	fprintf(stderr, "fir_new: could not allocate memory for coefficients");
	for (n = 0; n < 8; n++) {
	    free(firdata->state[n]);
	}
	free(filter); free(firdata); dlclose(firlib);
    }
    if ((state = malloc(4 * (2 * *taps + 4) * sizeof(int16_t)))
	== NULL)
    {
	fprintf(stderr, "fir_new: could not allocate state memory\n");
	free(filter); free(firdata); dlclose(firlib);
	return NULL;	    
    }
    bzero(state, 4 * (2 * *taps + 4) * sizeof(int16_t));
    for (n = 0; n < 4; n++) {
	firdata->state[n] = state + n * (2 * *taps + 4) * sizeof(int16_t);
    }
    get_info(&mmxfir_info);
    *taps = mmxfir_info.taps;
    *groups = mmxfir_info.groups;
    *coeffs_error = mmxfir_info.mean_squares;
    firdata->taps = *taps;
    firdata->xpos = *taps;
    firdata->spos = *taps;
    firdata->ypos = 0;
    firdata->active = true;
    firdata->multiplier = sref;
    
    filter->process_sample = (int64_t (*)(void *, int32_t))process_sample;
    filter->toggle_processing = (void (*)(void *))toggle_processing;
    filter->private = (void *)firdata;
    filter->multiplier = sref >> 16;
    filter->db_gain = db_gain;
    
    return filter;
}
