47 files changed, 4450 insertions, 1723 deletions
diff --git a/ac3dec/Makefile b/ac3dec/Makefile
index 4599a912..8a4791f2 100644
--- a/ac3dec/Makefile
+++ b/ac3dec/Makefile
@@ -1,10 +1,16 @@
 #
 # Makefile for 'ac3dec'
 #
-# $Id: Makefile 1.1 2001/08/02 13:18:08 kls Exp $
+# $Id: Makefile 1.2 2001/08/09 11:41:39 kls Exp $
+
+#OBJS = coeff.o decode.o exponent.o rematrix.o bit_allocate.o crc.o dither.o\
+#       imdct.o sanity_check.o bitstream.o debug.o downmix.o parse.o stats.o\
+#       downmix_c.o imdct.o                             
+
+OBJS = bit_allocate.o bitstream.o coeff.o cpu_accel.o crc.o debug.o decode.o\
+       dither.o downmix.o downmix_c.o exponent.o imdct.o imdct_c.o imdct_kni.o\
+       parse.o rematrix.o sanity_check.o srfft.o srfft_kni_c.o stats.o 
 
-OBJS = coeff.o decode.o exponent.o rematrix.o bit_allocate.o crc.o dither.o\
-       imdct.o sanity_check.o bitstream.o debug.o downmix.o parse.o stats.o                                  
 
 DEFINES += -DDOLBY_SURROUND
 
@@ -16,7 +22,7 @@ libac3.a: $(OBJS)
 # Implicit rules:
 
 %.o: %.c
-	gcc -g -O2 -Wall -m486 -c $(DEFINES) $<
+	gcc -g -O2 -Wall -m486 -I./ -c $(DEFINES) $<
 
 clean:
 	rm -f *~ libac3.a $(OBJS)
diff --git a/ac3dec/ac3.h b/ac3dec/ac3.h
index 4919fc50..d325f3bb 100644
--- a/ac3dec/ac3.h
+++ b/ac3dec/ac3.h
@@ -22,39 +22,37 @@
  *
  */
 
-#ifndef AARONS_TYPES
-#define AARONS_TYPES
-typedef unsigned long long uint_64;
-typedef unsigned int   uint_32;
-typedef unsigned short uint_16;
-typedef unsigned char  uint_8;
-
-typedef signed long long sint_64;
-typedef signed int     sint_32;
-typedef signed short   sint_16;
-typedef signed char    sint_8;
+#define AC3_BUFFER_SIZE (6*1024*16)
+
+#ifndef __AC3_H__
+#define __AC3_H__
+
+#ifdef __OMS__
+#include <oms/plugin/output_audio.h>
+#else
+//#include "audio_out.h"
 #endif
 
-#define AC3_DOLBY_SURR_ENABLE 0x1
-#define AC3_3DNOW_ENABLE      0x2
-#define AC3_MMX_ENABLE        0x4
-#define AC3_ALTIVEC_ENABLE    0x8
-
-typedef struct ac3_config_s
-{
-	//Bit flags that enable various things
-	uint_32 flags;
-	//Callback that points the decoder to new stream data
-  void   (*fill_buffer_callback)(uint_8 **, uint_8 **);
-	//Number of discrete channels in final output (for downmixing)
-	uint_16 num_output_ch;
-	//Which channel of a dual mono stream to select
-	uint_16 dual_mono_ch_sel;
-} ac3_config_t;
+#include <inttypes.h>
 
-void ac3_init(ac3_config_t *);
-uint_32 ac3_decode_data(uint_8 *data_start,uint_8 *data_end, int ac3reset, int *input_pointer, int *output_pointer, char *ac3_data);
+#define AC3_DOLBY_SURR_ENABLE	(1<<0)
+#define AC3_ALTIVEC_ENABLE	(1<<1)
 
+typedef struct ac3_config_s {
+	// Bit flags that enable various things
+	uint32_t flags;
+	// Number of discrete channels in final output (for downmixing)
+	uint16_t num_output_ch;
+	// Which channel of a dual mono stream to select
+	uint16_t dual_mono_ch_sel;
+} ac3_config_t;
 
+void ac3dec_init (void);
 
+#ifdef __OMS__
+size_t ac3dec_decode_data (plugin_output_audio_t *output, uint8_t *data_start, uint8_t *data_end);
+#else
+size_t ac3dec_decode_data (uint8_t *data_start ,uint8_t *data_end, int ac3reset, int *input_pointer, int *output_pointer, char *ac3_data);
+#endif
 
+#endif
diff --git a/ac3dec/ac3_internal.h b/ac3dec/ac3_internal.h
index 235189ed..259396fd 100644
--- a/ac3dec/ac3_internal.h
+++ b/ac3dec/ac3_internal.h
@@ -25,6 +25,8 @@
 #define inline 
 #endif
 
+#include <setjmp.h>
+
 /* Exponent strategy constants */
 #define EXP_REUSE (0)
 #define EXP_D15   (1)
@@ -43,7 +45,8 @@ typedef float stream_samples_t[6][256];
 /* global config structure */
 extern ac3_config_t ac3_config;
 /* global error flag */
-extern uint_32 error_flag;
+extern jmp_buf error_jmp_mark;
+#define HANDLE_ERROR() longjmp (error_jmp_mark, -1)
 
 /* Everything you wanted to know about band structure */
 /*
@@ -72,273 +75,279 @@ extern uint_32 error_flag;
 
 typedef struct syncinfo_s
 {
-	uint_32	magic;
+	uint32_t	magic;
 	/* Sync word == 0x0B77 */
-	 uint_16   syncword; 
+	 uint16_t   syncword; 
 	/* crc for the first 5/8 of the sync block */
-	/* uint_16   crc1; */
+	/* uint16_t   crc1; */
 	/* Stream Sampling Rate (kHz) 0 = 48, 1 = 44.1, 2 = 32, 3 = reserved */
-	uint_16		fscod;	
+	uint16_t		fscod;	
 	/* Frame size code */
-	uint_16		frmsizecod;
+	uint16_t		frmsizecod;
 
 	/* Information not in the AC-3 bitstream, but derived */
 	/* Frame size in 16 bit words */
-	uint_16 frame_size;
+	uint16_t frame_size;
 	/* Bit rate in kilobits */
-	uint_16 bit_rate;
+	uint16_t bit_rate;
 	/* sampling rate in hertz */
-	uint_32 sampling_rate;
+	uint32_t sampling_rate;
 
 } syncinfo_t;
 
 typedef struct bsi_s
 {
-	uint_32	magic;
+	uint32_t	magic;
 	/* Bit stream identification == 0x8 */
-	uint_16 bsid;	
+	uint16_t bsid;	
 	/* Bit stream mode */
-	uint_16 bsmod;
+	uint16_t bsmod;
 	/* Audio coding mode */
-	uint_16 acmod;
+	uint16_t acmod;
 	/* If we're using the centre channel then */
 		/* centre mix level */
-		uint_16 cmixlev;
+		uint16_t cmixlev;
 	/* If we're using the surround channel then */
 		/* surround mix level */
-		uint_16 surmixlev;
+		uint16_t surmixlev;
 	/* If we're in 2/0 mode then */
 		/* Dolby surround mix level - NOT USED - */
-		uint_16 dsurmod;
+		uint16_t dsurmod;
 	/* Low frequency effects on */
-	uint_16 lfeon;
+	uint16_t lfeon;
 	/* Dialogue Normalization level */
-	uint_16 dialnorm;
+	uint16_t dialnorm;
 	/* Compression exists */
-	uint_16 compre;
+	uint16_t compre;
 		/* Compression level */
-		uint_16 compr;
+		uint16_t compr;
 	/* Language code exists */
-	uint_16 langcode;
+	uint16_t langcode;
 		/* Language code */
-		uint_16 langcod;
+		uint16_t langcod;
 	/* Audio production info exists*/
-	uint_16 audprodie;
-		uint_16 mixlevel;
-		uint_16 roomtyp;
+	uint16_t audprodie;
+		uint16_t mixlevel;
+		uint16_t roomtyp;
 	/* If we're in dual mono mode (acmod == 0) then extra stuff */
-		uint_16 dialnorm2;
-		uint_16 compr2e;
-			uint_16 compr2;
-		uint_16 langcod2e;
-			uint_16 langcod2;
-		uint_16 audprodi2e;
-			uint_16 mixlevel2;
-			uint_16 roomtyp2;
+		uint16_t dialnorm2;
+		uint16_t compr2e;
+			uint16_t compr2;
+		uint16_t langcod2e;
+			uint16_t langcod2;
+		uint16_t audprodi2e;
+			uint16_t mixlevel2;
+			uint16_t roomtyp2;
 	/* Copyright bit */
-	uint_16 copyrightb;
+	uint16_t copyrightb;
 	/* Original bit */
-	uint_16 origbs;
+	uint16_t origbs;
 	/* Timecode 1 exists */
-	uint_16 timecod1e;
+	uint16_t timecod1e;
 		/* Timecode 1 */
-		uint_16 timecod1;
+		uint16_t timecod1;
 	/* Timecode 2 exists */
-	uint_16 timecod2e;
+	uint16_t timecod2e;
 		/* Timecode 2 */
-		uint_16 timecod2;
+		uint16_t timecod2;
 	/* Additional bit stream info exists */
-	uint_16 addbsie;
+	uint16_t addbsie;
 		/* Additional bit stream length - 1 (in bytes) */
-		uint_16 addbsil;
+		uint16_t addbsil;
 		/* Additional bit stream information (max 64 bytes) */
-		uint_8	addbsi[64];
+		uint8_t	addbsi[64];
 
 	/* Information not in the AC-3 bitstream, but derived */
 	/* Number of channels (excluding LFE)
 	 * Derived from acmod */
-	uint_16 nfchans;
+	uint16_t nfchans;
 } bsi_t;
 
 
 /* more pain */
 typedef struct audblk_s
 {
-	uint_32	magic1;
+	uint32_t	magic1;
 	/* block switch bit indexed by channel num */
-	uint_16 blksw[5];
+	uint16_t blksw[5];
 	/* dither enable bit indexed by channel num */
-	uint_16 dithflag[5];
+	uint16_t dithflag[5];
 	/* dynamic range gain exists */
-	uint_16 dynrnge;
+	uint16_t dynrnge;
 		/* dynamic range gain */
-		uint_16 dynrng;
+		uint16_t dynrng;
 	/* if acmod==0 then */
 	/* dynamic range 2 gain exists */
-	uint_16 dynrng2e;
+	uint16_t dynrng2e;
 		/* dynamic range 2 gain */
-		uint_16 dynrng2;
+		uint16_t dynrng2;
 	/* coupling strategy exists */
-	uint_16 cplstre;
+	uint16_t cplstre;
 		/* coupling in use */
-		uint_16 cplinu;
+		uint16_t cplinu;
 			/* channel coupled */
-			uint_16 chincpl[5];
+			uint16_t chincpl[5];
 			/* if acmod==2 then */
 				/* Phase flags in use */
-				uint_16 phsflginu;
+				uint16_t phsflginu;
 			/* coupling begin frequency code */
-			uint_16 cplbegf;
+			uint16_t cplbegf;
 			/* coupling end frequency code */
-			uint_16 cplendf;
+			uint16_t cplendf;
 			/* coupling band structure bits */
-			uint_16 cplbndstrc[18];
+			uint16_t cplbndstrc[18];
 			/* Do coupling co-ords exist for this channel? */
-			uint_16 cplcoe[5];
+			uint16_t cplcoe[5];
 			/* Master coupling co-ordinate */
-			uint_16 mstrcplco[5];
+			uint16_t mstrcplco[5];
 			/* Per coupling band coupling co-ordinates */
-			uint_16 cplcoexp[5][18];
-			uint_16 cplcomant[5][18];
+			uint16_t cplcoexp[5][18];
+			uint16_t cplcomant[5][18];
 			/* Phase flags for dual mono */
-			uint_16 phsflg[18];
+			uint16_t phsflg[18];
 	/* Is there a rematrixing strategy */
-	uint_16 rematstr;
+	uint16_t rematstr;
 		/* Rematrixing bits */
-		uint_16 rematflg[4];
+		uint16_t rematflg[4];
 	/* Coupling exponent strategy */
-	uint_16 cplexpstr;
+	uint16_t cplexpstr;
 	/* Exponent strategy for full bandwidth channels */
-	uint_16 chexpstr[5];
+	uint16_t chexpstr[5];
 	/* Exponent strategy for lfe channel */
-	uint_16 lfeexpstr;
+	uint16_t lfeexpstr;
 	/* Channel bandwidth for independent channels */
-	uint_16 chbwcod[5];
+	uint16_t chbwcod[5];
 		/* The absolute coupling exponent */
-		uint_16 cplabsexp;
+		uint16_t cplabsexp;
 		/* Coupling channel exponents (D15 mode gives 18 * 12 /3  encoded exponents */
-		uint_16 cplexps[18 * 12 / 3];
+		uint16_t cplexps[18 * 12 / 3];
 	/* Sanity checking constant */
-	uint_32	magic2;
+	uint32_t	magic2;
 	/* fbw channel exponents */
-	uint_16 exps[5][252 / 3];
+	uint16_t exps[5][252 / 3];
 	/* channel gain range */
-	uint_16 gainrng[5];
+	uint16_t gainrng[5];
 	/* low frequency exponents */
-	uint_16 lfeexps[3];
+	uint16_t lfeexps[3];
 
 	/* Bit allocation info */
-	uint_16 baie;
+	uint16_t baie;
 		/* Slow decay code */
-		uint_16 sdcycod;
+		uint16_t sdcycod;
 		/* Fast decay code */
-		uint_16 fdcycod;
+		uint16_t fdcycod;
 		/* Slow gain code */
-		uint_16 sgaincod;
+		uint16_t sgaincod;
 		/* dB per bit code */
-		uint_16 dbpbcod;
+		uint16_t dbpbcod;
 		/* masking floor code */
-		uint_16 floorcod;
+		uint16_t floorcod;
 
 	/* SNR offset info */
-	uint_16 snroffste;
+	uint16_t snroffste;
 		/* coarse SNR offset */
-		uint_16 csnroffst;
+		uint16_t csnroffst;
 		/* coupling fine SNR offset */
-		uint_16 cplfsnroffst;
+		uint16_t cplfsnroffst;
 		/* coupling fast gain code */
-		uint_16 cplfgaincod;
+		uint16_t cplfgaincod;
 		/* fbw fine SNR offset */
-		uint_16 fsnroffst[5];
+		uint16_t fsnroffst[5];
 		/* fbw fast gain code */
-		uint_16 fgaincod[5];
+		uint16_t fgaincod[5];
 		/* lfe fine SNR offset */
-		uint_16 lfefsnroffst;
+		uint16_t lfefsnroffst;
 		/* lfe fast gain code */
-		uint_16 lfefgaincod;
+		uint16_t lfefgaincod;
 	
 	/* Coupling leak info */
-	uint_16 cplleake;
+	uint16_t cplleake;
 		/* coupling fast leak initialization */
-		uint_16 cplfleak;
+		uint16_t cplfleak;
 		/* coupling slow leak initialization */
-		uint_16 cplsleak;
+		uint16_t cplsleak;
 	
 	/* delta bit allocation info */
-	uint_16 deltbaie;
+	uint16_t deltbaie;
 		/* coupling delta bit allocation exists */
-		uint_16 cpldeltbae;
+		uint16_t cpldeltbae;
 		/* fbw delta bit allocation exists */
-		uint_16 deltbae[5];
+		uint16_t deltbae[5];
 		/* number of cpl delta bit segments */
-		uint_16 cpldeltnseg;
+		uint16_t cpldeltnseg;
 			/* coupling delta bit allocation offset */
-			uint_16 cpldeltoffst[8];
+			uint16_t cpldeltoffst[8];
 			/* coupling delta bit allocation length */
-			uint_16 cpldeltlen[8];
+			uint16_t cpldeltlen[8];
 			/* coupling delta bit allocation length */
-			uint_16 cpldeltba[8];
+			uint16_t cpldeltba[8];
 		/* number of delta bit segments */
-		uint_16 deltnseg[5];
+		uint16_t deltnseg[5];
 			/* fbw delta bit allocation offset */
-			uint_16 deltoffst[5][8];
+			uint16_t deltoffst[5][8];
 			/* fbw delta bit allocation length */
-			uint_16 deltlen[5][8];
+			uint16_t deltlen[5][8];
 			/* fbw delta bit allocation length */
-			uint_16 deltba[5][8];
+			uint16_t deltba[5][8];
 
 	/* skip length exists */
-	uint_16 skiple;
+	uint16_t skiple;
 		/* skip length */
-		uint_16 skipl;
+		uint16_t skipl;
 
 	//Removed Feb 2000 -ah
+	//added Jul 2000 ++dent
 	/* channel mantissas */
-	//uint_16 chmant[5][256];
+	uint16_t chmant[5][256];
 
 	/* coupling mantissas */
-	uint_16 cplmant[256];
+//	uint16_t cplmant[256];
+
+	//Added Jun 2000 -MaXX
+	/* coupling floats */
+	float cpl_flt[ 256 ];
 
 	//Removed Feb 2000 -ah
+	//added Jul 2000 ++dent
 	/* coupling mantissas */
-	//uint_16 lfemant[7];
+	uint16_t lfemant[7];
 
 
 	/*  -- Information not in the bitstream, but derived thereof  -- */
 
 	/* Number of coupling sub-bands */
-	uint_16 ncplsubnd;
+	uint16_t ncplsubnd;
 
 	/* Number of combined coupling sub-bands
 	 * Derived from ncplsubnd and cplbndstrc */
-	uint_16 ncplbnd;
+	uint16_t ncplbnd;
 
 	/* Number of exponent groups by channel
 	 * Derived from strmant, endmant */
-	uint_16 nchgrps[5];
+	uint16_t nchgrps[5];
 
 	/* Number of coupling exponent groups
 	 * Derived from cplbegf, cplendf, cplexpstr */
-	uint_16 ncplgrps;
+	uint16_t ncplgrps;
 			
 	/* End mantissa numbers of fbw channels */
-	uint_16 endmant[5];
+	uint16_t endmant[5];
 
 	/* Start and end mantissa numbers for the coupling channel */
-	uint_16 cplstrtmant;
-	uint_16 cplendmant;
+	uint16_t cplstrtmant;
+	uint16_t cplendmant;
 
 	/* Decoded exponent info */
-	uint_16 fbw_exp[5][256];
-	uint_16 cpl_exp[256];
-	uint_16 lfe_exp[7];
+	uint16_t fbw_exp[5][256];
+	uint16_t cpl_exp[256];
+	uint16_t lfe_exp[7];
 
 	/* Bit allocation pointer results */
-	uint_16 fbw_bap[5][256];
-	uint_16 cpl_bap[256];
-	uint_16 lfe_bap[7];
+	uint16_t fbw_bap[5][256];
+	uint16_t cpl_bap[256];
+	uint16_t lfe_bap[7];
 	
-	uint_32	magic3;
+	uint32_t	magic3;
 } audblk_t;
 
 
diff --git a/ac3dec/bit_allocate.c b/ac3dec/bit_allocate.c
index 053e09cd..00874340 100644
--- a/ac3dec/bit_allocate.c
+++ b/ac3dec/bit_allocate.c
@@ -28,46 +28,46 @@
 
 
 
-static inline sint_16 logadd(sint_16 a,sint_16  b);
-static sint_16 calc_lowcomp(sint_16 a,sint_16 b0,sint_16 b1,sint_16 bin);
-static inline uint_16 min(sint_16 a,sint_16 b);
-static inline uint_16 max(sint_16 a,sint_16 b);
-static void ba_compute_psd(sint_16 start, sint_16 end, sint_16 exps[], 
-		sint_16 psd[], sint_16 bndpsd[]);
-
-static void ba_compute_excitation(sint_16 start, sint_16 end,sint_16 fgain,
-		sint_16 fastleak, sint_16 slowleak, sint_16 is_lfe, sint_16 bndpsd[],
-		sint_16 excite[]);
-static void ba_compute_mask(sint_16 start, sint_16 end, uint_16 fscod,
-		uint_16 deltbae, uint_16 deltnseg, uint_16 deltoffst[], uint_16 deltba[],
-		uint_16 deltlen[], sint_16 excite[], sint_16 mask[]);
-static void ba_compute_bap(sint_16 start, sint_16 end, sint_16 snroffset,
-		sint_16 psd[], sint_16 mask[], sint_16 bap[]);
+static inline int16_t logadd(int16_t a,int16_t  b);
+static int16_t calc_lowcomp(int16_t a,int16_t b0,int16_t b1,int16_t bin);
+static inline uint16_t min(int16_t a,int16_t b);
+static inline uint16_t max(int16_t a,int16_t b);
+static void ba_compute_psd(int16_t start, int16_t end, int16_t exps[], 
+		int16_t psd[], int16_t bndpsd[]);
+
+static void ba_compute_excitation(int16_t start, int16_t end,int16_t fgain,
+		int16_t fastleak, int16_t slowleak, int16_t is_lfe, int16_t bndpsd[],
+		int16_t excite[]);
+static void ba_compute_mask(int16_t start, int16_t end, uint16_t fscod,
+		uint16_t deltbae, uint16_t deltnseg, uint16_t deltoffst[], uint16_t deltba[],
+		uint16_t deltlen[], int16_t excite[], int16_t mask[]);
+static void ba_compute_bap(int16_t start, int16_t end, int16_t snroffset,
+		int16_t psd[], int16_t mask[], int16_t bap[]);
 
 /* Misc LUTs for bit allocation process */
 
-static sint_16 slowdec[]  = { 0x0f,  0x11,  0x13,  0x15  };
-static sint_16 fastdec[]  = { 0x3f,  0x53,  0x67,  0x7b  };
-static sint_16 slowgain[] = { 0x540, 0x4d8, 0x478, 0x410 };
-static sint_16 dbpbtab[]  = { 0x000, 0x700, 0x900, 0xb00 };
+static int16_t slowdec[]  = { 0x0f,  0x11,  0x13,  0x15  };
+static int16_t fastdec[]  = { 0x3f,  0x53,  0x67,  0x7b  };
+static int16_t slowgain[] = { 0x540, 0x4d8, 0x478, 0x410 };
+static int16_t dbpbtab[]  = { 0x000, 0x700, 0x900, 0xb00 };
 
-static uint_16 floortab[] = { 0x2f0, 0x2b0, 0x270, 0x230, 0x1f0, 0x170, 0x0f0, 0xf800 };
-static sint_16 fastgain[] = { 0x080, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400  };
+static uint16_t floortab[] = { 0x2f0, 0x2b0, 0x270, 0x230, 0x1f0, 0x170, 0x0f0, 0xf800 };
+static int16_t fastgain[] = { 0x080, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400  };
 
 
-static sint_16 bndtab[] = {  0,  1,  2,   3,   4,   5,   6,   7,   8,   9, 
+static int16_t bndtab[] = {  0,  1,  2,   3,   4,   5,   6,   7,   8,   9, 
                      10, 11, 12,  13,  14,  15,  16,  17,  18,  19,
                      20, 21, 22,  23,  24,  25,  26,  27,  28,  31,
                      34, 37, 40,  43,  46,  49,  55,  61,  67,  73,
                      79, 85, 97, 109, 121, 133, 157, 181, 205, 229 };
 
-static sint_16 bndsz[]  = { 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+static int16_t bndsz[]  = { 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
                      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
                      1,  1,  1,  1,  1,  1,  1,  1,  3,  3,
                      3,  3,  3,  3,  3,  6,  6,  6,  6,  6,
                      6, 12, 12, 12, 12, 24, 24, 24, 24, 24 };
 
-static sint_16 masktab[] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+static int16_t masktab[] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 28, 29,
                      29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34,
                      34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37,
@@ -85,7 +85,7 @@ static sint_16 masktab[] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
                      49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,  0,  0,  0 };
 
 
-static sint_16 latab[] = { 0x0040, 0x003f, 0x003e, 0x003d, 0x003c, 0x003b, 0x003a, 0x0039,
+static int16_t latab[] = { 0x0040, 0x003f, 0x003e, 0x003d, 0x003c, 0x003b, 0x003a, 0x0039,
                     0x0038, 0x0037, 0x0036, 0x0035, 0x0034, 0x0034, 0x0033, 0x0032,
                     0x0031, 0x0030, 0x002f, 0x002f, 0x002e, 0x002d, 0x002c, 0x002c,
                     0x002b, 0x002a, 0x0029, 0x0029, 0x0028, 0x0027, 0x0026, 0x0026,
@@ -119,7 +119,7 @@ static sint_16 latab[] = { 0x0040, 0x003f, 0x003e, 0x003d, 0x003c, 0x003b, 0x003
                     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
                     0x0000, 0x0000, 0x0000, 0x0000};
 
-static sint_16 hth[][50] = {{ 0x04d0, 0x04d0, 0x0440, 0x0400, 0x03e0, 0x03c0, 0x03b0, 0x03b0,  
+static int16_t hth[][50] = {{ 0x04d0, 0x04d0, 0x0440, 0x0400, 0x03e0, 0x03c0, 0x03b0, 0x03b0,  
                       0x03a0, 0x03a0, 0x03a0, 0x03a0, 0x03a0, 0x0390, 0x0390, 0x0390,  
                       0x0380, 0x0380, 0x0370, 0x0370, 0x0360, 0x0360, 0x0350, 0x0350,  
                       0x0340, 0x0340, 0x0330, 0x0320, 0x0310, 0x0300, 0x02f0, 0x02f0,
@@ -144,38 +144,50 @@ static sint_16 hth[][50] = {{ 0x04d0, 0x04d0, 0x0440, 0x0400, 0x03e0, 0x03c0, 0x
                       0x0450, 0x04e0 }};
 
 
-static sint_16 baptab[] = { 0,  1,  1,  1,  1,  1,  2,  2,  3,  3,  3,  4,  4,  5,  5,  6,
+static int16_t baptab[] = { 0,  1,  1,  1,  1,  1,  2,  2,  3,  3,  3,  4,  4,  5,  5,  6,
                      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9, 10, 
                      10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14,
                      14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15 };
 
-static sint_16 sdecay;
-static sint_16 fdecay;
-static sint_16 sgain;
-static sint_16 dbknee;
-static sint_16 floor;
-static sint_16 psd[256];
-static sint_16 bndpsd[256];
-static sint_16 excite[256];
-static sint_16 mask[256];
-
-static inline uint_16
-max(sint_16 a,sint_16 b)
+static int16_t sdecay;
+static int16_t fdecay;
+static int16_t sgain;
+static int16_t dbknee;
+static int16_t floor;
+static int16_t psd[256];
+static int16_t bndpsd[256];
+static int16_t excite[256];
+static int16_t mask[256];
+
+
+/**
+ *
+ **/
+
+static inline uint16_t max(int16_t a,int16_t b)
 {
 	return (a > b ? a : b);
 }
+
+
+/**
+ *
+ **/
 	
-static inline uint_16
-min(sint_16 a,sint_16 b)
+static inline uint16_t min(int16_t a,int16_t b)
 {
 	return (a < b ? a : b);
 }
 
-static inline sint_16 
-logadd(sint_16 a,sint_16  b) 
+
+/**
+ *
+ **/
+
+static inline int16_t logadd(int16_t a,int16_t  b) 
 { 
-	sint_16 c;
-	sint_16 address;
+	int16_t c;
+	int16_t address;
 
 	c = a - b; 
 	address = min((abs(c) >> 1), 255); 
@@ -187,15 +199,19 @@ logadd(sint_16 a,sint_16  b)
 }
 
 
-void bit_allocate(uint_16 fscod, bsi_t *bsi, audblk_t *audblk)
+/**
+ *
+ **/
+
+void bit_allocate(uint16_t fscod, bsi_t *bsi, audblk_t *audblk)
 {
-	uint_16 i;
-	sint_16 fgain;
-	sint_16 snroffset;
-	sint_16 start;
-	sint_16 end;
-	sint_16 fastleak;
-	sint_16 slowleak;
+	uint16_t i;
+	int16_t fgain;
+	int16_t snroffset;
+	int16_t start;
+	int16_t end;
+	int16_t fastleak;
+	int16_t slowleak;
 
 	/* Only perform bit_allocation if the exponents have changed or we
 	 * have new sideband information */
@@ -217,11 +233,10 @@ void bit_allocate(uint_16 fscod, bsi_t *bsi, audblk_t *audblk)
 	if(!audblk->csnroffst    && !audblk->fsnroffst[0] && 
 		 !audblk->fsnroffst[1] && !audblk->fsnroffst[2] && 
 		 !audblk->fsnroffst[3] && !audblk->fsnroffst[4] &&
-		 !audblk->cplfsnroffst && !audblk->lfefsnroffst)
-	{
-		memset(audblk->fbw_bap,0,sizeof(uint_16) * 256 * 5);
-		memset(audblk->cpl_bap,0,sizeof(uint_16) * 256);
-		memset(audblk->lfe_bap,0,sizeof(uint_16) * 7);
+		 !audblk->cplfsnroffst && !audblk->lfefsnroffst) {
+		memset(audblk->fbw_bap,0,sizeof(uint16_t) * 256 * 5);
+		memset(audblk->cpl_bap,0,sizeof(uint16_t) * 256);
+		memset(audblk->lfe_bap,0,sizeof(uint16_t) * 7);
 		return;
 	}
 		 
@@ -245,8 +260,7 @@ void bit_allocate(uint_16 fscod, bsi_t *bsi, audblk_t *audblk)
 		ba_compute_bap(start, end, snroffset, psd, mask, audblk->fbw_bap[i]);
 	}
 
-	if(audblk->cplinu)
-	{
+	if(audblk->cplinu) {
 		start = audblk->cplstrtmant; 
 		end = audblk->cplendmant; 
 		fgain = fastgain[audblk->cplfgaincod];
@@ -264,8 +278,7 @@ void bit_allocate(uint_16 fscod, bsi_t *bsi, audblk_t *audblk)
 		ba_compute_bap(start, end, snroffset, psd, mask, audblk->cpl_bap);
 	}
 
-	if(bsi->lfeon)
-	{
+	if(bsi->lfeon) {
 		start = 0;
 		end = 7;
 		fgain = fastgain[audblk->lfefgaincod];
@@ -285,15 +298,18 @@ void bit_allocate(uint_16 fscod, bsi_t *bsi, audblk_t *audblk)
 }
 
 
-static void ba_compute_psd(sint_16 start, sint_16 end, sint_16 exps[], 
-		sint_16 psd[], sint_16 bndpsd[])
+/**
+ *
+ **/
+
+static void ba_compute_psd(int16_t start, int16_t end, int16_t exps[], 
+		int16_t psd[], int16_t bndpsd[])
 {
 	int bin,i,j,k;
-	sint_16 lastbin = 0;
+	int16_t lastbin = 0;
 	
 	/* Map the exponents into dBs */
-	for (bin=start; bin<end; bin++) 
-	{ 
+	for (bin=start; bin<end; bin++) { 
 		psd[bin] = (3072 - (exps[bin] << 7)); 
 	}
 
@@ -301,14 +317,12 @@ static void ba_compute_psd(sint_16 start, sint_16 end, sint_16 exps[],
 	j = start; 
 	k = masktab[start]; 
 	
-	do 
-	{ 
+	do { 
 		lastbin = min(bndtab[k] + bndsz[k], end); 
 		bndpsd[k] = psd[j]; 
 		j++; 
 
-		for (i = j; i < lastbin; i++) 
-		{ 
+		for (i = j; i < lastbin; i++) { 
 			bndpsd[k] = logadd(bndpsd[k], psd[j]);
 			j++; 
 		} 
@@ -317,49 +331,49 @@ static void ba_compute_psd(sint_16 start, sint_16 end, sint_16 exps[],
 	} while (end > lastbin);
 }
 
-static void ba_compute_excitation(sint_16 start, sint_16 end,sint_16 fgain,
-		sint_16 fastleak, sint_16 slowleak, sint_16 is_lfe, sint_16 bndpsd[],
-		sint_16 excite[])
+
+/**
+ *
+ **/
+
+static void ba_compute_excitation(int16_t start, int16_t end,int16_t fgain,
+		int16_t fastleak, int16_t slowleak, int16_t is_lfe, int16_t bndpsd[],
+		int16_t excite[])
 {
 	int bin;
-	sint_16 bndstrt;
-	sint_16 bndend;
-	sint_16 lowcomp = 0;
-	sint_16 begin = 0;
+	int16_t bndstrt;
+	int16_t bndend;
+	int16_t lowcomp = 0;
+	int16_t begin = 0;
 
 	/* Compute excitation function */
 	bndstrt = masktab[start]; 
 	bndend = masktab[end - 1] + 1; 
 	
-	if (bndstrt == 0) /* For fbw and lfe channels */ 
-	{ 
+	if (bndstrt == 0) { /* For fbw and lfe channels */ 
 		lowcomp = calc_lowcomp(lowcomp, bndpsd[0], bndpsd[1], 0); 
 		excite[0] = bndpsd[0] - fgain - lowcomp; 
 		lowcomp = calc_lowcomp(lowcomp, bndpsd[1], bndpsd[2], 1);
 		excite[1] = bndpsd[1] - fgain - lowcomp; 
 		begin = 7 ; 
 		
-		/* Note: Do not call calc_lowcomp() for the last band of the lfe channel, (bin = 6) */ 
-		for (bin = 2; bin < 7; bin++) 
-		{ 
+// Note: Do not call calc_lowcomp() for the last band of the lfe channel,(bin=6)
+		for (bin = 2; bin < 7; bin++) { 
 			if (!(is_lfe && (bin == 6)))
 				lowcomp = calc_lowcomp(lowcomp, bndpsd[bin], bndpsd[bin+1], bin); 
 			fastleak = bndpsd[bin] - fgain; 
 			slowleak = bndpsd[bin] - sgain; 
 			excite[bin] = fastleak - lowcomp; 
 			
-			if (!(is_lfe && (bin == 6)))
-			{
-				if (bndpsd[bin] <= bndpsd[bin+1]) 
-				{
+			if (!(is_lfe && (bin == 6))) {
+				if (bndpsd[bin] <= bndpsd[bin+1]) {
 					begin = bin + 1 ; 
 					break; 
 				} 
 			}
 		} 
 		
-		for (bin = begin; bin < min(bndend, 22); bin++) 
-		{ 
+		for (bin = begin; bin < min(bndend, 22); bin++) { 
 			if (!(is_lfe && (bin == 6)))
 				lowcomp = calc_lowcomp(lowcomp, bndpsd[bin], bndpsd[bin+1], bin); 
 			fastleak -= fdecay ; 
@@ -371,12 +385,9 @@ static void ba_compute_excitation(sint_16 start, sint_16 end,sint_16 fgain,
 		begin = 22; 
 	} 
 	else /* For coupling channel */ 
-	{ 
 		begin = bndstrt; 
-	} 
 
-	for (bin = begin; bin < bndend; bin++) 
-	{ 
+	for (bin = begin; bin < bndend; bin++) { 
 		fastleak -= fdecay; 
 		fastleak = max(fastleak, bndpsd[bin] - fgain); 
 		slowleak -= sdecay; 
@@ -385,49 +396,47 @@ static void ba_compute_excitation(sint_16 start, sint_16 end,sint_16 fgain,
 	} 
 }
 
-static void ba_compute_mask(sint_16 start, sint_16 end, uint_16 fscod,
-		uint_16 deltbae, uint_16 deltnseg, uint_16 deltoffst[], uint_16 deltba[],
-		uint_16 deltlen[], sint_16 excite[], sint_16 mask[])
+
+/**
+ *
+ **/
+
+static void ba_compute_mask(int16_t start, int16_t end, uint16_t fscod,
+		uint16_t deltbae, uint16_t deltnseg, uint16_t deltoffst[], uint16_t deltba[],
+		uint16_t deltlen[], int16_t excite[], int16_t mask[])
 {
 	int bin,k;
-	sint_16 bndstrt;
-	sint_16 bndend;
-	sint_16 delta;
+	int16_t bndstrt;
+	int16_t bndend;
+	int16_t delta;
 
 	bndstrt = masktab[start]; 
 	bndend = masktab[end - 1] + 1; 
 
 	/* Compute the masking curve */
 
-	for (bin = bndstrt; bin < bndend; bin++) 
-	{ 
-		if (bndpsd[bin] < dbknee) 
-		{ 
+	for (bin = bndstrt; bin < bndend; bin++) { 
+		if (bndpsd[bin] < dbknee) { 
 			excite[bin] += ((dbknee - bndpsd[bin]) >> 2); 
 		} 
 		mask[bin] = max(excite[bin], hth[fscod][bin]);
 	}
 	
 	/* Perform delta bit modulation if necessary */
-	if ((deltbae == DELTA_BIT_REUSE) || (deltbae == DELTA_BIT_NEW)) 
-	{ 
-		sint_16 band = 0; 
-		sint_16 seg = 0; 
+	if ((deltbae == DELTA_BIT_REUSE) || (deltbae == DELTA_BIT_NEW)) { 
+		int16_t band = 0; 
+		int16_t seg = 0; 
 		
-		for (seg = 0; seg < deltnseg+1; seg++) 
-		{ 
+		for (seg = 0; seg < deltnseg+1; seg++) { 
 			band += deltoffst[seg]; 
-			if (deltba[seg] >= 4) 
-			{ 
+
+			if (deltba[seg] >= 4) { 
 				delta = (deltba[seg] - 3) << 7;
-			} 
-			else 
-			{ 
+			} else { 
 				delta = (deltba[seg] - 4) << 7;
 			} 
 			
-			for (k = 0; k < deltlen[seg]; k++) 
-			{ 
+			for (k = 0; k < deltlen[seg]; k++) { 
 				mask[band] += delta; 
 				band++; 
 			} 
@@ -435,19 +444,23 @@ static void ba_compute_mask(sint_16 start, sint_16 end, uint_16 fscod,
 	}
 }
 
-static void ba_compute_bap(sint_16 start, sint_16 end, sint_16 snroffset,
-		sint_16 psd[], sint_16 mask[], sint_16 bap[])
+
+/**
+ *
+ **/
+
+static void ba_compute_bap(int16_t start, int16_t end, int16_t snroffset,
+		int16_t psd[], int16_t mask[], int16_t bap[])
 {
 	int i,j,k;
-	sint_16 lastbin = 0;
-	sint_16 address = 0;
+	int16_t lastbin = 0;
+	int16_t address = 0;
 
 	/* Compute the bit allocation pointer for each bin */
 	i = start; 
 	j = masktab[start]; 
 
-	do 
-	{ 
+	do { 
 		lastbin = min(bndtab[j] + bndsz[j], end); 
 		mask[j] -= snroffset; 
 		mask[j] -= floor; 
@@ -457,8 +470,7 @@ static void ba_compute_bap(sint_16 start, sint_16 end, sint_16 snroffset,
 
 		mask[j] &= 0x1fe0;
 		mask[j] += floor; 
-		for (k = i; k < lastbin; k++) 
-		{ 
+		for (k = i; k < lastbin; k++) { 
 			address = (psd[i] - mask[j]) >> 5; 
 			address = min(63, max(0, address)); 
 			bap[i] = baptab[address]; 
@@ -468,25 +480,25 @@ static void ba_compute_bap(sint_16 start, sint_16 end, sint_16 snroffset,
 	} while (end > lastbin);
 }
 
-static sint_16 
-calc_lowcomp(sint_16 a,sint_16 b0,sint_16 b1,sint_16 bin) 
+
+/**
+ *
+ **/
+
+static int16_t calc_lowcomp (int16_t a, int16_t b0, int16_t b1, int16_t bin) 
 { 
 
-	if (bin < 7) 
-	{ 
+	if (bin < 7) { 
 		if ((b0 + 256) == b1)
 			a = 384; 
 	 	else if (b0 > b1) 
 			a = max(0, a - 64); 
-	} 
-	else if (bin < 20) 
-	{ 
+	} else if (bin < 20) { 
 		if ((b0 + 256) == b1) 
 			a = 320; 
 		else if (b0 > b1) 
 			a = max(0, a - 64) ; 
-	}
-	else  
+	} else  
 		a = max(0, a - 128); 
 	
 	return(a);
diff --git a/ac3dec/bit_allocate.h b/ac3dec/bit_allocate.h
index e48b0b2b..a6a3c770 100644
--- a/ac3dec/bit_allocate.h
+++ b/ac3dec/bit_allocate.h
@@ -21,4 +21,4 @@
  *
  */
 
-void bit_allocate(uint_16 fscod, bsi_t *bsi, audblk_t *audblk);
+void bit_allocate(uint16_t fscod, bsi_t *bsi, audblk_t *audblk);
diff --git a/ac3dec/bitstream.c b/ac3dec/bitstream.c
index 296d5ee3..f48d2408 100644
--- a/ac3dec/bitstream.c
+++ b/ac3dec/bitstream.c
@@ -23,54 +23,57 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include <inttypes.h>
+
+#include <bswap.h>
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
 
 #include "ac3.h"
 #include "ac3_internal.h"
 #include "bitstream.h"
 
 
-uint_8 *buffer_start = 0;
-uint_32 bits_left = 0;
-uint_32 current_word;
+uint32_t bits_left = 0;
+uint64_t current_word;
+uint64_t *buffer_start = 0;
+
+
+static inline uint64_t getdword (void)
+{
+	return be2me_64 (*buffer_start++);
+}
+
 
-static inline void
-bitstream_fill_current()
+static inline void bitstream_fill_current (void)
 {
-	current_word = *((uint_32*)buffer_start)++;
-	current_word = swab32(current_word);
+	//current_word = bswap_64 (*buffer_start++);
+	current_word = getdword ();
 }
 
-//
-// The fast paths for _get is in the
-// bitstream.h header file so it can be inlined.
-//
-// The "bottom half" of this routine is suffixed _bh
-//
-// -ah
-//
-
-uint_32
-bitstream_get_bh(uint_32 num_bits)
+
+uint32_t bitstream_get_bh (uint32_t num_bits)
 {
-	uint_32 result;
+	uint32_t result;
 
 	num_bits -= bits_left;
-	result = (current_word << (32 - bits_left)) >> (32 - bits_left);
+	result = (current_word << (64 - bits_left)) >> (64 - bits_left);
 
 	bitstream_fill_current();
 
 	if(num_bits != 0)
-		result = (result << num_bits) | (current_word >> (32 - num_bits));
+		result = (result << num_bits) | (current_word >> (64 - num_bits));
 	
-	bits_left = 32 - num_bits;
+	bits_left = 64 - num_bits;
 
 	return result;
 }
 
-void
-bitstream_init(uint_8 *start)
+
+void bitstream_init (uint8_t *start)
 {
 	//initialize the start of the buffer
-	buffer_start = start;
+	buffer_start = (uint64_t *) start;
 	bits_left = 0;
 }
diff --git a/ac3dec/bitstream.h b/ac3dec/bitstream.h
index 33519306..f34726a7 100644
--- a/ac3dec/bitstream.h
+++ b/ac3dec/bitstream.h
@@ -21,56 +21,24 @@
  *
  */
 
-//My new and improved vego-matic endian swapping routine
-//(stolen from the kernel)
-#ifdef WORDS_BIGENDIAN
 
-#	define swab32(x) (x)
+#include <inttypes.h>
 
-#else
+extern uint32_t bits_left;
+extern uint64_t current_word;
 
-#	if defined (__i386__)
+void bitstream_init(uint8_t *start);
+inline uint32_t bitstream_get_bh(uint32_t num_bits);
 
-#	define swab32(x) __i386_swab32(x)
-	static inline const uint_32 __i386_swab32(uint_32 x)
-	{
-		__asm__("bswap %0" : "=r" (x) : "0" (x));
-		return x;
-	}
-
-#	else
-
-#	define swab32(x)\
-((((uint_8*)&x)[0] << 24) | (((uint_8*)&x)[1] << 16) |  \
- (((uint_8*)&x)[2] << 8)  | (((uint_8*)&x)[3]))
-
-#	endif
-#endif
-
-extern uint_32 bits_left;
-extern uint_32 current_word;
-
-void bitstream_init(uint_8 *start);
-
-uint_8 bitstream_get_byte(void);
-
-uint_8 *bitstream_get_buffer_start(void);
-void bitstream_buffer_frame(uint_32 frame_size);
-
-uint_32 bitstream_get_bh(uint_32 num_bits);
-
-static inline uint_32 
-bitstream_get(uint_32 num_bits)
+static inline uint32_t bitstream_get (uint32_t num_bits)
 {
-	uint_32 result;
-	
-	if(num_bits < bits_left)
-	{
-		result = (current_word << (32 - bits_left)) >> (32 - num_bits);
+	uint32_t result;
+
+	if (num_bits < bits_left) {
+		result = (current_word << (64 - bits_left)) >> (64 - num_bits);
 		bits_left -= num_bits;
 		return result;
 	}
 
-	return bitstream_get_bh(num_bits);
+	return bitstream_get_bh (num_bits);
 }
-
diff --git a/ac3dec/bswap.h b/ac3dec/bswap.h
new file mode 100644
index 00000000..73d398aa
--- /dev/null
+++ b/ac3dec/bswap.h
@@ -0,0 +1,80 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __BSWAP_H__
+#define __BSWAP_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_BYTESWAP_H
+#include <byteswap.h>
+#else
+
+#include <inttypes.h>
+
+#ifdef WORDS_BIGENDIAN
+// FIXME these need to actually swap ;)
+#define bswap_16(x) (x)
+#define bswap_32(x) (x)
+#define bswap_64(x) (x)
+#else
+// This is wrong, 'cannot take address of ...'
+#define bswap_16(x) ((((uint8_t*)&x)[2] << 8) \
+			| (((uint8_t*)&x)[3]))
+
+// code from bits/byteswap.h (C) 1997, 1998 Free Software Foundation, Inc.
+#define bswap_32(x) \
+     ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) | \
+      (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
+
+#define bswap_64(x) \
+     (__extension__						\
+      ({ union { __extension__ unsigned long long int __ll;	\
+                 unsigned long int __l[2]; } __w, __r;		\
+         __w.__ll = (x);					\
+         __r.__l[0] = bswap_32 (__w.__l[1]);			\
+         __r.__l[1] = bswap_32 (__w.__l[0]);			\
+         __r.__ll; }))
+#endif
+
+#endif
+
+// be2me ... BigEndian to MachineEndian
+// le2me ... LittleEndian to MachineEndian
+
+#ifdef WORDS_BIGENDIAN
+#define be2me_16(x) (x)
+#define be2me_32(x) (x)
+#define be2me_64(x) (x)
+#define le2me_16(x) bswap_16(x)
+#define le2me_32(x) bswap_32(x)
+#define le2me_64(x) bswap_64(x)
+#else
+#define be2me_16(x) bswap_16(x)
+#define be2me_32(x) bswap_32(x)
+#define be2me_64(x) bswap_64(x)
+#define le2me_16(x) (x)
+#define le2me_32(x) (x)
+#define le2me_64(x) (x)
+#endif
+
+#endif
diff --git a/ac3dec/cmplx.h b/ac3dec/cmplx.h
new file mode 100644
index 00000000..9f74721e
--- /dev/null
+++ b/ac3dec/cmplx.h
@@ -0,0 +1,29 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __COMPLEX_H__
+#define __COMPLEX_H__
+
+typedef struct complex {
+	float re;
+	float im;
+} complex_t;
+
+#endif
diff --git a/ac3dec/coeff.c b/ac3dec/coeff.c
index b9f03ff6..7d5579ca 100644
--- a/ac3dec/coeff.c
+++ b/ac3dec/coeff.c
@@ -28,51 +28,183 @@
 #include "ac3_internal.h"
 
 
-#include "decode.h"
 #include "bitstream.h"
 #include "dither.h"
 #include "coeff.h"
 
+
 //
 //Lookup tables of 0.15 two's complement quantization values
 //
-static const uint_16 q_1[3] = 
+#define Q0 ((-2 << 15) / 3.0)
+#define Q1 (0)
+#define Q2 ((2 << 15) / 3.0)
+
+static const float q_1_0[ 32 ] =
 {
-	( -2 << 15)/3, 0,(  2 << 15)/3 
-};
+	Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,
+	Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,
+	Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,
+	0,0,0,0,0
+};                                                                              
 
-static const uint_16 q_2[5] = 
+static const float q_1_1[ 32 ] =
 {
-	( -4 << 15)/5,( -2 << 15)/5, 0,
-	(  2 << 15)/5,(  4 << 15)/5
-};
+	Q0,Q0,Q0,Q1,Q1,Q1,Q2,Q2,Q2,
+	Q0,Q0,Q0,Q1,Q1,Q1,Q2,Q2,Q2,
+	Q0,Q0,Q0,Q1,Q1,Q1,Q2,Q2,Q2,
+	0,0,0,0,0
+};                                                                              
 
-static const uint_16 q_3[7] = 
+static const float q_1_2[ 32 ] =
+{
+	Q0,Q1,Q2,Q0,Q1,Q2,Q0,Q1,Q2,
+	Q0,Q1,Q2,Q0,Q1,Q2,Q0,Q1,Q2,
+	Q0,Q1,Q2,Q0,Q1,Q2,Q0,Q1,Q2,
+	0,0,0,0,0
+};                                                                              
+
+#undef Q0
+#undef Q1
+#undef Q2
+
+#define Q0 ((-4 << 15) / 5.0)
+#define Q1 ((-2 << 15) / 5.0)
+#define Q2 (0)
+#define Q3 ((2 << 15) / 5.0)
+#define Q4 ((4 << 15) / 5.0)
+
+static const float q_2_0[ 128 ] =
+{
+	Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,
+	Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,Q0,
+	Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,
+	Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,Q1,
+	Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,
+	Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,Q2,
+	Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,
+	Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,Q3,
+	Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,
+	Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,Q4,
+	0,0,0
+};                                                                              
+
+static const float q_2_1[ 128 ] =
+{
+	Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,
+	Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,
+	Q4,Q4,Q4,Q4,Q4,Q0,Q0,Q0,Q0,Q0,
+	Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,
+	Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4,
+	Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,
+	Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,
+	Q4,Q4,Q4,Q4,Q4,Q0,Q0,Q0,Q0,Q0,
+	Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2,
+	Q3,Q3,Q3,Q3,Q3,Q4,Q4,Q4,Q4,Q4,
+	Q0,Q0,Q0,Q0,Q0,Q1,Q1,Q1,Q1,Q1,
+	Q2,Q2,Q2,Q2,Q2,Q3,Q3,Q3,Q3,Q3,
+	Q4,Q4,Q4,Q4,Q4,0,0,0
+ };
+
+static const float q_2_2[ 128 ] =
+ {
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,Q0,Q1,Q2,Q3,Q4,
+	Q0,Q1,Q2,Q3,Q4,0,0,0
+};                                                                              
+
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+#undef Q4                                                                      
+
+static const float q_3[7] =
 {
-	( -6 << 15)/7,( -4 << 15)/7,( -2 << 15)/7, 0,
-	(  2 << 15)/7,(  4 << 15)/7,(  6 << 15)/7
+	(-6 << 15)/7.0, (-4 << 15)/7.0, (-2 << 15)/7.0, 0.0,
+	( 2 << 15)/7.0, ( 4 << 15)/7.0, ( 6 << 15)/7.0
 };
 
-static const uint_16 q_4[11] = 
+#define Q0 ((-10 << 15) / 11.0)
+#define Q1 ((-8 << 15) / 11.0)
+#define Q2 ((-6 << 15) / 11.0)
+#define Q3 ((-4 << 15) / 11.0)
+#define Q4 ((-2 << 15) / 11.0)
+#define Q5 (0)
+#define Q6 ((2 << 15) / 11.0)
+#define Q7 ((4 << 15) / 11.0)
+#define Q8 ((6 << 15) / 11.0)
+#define Q9 ((8 << 15) / 11.0)
+#define QA ((10 << 15) / 11.0)
+
+static const float q_4_0[ 128 ] =
 {
-	(-10 << 15)/11,(-8 << 15)/11,(-6 << 15)/11, ( -4 << 15)/11,(-2 << 15)/11,  0,
-	(  2 << 15)/11,( 4 << 15)/11,( 6 << 15)/11, (  8 << 15)/11,(10 << 15)/11
+	Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0, Q0,
+	Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1, Q1,
+	Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2, Q2,
+	Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3, Q3,
+	Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4, Q4,
+	Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5, Q5,
+	Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6, Q6,
+	Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7, Q7,
+	Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8, Q8,
+	Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9, Q9,
+	QA, QA, QA, QA, QA, QA, QA, QA, QA, QA, QA,
+	 0,  0,  0,  0,  0,  0,  0
+ };
+
+static const float q_4_1[ 128 ] =
+{
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, QA,
+	 0,  0,  0,  0,  0,  0,  0
 };
 
-static const uint_16 q_5[15] = 
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+#undef Q4
+#undef Q5
+#undef Q6
+#undef Q7
+#undef Q8
+#undef Q9
+#undef QA                                                                       
+
+static const float q_5[15] =
 {
-	(-14 << 15)/15,(-12 << 15)/15,(-10 << 15)/15,
-	( -8 << 15)/15,( -6 << 15)/15,( -4 << 15)/15,
-	( -2 << 15)/15,   0          ,(  2 << 15)/15,
-	(  4 << 15)/15,(  6 << 15)/15,(  8 << 15)/15,
-	( 10 << 15)/15,( 12 << 15)/15,( 14 << 15)/15
-};
+	(-14 << 15)/15.0,(-12 << 15)/15.0,(-10 << 15)/15.0,
+	( -8 << 15)/15.0,( -6 << 15)/15.0,( -4 << 15)/15.0,
+	( -2 << 15)/15.0,   0.0          ,(  2 << 15)/15.0,
+	(  4 << 15)/15.0,(  6 << 15)/15.0,(  8 << 15)/15.0,
+	( 10 << 15)/15.0,( 12 << 15)/15.0,( 14 << 15)/15.0
+};                                                                              
 
 //
 // Scale factors for convert_to_float
 //
 
-static const uint_32 u32_scale_factors[25] = 
+static const uint32_t u32_scale_factors[25] = 
 {
 	0x38000000, //2 ^ -(0 + 15)
 	0x37800000, //2 ^ -(1 + 15)
@@ -104,230 +236,179 @@ static const uint_32 u32_scale_factors[25] =
 static float *scale_factor = (float*)u32_scale_factors;
 
 //These store the persistent state of the packed mantissas
-static uint_16 m_1[3];
-static uint_16 m_2[3];
-static uint_16 m_4[2];
-static uint_16 m_1_pointer;
-static uint_16 m_2_pointer;
-static uint_16 m_4_pointer;
+static float q_1[2];
+static float q_2[2];
+static float q_4[1];
+static int32_t q_1_pointer;
+static int32_t q_2_pointer;
+static int32_t q_4_pointer;
+static float __inline__
+coeff_get_float(uint16_t bap, uint16_t dithflag, uint16_t exp);
 
 //Conversion from bap to number of bits in the mantissas
 //zeros account for cases 0,1,2,4 which are special cased
-static uint_16 qnttztab[16] = { 0, 0, 0, 3, 0 , 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16};
-
-static void    coeff_reset(void);
-static sint_16 coeff_get_mantissa(uint_16 bap, uint_16 dithflag);
-static void    coeff_uncouple_ch(float samples[],bsi_t *bsi,audblk_t *audblk,uint_32 ch);
-
-//
-// Convert a 0.15 fixed point number into IEEE single
-// precision floating point and scale by 2^-exp
-//
-static inline float
-convert_to_float(uint_16 exp, sint_16 mantissa)
+static uint16_t qnttztab[16] =
 {
-	float x;
-
-	//the scale by 2^-15 is built into the scale factor table
-	x = mantissa * scale_factor[exp];
+	 0,  0,  0,  3,
+	 0,  4,  5,  6,
+	 7,  8,  9, 10,
+	11, 12, 14, 16
+};
 
-	return x;
-}
+static void	coeff_reset(void);
+static float	coeff_get_float(uint16_t bap, uint16_t dithflag, uint16_t exp);
+static void	coeff_uncouple_ch(float samples[],bsi_t *bsi,audblk_t *audblk,uint32_t ch);
 
-void
-coeff_unpack(bsi_t *bsi, audblk_t *audblk, stream_samples_t samples)
+void coeff_unpack(bsi_t *bsi, audblk_t *audblk, stream_samples_t samples)
 {
-	uint_16 i,j;
-	uint_32 done_cpl = 0;
-	sint_16 mantissa;
+	uint16_t i,j;
+	uint32_t done_cpl = 0;
 
 	coeff_reset();
 
-	for(i=0; i< bsi->nfchans; i++)
-	{
+	for(i=0; i< bsi->nfchans; i++) {
 		for(j=0; j < audblk->endmant[i]; j++)
-		{
-			mantissa = coeff_get_mantissa(audblk->fbw_bap[i][j],audblk->dithflag[i]);
-			samples[i][j] = convert_to_float(audblk->fbw_exp[i][j],mantissa);
-		}
+			samples[i][j] = coeff_get_float(audblk->fbw_bap[i][j], audblk->dithflag[i], audblk->fbw_exp[i][j]);
 
-		if(audblk->cplinu && audblk->chincpl[i] && !(done_cpl))
-		{
+		if(audblk->cplinu && audblk->chincpl[i] && !(done_cpl)) {
 			// ncplmant is equal to 12 * ncplsubnd
-			// Don't dither coupling channel until channel separation so that
-			// interchannel noise is uncorrelated 
+			// Don't dither coupling channel until channel
+			// separation so that interchannel noise is uncorrelated 
 			for(j=audblk->cplstrtmant; j < audblk->cplendmant; j++)
-				audblk->cplmant[j] = coeff_get_mantissa(audblk->cpl_bap[j],0);
+				audblk->cpl_flt[j] = coeff_get_float(audblk->cpl_bap[j],0, audblk->cpl_exp[j]);
 			done_cpl = 1;
 		}
 	}
 
 	//uncouple the channel if necessary
-	if(audblk->cplinu)
-	{
-		for(i=0; i< bsi->nfchans; i++)
-		{
+	if(audblk->cplinu) {
+		for(i=0; i< bsi->nfchans; i++) {
 			if(audblk->chincpl[i])
 				coeff_uncouple_ch(samples[i],bsi,audblk,i);
 		}
 
 	}
 
-	if(bsi->lfeon)
-	{
+	if(bsi->lfeon) {
 		// There are always 7 mantissas for lfe, no dither for lfe 
 		for(j=0; j < 7 ; j++)
-		{
-			mantissa = coeff_get_mantissa(audblk->lfe_bap[j],0);
-			samples[5][j] = convert_to_float(audblk->lfe_exp[j],mantissa);
-		}
+			samples[5][j] = coeff_get_float(audblk->lfe_bap[j], 0, audblk->lfe_exp[j]);
 	}
 }
 
-//
-//Fetch a mantissa from the bitstream
-//
-//The mantissa returned is a signed 0.15 fixed point number
-//
-static sint_16
-coeff_get_mantissa(uint_16 bap, uint_16 dithflag)
+
+/**
+ * Fetch a float from the bitstream
+ **/
+
+static float inline coeff_get_float (uint16_t bap, uint16_t dithflag, uint16_t exp)
 {
-	uint_16 mantissa;
-	uint_16 group_code;
+	uint16_t dummy = 0;
 
 	//If the bap is 0-5 then we have special cases to take care of
-	switch(bap)
-	{
+	switch(bap) {
 		case 0:
 			if(dithflag)
-				mantissa = dither_gen();
-			else
-				mantissa = 0;
-			break;
+				return (dither_gen() * scale_factor[exp]);
+
+			return 0.0;
 
 		case 1:
-			if(m_1_pointer > 2)
-			{
-				group_code = bitstream_get(5);
-
-				if(group_code > 26)
-					goto error;
-
-				m_1[0] = group_code / 9; 
-				m_1[1] = (group_code % 9) / 3; 
-				m_1[2] = (group_code % 9) % 3; 
-				m_1_pointer = 0;
-			}
-			mantissa = m_1[m_1_pointer++];
-			mantissa = q_1[mantissa];
-			break;
+			if (q_1_pointer >= 0)
+				return(q_1[q_1_pointer--] * scale_factor[exp]);
+
+			if ((dummy = bitstream_get (5)) > 26)
+				goto error;
+
+			q_1[1] = q_1_1[dummy];
+			q_1[0] = q_1_2[dummy];
+			q_1_pointer = 1;
+
+			return (q_1_0[dummy] * scale_factor[exp]);
+
 		case 2:
+			if(q_2_pointer >= 0)
+				return (q_2[q_2_pointer--] * scale_factor[exp]);
 
-			if(m_2_pointer > 2)
-			{
-				group_code = bitstream_get(7);
+			if ((dummy = bitstream_get (7)) > 124)
+				goto error;
 
-				if(group_code > 124)
-					goto error;
+			q_2[1] = q_2_1[dummy];
+			q_2[0] = q_2_2[dummy];
+			q_2_pointer = 1;
 
-				m_2[0] = group_code / 25;
-				m_2[1] = (group_code % 25) / 5 ;
-				m_2[2] = (group_code % 25) % 5 ; 
-				m_2_pointer = 0;
-			}
-			mantissa = m_2[m_2_pointer++];
-			mantissa = q_2[mantissa];
-			break;
+			return (q_2_0[dummy] * scale_factor[exp]);
 
 		case 3:
-			mantissa = bitstream_get(3);
-
-			if(mantissa > 6)
+			if ((dummy = bitstream_get (3)) > 6)
 				goto error;
 
-			mantissa = q_3[mantissa];
-			break;
+			return (q_3[dummy] * scale_factor[exp]);
 
 		case 4:
-			if(m_4_pointer > 1)
-			{
-				group_code = bitstream_get(7);
+			if(q_4_pointer >= 0)
+				return (q_4[q_4_pointer--] * scale_factor[exp]);
 
-				if(group_code > 120)
-					goto error;
+			if ((dummy = bitstream_get (7)) > 120)
+				goto error;
 
-				m_4[0] = group_code / 11;
-				m_4[1] = group_code % 11;
-				m_4_pointer = 0;
-			}
-			mantissa = m_4[m_4_pointer++];
-			mantissa = q_4[mantissa];
-			break;
+			q_4[0] = q_4_1[dummy];
+			q_4_pointer = 0;
 
-		case 5:
-			mantissa = bitstream_get(4);
+			return (q_4_0[dummy] * scale_factor[exp]);  
 
-			if(mantissa > 14)
+		case 5:
+			if ((dummy = bitstream_get (4)) > 14)
 				goto error;
 
-			mantissa = q_5[mantissa];
-			break;
+			return (q_5[dummy] * scale_factor[exp]);
 
 		default:
-			mantissa = bitstream_get(qnttztab[bap]);
-			mantissa <<= 16 - qnttztab[bap];
+			dummy = bitstream_get(qnttztab[bap]);
+			dummy <<= 16 - qnttztab[bap];
+			return ((int16_t)dummy * scale_factor[exp]);  
 	}
 
-	return mantissa;
-
-
-
 error:
-	if(!error_flag)
-		fprintf(stderr,"** Invalid mantissa - skipping frame **\n");
-	error_flag = 1;
-
-	return 0;
+#ifdef DEBUG
+	fprintf(stderr,"** Invalid mantissa - skipping frame **\n");
+#endif
+	HANDLE_ERROR();
 }
 
-//
-// Reset the mantissa state
-//
-static void 
-coeff_reset(void)
+
+/**
+ * Reset the mantissa state
+ **/
+
+static void coeff_reset(void)
 {
-	m_1[2] = m_1[1] = m_1[0] = 0;
-	m_2[2] = m_2[1] = m_2[0] = 0;
-	m_4[1] = m_4[0] = 0;
-	m_1_pointer = m_2_pointer = m_4_pointer = 3;
+	q_1_pointer = q_2_pointer = q_4_pointer = -1;
 }
 
-//
-// Uncouple the coupling channel into a fbw channel
-//
-static void
-coeff_uncouple_ch(float samples[],bsi_t *bsi,audblk_t *audblk,uint_32 ch)
+
+/**
+ * Uncouple the coupling channel into a fbw channel
+ **/
+
+static void coeff_uncouple_ch (float samples[],bsi_t *bsi,audblk_t *audblk,uint32_t ch)
 {
-	uint_32 bnd = 0;
-	uint_32 sub_bnd = 0;
-	uint_32 i,j;
+	uint32_t bnd = 0;
+	uint32_t sub_bnd = 0;
+	uint32_t i,j;
 	float cpl_coord = 1.0;
-	uint_32 cpl_exp_tmp;
-	uint_32 cpl_mant_tmp;
-	sint_16 mantissa;
-
+	uint32_t cpl_exp_tmp;
+	uint32_t cpl_mant_tmp;
 
-	for(i=audblk->cplstrtmant;i<audblk->cplendmant;)
-	{
-		if(!audblk->cplbndstrc[sub_bnd++])
-		{
+	for (i=audblk->cplstrtmant;i<audblk->cplendmant;) {
+		if (!audblk->cplbndstrc[sub_bnd++]) {
 			cpl_exp_tmp = audblk->cplcoexp[ch][bnd] + 3 * audblk->mstrcplco[ch];
-			if(audblk->cplcoexp[ch][bnd] == 15)
+			if (audblk->cplcoexp[ch][bnd] == 15)
 				cpl_mant_tmp = (audblk->cplcomant[ch][bnd]) << 11;
 			else
 				cpl_mant_tmp = ((0x10) | audblk->cplcomant[ch][bnd]) << 10;
 			
-			cpl_coord = convert_to_float(cpl_exp_tmp,cpl_mant_tmp) * 8.0f;
+			cpl_coord = (cpl_mant_tmp * scale_factor[cpl_exp_tmp]) * 8.0f;
 
 			//Invert the phase for the right channel if necessary
 			if(bsi->acmod == 0x2 && audblk->phsflginu && ch == 1 && audblk->phsflg[bnd])
@@ -336,16 +417,13 @@ coeff_uncouple_ch(float samples[],bsi_t *bsi,audblk_t *audblk,uint_32 ch)
 			bnd++;
 		}
 
-		for(j=0;j < 12; j++)
-		{
-			//Get new dither values for each channel if necessary, so
-			//the channels are uncorrelated
-			if(audblk->dithflag[ch] && audblk->cpl_bap[i] == 0)
-				mantissa = dither_gen();
+		for(j=0;j < 12; j++) {
+			// Get new dither values for each channel if necessary,
+			// so the channels are uncorrelated
+			if(audblk->dithflag[ch] && !audblk->cpl_bap[i])
+				samples[i] = cpl_coord * (dither_gen() * scale_factor[audblk->cpl_exp[i]]);
 			else
-				mantissa = audblk->cplmant[i];
-
-			samples[i]  = cpl_coord * convert_to_float(audblk->cpl_exp[i],mantissa);
+				samples[i] = cpl_coord * audblk->cpl_flt[i];
 
 			i++;
 		}
diff --git a/ac3dec/cpu_accel.c b/ac3dec/cpu_accel.c
new file mode 100644
index 00000000..e01db83f
--- /dev/null
+++ b/ac3dec/cpu_accel.c
@@ -0,0 +1,129 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#include <inttypes.h>
+#include "mm_accel.h"
+
+//#ifdef __i386__
+#if 0
+
+#ifdef __PIC__
+#define cpuid(op, eax, ebx, ecx, edx)		\
+	__asm__ __volatile__ ("pushl %%ebx\n\t"	\
+		  "cpuid\n\t"			\
+		  "movl %%ebx, %%esi\n\t"	\
+		  "popl %%ebx"			\
+		: "=a" (eax),			\
+		  "=S" (ebx),			\
+		  "=c" (ecx),			\
+		  "=d" (edx)			\
+		: "a" (op)			\
+		: "cc")
+#else
+#define cpuid(op, eax, ebx, ecx, edx)		\
+	__asm__("cpuid"				\
+		: "=a" (eax),			\
+		  "=b" (ebx),			\
+		  "=c" (ecx),			\
+		  "=d" (edx)			\
+		: "a" (op)			\
+		: "cc")
+#endif
+
+static inline int has_cpuid ()
+{
+	return 1;
+/*
+	uint32_t eax, ebx;
+
+	asm ("pushfl\n\t"
+		  "popl %0\n\t"
+		  "movl %0,%1\n\t"
+		  "xorl $0x200000,%0\n\t"
+		  "pushl %0\n\t"
+		  "popfl\n\t"
+		  "pushfl\n\t"
+		  "popl %0"
+		: "=a" (eax),
+		  "=b" (ebx)
+		:
+		: "cc");
+
+	return (eax != ebx);
+*/
+}
+
+
+static uint32_t x86_accel (void)
+{
+	uint32_t eax, ebx, ecx, edx;
+	int AMD;
+	uint32_t caps;
+
+	if (!has_cpuid ())		// no cpuid
+		return 0;
+
+	cpuid (0x00000000, eax, ebx, ecx, edx);
+	if (!eax)			// vendor string only
+		return 0;
+
+	AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+	cpuid (0x00000001, eax, ebx, ecx, edx);
+	if (! (edx & 0x00800000))	// no MMX
+		return 0;
+
+	caps = OMS_ACCEL_X86_MMX;
+	if (edx & 0x02000000)		// SSE - identical to AMD MMX extensions
+		caps = OMS_ACCEL_X86_MMX | OMS_ACCEL_X86_MMXEXT;
+
+	cpuid (0x80000000, eax, ebx, ecx, edx);
+	if (eax < 0x80000001)		// no extended capabilities
+		return caps;
+
+	cpuid (0x80000001, eax, ebx, ecx, edx);
+
+	if (edx & 0x80000000)
+		caps |= OMS_ACCEL_X86_3DNOW;
+
+	if (AMD && (edx & 0x00400000))	// AMD MMX extensions
+		caps |= OMS_ACCEL_X86_MMXEXT;
+
+	return caps;
+}
+#endif
+
+uint32_t mm_accel (void)
+{
+//#ifdef __i386__
+#if 0
+    static int got_accel = 0;
+    static uint32_t accel;
+
+    if (!got_accel) {
+	got_accel = 1;
+	accel = x86_accel ();
+    }
+
+    return accel;
+#else
+    return 0;
+#endif
+}
diff --git a/ac3dec/crc.c b/ac3dec/crc.c
index 3210ce7f..2d5ef806 100644
--- a/ac3dec/crc.c
+++ b/ac3dec/crc.c
@@ -30,7 +30,7 @@
 
 #include "crc.h"
 
-static const uint_16 crc_lut[256] = 
+static const uint16_t crc_lut[256] = 
 {
 	0x0000,0x8005,0x800f,0x000a,0x801b,0x001e,0x0014,0x8011,
 	0x8033,0x0036,0x003c,0x8039,0x0028,0x802d,0x8027,0x0022,
@@ -66,31 +66,31 @@ static const uint_16 crc_lut[256] =
 	0x8213,0x0216,0x021c,0x8219,0x0208,0x820d,0x8207,0x0202
 };
 
-static uint_16 state;
+static uint16_t state;
 
-void
-crc_init(void)
+
+void crc_init(void)
 {
 	state = 0;
 }
 
 
-inline void crc_process_byte(uint_8 data)
+inline void crc_process_byte (uint8_t data)
 {
 	state = crc_lut[data ^ (state>>8)] ^ (state<<8);
 }
 
-void
-crc_process_frame(uint_8 *data,uint_32 num_bytes)
+
+void crc_process_frame (uint8_t *data,uint32_t num_bytes)
 {
-	uint_32 i;
+	uint32_t i;
 
-	for(i=0;i<num_bytes;i++)
-		crc_process_byte(data[i]);
+	for(i=0; i<num_bytes; i++)
+		crc_process_byte (data[i]);
 }
 
-int
-crc_validate(void)
+
+int crc_validate(void)
 {
-	return(state  == 0);
+	return (state == 0);
 }
diff --git a/ac3dec/crc.h b/ac3dec/crc.h
index 07d57b15..16489656 100644
--- a/ac3dec/crc.h
+++ b/ac3dec/crc.h
@@ -23,5 +23,5 @@
  
 int crc_validate(void);
 void crc_init(void);
-void crc_process_byte(uint_8 data);
-void crc_process_frame(uint_8 *data,uint_32 num_bytes);
+void crc_process_byte(uint8_t data);
+void crc_process_frame(uint8_t *data,uint32_t num_bytes);
diff --git a/ac3dec/debug.c b/ac3dec/debug.c
index b7d6a3b0..b84511d7 100644
--- a/ac3dec/debug.c
+++ b/ac3dec/debug.c
@@ -31,18 +31,8 @@ static int debug_level = -1;
 // We could potentially have multiple levels of debug info
 int debug_is_on(void)
 {
-	char *env_var;
-	
-	if(debug_level < 0)
-	{
-	  env_var = getenv("AC3_DEBUG");
-
-		if (env_var)
-		{
-			debug_level = 1;
-		}
-		else
-			debug_level = 0;
+	if(debug_level < 0) {
+		debug_level = getenv ("AC3_DEBUG") ? 1 : 0;
 	}
 	
 	return debug_level;
@@ -52,7 +42,7 @@ int debug_is_on(void)
 #ifndef __GNUC__
 void dprintf(char fmt[],...)
 {
-	int foo = 0;
+	return;
 }
 #endif
 
diff --git a/ac3dec/debug.h b/ac3dec/debug.h
index f45cb5b1..691d4032 100644
--- a/ac3dec/debug.h
+++ b/ac3dec/debug.h
@@ -22,16 +22,20 @@
  *
  */
 
-int debug_is_on(void);
+int debug_is_on (void);
 
 #ifdef __GNUC__
-#define dprintf(format,args...)\
+#ifdef DEBUG
+#define dprintf(args...)\
 {\
 	if (debug_is_on())\
 	{\
-		fprintf(stderr,format,## args);\
+		fprintf(stderr, args);\
 	}\
 }
 #else
+#define dprintf(args...) { };
+#endif
+#else
 void dprintf(char fmt[],...);
 #endif
diff --git a/ac3dec/decode.c b/ac3dec/decode.c
index 5e7f034b..fb852055 100644
--- a/ac3dec/decode.c
+++ b/ac3dec/decode.c
@@ -3,9 +3,6 @@
  *
  *	Copyright (C) Aaron Holtzman - May 1999
  *
- *      Added support for DVB-s PCI card by:
- *      Matjaz Thaler <matjaz.thaler@rd.iskraemeco.si> - November 2000
- *
  *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
  *	
  *  ac3dec is free software; you can redistribute it and/or modify
@@ -22,6 +19,17 @@
  *  along with GNU Make; see the file COPYING.  If not, write to
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
  *
+ *------------------------------------------------------------
+ *
+ *	Thomas Mirlacher <dent@cosy.sbg.ac.at>
+ *	added OMS support
+ * 11 Jan 2001
+ *	Thomas Mirlacher <dent@cosy.sbg.ac.at>
+ *	faster error response using jmp functions
+ * 
+ *  9 Aug 2001      
+ *      Matjaz Thaler <matjaz.thaler@rd.iskraemeco.si>
+ *      Added support for DVB-s PCI card
  *
  */
 
@@ -34,14 +42,17 @@
 #include <errno.h>
 #include <string.h>
 #include <sys/time.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
+
+#ifdef __OMS__
+#include <oms/oms.h>
+#include <oms/plugin/output_audio.h>
+#endif
 
 #include "ac3.h"
 #include "ac3_internal.h"
 #include "bitstream.h"
+#include "downmix.h"
+#include "srfft.h"
 #include "imdct.h"
 #include "exponent.h"
 #include "coeff.h"
@@ -51,96 +62,91 @@
 #include "stats.h"
 #include "rematrix.h"
 #include "sanity_check.h"
-#include "downmix.h"
 #include "debug.h"
-
-#define AC3_BUFFER_SIZE (6*1024*16)      
-
+#ifndef __OMS__
+//#include "audio_out.h"
+#endif
 //our global config structure
 ac3_config_t ac3_config;
-uint_32 error_flag = 0;
 
 static audblk_t audblk;
 static bsi_t bsi;
 static syncinfo_t syncinfo;
-static uint_32 frame_count = 0;
-//static uint_32 is_output_initialized = 0;
+#ifndef __OMS__
+static uint32_t done_banner;
+#endif
+static uint32_t is_output_initialized = 0;
 
 //the floating point samples for one audblk
 static stream_samples_t samples;
 
 //the integer samples for the entire frame (with enough space for 2 ch out)
 //if this size change, be sure to change the size when muting
-static sint_16 s16_samples[2 * 6 * 256];
-
+static int16_t s16_samples[2 * 6 * 256] __attribute__ ((aligned(16)));
 
+// downmix stuff
+static float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
+static float smixlev_lut[4] = { 0.707, 0.500, 0.0   , 0.500 };
+static dm_par_t dm_par;
+ 
 //Storage for the syncframe
-#define SYNC_BUFFER_MAX_SIZE 4096
-static uint_8 sync_buffer[SYNC_BUFFER_MAX_SIZE];
-static uint_32 sync_buffer_size = 0;;
+#define BUFFER_MAX_SIZE 4096
+static uint8_t buffer[BUFFER_MAX_SIZE];
+static uint32_t buffer_size = 0;;
+// for error handling
+jmp_buf error_jmp_mark;
 
-uint_32
-decode_sync_buffer_syncframe(syncinfo_t *syncinfo, uint_8 **start,uint_8 *end)
+static uint32_t decode_buffer_syncframe (syncinfo_t *syncinfo, uint8_t **start, uint8_t *end)
 {
-	uint_8 *cur = *start;
-	uint_16 syncword = syncinfo->syncword;
-	uint_32 ret = 0;
+	uint8_t *cur = *start;
+	uint16_t syncword = syncinfo->syncword;
+	uint32_t ret = 0;
 
-	// 
 	// Find an ac3 sync frame.
-	// 
-resync:
-
-	while(syncword != 0x0b77)
-	{
-		if(cur >= end)
+	while (syncword != 0x0b77) {
+		if (cur >= end)
 			goto done;
 		syncword = (syncword << 8) + *cur++;
 	}
 
 	//need the next 3 bytes to decide how big the frame is
-	while(sync_buffer_size < 3)
-	{
+	while (buffer_size < 3) {
 		if(cur >= end)
 			goto done;
 
-		sync_buffer[sync_buffer_size++] = *cur++;
+		buffer[buffer_size++] = *cur++;
 	}
 	
-	parse_syncinfo(syncinfo,sync_buffer);
-	stats_print_syncinfo(syncinfo);
+	parse_syncinfo (syncinfo,buffer);
+	stats_print_syncinfo (syncinfo);
 
-	while(sync_buffer_size < syncinfo->frame_size * 2 - 2)
-	{
+	while (buffer_size < syncinfo->frame_size * 2 - 2) {
 		if(cur >= end)
 			goto done;
 
-		sync_buffer[sync_buffer_size++] = *cur++;
+		buffer[buffer_size++] = *cur++;
 	}
-	
+
+#if 0
 	// Check the crc over the entire frame 
 	crc_init();
-	crc_process_frame(sync_buffer,syncinfo->frame_size * 2 - 2);
+	crc_process_frame (buffer, syncinfo->frame_size * 2 - 2);
 
-	if(!crc_validate())
-	{
+	if (!crc_validate()) {
 		fprintf(stderr,"** CRC failed - skipping frame **\n");
-		syncword = 0xffff;
-		sync_buffer_size = 0;
-		goto resync;
+		goto done;
 	}
-	
-	//
+#endif
+
 	//if we got to this point, we found a valid ac3 frame to decode
-	//
 
-	bitstream_init(sync_buffer);
+	bitstream_init (buffer);
 	//get rid of the syncinfo struct as we already parsed it
-	bitstream_get(24);
+	bitstream_get (24);
 
 	//reset the syncword for next time
 	syncword = 0xffff;
-	sync_buffer_size = 0;
+	buffer_size = 0;
 	ret = 1;
 
 done:
@@ -149,100 +155,131 @@ done:
 	return ret;
 }
 
-void
-decode_mute(void)
+
+void inline decode_mute (void)
 {
-	fprintf(stderr,"muting frame\n");
 	//mute the frame
-	memset(s16_samples,0,sizeof(sint_16) * 256 * 2 * 6);
-	error_flag = 0;
+	memset (s16_samples, 0, sizeof(int16_t) * 256 * 2 * 6);
 }
 
 
-void
-ac3_init(ac3_config_t *config)
+void ac3dec_init (void)
 {
-	memcpy(&ac3_config,config,sizeof(ac3_config_t));
-
-	imdct_init();
-	sanity_check_init(&syncinfo,&bsi,&audblk);
-
-	//	ac3_output = *foo;
+// FIXME - don't do that statically here
+	ac3_config.num_output_ch = 2;
+	ac3_config.flags = 0;
+
+	imdct_init ();
+	downmix_init ();
+	memset (&syncinfo, 0, sizeof (syncinfo));
+	memset (&bsi, 0, sizeof (bsi));
+	memset (&audblk, 0, sizeof (audblk));
+	sanity_check_init (&syncinfo,&bsi,&audblk);
 }
 
-uint_32 ac3_decode_data(uint_8 *data_start,uint_8 *data_end, int ac3reset, int *input_pointer, int *output_pointer, char *ac3_data)
+#ifdef __OMS__
+size_t ac3dec_decode_data (plugin_output_audio_t *output, uint8_t *data_start, uint8_t *data_end)
+#else
+size_t ac3dec_decode_data (uint8_t *data_start ,uint8_t *data_end, int ac3reset, int *input_pointer, int *output_pointer, char *ac3_data)
+#endif
 {
-	uint_32 i;
+	uint32_t i;
 	int datasize;
 	char *data;
 
+
 	if(ac3reset != 0){
 	  syncinfo.syncword = 0xffff;
-	  sync_buffer_size = 0;
+	  buffer_size = 0;
+	}
+		
+	if (setjmp (error_jmp_mark) < 0) {
+		ac3dec_init ();
+		return 0;
 	}
-	
-	while(decode_sync_buffer_syncframe(&syncinfo,&data_start,data_end))
-	{
-		dprintf("(decode) begin frame %d\n",frame_count++);
 
-		if(error_flag)
-		{
-			decode_mute();
-			continue;
+	while (decode_buffer_syncframe (&syncinfo, &data_start, data_end)) {
+		parse_bsi (&bsi);
+
+#ifndef __OMS__
+		if(!done_banner) {
+		  //			stats_print_banner(&syncinfo,&bsi);
+			done_banner = 1;
 		}
+#endif
+
+		// compute downmix parameters
+		// downmix to tow channels for now
+		dm_par.clev = 0.0; dm_par.slev = 0.0; dm_par.unit = 1.0;
+		if (bsi.acmod & 0x1)	// have center
+			dm_par.clev = cmixlev_lut[bsi.cmixlev];
+
+		if (bsi.acmod & 0x4)	// have surround channels
+			dm_par.slev = smixlev_lut[bsi.surmixlev];
 
-		parse_bsi(&bsi);
+		dm_par.unit /= 1.0 + dm_par.clev + dm_par.slev;
+		dm_par.clev *= dm_par.unit;
+		dm_par.slev *= dm_par.unit;
 
-		for(i=0; i < 6; i++)
-		{
+		for(i=0; i < 6; i++) {
 			//Initialize freq/time sample storage
-			memset(samples,0,sizeof(float) * 256 * (bsi.nfchans + bsi.lfeon));
+			memset (samples, 0, sizeof(float) * 256 * (bsi.nfchans + bsi.lfeon));
 
 			// Extract most of the audblk info from the bitstream
 			// (minus the mantissas 
-			parse_audblk(&bsi,&audblk);
+			parse_audblk (&bsi,&audblk);
 
 			// Take the differential exponent data and turn it into
 			// absolute exponents 
-			exponent_unpack(&bsi,&audblk); 
-			if(error_flag)
-				goto error;
+			exponent_unpack (&bsi,&audblk); 
 
 			// Figure out how many bits per mantissa 
-			bit_allocate(syncinfo.fscod,&bsi,&audblk);
+			bit_allocate (syncinfo.fscod,&bsi,&audblk);
 
 			// Extract the mantissas from the stream and
 			// generate floating point frequency coefficients
-			coeff_unpack(&bsi,&audblk,samples);
-			if(error_flag)
-				goto error;
+			coeff_unpack (&bsi,&audblk,samples);
 
-			if(bsi.acmod == 0x2)
-				rematrix(&audblk,samples);
+			if (bsi.acmod == 0x2)
+				rematrix (&audblk,samples);
 
 			// Convert the frequency samples into time samples 
-			imdct(&bsi,&audblk,samples);
+			imdct (&bsi,&audblk,samples, &s16_samples[i * 2 * 256], &dm_par);
 
 			// Downmix into the requested number of channels
-			// and convert floating point to sint_16
-			downmix(&bsi,samples,&s16_samples[i * 2 * 256]);
+			// and convert floating point to int16_t
+			// downmix(&bsi,samples,&s16_samples[i * 2 * 256]);
 
-			sanity_check(&syncinfo,&bsi,&audblk);
-			if(error_flag)
-				goto error;
+			if (sanity_check(&syncinfo,&bsi,&audblk) < 0) {
+				HANDLE_ERROR ();
+				return 0;
+			}
 
 			continue;
 		}
-		
-		parse_auxdata(&syncinfo);
 
-		/*
-		if(!is_output_initialized)
-		{
-			ac3_output.open(16,syncinfo.sampling_rate,2);
+		if (!is_output_initialized) {
+#ifdef __OMS__
+			plugin_output_audio_attr_t attr;
+#ifdef __sun__
+			attr.format = 16;
+#else
+			attr.format = AFMT_S16_NE;
+#endif
+			attr.speed = syncinfo.sampling_rate;
+			attr.channels = 2;
+
+			//	output->setup (&attr);
+#else
+			//			ao_functions->open (16, syncinfo.sampling_rate, 2);
+#endif
 			is_output_initialized = 1;
 		}
-		*/
+
+#ifdef __OMS__
+		output->write (s16_samples, 256 * 6 * 2 * 2);
+#else
+		//		ao_functions->play(s16_samples, 256 * 6 * 2);
 		data = (char *)s16_samples;
 		datasize = 0;
 		while(datasize < 6144){
@@ -257,13 +294,11 @@ uint_32 ac3_decode_data(uint_8 *data_start,uint_8 *data_end, int ac3reset, int *
 		  }
 		}
 
+#endif
 
-		//write(1, s16_samples, 256 * 6 * 2* 2);
-		//ac3_output.play(s16_samples, 256 * 6 * 2);
-error:
-		;
-		//find a new frame
 	}
 
+	decode_mute ();
+
 	return 0;	
 }
diff --git a/ac3dec/dither.c b/ac3dec/dither.c
index 31e74f62..079c04d6 100644
--- a/ac3dec/dither.c
+++ b/ac3dec/dither.c
@@ -31,7 +31,7 @@
 #include "dither.h"
 
 
-const uint_16 dither_lut[256] = 
+const uint16_t dither_lut[256] = 
 {
  0x0000, 0xa011, 0xe033, 0x4022, 0x6077, 0xc066, 0x8044, 0x2055,
  0xc0ee, 0x60ff, 0x20dd, 0x80cc, 0xa099, 0x0088, 0x40aa, 0xe0bb,
@@ -67,7 +67,7 @@ const uint_16 dither_lut[256] =
  0x8bf4, 0x2be5, 0x6bc7, 0xcbd6, 0xeb83, 0x4b92, 0x0bb0, 0xaba1
 };
 
-uint_16 lfsr_state = 1;
+uint16_t lfsr_state = 1;
 
 //
 // see dither_gen (inline-able) in dither.h
@@ -89,18 +89,17 @@ uint_16 lfsr_state = 1;
  *
  */
 
-uint_16 dither_gen(void)
+uint16_t dither_gen(void)
 {
 	int i;
-	uint_32 state;
+	uint32_t state;
 
 	//explicitly bring the state into a local var as gcc > 3.0?
 	//doesn't know how to optimize out the stores
 	state = lfsr_state;
 
 	//Generate eight pseudo random bits
-	for(i=0;i<8;i++)
-	{
+	for(i=0;i<8;i++) {
 		state <<= 1;	
 
 		if(state & 0x10000)
@@ -109,7 +108,7 @@ uint_16 dither_gen(void)
 
 	lfsr_state = state;
 
-	return (((((sint_32)state<<8)>>8) * (sint_32) (0.707106 * 256.0))>>16);
+	return (((((int32_t)state<<8)>>8) * (int32_t) (0.707106 * 256.0))>>16);
 }
 
 #endif
diff --git a/ac3dec/dither.h b/ac3dec/dither.h
index 6d68e1b5..abb9f518 100644
--- a/ac3dec/dither.h
+++ b/ac3dec/dither.h
@@ -22,16 +22,16 @@
  */
 
 
-extern uint_16 lfsr_state;
-extern const uint_16 dither_lut[256]; 
+extern uint16_t lfsr_state;
+extern const uint16_t dither_lut[256]; 
 
-static inline uint_16 dither_gen(void)
+static inline uint16_t dither_gen(void)
 {
-	sint_16 state;
+	int16_t state;
 
 	state = dither_lut[lfsr_state >> 8] ^ (lfsr_state << 8);
 	
-	lfsr_state = (uint_16) state;
+	lfsr_state = (uint16_t) state;
 
-	return ((state * (sint_32) (0.707106 * 256.0))>>8);
+	return ((state * (int32_t) (0.707106 * 256.0))>>8);
 }
diff --git a/ac3dec/downmix.c b/ac3dec/downmix.c
index 94bc51a8..8eac3e21 100644
--- a/ac3dec/downmix.c
+++ b/ac3dec/downmix.c
@@ -1,10 +1,7 @@
 /*
- *
- *  downmix.c
+ *  imdct.c
  *    
- *	Copyright (C) Aaron Holtzman - Sept 1999
- *
- *	Originally based on code by Yuqing Deng.
+ *	Copyright (C) Aaron Holtzman - May 1999
  *
  *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
  *	
@@ -28,401 +25,65 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
+#include <mm_accel.h>
+
 #include "ac3.h"
 #include "ac3_internal.h"
 
-
-#include "decode.h"
-#include "downmix.h"
 #include "debug.h"
+#include "downmix.h"
+#include "downmix_c.h"
+#include "downmix_i386.h"
+#ifdef HAVE_KNI
+#include "downmix_kni.h"
+#endif
 
+void (*downmix_3f_2r_to_2ch)(float *samples, dm_par_t * dm_par);
+void (*downmix_3f_1r_to_2ch)(float *samples, dm_par_t * dm_par);
+void (*downmix_2f_2r_to_2ch)(float *samples, dm_par_t * dm_par);
+void (*downmix_2f_1r_to_2ch)(float *samples, dm_par_t * dm_par);
+void (*downmix_3f_0r_to_2ch)(float *samples, dm_par_t * dm_par);
+void (*stream_sample_2ch_to_s16)(int16_t *s16_samples, float *left, float *right);
+void (*stream_sample_1ch_to_s16)(int16_t *s16_samples, float *center);
 
-//Pre-scaled downmix coefficients
-static float cmixlev_lut[4] = { 0.2928, 0.2468, 0.2071, 0.2468 };
-static float smixlev_lut[4] = { 0.2928, 0.2071, 0.0   , 0.2071 };
 
-static void 
-downmix_3f_2r_to_2ch(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
+void downmix_init()
 {
-	uint_32 j;
-	float right_tmp;
-	float left_tmp;
-	float clev,slev;
-	float *centre = 0, *left = 0, *right = 0, *left_sur = 0, *right_sur = 0;
-
-	left      = samples[0];
-	centre    = samples[1];
-	right     = samples[2];
-	left_sur  = samples[3];
-	right_sur = samples[4];
-
-	clev = cmixlev_lut[bsi->cmixlev];
-	slev = smixlev_lut[bsi->surmixlev];
-
-#if defined DOLBY_SURROUND
-	for (j = 0; j < 256; j++) 
+#ifdef __i386__
+#ifdef HAVE_KNI
+	uint32_t accel = mm_accel ();
+
+// other dowmixing should go here too
+	if (accel & MM_ACCEL_X86_MMXEXT) {
+		dprintf("Using SSE for downmix\n");
+		downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_kni;
+		downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_kni;
+		downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_kni;
+		downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_kni;
+		downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_kni;
+		stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_kni;
+		stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_kni;
+	} else if (accel & MM_ACCEL_X86_3DNOW) {
+	} else
+#endif
+#endif
 	{
-	        right_tmp = *left_sur++ + *right_sur++;
-		left_tmp  = 1.4142f * *left++  + *centre - right_tmp;
-		right_tmp += 1.4142f * *right++ + *centre++;
-	    
-	        s16_samples[j * 2 ]    = (sint_16) (left_tmp  * 16000.0f);
-	        s16_samples[j * 2 + 1] = (sint_16) (right_tmp * 16000.0f);
-	}
+		downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_c;
+		downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_c;
+		downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_c;
+		downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_c;
+		downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_c;
+#ifdef __i386__
+#if 1
+		stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_c;
+		stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_c;
 #else
-	for (j = 0; j < 256; j++) 
-	{
-		left_tmp = 0.4142f * *left++  + clev * *centre   + slev * *left_sur++;
-		right_tmp= 0.4142f * *right++ + clev * *centre++ + slev * *right_sur++;
-
-		s16_samples[j * 2 ]    = (sint_16) (left_tmp  * 32767.0f);
-		s16_samples[j * 2 + 1] = (sint_16) (right_tmp * 32767.0f);
-	}
+		stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_i386;
+		stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_i386;
+#endif
+#else
+		stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_c;
+		stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_c;
 #endif
-}
-
-static void
-downmix_2f_2r_to_2ch(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
-{
-	uint_32 j;
-	float right_tmp;
-	float left_tmp;
-	float slev;
-	float *left = 0, *right = 0, *left_sur = 0, *right_sur = 0;
-
-	left      = samples[0];
-	right     = samples[1];
-	left_sur  = samples[2];
-	right_sur = samples[3];
-
-	slev = smixlev_lut[bsi->surmixlev];
-
-	for (j = 0; j < 256; j++) 
-	{
-		left_tmp = 0.4142f * *left++  + slev * *left_sur++;
-		right_tmp= 0.4142f * *right++ + slev * *right_sur++;
-
-		s16_samples[j * 2 ]    = (sint_16) (left_tmp  * 32767.0f);
-		s16_samples[j * 2 + 1] = (sint_16) (right_tmp * 32767.0f);
-	}
-}
-
-static void
-downmix_3f_1r_to_2ch(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
-{
-	uint_32 j;
-	float right_tmp;
-	float left_tmp;
-	float clev,slev;
-	float *centre = 0, *left = 0, *right = 0, *sur = 0;
-
-	left      = samples[0];
-	centre    = samples[1];
-	right     = samples[2];
-	//Mono surround
-	sur = samples[3];
-
-	clev = cmixlev_lut[bsi->cmixlev];
-	slev = smixlev_lut[bsi->surmixlev];
-
-	for (j = 0; j < 256; j++) 
-	{
-		left_tmp = 0.4142f * *left++  + clev * *centre++ + slev * *sur;
-		right_tmp= 0.4142f * *right++ + clev * *centre   + slev * *sur++;
-
-		s16_samples[j * 2 ]    = (sint_16) (left_tmp  * 32767.0f);
-		s16_samples[j * 2 + 1] = (sint_16) (right_tmp * 32767.0f);
-	}
-}
-
-
-static void
-downmix_2f_1r_to_2ch(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
-{
-	uint_32 j;
-	float right_tmp;
-	float left_tmp;
-	float slev;
-	float *left = 0, *right = 0, *sur = 0;
-
-	left      = samples[0];
-	right     = samples[1];
-	//Mono surround
-	sur = samples[2];
-
-	slev = smixlev_lut[bsi->surmixlev];
-
-	for (j = 0; j < 256; j++) 
-	{
-		left_tmp = 0.4142f * *left++  + slev * *sur;
-		right_tmp= 0.4142f * *right++ + slev * *sur++;
-
-		s16_samples[j * 2 ]    = (sint_16) (left_tmp  * 32767.0f);
-		s16_samples[j * 2 + 1] = (sint_16) (right_tmp * 32767.0f);
-	}
-}
-
-static void
-downmix_3f_0r_to_2ch(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
-{
-	uint_32 j;
-	float right_tmp;
-	float left_tmp;
-	float clev;
-	float *centre = 0, *left = 0, *right = 0;
-
-	left      = samples[0];
-	centre    = samples[1];
-	right     = samples[2];
-
-	clev = cmixlev_lut[bsi->cmixlev];
-
-	for (j = 0; j < 256; j++) 
-	{
-		left_tmp = 0.4142f * *left++  + clev * *centre; 
-		right_tmp= 0.4142f * *right++ + clev * *centre++;   
-
-		s16_samples[j * 2 ]    = (sint_16) (left_tmp  * 32767.0f);
-		s16_samples[j * 2 + 1] = (sint_16) (right_tmp * 32767.0f);
-	}
-}
-				
-static void
-downmix_2f_0r_to_2ch(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
-{
-	uint_32 j;
-	float *left = 0, *right = 0;
-
-	left      = samples[0];
-	right     = samples[1];
-
-	for (j = 0; j < 256; j++) 
-	{
-		s16_samples[j * 2 ]    = (sint_16) (*left++  * 32767.0f);
-		s16_samples[j * 2 + 1] = (sint_16) (*right++ * 32767.0f);
-	}
-}
-
-static void
-downmix_1f_0r_to_2ch(float *centre,sint_16 *s16_samples)
-{
-	uint_32 j;
-	float tmp;
-
-	//Mono program!
-
-	for (j = 0; j < 256; j++) 
-	{
-		tmp =  32767.0f * 0.7071f * *centre++;
-
-		s16_samples[j * 2 ] = s16_samples[j * 2 + 1] = (sint_16) tmp;
-	}
-}
-
-//
-// Downmix into 2 or 4 channels  (4 ch isn't in quite yet)
-//
-// The downmix function names have the following format
-//
-// downmix_Xf_Yr_to_[2|4]ch[_dolby]
-//
-// where X        = number of front channels
-//       Y        = number of rear channels
-//       [2|4]    = number of output channels
-//       [_dolby] = with or without dolby surround mix
-//
-
-void downmix(bsi_t* bsi, stream_samples_t samples,sint_16 *s16_samples)
-{
-	if(bsi->acmod > 7)
-		dprintf("(downmix) invalid acmod number\n"); 
-
-	//
-	//There are two main cases, with or without Dolby Surround
-	//
-	if(ac3_config.flags & AC3_DOLBY_SURR_ENABLE)
-	{
-		fprintf(stderr,"Dolby Surround Mixes not currently enabled\n");
-		exit(1);
-	}
-
-	//Non-Dolby surround downmixes
-	switch(bsi->acmod)
-	{
-		// 3/2
-		case 7:
-			downmix_3f_2r_to_2ch(bsi,samples,s16_samples);
-		break;
-
-		// 2/2
-		case 6:
-			downmix_2f_2r_to_2ch(bsi,samples,s16_samples);
-		break;
-
-		// 3/1
-		case 5:
-			downmix_3f_1r_to_2ch(bsi,samples,s16_samples);
-		break;
-
-		// 2/1
-		case 4:
-			downmix_2f_1r_to_2ch(bsi,samples,s16_samples);
-		break;
-
-		// 3/0
-		case 3:
-			downmix_3f_0r_to_2ch(bsi,samples,s16_samples);
-		break;
-
-		case 2:
-			downmix_2f_0r_to_2ch(bsi,samples,s16_samples);
-		break;
-
-		// 1/0
-		case 1:
-			downmix_1f_0r_to_2ch(samples[0],s16_samples);
-		break;
-
-		// 1+1
-		case 0:
-			downmix_1f_0r_to_2ch(samples[ac3_config.dual_mono_ch_sel],s16_samples);
-		break;
 	}
 }
-
-
-
-#if 0 
-
-	//the dolby mixes lay here for the time being
-	switch(bsi->acmod)
-	{
-		// 3/2
-		case 7:
-			left      = samples[0];
-			centre    = samples[1];
-			right     = samples[2];
-			left_sur  = samples[3];
-			right_sur = samples[4];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp = 0.2265f * *left_sur++ + 0.2265f * *right_sur++;
-				left_tmp  = -1 * right_tmp;
-				right_tmp += 0.3204f * *right++ + 0.2265f * *centre;
-				left_tmp  += 0.3204f * *left++  + 0.2265f * *centre++;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = left_tmp;
-			}
-
-		break;
-
-		// 2/2
-		case 6:
-			left      = samples[0];
-			right     = samples[1];
-			left_sur  = samples[2];
-			right_sur = samples[3];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp = 0.2265f * *left_sur++ + 0.2265f * *right_sur++;
-				left_tmp  = -1 * right_tmp;
-				right_tmp += 0.3204f * *right++;
-				left_tmp  += 0.3204f * *left++ ;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = left_tmp;
-			}
-		break;
-
-		// 3/1
-		case 5:
-			left      = samples[0];
-			centre    = samples[1];
-			right     = samples[2];
-			//Mono surround
-			right_sur = samples[3];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp =  0.2265f * *right_sur++;
-				left_tmp  = -1 * right_tmp;
-				right_tmp += 0.3204f * *right++ + 0.2265f * *centre;
-				left_tmp  += 0.3204f * *left++  + 0.2265f * *centre++;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = left_tmp;
-			}
-		break;
-
-		// 2/1
-		case 4:
-			left      = samples[0];
-			right     = samples[1];
-			//Mono surround
-			right_sur = samples[2];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp =  0.2265f * *right_sur++;
-				left_tmp  = -1 * right_tmp;
-				right_tmp += 0.3204f * *right++; 
-				left_tmp  += 0.3204f * *left++;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = left_tmp;
-			}
-		break;
-
-		// 3/0
-		case 3:
-			left      = samples[0];
-			centre    = samples[1];
-			right     = samples[2];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp = 0.3204f * *right++ + 0.2265f * *centre;
-				left_tmp  = 0.3204f * *left++  + 0.2265f * *centre++;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = left_tmp;
-			}
-		break;
-
-		// 2/0
-		case 2:
-		//Do nothing!
-		break;
-
-		// 1/0
-		case 1:
-			//Mono program!
-			right = samples[0];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp = 0.7071f * *right++;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = right_tmp;
-			}
-			
-		break;
-
-		// 1+1
-		case 0:
-			//Dual mono, output selected by user
-			right = samples[ac3_config.dual_mono_ch_sel];
-
-			for (j = 0; j < 256; j++) 
-			{
-				right_tmp = 0.7071f * *right++;
-
-				samples[1][j] = right_tmp;
-				samples[0][j] = right_tmp;
-			}
-		break;
-#endif
diff --git a/ac3dec/downmix.h b/ac3dec/downmix.h
index 7537c622..dcfe1e12 100644
--- a/ac3dec/downmix.h
+++ b/ac3dec/downmix.h
@@ -25,4 +25,10 @@
  *
  */
 
-void downmix(bsi_t* bsi, stream_samples_t stream_samples,sint_16 *s16_samples);
+typedef struct dm_par_s {
+	float unit;  
+	float clev;  
+	float slev;          
+} dm_par_t;                                                                     
+
+void downmix_init();
diff --git a/ac3dec/downmix_c.c b/ac3dec/downmix_c.c
new file mode 100644
index 00000000..875d785e
--- /dev/null
+++ b/ac3dec/downmix_c.c
@@ -0,0 +1,161 @@
+/*
+ *  downmix_c.c
+ *    
+ *	Copyright (C) Aaron Holtzman - May 1999
+ *
+ *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
+ *	
+ *  ac3dec is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  ac3dec is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
+ *
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "ac3.h"
+#include "ac3_internal.h"
+
+#include "debug.h"
+#include "downmix.h"
+#include "downmix_c.h"
+
+
+void downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
+{
+	int i;
+	float *left, *right, *center, *left_sur, *right_sur;
+	float left_tmp, right_tmp;
+
+	left = samples;
+	right = samples + 256 * 2;
+	center = samples + 256;
+	left_sur = samples + 256 * 3;
+	right_sur = samples + 256 * 4;
+
+	for (i=0; i < 256; i++) {
+#if defined DOLBY_SURROUND
+	  	left_tmp = dm_par->unit * *left  + dm_par->clev * *center - dm_par->slev * (*left_sur + *right_sur);
+	  	right_tmp= dm_par->unit * *right++ + dm_par->clev * *center + dm_par->slev * (*left_sur++ + *right_sur++);
+#else
+		left_tmp = dm_par->unit * *left  + dm_par->clev * *center + dm_par->slev * *left_sur++;
+		right_tmp= dm_par->unit * *right++ + dm_par->clev * *center + dm_par->slev * *right_sur++;
+#endif
+		*left++ = left_tmp;
+		*center++ = right_tmp;
+
+	}
+}
+
+
+void downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
+{
+	int i;
+	float *left, *right, *left_sur, *right_sur;
+	float left_tmp, right_tmp;
+               
+	left = &samples[0];
+	right = &samples[256];
+	left_sur = &samples[512];
+	right_sur = &samples[768];
+
+	for (i = 0; i < 256; i++) {
+		left_tmp = dm_par->unit * *left  + dm_par->slev * *left_sur++;
+		right_tmp= dm_par->unit * *right + dm_par->slev * *right_sur++;
+		*left++ = left_tmp;
+		*right++ = right_tmp;
+	}
+}
+
+
+void downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
+{
+	int i;
+	float *left, *right, *center, *right_sur;
+	float left_tmp, right_tmp;
+
+	left = &samples[0];
+	right = &samples[512];
+	center = &samples[256];
+	right_sur = &samples[768];
+
+	for (i = 0; i < 256; i++) {
+		left_tmp = dm_par->unit * *left  + dm_par->clev * *center  - dm_par->slev * *right_sur;
+		right_tmp= dm_par->unit * *right++ + dm_par->clev * *center + dm_par->slev * *right_sur++;
+		*left++ = left_tmp;
+		*center++ = right_tmp;
+	}
+}
+
+
+void downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
+{
+	int i;
+	float *left, *right, *right_sur;
+	float left_tmp, right_tmp;
+
+	left = &samples[0];
+	right = &samples[256];
+	right_sur = &samples[512];
+
+	for (i = 0; i < 256; i++) {
+		left_tmp = dm_par->unit * *left  - dm_par->slev * *right_sur;
+		right_tmp= dm_par->unit * *right + dm_par->slev * *right_sur++;
+		*left++ = left_tmp;
+		*right++ = right_tmp;
+	}
+}
+
+
+void downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
+{
+	int i;
+	float *left, *right, *center;
+	float left_tmp, right_tmp;
+
+	left = &samples[0];
+	center = &samples[256];
+	right = &samples[512];
+
+	for (i = 0; i < 256; i++) {
+		left_tmp = dm_par->unit * *left  + dm_par->clev * *center;
+		right_tmp= dm_par->unit * *right++ + dm_par->clev * *center;
+		*left++ = left_tmp;
+		*center++ = right_tmp;
+	}
+}
+
+
+void stream_sample_2ch_to_s16_c (int16_t *s16_samples, float *left, float *right)
+{
+	int i;
+
+	for (i=0; i < 256; i++) {
+		*s16_samples++ = (int16_t) *left++;
+		*s16_samples++ = (int16_t) *right++;
+	}
+}
+
+
+void stream_sample_1ch_to_s16_c (int16_t *s16_samples, float *center)
+{
+	int i;
+	float tmp;
+
+	for (i=0; i < 256; i++) {
+		*s16_samples++ = tmp = (int16_t) (0.7071f * *center++);
+		*s16_samples++ = tmp;
+	}
+}
diff --git a/ac3dec/downmix_c.h b/ac3dec/downmix_c.h
new file mode 100644
index 00000000..3643f65e
--- /dev/null
+++ b/ac3dec/downmix_c.h
@@ -0,0 +1,32 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __DOWNMIX_C_H__
+#define __DOWNMIX_C_H__
+
+void downmix_3f_2r_to_2ch_c(float *samples, dm_par_t * dm_par);
+void downmix_3f_1r_to_2ch_c(float *samples, dm_par_t * dm_par);
+void downmix_2f_2r_to_2ch_c(float *samples, dm_par_t * dm_par);
+void downmix_2f_1r_to_2ch_c(float *samples, dm_par_t * dm_par);
+void downmix_3f_0r_to_2ch_c(float *samples, dm_par_t * dm_par);            
+void stream_sample_2ch_to_s16_c(int16_t *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_c(int16_t *s16_samples, float *center);      
+
+#endif
diff --git a/ac3dec/downmix_i386.S b/ac3dec/downmix_i386.S
new file mode 100644
index 00000000..20f2e8c3
--- /dev/null
+++ b/ac3dec/downmix_i386.S
@@ -0,0 +1,92 @@
+/* This is basicly gcc generated.
+ * Only the floating point rounding mode loads and saves
+ * are removed in the stream_sample_to_s16 functions.
+ */                                                                            
+
+#ifdef __i386__
+
+	.file	"downmix.c"
+	.version	"01.01"
+gcc2_compiled.:
+.text
+	.align 4
+.globl stream_sample_2ch_to_s16_i386
+	.type	 stream_sample_2ch_to_s16_i386,@function
+stream_sample_2ch_to_s16_i386:
+	pushl %ebp
+	movl %esp,%ebp
+	subl $28,%esp
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+	movl 8(%ebp),%edx
+	movl 12(%ebp),%ebx
+	movl 16(%ebp),%ecx
+	movl $255,%esi
+	.p2align 4,,7
+.L373:
+	flds (%ebx)
+	fistpl -8(%ebp)
+	movl -8(%ebp),%eax
+	movw %ax,(%edx)
+	addl $2,%edx
+	addl $4,%ebx
+	flds (%ecx)
+	fistpl -8(%ebp)
+	movl -8(%ebp),%eax
+	movw %ax,(%edx)
+	addl $4,%ecx
+	addl $2,%edx
+	decl %esi
+	jns .L373
+	popl %ebx
+	popl %esi
+	popl %edi
+	leave
+	ret
+.Lfe6:
+	.size	 stream_sample_2ch_to_s16_i386,.Lfe6-stream_sample_2ch_to_s16_i386
+.section	.rodata
+	.align 4
+.LC46:
+	.long 0x3f350481
+.text
+	.align 4
+.globl stream_sample_1ch_to_s16_i386
+	.type	 stream_sample_1ch_to_s16_i386,@function
+stream_sample_1ch_to_s16_i386:
+	pushl %ebp
+	movl %esp,%ebp
+	subl $16,%esp
+	pushl %esi
+	pushl %ebx
+	movl 8(%ebp),%edx
+	movl 12(%ebp),%ecx
+	flds .LC46
+	movl $255,%ebx
+	.p2align 4,,7
+.L379:
+	flds (%ecx)
+	fmul %st(1),%st
+	fistpl -8(%ebp)
+	movl -8(%ebp),%eax
+	movw %ax,-2(%ebp)
+	addl $4,%ecx
+	flds -2(%ebp)
+	fistpl -8(%ebp)
+	movl -8(%ebp),%eax
+	movw %ax,(%edx)
+	addl $2,%edx
+	movw %ax,(%edx)
+	addl $2,%edx
+	decl %ebx
+	jns .L379
+	fstp %st(0)
+	popl %ebx
+	popl %esi
+	leave
+	ret
+.Lfe7:
+	.size	 stream_sample_1ch_to_s16_i386,.Lfe7-stream_sample_1ch_to_s16_i386
+	.ident	"GCC: (GNU) 2.95.3 19991030 (prerelease)"
+#endif
diff --git a/ac3dec/downmix_i386.h b/ac3dec/downmix_i386.h
new file mode 100644
index 00000000..e34d7d6c
--- /dev/null
+++ b/ac3dec/downmix_i386.h
@@ -0,0 +1,27 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __DOWNMIX_I386_H__
+#define __DOWNMIX_I386_H__
+
+void stream_sample_2ch_to_s16_i386(int16_t *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_i386(int16_t *s16_samples, float *center);      
+
+#endif
diff --git a/ac3dec/downmix_kni.S b/ac3dec/downmix_kni.S
new file mode 100644
index 00000000..7df8c060
--- /dev/null
+++ b/ac3dec/downmix_kni.S
@@ -0,0 +1,396 @@
+/* 
+ *  downmix_kni.S
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000
+ *
+ *
+ *  downmix_kni.S is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  downmix_kni.S is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifdef __i386__
+
+.section .rodata
+	.align 4
+sqrt2:	.float 0f0.7071068
+	.p2align 5,0,
+	
+	.section .text
+	
+	.align 4
+	.global downmix_3f_2r_to_2ch_kni
+	.type downmix_3f_2r_to_2ch_kni, @function
+
+downmix_3f_2r_to_2ch_kni:
+	pushl	%ebp
+	movl  	%esp, %ebp
+	
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+
+	movl 	8(%ebp), %eax /* samples[] */
+	movl	12(%ebp), %ebx /* &dm_par */
+	movl	$64, %ecx		/* loop counter */
+
+	movss	(%ebx), %xmm5	/* unit */
+	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */
+
+	movss	4(%ebx), %xmm6		/* clev */
+	shufps	$0, %xmm6, %xmm6	/* clev | clev | clev | clev */
+
+	movss	8(%ebx), %xmm7		/* slev */
+	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */
+
+.loop:	
+	movaps	(%eax), %xmm0 /*  left */
+	movaps	2048(%eax), %xmm1 /* right */
+	movaps 	1024(%eax), %xmm2	/* center */
+	mulps	%xmm5, %xmm0
+	mulps	%xmm5, %xmm1
+	
+	mulps	%xmm6, %xmm2
+	movaps	3072(%eax), %xmm3	/* leftsur */
+	movaps	4096(%eax), %xmm4	/* rithgsur */
+	addps	%xmm2, %xmm0
+	addps 	%xmm2, %xmm1
+
+	mulps	%xmm7, %xmm3
+	mulps	%xmm7, %xmm4
+	addps	%xmm3, %xmm0
+	addps	%xmm4, %xmm1
+
+	movaps	%xmm0, (%eax)
+	movaps	%xmm1, 1024(%eax)
+
+	addl	$16, %eax
+	decl 	%ecx
+	jnz	.loop
+
+	popl	%ecx
+	popl	%ebx
+	popl 	%eax
+
+	leave
+	ret
+	.p2align 4,,7
+
+	.global downmix_2f_2r_to_2ch_kni
+	.type downmix_2f_2r_to_2ch_kni, @function
+
+downmix_2f_2r_to_2ch_kni:
+	pushl %ebp
+	movl  %esp, %ebp
+
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+
+	movl 8(%ebp), %eax /* samples[] */
+	movl 12(%ebp), %ebx /* &dm_par */
+	movl $64, %ecx	/* loop counter */
+
+	movss (%ebx), %xmm5	/* unit */
+	shufps $0, %xmm5, %xmm5 /* unit | unit | unit | unit */
+
+	movss	8(%ebx), %xmm7		/* slev */
+	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */
+
+.loop3:	
+	movaps	(%eax), %xmm0 /*  left */
+	movaps	1024(%eax), %xmm1 /* right */
+	movaps 	2048(%eax), %xmm3	/* leftsur */
+	mulps	%xmm5, %xmm0
+	mulps	%xmm5, %xmm1
+	
+	movaps	3072(%eax), %xmm4	/* rightsur */
+
+	mulps	%xmm7, %xmm3
+	mulps	%xmm7, %xmm4
+	addps	%xmm3, %xmm0
+	addps	%xmm4, %xmm1
+
+	movaps	%xmm0, (%eax)
+	movaps	%xmm1, 1024(%eax)
+
+	addl	$16, %eax
+	decl 	%ecx
+	jnz	.loop3
+
+	popl	%ecx
+	popl	%ebx
+	popl 	%eax
+
+	leave
+	ret
+	.p2align 4,,7
+	
+	.global downmix_3f_1r_to_2ch_kni
+	.type downmix_3f_1r_to_2ch_kni, @function
+
+downmix_3f_1r_to_2ch_kni:
+	pushl	%ebp
+	movl  	%esp, %ebp
+	
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+
+	movl 	8(%ebp), %eax /* samples[] */
+	movl	12(%ebp), %ebx /* &dm_par */
+	movl	$64, %ecx		/* loop counter */
+
+	movss	(%ebx), %xmm5	/* unit */
+	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */
+
+	movss	4(%ebx), %xmm6		/* clev */
+	shufps	$0, %xmm6, %xmm6	/* clev | clev | clev | clev */
+
+	movss	8(%ebx), %xmm7		/* slev */
+	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */
+
+.loop4:	
+	movaps	(%eax), %xmm0 /*  left */
+	movaps	2048(%eax), %xmm1 /* right */
+	movaps 	1024(%eax), %xmm2	/* center */
+	mulps	%xmm5, %xmm0
+	mulps	%xmm5, %xmm1
+	
+	mulps	%xmm6, %xmm2
+	movaps	3072(%eax), %xmm3	/* sur */
+
+	addps	%xmm2, %xmm0
+	mulps	%xmm7, %xmm3
+	
+	addps 	%xmm2, %xmm1
+
+	subps	%xmm3, %xmm0
+	addps	%xmm3, %xmm1
+
+	movaps	%xmm0, (%eax)
+	movaps	%xmm1, 1024(%eax)
+
+	addl	$16, %eax
+	decl 	%ecx
+	jnz	.loop4
+
+	popl	%ecx
+	popl	%ebx
+	popl 	%eax
+
+	leave
+	ret
+	.p2align 4,,7
+		
+	.global downmix_2f_1r_to_2ch_kni
+	.type downmix_2f_1r_to_2ch_kni, @function
+
+downmix_2f_1r_to_2ch_kni:
+	pushl	%ebp
+	movl  	%esp, %ebp
+	
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+
+	movl 	8(%ebp), %eax /* samples[] */
+	movl	12(%ebp), %ebx /* &dm_par */
+	movl	$64, %ecx		/* loop counter */
+
+	movss	(%ebx), %xmm5	/* unit */
+	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */
+
+	movss	8(%ebx), %xmm7		/* slev */
+	shufps	$0, %xmm7, %xmm7	/* slev | slev | slev | slev */
+
+.loop5:	
+	movaps	(%eax), %xmm0 /*  left */
+	movaps	1024(%eax), %xmm1 /* right */
+	
+	mulps	%xmm5, %xmm0
+	mulps	%xmm5, %xmm1
+	
+	movaps	2048(%eax), %xmm3	/* sur */
+
+	mulps	%xmm7, %xmm3
+	
+	subps	%xmm3, %xmm0
+	addps	%xmm3, %xmm1
+
+	movaps	%xmm0, (%eax)
+	movaps	%xmm1, 1024(%eax)
+
+	addl	$16, %eax
+	decl 	%ecx
+	jnz	.loop5
+
+	popl	%ecx
+	popl	%ebx
+	popl 	%eax
+
+	leave
+	ret
+	.p2align 4,,7
+	
+	.global downmix_3f_0r_to_2ch_kni
+	.type downmix_3f_0r_to_2ch_kni, @function
+
+downmix_3f_0r_to_2ch_kni:
+	pushl	%ebp
+	movl  	%esp, %ebp
+	
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+
+	movl 	8(%ebp), %eax /* samples[] */
+	movl	12(%ebp), %ebx /* &dm_par */
+	movl	$64, %ecx		/* loop counter */
+
+	movss	(%ebx), %xmm5	/* unit */
+	shufps	$0, %xmm5, %xmm5	/* unit | unit | unit | unit */
+
+	movss	4(%ebx), %xmm6		/* clev */
+	shufps	$0, %xmm6, %xmm6	/* clev | clev | clev | clev */
+
+
+.loop6:	
+	movaps	(%eax), %xmm0 /*  left */
+	movaps	2048(%eax), %xmm1 /* right */
+	movaps 	1024(%eax), %xmm2	/* center */
+	mulps	%xmm5, %xmm0
+	mulps	%xmm5, %xmm1
+	
+	mulps	%xmm6, %xmm2
+
+	addps	%xmm2, %xmm0
+	
+	addps 	%xmm2, %xmm1
+
+	movaps	%xmm0, (%eax)
+	movaps	%xmm1, 1024(%eax)
+
+	addl	$16, %eax
+	decl 	%ecx
+	jnz	.loop6
+
+	popl	%ecx
+	popl	%ebx
+	popl 	%eax
+
+	leave
+	ret
+	.p2align 4,,7
+	
+	.global stream_sample_2ch_to_s16_kni
+	.type stream_sample_2ch_to_s16_kni, @function
+
+stream_sample_2ch_to_s16_kni:
+	pushl %ebp
+	movl  %esp, %ebp
+
+	pushl %eax
+	pushl %ebx
+	pushl %edx
+	pushl %ecx
+
+	movl 8(%ebp), %eax	/* s16_samples */
+	movl 12(%ebp), %ebx	/* left */
+	movl 16(%ebp), %edx	/* right */
+	movl $64, %ecx
+
+.loop1:
+	movaps (%ebx), %xmm0	/* l3 | l2 | l1 | l0 */
+	movaps (%edx), %xmm1	/* r3 | r2 | r1 | r0 */
+	movhlps %xmm0, %xmm2	/* l3 | l2 */
+	movhlps %xmm1, %xmm3	/* r3 | r2 */
+	unpcklps %xmm1, %xmm0	/* r1 | l1 | r0 | l0 */
+	unpcklps %xmm3, %xmm2	/* r3 | l3 | r2 | l2 */
+
+	cvtps2pi %xmm0, %mm0	/* r0 l0 --> mm0, int_32 */
+	movhlps %xmm0, %xmm0	
+	cvtps2pi %xmm0, %mm1	/* r1 l1 --> mm1, int_32 */
+
+	cvtps2pi %xmm2, %mm2	/* r2 l2 --> mm2, int_32 */
+	movhlps %xmm2, %xmm2
+	cvtps2pi %xmm2, %mm3	/* r3 l3 --> mm3, int_32 */
+	packssdw %mm1, %mm0	/* r1 l1 r0 l0 --> mm0, int_16 */
+	packssdw %mm3, %mm2	/* r3 l3 r2 l2 --> mm2, int_16 */
+
+	movq %mm0, (%eax)
+	movq %mm2, 8(%eax)
+	addl $16, %eax
+	addl $16, %ebx
+	addl $16, %edx
+
+	decl %ecx
+	jnz .loop1
+
+	popl %ecx
+	popl %edx
+	popl %ebx
+	popl %eax
+
+	emms
+
+	leave
+	ret
+	.p2align 4,,7
+	
+	.global stream_sample_1ch_to_s16_kni
+	.type stream_sample_1ch_to_s16_kni, @function
+
+stream_sample_1ch_to_s16_kni:
+	pushl %ebp
+	movl  %esp, %ebp
+
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+
+	movl $sqrt2, %eax
+	movss (%eax), %xmm7
+	movl 8(%ebp), %eax	/* s16_samples */
+	movl 12(%ebp), %ebx	/* left */
+	shufps $0, %xmm7, %xmm7
+	movl $64, %ecx
+
+.loop2:
+	movaps (%ebx), %xmm0	/* c3 | c2 | c1 | c0 */
+	mulps %xmm7, %xmm0
+	movhlps %xmm0, %xmm2	/* c3 | c2 */
+
+	cvtps2pi %xmm0, %mm0	/* c1 c0 --> mm0, int_32 */
+	cvtps2pi %xmm2, %mm1	/* c3 c2 --> mm1, int_32 */
+
+	packssdw %mm0, %mm0	/* c1 c1 c0 c0 --> mm0, int_16 */
+	packssdw %mm1, %mm1	/* c3 c3 c2 c2 --> mm1, int_16 */
+
+	movq %mm0, (%eax)
+	movq %mm1, 8(%eax)
+	addl $16, %eax
+	addl $16, %ebx
+
+	decl %ecx
+	jnz .loop2
+
+	popl %ecx
+	popl %ebx
+	popl %eax
+	
+	emms
+	leave
+	ret 
+#endif
diff --git a/ac3dec/downmix_kni.h b/ac3dec/downmix_kni.h
new file mode 100644
index 00000000..323f2a75
--- /dev/null
+++ b/ac3dec/downmix_kni.h
@@ -0,0 +1,32 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __DOWNMIX_KNI_H__
+#define __DOWNMIX_KNI_H__
+
+void downmix_3f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
+void downmix_3f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
+void downmix_2f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
+void downmix_2f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
+void downmix_3f_0r_to_2ch_kni(float *samples, dm_par_t * dm_par);
+void stream_sample_2ch_to_s16_kni(int16_t *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_kni(int16_t *s16_samples, float *center);
+
+#endif
diff --git a/ac3dec/exponent.c b/ac3dec/exponent.c
index 98111a5b..16fca465 100644
--- a/ac3dec/exponent.c
+++ b/ac3dec/exponent.c
@@ -28,41 +28,43 @@
 #include "ac3_internal.h"
 
 
-#include "decode.h"
 #include "exponent.h"
 
 
-static void exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp, 
-		uint_16 exps[], uint_16 *dest);
+static inline void exp_unpack_ch(uint16_t type,uint16_t expstr,uint16_t ngrps,uint16_t initial_exp, uint16_t exps[], uint16_t *dest);
 
-void
-exponent_unpack( bsi_t *bsi, audblk_t *audblk)
+
+/**
+ *
+ **/
+
+void exponent_unpack( bsi_t *bsi, audblk_t *audblk)
 {
-	uint_16 i;
+	uint16_t i;
 
 	for(i=0; i< bsi->nfchans; i++)
-		exp_unpack_ch(UNPACK_FBW, audblk->chexpstr[i], audblk->nchgrps[i], audblk->exps[i][0], 
-				&audblk->exps[i][1], audblk->fbw_exp[i]);
+		exp_unpack_ch(UNPACK_FBW, audblk->chexpstr[i], audblk->nchgrps[i], audblk->exps[i][0], &audblk->exps[i][1], audblk->fbw_exp[i]);
 
 	if(audblk->cplinu)
-		exp_unpack_ch(UNPACK_CPL, audblk->cplexpstr, audblk->ncplgrps, audblk->cplabsexp << 1,	
-				audblk->cplexps, &audblk->cpl_exp[audblk->cplstrtmant]);
+		exp_unpack_ch(UNPACK_CPL, audblk->cplexpstr, audblk->ncplgrps, audblk->cplabsexp << 1, audblk->cplexps, &audblk->cpl_exp[audblk->cplstrtmant]);
 
 	if(bsi->lfeon)
-		exp_unpack_ch(UNPACK_LFE, audblk->lfeexpstr, 2, audblk->lfeexps[0], 
-				&audblk->lfeexps[1], audblk->lfe_exp);
+		exp_unpack_ch(UNPACK_LFE, audblk->lfeexpstr, 2, audblk->lfeexps[0], &audblk->lfeexps[1], audblk->lfe_exp);
 }
 
 
-static void
-exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp, 
-		uint_16 exps[], uint_16 *dest)
+/**
+ *
+ **/
+
+static inline void exp_unpack_ch(uint16_t type,uint16_t expstr,uint16_t ngrps,uint16_t initial_exp, 
+		uint16_t exps[], uint16_t *dest)
 {
-	uint_16 i,j;
-	sint_16 exp_acc;
-	sint_16 exp_1,exp_2,exp_3;
+	uint16_t i,j;
+	int16_t exp_acc;
+	int16_t exp_1,exp_2,exp_3;
 
-	if(expstr == EXP_REUSE)
+	if (expstr == EXP_REUSE)
 		return;
 
 	/* Handle the initial absolute exponent */
@@ -75,8 +77,7 @@ exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp,
 		dest[j++] = exp_acc;
 
 	/* Loop through the groups and fill the dest array appropriately */
-	for(i=0; i< ngrps; i++)
-	{
+	for(i=0; i< ngrps; i++) {
 		if(exps[i] > 124)
 			goto error;
 
@@ -86,8 +87,7 @@ exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp,
 
 		exp_acc += (exp_1 - 2);
 
-		switch(expstr)
-		{
+		switch(expstr) {
 			case EXP_D45:
 				dest[j++] = exp_acc;
 				dest[j++] = exp_acc;
@@ -99,8 +99,7 @@ exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp,
 
 		exp_acc += (exp_2 - 2);
 
-		switch(expstr)
-		{
+		switch(expstr) {
 			case EXP_D45:
 				dest[j++] = exp_acc;
 				dest[j++] = exp_acc;
@@ -112,8 +111,7 @@ exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp,
 
 		exp_acc += (exp_3 - 2);
 
-		switch(expstr)
-		{
+		switch(expstr) {
 			case EXP_D45:
 				dest[j++] = exp_acc;
 				dest[j++] = exp_acc;
@@ -125,11 +123,10 @@ exp_unpack_ch(uint_16 type,uint_16 expstr,uint_16 ngrps,uint_16 initial_exp,
 	}	
 
 	return;
-
-			goto error;
 error:
-	if(!error_flag)
-		fprintf(stderr,"** Invalid exponent - skipping frame **\n");
-	error_flag = 1;
+#ifdef DEBUG
+	fprintf (stderr,"** Invalid exponent - skipping frame **\n");
+#endif
+	HANDLE_ERROR ();
 }
 
diff --git a/ac3dec/imdct.c b/ac3dec/imdct.c
index ae2794ed..c09ef413 100644
--- a/ac3dec/imdct.c
+++ b/ac3dec/imdct.c
@@ -28,75 +28,48 @@
 #include "ac3.h"
 #include "ac3_internal.h"
 
-
-#include "decode.h"
+#include "downmix.h"
 #include "imdct.h"
+#include "imdct_c.h"
+#ifdef HAVE_KNI
+#include "imdct_kni.h"
+#endif
+#include "srfft.h"
 
-void imdct_do_256(float data[],float delay[]);
-void imdct_do_512(float data[],float delay[]);
 
-typedef struct complex_s
-{
-	float real;
-	float imag;
-} complex_t;
+extern void (*downmix_3f_2r_to_2ch)(float *samples, dm_par_t * dm_par);
+extern void (*downmix_3f_1r_to_2ch)(float *samples, dm_par_t * dm_par);
+extern void (*downmix_2f_2r_to_2ch)(float *samples, dm_par_t * dm_par);
+extern void (*downmix_2f_1r_to_2ch)(float *samples, dm_par_t * dm_par);
+extern void (*downmix_3f_0r_to_2ch)(float *samples, dm_par_t * dm_par);
+
 
+extern void (*stream_sample_2ch_to_s16)(int16_t *s16_samples, float *left, float *right);
+extern void (*stream_sample_1ch_to_s16)(int16_t *s16_samples, float *center);
+
+void (*fft_64p) (complex_t *);
+void (*imdct_do_512) (float data[],float delay[]);
+void (*imdct_do_512_nol) (float data[], float delay[]);
+
+void imdct_do_256 (float data[],float delay[]);
 
 #define N 512
 
+/* static complex_t buf[128]; */
+//static complex_t buf[128] __attribute__((aligned(16)));
+complex_t buf[128] __attribute__((aligned(16)));
 
-/* 128 point bit-reverse LUT */
-static uint_8 bit_reverse_512[128] = {
-	0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, 
-	0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, 
-	0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, 
-	0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c, 
-	0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72, 
-	0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a, 
-	0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76, 
-	0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e, 
-	0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71, 
-	0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79, 
-	0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75, 
-	0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d, 
-	0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73, 
-	0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b, 
-	0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77, 
-	0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
-
-static uint_8 bit_reverse_256[64] = {
-	0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38, 
-	0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c, 
-	0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a, 
-	0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e, 
-	0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39, 
-	0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d, 
-	0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, 
-	0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
-
-static complex_t buf[128];
-
-/* Twiddle factor LUT */
-static complex_t *w[7];
-static complex_t w_1[1];
-static complex_t w_2[2];
-static complex_t w_4[4];
-static complex_t w_8[8];
-static complex_t w_16[16];
-static complex_t w_32[32];
-static complex_t w_64[64];
+/* Delay buffer for time domain interleaving */
+static float delay[6][256];
+static float delay1[6][256];
 
 /* Twiddle factors for IMDCT */
-static float xcos1[128];
-static float xsin1[128];
 static float xcos2[64];
 static float xsin2[64];
 
-/* Delay buffer for time domain interleaving */
-static float delay[6][256];
-
 /* Windowing function for Modified DCT - Thank you acroread */
-static float window[] = {
+//static float window[] = {
+float window[] = {
 	0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
 	0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
 	0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
@@ -128,146 +101,116 @@ static float window[] = {
 	0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
 	0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
 	0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
-	1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
-
+	1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
+};
 
-static inline void swap_cmplx(complex_t *a, complex_t *b)
+//static const int pm128[128] =
+const int pm128[128] =
 {
-	complex_t tmp;
-
-	tmp = *a;
-	*a = *b;
-	*b = tmp;
-}
-
-
-
-static inline complex_t cmplx_mult(complex_t a, complex_t b)
+	0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
+	4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
+	2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
+	6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
+	1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
+	5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
+	3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
+	7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
+}; 
+
+static const int pm64[64] =
 {
-	complex_t ret;
-
-	ret.real = a.real * b.real - a.imag * b.imag;
-	ret.imag = a.real * b.imag + a.imag * b.real;
-
-	return ret;
-}
-
-void imdct_init(void)
-{
-	int i,k;
-	complex_t angle_step;
-	complex_t current_angle;
-
-	/* Twiddle factors to turn IFFT into IMDCT */
-	for( i=0; i < 128; i++)
-	{
-		xcos1[i] = -cos(2.0f * M_PI * (8*i+1)/(8*N)) ; 
-		xsin1[i] = -sin(2.0f * M_PI * (8*i+1)/(8*N)) ;
-	}
-	
-	/* More twiddle factors to turn IFFT into IMDCT */
-	for( i=0; i < 64; i++)
-	{
-		xcos2[i] = -cos(2.0f * M_PI * (8*i+1)/(4*N)) ; 
-		xsin2[i] = -sin(2.0f * M_PI * (8*i+1)/(4*N)) ;
-	}
-
-	/* Canonical twiddle factors for FFT */
-	w[0] = w_1;
-	w[1] = w_2;
-	w[2] = w_4;
-	w[3] = w_8;
-	w[4] = w_16;
-	w[5] = w_32;
-	w[6] = w_64;
-
-	for( i = 0; i < 7; i++)
-	{
-		angle_step.real = cos(-2.0 * M_PI / (1 << (i+1)));
-		angle_step.imag = sin(-2.0 * M_PI / (1 << (i+1)));
-
-		current_angle.real = 1.0;
-		current_angle.imag = 0.0;
-
-		for (k = 0; k < 1 << i; k++)
-		{
-			w[i][k] = current_angle;
-			current_angle = cmplx_mult(current_angle,angle_step);
-		}
+	0,  8, 16, 24, 32, 40, 48, 56,
+	4, 20, 36, 52, 12, 28, 44, 60,
+	2, 10, 18, 26, 34, 42, 50, 58,
+	6, 14, 22, 30, 38, 46, 54, 62,
+	1,  9, 17, 25, 33, 41, 49, 57,
+	5, 21, 37, 53, 13, 29, 45, 61,
+	3, 11, 19, 27, 35, 43, 51, 59,
+	7, 23, 39, 55, 15, 31, 47, 63
+};
+
+
+void imdct_init (void)
+ {
+	int i;
+	float scale = 255.99609372;
+
+#ifdef __i386__
+#ifdef HAVE_KNI
+	if (!imdct_init_kni ());
+	else
+#endif
+#endif
+	if (!imdct_init_c ());
+
+	// More twiddle factors to turn IFFT into IMDCT */
+	for (i=0; i < 64; i++) {
+		xcos2[i] = cos(2.0f * M_PI * (8*i+1)/(4*N)) * scale;
+		xsin2[i] = sin(2.0f * M_PI * (8*i+1)/(4*N)) * scale;
 	}
 }
 
-void
-imdct_do_512(float data[],float delay[])
+
+void imdct_do_256 (float data[],float delay[])
 {
-	int i,k;
-	int p,q;
-	int m;
-	int two_m;
-	int two_m_plus_one;
+	int i, j, k;
+	int p, q;
 
 	float tmp_a_i;
 	float tmp_a_r;
-	float tmp_b_i;
-	float tmp_b_r;
 
 	float *data_ptr;
 	float *delay_ptr;
 	float *window_ptr;
-	
-	//
-	// 512 IMDCT with source and dest data in 'data'
-	//
-	
-	// Pre IFFT complex multiply plus IFFT cmplx conjugate and bit reverse
-	// permutation
-	for( i=0; i < 128; i++)
-	{
-		k = bit_reverse_512[i];
-
-		/* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ 
-		buf[k].real =         (data[256-2*i-1] * xcos1[i])  -  (data[2*i]       * xsin1[i]);
-	  buf[k].imag = -1.0 * ((data[2*i]       * xcos1[i])  +  (data[256-2*i-1] * xsin1[i]));
-	}
 
-	// FFT Merge 
-	for (m=0; m < 7; m++)
-	{
-		if(m)
-			two_m = (1 << m);
-		else
-			two_m = 1;
-
-		two_m_plus_one = (1 << (m+1));
-
-		for(k = 0; k < two_m; k++)
-		{
-			for(i = 0; i < 128; i += two_m_plus_one)
-			{
-				p = k + i;
-				q = p + two_m;
-				tmp_a_r = buf[p].real;
-				tmp_a_i = buf[p].imag;
-				tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
-				tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
-				buf[p].real = tmp_a_r + tmp_b_r;
-				buf[p].imag =  tmp_a_i + tmp_b_i;
-				buf[q].real = tmp_a_r - tmp_b_r;
-				buf[q].imag =  tmp_a_i - tmp_b_i;
+	complex_t *buf1, *buf2;
 
-			}
-		}
+	buf1 = &buf[0];
+	buf2 = &buf[64];
+
+// Pre IFFT complex multiply plus IFFT complex conjugate
+	for (k=0; k<64; k++) { 
+		/* X1[k] = X[2*k]	*/
+		/* X2[k] = X[2*k+1]	*/
+
+		j = pm64[k];
+		p = 2 * (128-2*j-1);
+		q = 2 * (2 * j);
+
+		/* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
+		buf1[k].re =         data[p] * xcos2[j] - data[q] * xsin2[j];
+		buf1[k].im = -1.0f * (data[q] * xcos2[j] + data[p] * xsin2[j]);
+		/* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
+		buf2[k].re =          data[p + 1] * xcos2[j] - data[q + 1] * xsin2[j];
+		buf2[k].im = -1.0f * ( data[q + 1] * xcos2[j] + data[p + 1] * xsin2[j]);
 	}
 
-	/* Post IFFT complex multiply  plus IFFT complex conjugate*/
-	for( i=0; i < 128; i++)
-	{
-		/* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
-		tmp_a_r = buf[i].real;
-		tmp_a_i = buf[i].imag;
-		//Note that I flipped the signs on the imaginary ops to do the complex conj
-		buf[i].real =(tmp_a_r * xcos1[i])  +  (tmp_a_i  * xsin1[i]);
-	  buf[i].imag =(tmp_a_r * xsin1[i])  -  (tmp_a_i  * xcos1[i]);
+	fft_64p(&buf1[0]);
+	fft_64p(&buf2[0]);
+
+#ifdef DEBUG
+       //DEBUG FFT
+#if 0
+	printf ("Post FFT, buf1\n");
+	for (i=0; i < 64; i++)
+		printf("%d %f %f\n", i, buf_1[i].re, buf_1[i].im);
+	printf ("Post FFT, buf2\n");
+	for (i=0; i < 64; i++)
+		printf("%d %f %f\n", i, buf_2[i].re, buf_2[i].im);
+#endif
+#endif
+
+
+	// Post IFFT complex multiply 
+	for( i=0; i < 64; i++) {
+		tmp_a_r =  buf1[i].re;
+		tmp_a_i = -buf1[i].im;
+		buf1[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
+		buf1[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
+		tmp_a_r =  buf2[i].re;
+		tmp_a_i = -buf2[i].im;
+		buf2[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
+		buf2[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
 	}
 	
 	data_ptr = data;
@@ -275,164 +218,122 @@ imdct_do_512(float data[],float delay[])
 	window_ptr = window;
 
 	/* Window and convert to real valued signal */
-	for(i=0; i< 64; i++) 
-	{ 
-		*data_ptr++   = 2.0f * (-buf[64+i].imag   * *window_ptr++ + *delay_ptr++); 
-		*data_ptr++   = 2.0f * ( buf[64-i-1].real * *window_ptr++ + *delay_ptr++); 
+	for(i=0; i< 64; i++) { 
+		*data_ptr++  = -buf1[i].im      * *window_ptr++ + *delay_ptr++;
+		*data_ptr++  = buf1[64-i-1].re * *window_ptr++ + *delay_ptr++;
 	}
 
-	for(i=0; i< 64; i++) 
-	{ 
-		*data_ptr++  = 2.0f * (-buf[i].real       * *window_ptr++ + *delay_ptr++); 
-		*data_ptr++  = 2.0f * ( buf[128-i-1].imag * *window_ptr++ + *delay_ptr++); 
+	for(i=0; i< 64; i++) {
+		*data_ptr++  = -buf1[i].re      * *window_ptr++ + *delay_ptr++;
+		*data_ptr++  = buf1[64-i-1].im * *window_ptr++ + *delay_ptr++;
 	}
 	
-	/* The trailing edge of the window goes into the delay line */
 	delay_ptr = delay;
 
-	for(i=0; i< 64; i++) 
-	{ 
-		*delay_ptr++  = -buf[64+i].real   * *--window_ptr; 
-		*delay_ptr++  =  buf[64-i-1].imag * *--window_ptr; 
+	for(i=0; i< 64; i++) {
+		*delay_ptr++ = -buf2[i].re      * *--window_ptr;
+		*delay_ptr++ =  buf2[64-i-1].im * *--window_ptr;
 	}
 
-	for(i=0; i<64; i++) 
-	{
-		*delay_ptr++  =  buf[i].imag       * *--window_ptr; 
-		*delay_ptr++  = -buf[128-i-1].real * *--window_ptr; 
+	for(i=0; i< 64; i++) {
+		*delay_ptr++ =  buf2[i].im      * *--window_ptr;
+		*delay_ptr++ = -buf2[64-i-1].re * *--window_ptr;
 	}
 }
 
-void
-imdct_do_256(float data[],float delay[])
+
+/**
+ *
+ **/
+
+void imdct_do_256_nol (float data[], float delay[])
 {
-	int i,k;
-	int p,q;
-	int m;
-	int two_m;
-	int two_m_plus_one;
+	int i, j, k;
+	int p, q;
 
 	float tmp_a_i;
 	float tmp_a_r;
-	float tmp_b_i;
-	float tmp_b_r;
 
 	float *data_ptr;
 	float *delay_ptr;
 	float *window_ptr;
 
-	complex_t *buf_1, *buf_2;
-
-	buf_1 = &buf[0];
-	buf_2 = &buf[64];
-
-	// Pre IFFT complex multiply plus IFFT cmplx conjugate and bit reverse
-	// permutation
-	for(i=0; i<64; i++) 
-	{ 
-		/* X1[i] = X[2*i]  */
-		/* X2[i] = X[2*i+1]     */
-
-		k = bit_reverse_256[i];
-
-		p = 2 * (128-2*i-1);
-		q = 2 * (2 * i);
-
-		/* Z1[i] = (X1[128-2*i-1] + j * X1[2*i]) * (xcos2[i] + j * xsin2[i]); */ 
-		buf_1[k].real =         data[p] * xcos2[i] - data[q] * xsin2[i];
-	  buf_1[k].imag = -1.0f * (data[q] * xcos2[i] + data[p] * xsin2[i]); 
-		/* Z2[i] = (X2[128-2*i-1] + j * X2[2*i]) * (xcos2[i] + j * xsin2[i]); */ 
-		buf_2[k].real =          data[p + 1] * xcos2[i] - data[q + 1] * xsin2[i];
-	  buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[i] + data[p + 1] * xsin2[i]); 
-	}
-
-	// FFT Merge 
-	for (m=0; m < 6; m++)
-	{
-		two_m = (1 << m);
-		two_m_plus_one = (1 << (m+1));
-
-		if(m)
-			two_m = (1 << m);
-		else
-			two_m = 1;
-
-		for(k = 0; k < two_m; k++)
-		{
-			for(i = 0; i < 64; i += two_m_plus_one)
-			{
-				p = k + i;
-				q = p + two_m;
-				//Do block 1
-				tmp_a_r = buf_1[p].real;
-				tmp_a_i = buf_1[p].imag;
-				tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
-				tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
-				buf_1[p].real = tmp_a_r + tmp_b_r;
-				buf_1[p].imag =  tmp_a_i + tmp_b_i;
-				buf_1[q].real = tmp_a_r - tmp_b_r;
-				buf_1[q].imag =  tmp_a_i - tmp_b_i;
-
-				//Do block 2
-				tmp_a_r = buf_2[p].real;
-				tmp_a_i = buf_2[p].imag;
-				tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
-				tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
-				buf_2[p].real = tmp_a_r + tmp_b_r;
-				buf_2[p].imag =  tmp_a_i + tmp_b_i;
-				buf_2[q].real = tmp_a_r - tmp_b_r;
-				buf_2[q].imag =  tmp_a_i - tmp_b_i;
-
-			}
-		}
-	}
-
-	// Post IFFT complex multiply 
-	for( i=0; i < 64; i++)
-	{
-		//Note that I flipped the signs on the imaginary ops to do the complex conj
-		
-		/* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */ 
-		tmp_a_r = buf_1[i].real;
-		tmp_a_i = buf_1[i].imag;
-		buf_1[i].real =(tmp_a_r * xcos2[i])  +  (tmp_a_i  * xsin2[i]);
-	  buf_1[i].imag =(tmp_a_r * xsin2[i])  -  (tmp_a_i  * xcos2[i]);
-		/* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */ 
-		tmp_a_r = buf_2[i].real;
-		tmp_a_i = buf_2[i].imag;
-		buf_2[i].real =(tmp_a_r * xcos2[i])  +  (tmp_a_i  * xsin2[i]);
-	  buf_2[i].imag =(tmp_a_r * xsin2[i])  -  (tmp_a_i  * xcos2[i]);
-	}
-	
-	data_ptr = data;
-	delay_ptr = delay;
-	window_ptr = window;
-
-	/* Window and convert to real valued signal */
-	for(i=0; i< 64; i++) 
-	{ 
-		*data_ptr++  = 2.0f * (-buf_1[i].imag      * *window_ptr++ + *delay_ptr++);
-		*data_ptr++  = 2.0f * ( buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++);
-	}
-
-	for(i=0; i< 64; i++) 
-	{
-		*data_ptr++  = 2.0f * (-buf_1[i].real      * *window_ptr++ + *delay_ptr++);
-		*data_ptr++  = 2.0f * ( buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++);
-	}
-	
-	delay_ptr = delay;
-
-	for(i=0; i< 64; i++) 
-	{
-		*delay_ptr++ = -buf_2[i].real      * *--window_ptr;
-		*delay_ptr++ =  buf_2[64-i-1].imag * *--window_ptr;
-	}
-
-	for(i=0; i< 64; i++) 
-	{
-		*delay_ptr++ =  buf_2[i].imag      * *--window_ptr;
-		*delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
+	complex_t *buf1, *buf2;
+
+	buf1 = &buf[0];
+	buf2 = &buf[64];
+
+       /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
+	for(k=0; k<64; k++) {
+               /* X1[k] = X[2*k]  */
+               /* X2[k] = X[2*k+1]     */
+               j = pm64[k];
+               p = 2 * (128-2*j-1);
+               q = 2 * (2 * j);
+
+               /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
+               buf1[k].re =         data[p] * xcos2[j] - data[q] * xsin2[j];
+               buf1[k].im = -1.0f * (data[q] * xcos2[j] + data[p] * xsin2[j]);
+               /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
+               buf2[k].re =          data[p + 1] * xcos2[j] - data[q + 1] * xsin2[j];
+               buf2[k].im = -1.0f * ( data[q + 1] * xcos2[j] + data[p + 1] * xsin2[j]);
+       }
+
+
+       fft_64p(&buf1[0]);
+       fft_64p(&buf2[0]);
+
+#ifdef DEBUG
+	//DEBUG FFT
+#if 0
+       printf("Post FFT, buf1\n");
+       for (i=0; i < 64; i++)
+               printf("%d %f %f\n", i, buf_1[i].re, buf_1[i].im);
+       printf("Post FFT, buf2\n");
+       for (i=0; i < 64; i++)
+               printf("%d %f %f\n", i, buf_2[i].re, buf_2[i].im);
+#endif
+#endif
+
+       /* Post IFFT complex multiply */
+       for( i=0; i < 64; i++) {
+               /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
+               tmp_a_r =  buf1[i].re;
+               tmp_a_i = -buf1[i].im;
+               buf1[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
+               buf1[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
+               /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
+               tmp_a_r =  buf2[i].re;
+               tmp_a_i = -buf2[i].im;
+               buf2[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
+               buf2[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
+       }
+      
+       data_ptr = data;
+       delay_ptr = delay;
+       window_ptr = window;
+
+       /* Window and convert to real valued signal, no overlap */
+       for(i=0; i< 64; i++) {
+               *data_ptr++  = -buf1[i].im      * *window_ptr++;
+               *data_ptr++  = buf1[64-i-1].re * *window_ptr++;
+       }
+
+       for(i=0; i< 64; i++) {
+               *data_ptr++  = -buf1[i].re      * *window_ptr++ + *delay_ptr++;
+               *data_ptr++  = buf1[64-i-1].im * *window_ptr++ + *delay_ptr++;
+       }
+
+       delay_ptr = delay;
+
+       for(i=0; i< 64; i++) {
+               *delay_ptr++ = -buf2[i].re      * *--window_ptr;
+               *delay_ptr++ =  buf2[64-i-1].im * *--window_ptr;
+       }
+
+       for(i=0; i< 64; i++) {
+               *delay_ptr++ =  buf2[i].im      * *--window_ptr;
+               *delay_ptr++ = -buf2[64-i-1].re * *--window_ptr;
 	}
 }
 
@@ -440,29 +341,190 @@ imdct_do_256(float data[],float delay[])
 ///#include <sys/time.h>
 //FIXME remove
 
-void 
-imdct(bsi_t *bsi,audblk_t *audblk, stream_samples_t samples) {
+void imdct (bsi_t *bsi,audblk_t *audblk, stream_samples_t samples, int16_t *s16_samples, dm_par_t* dm_par)
+{
 	int i;
+	int doable = 0;
+	float *center=NULL, *left, *right, *left_sur, *right_sur;
+	float *delay_left, *delay_right;
+	float *delay1_left, *delay1_right, *delay1_center, *delay1_sr, *delay1_sl;
+	float right_tmp, left_tmp;
+	void (*do_imdct)(float data[], float deley[]);
+
+	// test if dm in frequency is doable
+	if (!(doable = audblk->blksw[0]))
+		do_imdct = imdct_do_512;
+	else
+		do_imdct = imdct_do_256;
+
+	// downmix in the frequency domain if all the channels
+	// use the same imdct
+	for (i=0; i < bsi->nfchans; i++) {
+		if (doable != audblk->blksw[i]) {
+			do_imdct = NULL;
+			break;
+		}
+	}
 
-	//handy timing code
-	//struct timeval start,end;
+	if (do_imdct) {
+		//dowmix first and imdct
+		switch(bsi->acmod) {
+		case 7:		// 3/2
+			downmix_3f_2r_to_2ch (samples[0], dm_par);
+			break;
+		case 6:		// 2/2
+			downmix_2f_2r_to_2ch (samples[0], dm_par);
+			break;
+		case 5:		// 3/1
+			downmix_3f_1r_to_2ch (samples[0], dm_par);
+			break;
+		case 4:		// 2/1
+			downmix_2f_1r_to_2ch (samples[0], dm_par);
+			break;
+		case 3:		// 3/0
+			downmix_3f_0r_to_2ch (samples[0], dm_par);
+			break;
+		case 2:
+			break;
+		default:	// 1/0
+			if (bsi->acmod == 1)
+				center = samples[0];
+			else if (bsi->acmod == 0)
+				center = samples[ac3_config.dual_mono_ch_sel];
+			do_imdct(center, delay[0]); // no downmix
+
+			stream_sample_1ch_to_s16 (s16_samples, center);
+
+			return;
+			//goto done;
+			break;
+		}
 
-	//gettimeofday(&start,0);
-	
-	for(i=0; i<bsi->nfchans;i++)
-	{
-		if(audblk->blksw[i])
-			imdct_do_256(samples[i],delay[i]);
-		else
-			imdct_do_512(samples[i],delay[i]);
+		do_imdct (samples[0], delay[0]);
+		do_imdct (samples[1], delay[1]);
+		stream_sample_2ch_to_s16(s16_samples, samples[0], samples[1]);
+
+	} else {		//imdct and then dowmix
+		// delay and samples should be saved and mixed
+		//fprintf(stderr, "time domain downmix\n");
+		for (i=0; i<bsi->nfchans; i++) {
+			if (audblk->blksw[i])
+				imdct_do_256_nol (samples[i],delay1[i]);
+			else
+				imdct_do_512_nol (samples[i],delay1[i]);
+		}
+
+		// mix the sample, overlap
+		switch(bsi->acmod) {
+		case 7:		// 3/2
+			left = samples[0];
+			center = samples[1];
+			right = samples[2];
+			left_sur = samples[3];
+			right_sur = samples[4];
+			delay_left = delay[0];
+			delay_right = delay[1];
+			delay1_left = delay1[0];
+			delay1_center = delay1[1];
+			delay1_right = delay1[2];
+			delay1_sl = delay1[3];
+			delay1_sr = delay1[4];
+
+			for (i = 0; i < 256; i++) {
+				left_tmp = dm_par->unit * *left++  + dm_par->clev * *center  + dm_par->slev * *left_sur++;
+				right_tmp= dm_par->unit * *right++ + dm_par->clev * *center++ + dm_par->slev * *right_sur++;
+				*s16_samples++ = (int16_t)(left_tmp + *delay_left);
+				*s16_samples++ = (int16_t)(right_tmp + *delay_right);
+				*delay_left++ = dm_par->unit * *delay1_left++  + dm_par->clev * *delay1_center  + dm_par->slev * *delay1_sl++;
+				*delay_right++ = dm_par->unit * *delay1_right++ + dm_par->clev * *center++ + dm_par->slev * *delay1_sr++;
+			}
+			break;
+		case 6:		// 2/2
+			left = samples[0];
+			right = samples[1];
+			left_sur = samples[2];
+			right_sur = samples[3];
+			delay_left = delay[0];
+			delay_right = delay[1];
+			delay1_left = delay1[0];
+			delay1_right = delay1[1];
+			delay1_sl = delay1[2];
+			delay1_sr = delay1[3];
+
+			for (i = 0; i < 256; i++) {
+				left_tmp = dm_par->unit * *left++  + dm_par->slev * *left_sur++;
+				right_tmp= dm_par->unit * *right++ + dm_par->slev * *right_sur++;
+				*s16_samples++ = (int16_t)(left_tmp + *delay_left);
+				*s16_samples++ = (int16_t)(right_tmp + *delay_right);
+				*delay_left++ = dm_par->unit * *delay1_left++  + dm_par->slev * *delay1_sl++;
+				*delay_right++ = dm_par->unit * *delay1_right++ + dm_par->slev * *delay1_sr++;
+			}
+			break;
+		case 5:		// 3/1
+			left = samples[0];
+			center = samples[1];
+			right = samples[2];
+			right_sur = samples[3];
+			delay_left = delay[0];
+			delay_right = delay[1];
+			delay1_left = delay1[0];
+			delay1_center = delay1[1];
+			delay1_right = delay1[2];
+			delay1_sl = delay1[3];
+
+			for (i = 0; i < 256; i++) {
+				left_tmp = dm_par->unit * *left++  + dm_par->clev * *center  - dm_par->slev * *right_sur;
+				right_tmp= dm_par->unit * *right++ + dm_par->clev * *center++ + dm_par->slev * *right_sur++;
+				*s16_samples++ = (int16_t)(left_tmp + *delay_left);
+				*s16_samples++ = (int16_t)(right_tmp + *delay_right);
+				*delay_left++ = dm_par->unit * *delay1_left++  + dm_par->clev * *delay1_center  + dm_par->slev * *delay1_sl;
+				*delay_right++ = dm_par->unit * *delay1_right++ + dm_par->clev * *center++ + dm_par->slev * *delay1_sl++;
+			}
+			break;
+		case 4:		// 2/1
+			left = samples[0];
+			right = samples[1];
+			right_sur = samples[2];
+			delay_left = delay[0];
+			delay_right = delay[1];
+			delay1_left = delay1[0];
+			delay1_right = delay1[1];
+			delay1_sl = delay1[2];
+
+			for (i = 0; i < 256; i++) {
+				left_tmp = dm_par->unit * *left++ - dm_par->slev * *right_sur;
+				right_tmp= dm_par->unit * *right++ + dm_par->slev * *right_sur++;
+				*s16_samples++ = (int16_t)(left_tmp + *delay_left);
+				*s16_samples++ = (int16_t)(right_tmp + *delay_right);
+				*delay_left++ = dm_par->unit * *delay1_left++ + dm_par->slev * *delay1_sl;
+				*delay_right++ = dm_par->unit * *delay1_right++ + dm_par->slev * *delay1_sl++;
+			}
+			break;
+		case 3:		// 3/0
+			left = samples[0];
+			center = samples[1];
+			right = samples[2];
+			delay_left = delay[0];
+			delay_right = delay[1];
+			delay1_left = delay1[0];
+			delay1_center = delay1[1];
+			delay1_right = delay1[2];
+
+			for (i = 0; i < 256; i++) {
+				left_tmp = dm_par->unit * *left++  + dm_par->clev * *center;
+				right_tmp= dm_par->unit * *right++ + dm_par->clev * *center++;
+				*s16_samples++ = (int16_t)(left_tmp + *delay_left);
+				*s16_samples++ = (int16_t)(right_tmp + *delay_right);
+				*delay_left++ = dm_par->unit * *delay1_left++  + dm_par->clev * *delay1_center;
+				*delay_right++ = dm_par->unit * *delay1_right++ + dm_par->clev * *center++;
+			}
+			break;
+		case 2:		// copy to output
+			for (i = 0; i < 256; i++) {
+				*s16_samples++ = (int16_t)samples[0][i];
+				*s16_samples++ = (int16_t)samples[1][i];
+			}
+			break;
+		}
 	}
-	//gettimeofday(&end,0);
-	//printf("imdct %ld us\n",(end.tv_sec - start.tv_sec) * 1000000 +
-        //end.tv_usec - start.tv_usec);
-
-	//XXX We don't bother with the IMDCT for the LFE as it's currently
-	//unused.
-	//if (bsi->lfeon)
-	//	imdct_do_512(coeffs->lfe,samples->channel[5],delay[5]);
-	//	
 }
diff --git a/ac3dec/imdct.h b/ac3dec/imdct.h
index 750aa879..8f10af00 100644
--- a/ac3dec/imdct.h
+++ b/ac3dec/imdct.h
@@ -22,5 +22,5 @@
  *
  */
 
-void imdct(bsi_t *bsi,audblk_t *audblk, stream_samples_t samples);
+void imdct (bsi_t *bsi,audblk_t *audblk, stream_samples_t samples, int16_t *s16_samples, dm_par_t *dm_par);
 void imdct_init(void);
diff --git a/ac3dec/imdct512_kni.S b/ac3dec/imdct512_kni.S
new file mode 100644
index 00000000..10b8de6f
--- /dev/null
+++ b/ac3dec/imdct512_kni.S
@@ -0,0 +1,548 @@
+/* 
+ *  imdct512_kni.S
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000
+ *
+ *
+ *  imdct512_kni.S is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  imdct512_kni.S is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifdef __i386__
+
+.text
+	.align 4
+.global imdct512_pre_ifft_twiddle_kni
+	.type imdct512_pre_ifft_twiddle_kni, @function
+imdct512_pre_ifft_twiddle_kni:
+	
+	pushl %ebp
+	movl  %esp, %ebp
+	addl  $-4, %esp /* local variable, loop counter */
+	
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	pushl %edi
+	pushl %esi
+
+	movl 8(%ebp), %eax  	/* pmt */
+	movl 12(%ebp), %ebx	/* buf */
+	movl 16(%ebp), %ecx	/* data */
+	movl 20(%ebp), %edx 	/* xcos_sin_sse */
+	movl $64, -4(%ebp)
+	
+	
+.loop:
+	movl (%eax), %esi
+	movl 4(%eax), %edi
+	movss (%ecx, %esi, 8), %xmm1 /* 2j */
+	movss (%ecx, %edi, 8), %xmm3 /* 2(j+1) */
+
+	shll $1, %esi
+	shll $1, %edi
+
+	movaps (%edx, %esi, 8), %xmm0; /* -c_j | -s_j | -s_j | c_j */
+	movaps (%edx, %edi, 8), %xmm2; /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
+
+	negl %esi
+	negl %edi
+
+	movss 1020(%ecx, %esi, 4), %xmm4 /* 255-2j */
+	addl $8, %eax
+	movss 1020(%ecx, %edi, 4), %xmm5 /* 255-2(j+1) */
+
+	shufps $0, %xmm1, %xmm4 /* 2j | 2j | 255-2j | 255-2j */
+	shufps $0, %xmm3, %xmm5 /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
+	mulps %xmm4, %xmm0
+	mulps %xmm5, %xmm2
+	movhlps %xmm0, %xmm1
+	movhlps %xmm2, %xmm3
+	addl $16, %ebx
+	addps %xmm1, %xmm0
+	addps %xmm3, %xmm2
+	movlhps %xmm2, %xmm0
+	movaps %xmm0, -16(%ebx)
+	decl -4(%ebp)
+     	jnz .loop
+
+	popl %esi
+	popl %edi
+	popl %edx
+	popl %ecx
+	popl %ebx
+	popl %eax
+
+	addl $4, %esp
+	popl %ebp
+
+	ret
+	.p2align 4,0
+
+.global imdct512_post_ifft_twiddle_kni
+	.type imdct512_post_ifft_twiddle_kni, @function
+imdct512_post_ifft_twiddle_kni:
+	
+	pushl %ebp
+	movl  %esp, %ebp
+	
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+
+	movl 8(%ebp), %eax /* buf[] */
+	movl 12(%ebp), %ebx /* xcos_sin_sse[] */
+	movl $32, %ecx /* loop counter */
+
+.loop1:
+	movaps	(%eax), %xmm0  /*  im1 | re1 | im0 | re0 */
+
+	movaps  (%ebx), %xmm2  /* -c | -s | -s | c */
+	movhlps  %xmm0, %xmm1  /* im1 | re1 */
+	movaps  16(%ebx), %xmm3  /* -c1 | -s1 | -s1 | c1 */
+
+	shufps $0x50, %xmm0, %xmm0 /* im0 | im0 | re0 | re0 */
+	shufps $0x50, %xmm1, %xmm1 /* im1 | im1 | re1 | re1 */
+
+	movaps  16(%eax), %xmm4 /* im3 | re3 | im2 | re2 */
+
+	shufps $0x27, %xmm2, %xmm2 /* c | -s | -s | -c */
+	movhlps  %xmm4, %xmm5 /* im3 | re3 */
+	shufps $0x27, %xmm3, %xmm3 /* c1 | -s1 | -s1 | -c1 */
+
+	movaps  32(%ebx), %xmm6   /* -c2 | -s2 | -s2 | c2 */
+	movaps  48(%ebx), %xmm7   /* -c3 | -s3 | -s3 | c3 */
+
+	shufps $0x50, %xmm4, %xmm4 /* im2 | im2 | re2 | re2 */
+	shufps $0x50, %xmm5, %xmm5 /* im3 | im3 | re3 | re3 */
+
+	mulps %xmm2, %xmm0
+	mulps %xmm3, %xmm1
+
+	shufps $0x27, %xmm6, %xmm6 /* c2 | -s2 | -s2 | -c2 */
+	shufps $0x27, %xmm7, %xmm7 /* c3 | -s3 | -s3 | -c3 */
+
+	movhlps %xmm0, %xmm2
+	movhlps %xmm1, %xmm3
+
+	mulps %xmm6, %xmm4
+	mulps %xmm7, %xmm5
+
+	addps %xmm2, %xmm0
+	addps %xmm3, %xmm1
+
+	movhlps %xmm4, %xmm6
+	movhlps %xmm5, %xmm7
+
+	addps %xmm6, %xmm4
+	addps %xmm7, %xmm5
+
+	movlhps %xmm1, %xmm0
+	movlhps %xmm5, %xmm4
+
+	movaps %xmm0, (%eax)
+	movaps %xmm4, 16(%eax)
+	addl $64, %ebx
+	addl $32, %eax
+	decl %ecx
+	jnz .loop1
+
+	popl %ecx
+	popl %ebx
+	popl %eax
+
+	leave
+	ret
+	.p2align 4,0
+
+.global imdct512_window_delay_kni
+	.type imdct512_window_delay_kni, @function
+imdct512_window_delay_kni:
+	
+	pushl %ebp
+	movl  %esp, %ebp
+	
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	pushl %esi
+	pushl %edi
+
+	movl 20(%ebp), %ebx /* delay */
+	movl 16(%ebp), %edx /* window */
+
+	movl 8(%ebp), %eax  /* buf */
+	movl $16, %ecx /* loop count */
+	leal 516(%eax), %esi /* buf[64].im */
+	leal 504(%eax), %edi /* buf[63].re */ 
+	movl 12(%ebp), %eax /* data */
+.first_128_samples:
+
+	movss (%esi), %xmm0
+	movss 8(%esi), %xmm2
+	movss (%edi), %xmm1
+	movss -8(%edi), %xmm3
+
+	movlhps %xmm2, %xmm0 /* 0.0 | im1 | 0.0 | im0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | re1 | 0.0 | re0 */
+
+	movaps (%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	movaps (%ebx), %xmm5 /* d3 | d2 | d1 | d0 */
+	shufps $0xb1, %xmm1, %xmm1 /* re1 | 0.0 | re0 | 0.0 */
+
+	movss 16(%esi), %xmm6 /* im2 */
+	movss 24(%esi), %xmm7 /* im3 */
+	subps %xmm1, %xmm0  /* -re1 | im1 | -re0 | im0 */
+	movss -16(%edi), %xmm2 /* re2 */
+	movss -24(%edi), %xmm3 /* re3 */
+	mulps %xmm4, %xmm0
+	movlhps %xmm7, %xmm6 /* 0.0 | im3 | 0.0 | im2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | re3 | 0.0 | re2 */
+	addps %xmm5, %xmm0
+	shufps $0xb1, %xmm2, %xmm2 /* re3 | 0.0 | re2 | 0.0 */
+	movaps 16(%edx), %xmm4 /* w7 | w6 | w5 | w4 */
+	movaps 16(%ebx), %xmm5 /* d7 | d6 | d5 | d4 */
+	subps %xmm2, %xmm6 /* -re3 | im3 | -re2 | im2 */
+	addl $32, %edx
+	movaps %xmm0, (%eax)
+	addl $32, %ebx
+	mulps %xmm4, %xmm6
+	addl $32, %esi
+	addl $32, %eax
+	addps %xmm5, %xmm6
+	addl $-32, %edi
+	movaps %xmm6, -16(%eax)
+	decl %ecx
+	jnz .first_128_samples
+
+	movl 8(%ebp), %esi  /* buf[0].re */
+	leal 1020(%esi), %edi /* buf[127].im */ 
+	movl $16, %ecx /* loop count */
+.second_128_samples:
+
+	movss (%esi), %xmm0 /* buf[i].re */
+	movss 8(%esi), %xmm2 /* re1 */
+	movss (%edi), %xmm1 /* buf[127-i].im */
+	movss -8(%edi), %xmm3 /* im1 */
+
+	movlhps %xmm2, %xmm0 /* 0.0 | re1 | 0.0 | re0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | im1 | 0.0 | im1 */
+	
+	movaps (%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	movaps (%ebx), %xmm5 /* d3 | d2 | d1 | d0 */
+
+	shufps $0xb1, %xmm1, %xmm1 /* im1 | 0.0 | im0 | 0.0 */
+	movss 16(%esi), %xmm6 /* re2 */
+	movss 24(%esi), %xmm7 /* re3 */
+	movss -16(%edi), %xmm2 /* im2 */
+	movss -24(%edi), %xmm3 /* im3 */
+	subps %xmm1, %xmm0 /* -im1 | re1 | -im0 | re0 */
+	movlhps %xmm7, %xmm6 /* 0.0 | re3 | 0.0 | re2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | im3 | 0.0 | im2 */
+	mulps %xmm4, %xmm0
+	shufps $0xb1, %xmm2, %xmm2 /* im3 | 0.0 | im2 | 0.0 */
+	movaps 16(%edx), %xmm4 /* w7 | w6 | w5 | w4 */
+	addl $32, %esi
+	subps %xmm2, %xmm6 /* -im3 | re3 | -im2 | re2 */
+	addps %xmm5, %xmm0
+	mulps %xmm4, %xmm6
+	addl $-32, %edi
+	movaps 16(%ebx), %xmm5 /* d7 | d6 | d5 | d4 */
+	movaps %xmm0, (%eax)
+	addps %xmm5, %xmm6
+	addl $32, %edx
+	addl $32, %eax
+	addl $32, %ebx
+	movaps %xmm6, -16(%eax)
+	decl %ecx
+	jnz .second_128_samples
+
+	movl 8(%ebp), %eax
+	leal 512(%eax), %esi /* buf[64].re */
+	leal 508(%eax), %edi /* buf[63].im */ 
+	movl $16, %ecx /* loop count */
+	movl 20(%ebp), %eax /* delay */
+.first_128_delay:
+
+	movss (%esi), %xmm0
+	movss 8(%esi), %xmm2
+	movss (%edi), %xmm1
+	movss -8(%edi), %xmm3
+
+	movlhps %xmm2, %xmm0 /* 0.0 | re1 | 0.0 | re0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | im1 | 0.0 | im0 */
+
+	movaps -16(%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	shufps $0xb1, %xmm1, %xmm1 /* im1 | 0.0 | im0 | 0.0 */
+	movss 16(%esi), %xmm6 /* re2 */
+	movss 24(%esi), %xmm7 /* re3 */
+	movss -16(%edi), %xmm2 /* im2 */
+	movss -24(%edi), %xmm3 /* im3 */
+	subps %xmm1, %xmm0 /* -im1 | re1 | -im0 | re0 */
+	addl $-32, %edx
+	movlhps %xmm7, %xmm6 /* 0.0 | re3 | 0.0 | re2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | im3 | 0.0 | im2 */
+	mulps %xmm4, %xmm0
+	movaps (%edx), %xmm5 /* w7 | w6 | w5 | w4 */
+	shufps $0xb1, %xmm2, %xmm2 /* im3 | 0.0 | im2 | 0.0 */
+	movaps %xmm0, (%eax)
+	addl $32, %esi
+	subps %xmm2, %xmm6 /* -im3 | re3 | -im2 | re2 */
+	addl $-32, %edi
+	mulps %xmm5, %xmm6
+	addl $32, %eax
+	movaps %xmm6, -16(%eax)
+	decl %ecx
+	jnz .first_128_delay
+
+	movl 8(%ebp), %ebx
+	leal 4(%ebx), %esi /* buf[0].im */
+	leal 1016(%ebx), %edi /* buf[127].re */ 
+	movl $16, %ecx /* loop count */
+.second_128_delay:
+
+	movss (%esi), %xmm0
+	movss 8(%esi), %xmm2
+	movss (%edi), %xmm1
+	movss -8(%edi), %xmm3
+
+	movlhps %xmm2, %xmm0 /* 0.0 | im1 | 0.0 | im0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | re1 | 0.0 | re0 */
+
+	movaps -16(%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	shufps $0xb1, %xmm1, %xmm1 /* re1 | 0.0 | re0 | 0.0 */
+	movss 16(%esi), %xmm6 /* im2 */
+	movss 24(%esi), %xmm7 /* im3 */
+	movss -16(%edi), %xmm2 /* re2 */
+	movss -24(%edi), %xmm3 /* re3 */
+	subps %xmm0, %xmm1 /* re1 | -im1 | re0 | -im0 */
+	addl $-32, %edx
+	movlhps %xmm7, %xmm6 /* 0.0 | im3 | 0.0 | im2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | re3 | 0.0 | re2 */
+	mulps %xmm4, %xmm1
+	movaps (%edx), %xmm5 /* w7 | w6 | w5 | w4 */
+	shufps $0xb1, %xmm2, %xmm2 /* re3 | 0.0 | re2 | 0.0 */
+	movaps %xmm1, (%eax)
+	addl $32, %esi
+	subps %xmm6, %xmm2 /* re | -im3 | re | -im2 */
+	addl $-32, %edi
+	mulps %xmm5, %xmm2
+	addl $32, %eax
+	movaps %xmm2, -16(%eax)
+	decl %ecx
+	jnz .second_128_delay
+
+	popl %edi
+	popl %esi
+	popl %edx
+	popl %ecx
+	popl %ebx
+	popl %eax
+	
+	leave
+	ret
+	.p2align 4,0
+	
+.global imdct512_window_delay_nol_kni
+	.type imdct512_window_delay_nol_kni, @function
+imdct512_window_delay_nol_kni:
+	
+	pushl %ebp
+	movl  %esp, %ebp
+	
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	pushl %esi
+	pushl %edi
+
+	/* movl 20(%ebp), %ebx delay */
+	movl 16(%ebp), %edx /* window */
+
+	movl 8(%ebp), %eax  /* buf */
+	movl $16, %ecx /* loop count */
+	leal 516(%eax), %esi /* buf[64].im */
+	leal 504(%eax), %edi /* buf[63].re */ 
+	movl 12(%ebp), %eax /* data */
+.first_128_sample:
+
+	movss (%esi), %xmm0
+	movss 8(%esi), %xmm2
+	movss (%edi), %xmm1
+	movss -8(%edi), %xmm3
+
+	movlhps %xmm2, %xmm0 /* 0.0 | im1 | 0.0 | im0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | re1 | 0.0 | re0 */
+
+	movaps (%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	/* movaps (%ebx), %xmm5 d3 | d2 | d1 | d0 */
+	shufps $0xb1, %xmm1, %xmm1 /* re1 | 0.0 | re0 | 0.0 */
+
+	movss 16(%esi), %xmm6 /* im2 */
+	movss 24(%esi), %xmm7 /* im3 */
+	subps %xmm1, %xmm0  /* -re1 | im1 | -re0 | im0 */
+	movss -16(%edi), %xmm2 /* re2 */
+	movss -24(%edi), %xmm3 /* re3 */
+	mulps %xmm4, %xmm0
+	movlhps %xmm7, %xmm6 /* 0.0 | im3 | 0.0 | im2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | re3 | 0.0 | re2 */
+	/* addps %xmm5, %xmm0 */
+	shufps $0xb1, %xmm2, %xmm2 /* re3 | 0.0 | re2 | 0.0 */
+	movaps 16(%edx), %xmm4 /* w7 | w6 | w5 | w4 */
+	/* movaps 16(%ebx), %xmm5  d7 | d6 | d5 | d4 */
+	subps %xmm2, %xmm6 /* -re3 | im3 | -re2 | im2 */
+	addl $32, %edx
+	movaps %xmm0, (%eax)
+	/* addl $32, %ebx */
+	mulps %xmm4, %xmm6
+	addl $32, %esi
+	addl $32, %eax
+	/* addps %xmm5, %xmm6 */
+	addl $-32, %edi
+	movaps %xmm6, -16(%eax)
+	decl %ecx
+	jnz .first_128_sample
+
+	movl 8(%ebp), %esi  /* buf[0].re */
+	leal 1020(%esi), %edi /* buf[127].im */ 
+	movl $16, %ecx /* loop count */
+.second_128_sample:
+
+	movss (%esi), %xmm0 /* buf[i].re */
+	movss 8(%esi), %xmm2 /* re1 */
+	movss (%edi), %xmm1 /* buf[127-i].im */
+	movss -8(%edi), %xmm3 /* im1 */
+
+	movlhps %xmm2, %xmm0 /* 0.0 | re1 | 0.0 | re0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | im1 | 0.0 | im1 */
+	
+	movaps (%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	/* movaps (%ebx), %xmm5 d3 | d2 | d1 | d0 */
+
+	shufps $0xb1, %xmm1, %xmm1 /* im1 | 0.0 | im0 | 0.0 */
+	movss 16(%esi), %xmm6 /* re2 */
+	movss 24(%esi), %xmm7 /* re3 */
+	movss -16(%edi), %xmm2 /* im2 */
+	movss -24(%edi), %xmm3 /* im3 */
+	subps %xmm1, %xmm0 /* -im1 | re1 | -im0 | re0 */
+	movlhps %xmm7, %xmm6 /* 0.0 | re3 | 0.0 | re2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | im3 | 0.0 | im2 */
+	mulps %xmm4, %xmm0
+	shufps $0xb1, %xmm2, %xmm2 /* im3 | 0.0 | im2 | 0.0 */
+	movaps 16(%edx), %xmm4 /* w7 | w6 | w5 | w4 */
+	addl $32, %esi
+	subps %xmm2, %xmm6 /* -im3 | re3 | -im2 | re2 */
+	/* addps %xmm5, %xmm0 */
+	mulps %xmm4, %xmm6
+	addl $-32, %edi
+	/* movaps 16(%ebx), %xmm5  d7 | d6 | d5 | d4 */
+	movaps %xmm0, (%eax)
+	/* addps %xmm5, %xmm6 */
+	addl $32, %edx
+	addl $32, %eax
+	/* addl $32, %ebx */
+	movaps %xmm6, -16(%eax)
+	decl %ecx
+	jnz .second_128_sample
+
+	movl 8(%ebp), %eax
+	leal 512(%eax), %esi /* buf[64].re */
+	leal 508(%eax), %edi /* buf[63].im */ 
+	movl $16, %ecx /* loop count */
+	movl 20(%ebp), %eax /* delay */
+.first_128_delays:
+
+	movss (%esi), %xmm0
+	movss 8(%esi), %xmm2
+	movss (%edi), %xmm1
+	movss -8(%edi), %xmm3
+
+	movlhps %xmm2, %xmm0 /* 0.0 | re1 | 0.0 | re0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | im1 | 0.0 | im0 */
+
+	movaps -16(%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	shufps $0xb1, %xmm1, %xmm1 /* im1 | 0.0 | im0 | 0.0 */
+	movss 16(%esi), %xmm6 /* re2 */
+	movss 24(%esi), %xmm7 /* re3 */
+	movss -16(%edi), %xmm2 /* im2 */
+	movss -24(%edi), %xmm3 /* im3 */
+	subps %xmm1, %xmm0 /* -im1 | re1 | -im0 | re0 */
+	addl $-32, %edx
+	movlhps %xmm7, %xmm6 /* 0.0 | re3 | 0.0 | re2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | im3 | 0.0 | im2 */
+	mulps %xmm4, %xmm0
+	movaps (%edx), %xmm5 /* w7 | w6 | w5 | w4 */
+	shufps $0xb1, %xmm2, %xmm2 /* im3 | 0.0 | im2 | 0.0 */
+	movaps %xmm0, (%eax)
+	addl $32, %esi
+	subps %xmm2, %xmm6 /* -im3 | re3 | -im2 | re2 */
+	addl $-32, %edi
+	mulps %xmm5, %xmm6
+	addl $32, %eax
+	movaps %xmm6, -16(%eax)
+	decl %ecx
+	jnz .first_128_delays
+
+	movl 8(%ebp), %ebx
+	leal 4(%ebx), %esi /* buf[0].im */
+	leal 1016(%ebx), %edi /* buf[127].re */ 
+	movl $16, %ecx /* loop count */
+.second_128_delays:
+
+	movss (%esi), %xmm0
+	movss 8(%esi), %xmm2
+	movss (%edi), %xmm1
+	movss -8(%edi), %xmm3
+
+	movlhps %xmm2, %xmm0 /* 0.0 | im1 | 0.0 | im0 */
+	movlhps %xmm3, %xmm1 /* 0.0 | re1 | 0.0 | re0 */
+
+	movaps -16(%edx), %xmm4 /* w3 | w2 | w1 | w0 */
+	shufps $0xb1, %xmm1, %xmm1 /* re1 | 0.0 | re0 | 0.0 */
+	movss 16(%esi), %xmm6 /* im2 */
+	movss 24(%esi), %xmm7 /* im3 */
+	movss -16(%edi), %xmm2 /* re2 */
+	movss -24(%edi), %xmm3 /* re3 */
+	subps %xmm0, %xmm1 /* re1 | -im1 | re0 | -im0 */
+	addl $-32, %edx
+	movlhps %xmm7, %xmm6 /* 0.0 | im3 | 0.0 | im2 */
+	movlhps %xmm3, %xmm2 /* 0.0 | re3 | 0.0 | re2 */
+	mulps %xmm4, %xmm1
+	movaps (%edx), %xmm5 /* w7 | w6 | w5 | w4 */
+	shufps $0xb1, %xmm2, %xmm2 /* re3 | 0.0 | re2 | 0.0 */
+	movaps %xmm1, (%eax)
+	addl $32, %esi
+	subps %xmm6, %xmm2 /* re | -im3 | re | -im2 */
+	addl $-32, %edi
+	mulps %xmm5, %xmm2
+	addl $32, %eax
+	movaps %xmm2, -16(%eax)
+	decl %ecx
+	jnz .second_128_delays
+
+	popl %edi
+	popl %esi
+	popl %edx
+	popl %ecx
+	popl %ebx
+	popl %eax
+	
+	leave
+	ret
+	.p2align 4,0
+#endif
diff --git a/ac3dec/imdct_c.c b/ac3dec/imdct_c.c
new file mode 100644
index 00000000..1f2bfe84
--- /dev/null
+++ b/ac3dec/imdct_c.c
@@ -0,0 +1,218 @@
+/* 
+ *  imdct.c
+ *
+ *	Copyright (C) Aaron Holtzman - May 1999
+ *
+ *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
+ *	
+ *  ac3dec is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  ac3dec is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
+ *
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "ac3.h"
+#include "ac3_internal.h"
+
+#include "downmix.h"
+#include "imdct_c.h"
+#include "srfft.h"
+
+
+#define N 512
+
+extern void (*imdct_do_512) (float data[],float delay[]);
+extern void (*imdct_do_512_nol) (float data[], float delay[]);
+extern void (*fft_64p) (complex_t *);
+
+extern const int pm128[];
+extern float window[];
+extern complex_t buf[128];
+
+extern void fft_64p_c (complex_t *);
+extern void fft_128p_c (complex_t *);
+
+static void imdct_do_512_c (float data[],float delay[]);
+static void imdct_do_512_nol_c (float data[], float delay[]);
+
+/* Twiddle factors for IMDCT */
+static float xcos1[128] __attribute__((aligned(16)));
+static float xsin1[128] __attribute__((aligned(16)));
+
+
+int imdct_init_c (void)
+{
+	int i;
+	float scale = 255.99609372;
+
+	imdct_do_512 = imdct_do_512_c;
+	imdct_do_512_nol = imdct_do_512_nol_c;
+	fft_64p = fft_64p_c;
+
+	/* Twiddle factors to turn IFFT into IMDCT */
+         
+	for (i=0; i < 128; i++) {
+		xcos1[i] = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale; 
+		xsin1[i] = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+	}
+
+	return 0;
+}
+
+
+static void imdct_do_512_c (float data[], float delay[])
+{
+	int i, j;
+	float tmp_a_r, tmp_a_i;
+	float *data_ptr;
+	float *delay_ptr;
+	float *window_ptr;
+
+// 512 IMDCT with source and dest data in 'data'
+// Pre IFFT complex multiply plus IFFT complex conjugate
+
+	for( i=0; i < 128; i++) {
+		j = pm128[i];
+		//a = (data[256-2*j-1] - data[2*j]) * (xcos1[j] + xsin1[j]);
+		//c = data[2*j] * xcos1[j];
+		//b = data[256-2*j-1] * xsin1[j];
+		//buf1[i].re = a - b + c;
+		//buf1[i].im = b + c;
+		buf[i].re = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
+		buf[i].im = -1.0 * (data[2*j] * xcos1[j] + data[256-2*j-1] * xsin1[j]);
+	}
+
+	fft_128p_c (&buf[0]);
+
+// Post IFFT complex multiply  plus IFFT complex conjugate
+	for (i=0; i < 128; i++) {
+		tmp_a_r = buf[i].re;
+		tmp_a_i = buf[i].im;
+		//a = (tmp_a_r - tmp_a_i) * (xcos1[j] + xsin1[j]);
+		//b = tmp_a_r * xsin1[j];
+		//c = tmp_a_i * xcos1[j];
+		//buf[j].re = a - b + c;
+		//buf[j].im = b + c;
+		buf[i].re =(tmp_a_r * xcos1[i])  +  (tmp_a_i  * xsin1[i]);
+		buf[i].im =(tmp_a_r * xsin1[i])  -  (tmp_a_i  * xcos1[i]);
+	}
+
+	data_ptr = data;
+	delay_ptr = delay;
+	window_ptr = window;
+
+// Window and convert to real valued signal
+	for (i=0; i< 64; i++) {
+		*data_ptr++   = -buf[64+i].im   * *window_ptr++ + *delay_ptr++;
+		*data_ptr++   = buf[64-i-1].re * *window_ptr++ + *delay_ptr++;
+	}
+
+	for(i=0; i< 64; i++) {
+		*data_ptr++  = -buf[i].re       * *window_ptr++ + *delay_ptr++;
+		*data_ptr++  = buf[128-i-1].im * *window_ptr++ + *delay_ptr++;
+	}
+
+// The trailing edge of the window goes into the delay line
+	delay_ptr = delay;
+
+	for(i=0; i< 64; i++) {
+		*delay_ptr++  = -buf[64+i].re   * *--window_ptr;
+		*delay_ptr++  =  buf[64-i-1].im * *--window_ptr;
+	}
+
+	for(i=0; i<64; i++) {
+		*delay_ptr++  =  buf[i].im       * *--window_ptr;
+		*delay_ptr++  = -buf[128-i-1].re * *--window_ptr;
+	}
+}
+
+
+static void imdct_do_512_nol_c (float data[], float delay[])
+{
+	int i, j;
+
+	float tmp_a_i;
+	float tmp_a_r;
+
+	float *data_ptr;
+	float *delay_ptr;
+	float *window_ptr;
+
+        //
+        // 512 IMDCT with source and dest data in 'data'
+        //
+
+	// Pre IFFT complex multiply plus IFFT cmplx conjugate
+
+        for( i=0; i < 128; i++) {
+	/* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) */
+		j = pm128[i];
+	//a = (data[256-2*j-1] - data[2*j]) * (xcos1[j] + xsin1[j]);
+	//c = data[2*j] * xcos1[j];
+	//b = data[256-2*j-1] * xsin1[j];
+	//buf1[i].re = a - b + c;
+
+         //buf1[i].im = b + c;
+		buf[i].re = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
+		buf[i].im = -1.0 * (data[2*j] * xcos1[j] + data[256-2*j-1] * xsin1[j]);
+	}
+       
+	fft_128p_c (&buf[0]);
+
+       /* Post IFFT complex multiply  plus IFFT complex conjugate*/
+	for (i=0; i < 128; i++) {
+		/* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
+		/* int j1 = i; */
+		tmp_a_r = buf[i].re;
+		tmp_a_i = buf[i].im;
+		//a = (tmp_a_r - tmp_a_i) * (xcos1[j] + xsin1[j]);
+		//b = tmp_a_r * xsin1[j];
+		//c = tmp_a_i * xcos1[j];
+		//buf[j].re = a - b + c;
+		//buf[j].im = b + c;
+		buf[i].re =(tmp_a_r * xcos1[i])  +  (tmp_a_i  * xsin1[i]);
+		buf[i].im =(tmp_a_r * xsin1[i])  -  (tmp_a_i  * xcos1[i]);
+	}
+       
+	data_ptr = data;
+	delay_ptr = delay;
+	window_ptr = window;
+
+	/* Window and convert to real valued signal, no overlap here*/
+	for (i=0; i< 64; i++) { 
+		*data_ptr++   = -buf[64+i].im   * *window_ptr++; 
+		*data_ptr++   = buf[64-i-1].re * *window_ptr++; 
+	}
+
+	for(i=0; i< 64; i++) { 
+		*data_ptr++  = -buf[i].re       * *window_ptr++; 
+		*data_ptr++  = buf[128-i-1].im * *window_ptr++; 
+	}
+       
+	/* The trailing edge of the window goes into the delay line */
+	delay_ptr = delay;
+
+	for(i=0; i< 64; i++) { 
+		*delay_ptr++  = -buf[64+i].re   * *--window_ptr; 
+		*delay_ptr++  =  buf[64-i-1].im * *--window_ptr; 
+	}
+
+	for(i=0; i<64; i++) {
+		*delay_ptr++  =  buf[i].im       * *--window_ptr; 
+		*delay_ptr++  = -buf[128-i-1].re * *--window_ptr; 
+	}
+}
diff --git a/ac3dec/imdct_c.h b/ac3dec/imdct_c.h
new file mode 100644
index 00000000..d2c31b54
--- /dev/null
+++ b/ac3dec/imdct_c.h
@@ -0,0 +1,36 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __IMDCT_C_H__
+#define __IMDCT_C_H__
+
+#include "cmplx.h"
+
+int imdct_init_c (void);
+
+void fft_128p_c (complex_t *);
+void fft_64p_c (complex_t *);
+
+void imdct512_pre_ifft_twiddle_c (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
+void imdct512_post_ifft_twiddle_c (complex_t *buf, float *xcos_sin_sse);
+void imdct512_window_delay_c (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+void imdct512_window_delay_nol_c (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+
+#endif
diff --git a/ac3dec/imdct_kni.c b/ac3dec/imdct_kni.c
new file mode 100644
index 00000000..b44547a9
--- /dev/null
+++ b/ac3dec/imdct_kni.c
@@ -0,0 +1,103 @@
+/* 
+ *  imdct_kni.c
+ *
+ *	Copyright (C) Aaron Holtzman - May 1999
+ *
+ *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
+ *	
+ *  ac3dec is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  ac3dec is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
+ *
+ *
+ */
+
+#ifdef __i386__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <mm_accel.h>
+#include "ac3.h"
+#include "ac3_internal.h"
+
+#include "downmix.h"
+#include "imdct_kni.h"
+#include "srfft.h"
+
+#define N 512
+
+/* Delay buffer for time domain interleaving */
+static float xcos_sin_sse[128 * 4] __attribute__((aligned(16)));
+
+extern void (*imdct_do_512) (float data[],float delay[]);
+extern void (*imdct_do_512_nol) (float data[], float delay[]);
+extern void (*fft_64p) (complex_t *);
+
+extern const int pm128[];
+extern float window[];
+extern complex_t buf[128];
+
+extern void fft_64p_kni (complex_t *);
+extern void fft_128p_kni (complex_t *);
+
+static void imdct_do_512_kni (float data[], float delay[]);
+static void imdct_do_512_nol_kni (float data[], float delay[]);
+
+
+int imdct_init_kni (void)
+{
+	uint32_t accel = mm_accel ();
+
+	if (accel & MM_ACCEL_X86_MMXEXT) {
+		int i;
+		float scale = 255.99609372;
+
+		fprintf (stderr, "Using SSE for IMDCT\n");
+		imdct_do_512 = imdct_do_512_kni;
+		imdct_do_512_nol = imdct_do_512_nol_kni;
+		fft_64p = fft_64p_kni;
+
+		for (i=0; i < 128; i++) {
+			float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+                       	float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+                       	xcos_sin_sse[i * 4] = xcos_i;
+                       	xcos_sin_sse[i * 4 + 1] = -xsin_i;
+                       	xcos_sin_sse[i * 4 + 2] = -xsin_i;
+                       	xcos_sin_sse[i * 4 + 3] = -xcos_i;
+               	}
+
+		return 0;
+	} else
+		return -1;
+}
+
+
+static void imdct_do_512_kni (float data[], float delay[])
+{
+	imdct512_pre_ifft_twiddle_kni (pm128, buf, data, xcos_sin_sse);
+	fft_128p_kni (buf);
+	imdct512_post_ifft_twiddle_kni (buf, xcos_sin_sse);
+	imdct512_window_delay_kni (buf, data, window, delay);
+}
+
+
+static void imdct_do_512_nol_kni (float data[], float delay[])
+{
+	imdct512_pre_ifft_twiddle_kni (pm128, buf, data, xcos_sin_sse);  
+	fft_128p_kni (buf);
+	imdct512_post_ifft_twiddle_kni (buf, xcos_sin_sse);
+	imdct512_window_delay_nol_kni (buf, data, window, delay);
+}
+
+#endif
diff --git a/ac3dec/imdct_kni.h b/ac3dec/imdct_kni.h
new file mode 100644
index 00000000..2193c075
--- /dev/null
+++ b/ac3dec/imdct_kni.h
@@ -0,0 +1,36 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __IMDCT_KNI_H__
+#define __IMDCT_KNI_H__
+
+#include "cmplx.h"
+
+int imdct_init_kni (void);
+
+void fft_128p_kni(complex_t *);
+void fft_64p_kni(complex_t *);
+
+void imdct512_pre_ifft_twiddle_kni(const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
+void imdct512_post_ifft_twiddle_kni(complex_t *buf, float *xcos_sin_sse);
+void imdct512_window_delay_kni(complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+void imdct512_window_delay_nol_kni(complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+
+#endif
diff --git a/ac3dec/mm_accel.h b/ac3dec/mm_accel.h
new file mode 100644
index 00000000..82dc5837
--- /dev/null
+++ b/ac3dec/mm_accel.h
@@ -0,0 +1,36 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __MM_ACCEL_H__
+#define __MM_ACCEL_H__
+
+#include <inttypes.h>
+
+// generic accelerations
+#define MM_ACCEL_MLIB		0x00000001
+
+// x86 accelerations
+#define MM_ACCEL_X86_MMX	0x80000000
+#define MM_ACCEL_X86_3DNOW	0x40000000
+#define MM_ACCEL_X86_MMXEXT	0x20000000
+
+uint32_t mm_accel (void);
+
+#endif
diff --git a/ac3dec/parse.c b/ac3dec/parse.c
index 3560bc5a..49ea57c6 100644
--- a/ac3dec/parse.c
+++ b/ac3dec/parse.c
@@ -31,15 +31,16 @@
 #include "bitstream.h"
 #include "stats.h"
 #include "debug.h"
+#include "crc.h"
 #include "parse.h"
 
 /* Misc LUT */
-static const uint_16 nfchans[8] = {2,1,2,3,3,4,4,5};
+static const uint16_t nfchans[8] = {2,1,2,3,3,4,4,5};
 
 struct frmsize_s
 {
-	uint_16 bit_rate;
-	uint_16 frm_size[3];
+	uint16_t bit_rate;
+	uint16_t frm_size[3];
 };
 
 static const struct frmsize_s frmsizecod_tbl[64] = 
@@ -85,8 +86,7 @@ static const struct frmsize_s frmsizecod_tbl[64] =
 };
 
 /* Parse a syncinfo structure, minus the sync word */
-void
-parse_syncinfo(syncinfo_t *syncinfo,uint_8 *data)
+void parse_syncinfo(syncinfo_t *syncinfo, uint8_t *data)
 {
 	//
 	// We need to read in the entire syncinfo struct (0x0b77 + 24 bits)
@@ -96,10 +96,8 @@ parse_syncinfo(syncinfo_t *syncinfo,uint_8 *data)
 	// Get the sampling rate 
 	syncinfo->fscod  = (data[2] >> 6) & 0x3;
 
-	if(syncinfo->fscod == 3)
-	{
+	if(syncinfo->fscod == 3) {
 		//invalid sampling rate code
-		error_flag = 1;	
 		return;
 	}
 	else if(syncinfo->fscod == 2)
@@ -119,15 +117,13 @@ parse_syncinfo(syncinfo_t *syncinfo,uint_8 *data)
 
 }
 
-/*
+
+/**
  * This routine fills a bsi struct from the AC3 stream
- */
+ **/
 
-void
-parse_bsi(bsi_t *bsi)
+void parse_bsi(bsi_t *bsi)
 {
-	uint_32 i;
-
 	/* Check the AC-3 version number */
 	bsi->bsid = bitstream_get(5);
 
@@ -159,25 +155,19 @@ parse_bsi(bsi_t *bsi)
 	bsi->dialnorm = bitstream_get(5);
 
 	/* Does compression gain exist? */
-	bsi->compre = bitstream_get(1);
-	if (bsi->compre)
-	{
+	if ((bsi->compre = bitstream_get(1))) {
 		/* Get compression gain */
 		bsi->compr = bitstream_get(8);
 	}
 
 	/* Does language code exist? */
-	bsi->langcode = bitstream_get(1);
-	if (bsi->langcode)
-	{
+	if ((bsi->langcode = bitstream_get(1))) {
 		/* Get langauge code */
 		bsi->langcod = bitstream_get(8);
 	}
 
 	/* Does audio production info exist? */
-	bsi->audprodie = bitstream_get(1);
-	if (bsi->audprodie)
-	{
+	if ((bsi->audprodie = bitstream_get(1))) {
 		/* Get mix level */
 		bsi->mixlevel = bitstream_get(5);
 
@@ -186,31 +176,24 @@ parse_bsi(bsi_t *bsi)
 	}
 
 	/* If we're in dual mono mode then get some extra info */
-	if (bsi->acmod ==0)
-	{
+	if (!bsi->acmod) {
 		/* Get the dialogue normalization level two */
 		bsi->dialnorm2 = bitstream_get(5);
 
 		/* Does compression gain two exist? */
-		bsi->compr2e = bitstream_get(1);
-		if (bsi->compr2e)
-		{
+		if ((bsi->compr2e = bitstream_get(1))) {
 			/* Get compression gain two */
 			bsi->compr2 = bitstream_get(8);
 		}
 
 		/* Does language code two exist? */
-		bsi->langcod2e = bitstream_get(1);
-		if (bsi->langcod2e)
-		{
+		if ((bsi->langcod2e = bitstream_get(1))) {
 			/* Get langauge code two */
 			bsi->langcod2 = bitstream_get(8);
 		}
 
 		/* Does audio production info two exist? */
-		bsi->audprodi2e = bitstream_get(1);
-		if (bsi->audprodi2e)
-		{
+		if ((bsi->audprodi2e = bitstream_get(1))) {
 			/* Get mix level two */
 			bsi->mixlevel2 = bitstream_get(5);
 
@@ -226,22 +209,17 @@ parse_bsi(bsi_t *bsi)
 	bsi->origbs = bitstream_get(1);
 	
 	/* Does timecode one exist? */
-	bsi->timecod1e = bitstream_get(1);
-
-	if(bsi->timecod1e)
+	if ((bsi->timecod1e = bitstream_get(1)))
 		bsi->timecod1 = bitstream_get(14);
 
 	/* Does timecode two exist? */
-	bsi->timecod2e = bitstream_get(1);
-
-	if(bsi->timecod2e)
+	if ((bsi->timecod2e = bitstream_get(1)))
 		bsi->timecod2 = bitstream_get(14);
 
 	/* Does addition info exist? */
-	bsi->addbsie = bitstream_get(1);
+	if ((bsi->addbsie = bitstream_get(1))) {
+		uint32_t i;
 
-	if(bsi->addbsie)
-	{
 		/* Get how much info is there */
 		bsi->addbsil = bitstream_get(6);
 
@@ -253,52 +231,41 @@ parse_bsi(bsi_t *bsi)
 	stats_print_bsi(bsi);
 }
 
+
 /* More pain inducing parsing */
-void
-parse_audblk(bsi_t *bsi,audblk_t *audblk)
+void parse_audblk(bsi_t *bsi,audblk_t *audblk)
 {
 	int i,j;
 
-	for (i=0;i < bsi->nfchans; i++)
-	{
+	for (i=0; i < bsi->nfchans; i++) {
 		/* Is this channel an interleaved 256 + 256 block ? */
 		audblk->blksw[i] = bitstream_get(1);
 	}
 
-	for (i=0;i < bsi->nfchans; i++)
-	{
+	for (i=0;i < bsi->nfchans; i++) {
 		/* Should we dither this channel? */
 		audblk->dithflag[i] = bitstream_get(1);
 	}
 
 	/* Does dynamic range control exist? */
-	audblk->dynrnge = bitstream_get(1);
-	if (audblk->dynrnge)
-	{
+	if ((audblk->dynrnge = bitstream_get(1))) {
 		/* Get dynamic range info */
 		audblk->dynrng = bitstream_get(8);
 	}
 
 	/* If we're in dual mono mode then get the second channel DR info */
-	if (bsi->acmod == 0)
-	{
+	if (bsi->acmod == 0) {
 		/* Does dynamic range control two exist? */
-		audblk->dynrng2e = bitstream_get(1);
-		if (audblk->dynrng2e)
-		{
+		if ((audblk->dynrng2e = bitstream_get(1))) {
 			/* Get dynamic range info */
 			audblk->dynrng2 = bitstream_get(8);
 		}
 	}
 
 	/* Does coupling strategy exist? */
-	audblk->cplstre = bitstream_get(1);
-	if (audblk->cplstre)
-	{
+	if ((audblk->cplstre = bitstream_get(1))) {
 		/* Is coupling turned on? */
-		audblk->cplinu = bitstream_get(1);
-		if(audblk->cplinu)
-		{
+		if ((audblk->cplinu = bitstream_get(1))) {
 			for(i=0;i < bsi->nfchans; i++)
 				audblk->chincpl[i] = bitstream_get(1);
 			if(bsi->acmod == 0x2)
@@ -315,30 +282,23 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 			 * band */
 			audblk->ncplbnd = audblk->ncplsubnd; 
 
-			for(i=1; i< audblk->ncplsubnd; i++)
-			{
+			for(i=1; i< audblk->ncplsubnd; i++) {
 				audblk->cplbndstrc[i] = bitstream_get(1);
 				audblk->ncplbnd -= audblk->cplbndstrc[i];
 			}
 		}
 	}
 
-	if(audblk->cplinu)
-	{
+	if(audblk->cplinu) {
 		/* Loop through all the channels and get their coupling co-ords */	
-		for(i=0;i < bsi->nfchans;i++)
-		{
+		for(i=0;i < bsi->nfchans;i++) {
 			if(!audblk->chincpl[i])
 				continue;
 
 			/* Is there new coupling co-ordinate info? */
-			audblk->cplcoe[i] = bitstream_get(1);
-
-			if(audblk->cplcoe[i])
-			{
+			if ((audblk->cplcoe[i] = bitstream_get(1))) {
 				audblk->mstrcplco[i] = bitstream_get(2); 
-				for(j=0;j < audblk->ncplbnd; j++)
-				{
+				for(j=0;j < audblk->ncplbnd; j++) {
 					audblk->cplcoexp[i][j] = bitstream_get(4); 
 					audblk->cplcomant[i][j] = bitstream_get(4); 
 				}
@@ -347,8 +307,7 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 
 		/* If we're in dual mono mode, there's going to be some phase info */
 		if( (bsi->acmod == 0x2) && audblk->phsflginu && 
-				(audblk->cplcoe[0] || audblk->cplcoe[1]))
-		{
+				(audblk->cplcoe[0] || audblk->cplcoe[1])) {
 			for(j=0;j < audblk->ncplbnd; j++)
 				audblk->phsflg[j] = bitstream_get(1); 
 
@@ -356,23 +315,17 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	}
 
 	/* If we're in dual mono mode, there may be a rematrix strategy */
-	if(bsi->acmod == 0x2)
-	{
-		audblk->rematstr = bitstream_get(1);
-		if(audblk->rematstr)
-		{
-			if (audblk->cplinu == 0) 
-			{ 
+	if(bsi->acmod == 0x2) {
+		if ((audblk->rematstr = bitstream_get(1))) {
+			if (!audblk->cplinu) { 
 				for(i = 0; i < 4; i++) 
 					audblk->rematflg[i] = bitstream_get(1);
 			}
-			if((audblk->cplbegf > 2) && audblk->cplinu) 
-			{
+			if((audblk->cplbegf > 2) && audblk->cplinu) {
 				for(i = 0; i < 4; i++) 
 					audblk->rematflg[i] = bitstream_get(1);
 			}
-			if((audblk->cplbegf <= 2) && audblk->cplinu) 
-			{ 
+			if((audblk->cplbegf <= 2) && audblk->cplinu) { 
 				for(i = 0; i < 3; i++) 
 					audblk->rematflg[i] = bitstream_get(1);
 			} 
@@ -383,8 +336,7 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 		}
 	}
 
-	if (audblk->cplinu)
-	{
+	if (audblk->cplinu) {
 		/* Get the coupling channel exponent strategy */
 		audblk->cplexpstr = bitstream_get(2);
 		audblk->ncplgrps = (audblk->cplendmant - audblk->cplstrtmant) / 
@@ -399,18 +351,13 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 		audblk->lfeexpstr = bitstream_get(1);
 
 	/* Determine the bandwidths of all the fbw channels */
-	for(i = 0; i < bsi->nfchans; i++) 
-	{ 
-		uint_16 grp_size;
-
-		if(audblk->chexpstr[i] != EXP_REUSE) 
-		{ 
-			if (audblk->cplinu && audblk->chincpl[i]) 
-			{
+	for(i = 0; i < bsi->nfchans; i++) { 
+		uint16_t grp_size;
+
+		if(audblk->chexpstr[i] != EXP_REUSE) { 
+			if (audblk->cplinu && audblk->chincpl[i]) {
 				audblk->endmant[i] = audblk->cplstrtmant;
-			}
-			else
-			{
+			} else {
 				audblk->chbwcod[i] = bitstream_get(6); 
 				audblk->endmant[i] = ((audblk->chbwcod[i] + 12) * 3) + 37;
 			}
@@ -422,18 +369,15 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	}
 
 	/* Get the coupling exponents if they exist */
-	if(audblk->cplinu && (audblk->cplexpstr != EXP_REUSE))
-	{
+	if(audblk->cplinu && (audblk->cplexpstr != EXP_REUSE)) {
 		audblk->cplabsexp = bitstream_get(4);
 		for(i=0;i< audblk->ncplgrps;i++)
 			audblk->cplexps[i] = bitstream_get(7);
 	}
 
 	/* Get the fwb channel exponents */
-	for(i=0;i < bsi->nfchans; i++)
-	{
-		if(audblk->chexpstr[i] != EXP_REUSE)
-		{
+	for(i=0;i < bsi->nfchans; i++) {
+		if(audblk->chexpstr[i] != EXP_REUSE) {
 			audblk->exps[i][0] = bitstream_get(4);			
 			for(j=1;j<=audblk->nchgrps[i];j++)
 				audblk->exps[i][j] = bitstream_get(7);
@@ -442,8 +386,7 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	}
 
 	/* Get the lfe channel exponents */
-	if(bsi->lfeon && (audblk->lfeexpstr != EXP_REUSE))
-	{
+	if(bsi->lfeon && (audblk->lfeexpstr != EXP_REUSE)) {
 		audblk->lfeexps[0] = bitstream_get(4);
 		audblk->lfeexps[1] = bitstream_get(7);
 		audblk->lfeexps[2] = bitstream_get(7);
@@ -452,8 +395,7 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	/* Get the parametric bit allocation parameters */
 	audblk->baie = bitstream_get(1);
 
-	if(audblk->baie)
-	{
+	if(audblk->baie) {
 		audblk->sdcycod = bitstream_get(2);
 		audblk->fdcycod = bitstream_get(2);
 		audblk->sgaincod = bitstream_get(2);
@@ -464,23 +406,19 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	/* Get the SNR off set info if it exists */
 	audblk->snroffste = bitstream_get(1);
 
-	if(audblk->snroffste)
-	{
+	if(audblk->snroffste) {
 		audblk->csnroffst = bitstream_get(6);
 
-		if(audblk->cplinu)
-		{
+		if(audblk->cplinu) {
 			audblk->cplfsnroffst = bitstream_get(4);
 			audblk->cplfgaincod = bitstream_get(3);
 		}
 
-		for(i = 0;i < bsi->nfchans; i++)
-		{
+		for(i = 0;i < bsi->nfchans; i++) {
 			audblk->fsnroffst[i] = bitstream_get(4);
 			audblk->fgaincod[i] = bitstream_get(3);
 		}
-		if(bsi->lfeon)
-		{
+		if(bsi->lfeon) {
 
 			audblk->lfefsnroffst = bitstream_get(4);
 			audblk->lfefgaincod = bitstream_get(3);
@@ -488,12 +426,10 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	}
 
 	/* Get coupling leakage info if it exists */
-	if(audblk->cplinu)
-	{
+	if(audblk->cplinu) {
 		audblk->cplleake = bitstream_get(1);	
 		
-		if(audblk->cplleake)
-		{
+		if(audblk->cplleake) {
 			audblk->cplfleak = bitstream_get(3);
 			audblk->cplsleak = bitstream_get(3);
 		}
@@ -502,32 +438,26 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	/* Get the delta bit alloaction info */
 	audblk->deltbaie = bitstream_get(1);	
 	
-	if(audblk->deltbaie)
-	{
+	if(audblk->deltbaie) {
 		if(audblk->cplinu)
 			audblk->cpldeltbae = bitstream_get(2);
 
 		for(i = 0;i < bsi->nfchans; i++)
 			audblk->deltbae[i] = bitstream_get(2);
 
-		if (audblk->cplinu && (audblk->cpldeltbae == DELTA_BIT_NEW))
-		{
+		if (audblk->cplinu && (audblk->cpldeltbae == DELTA_BIT_NEW)) {
 			audblk->cpldeltnseg = bitstream_get(3);
-			for(i = 0;i < audblk->cpldeltnseg + 1; i++)
-			{
+			for(i = 0;i < audblk->cpldeltnseg + 1; i++) {
 				audblk->cpldeltoffst[i] = bitstream_get(5);
 				audblk->cpldeltlen[i] = bitstream_get(4);
 				audblk->cpldeltba[i] = bitstream_get(3);
 			}
 		}
 
-		for(i = 0;i < bsi->nfchans; i++)
-		{
-			if (audblk->deltbae[i] == DELTA_BIT_NEW)
-			{
+		for(i = 0;i < bsi->nfchans; i++) {
+			if (audblk->deltbae[i] == DELTA_BIT_NEW) {
 				audblk->deltnseg[i] = bitstream_get(3);
-				for(j = 0; j < audblk->deltnseg[i] + 1; j++)
-				{
+				for(j = 0; j < audblk->deltnseg[i] + 1; j++) {
 					audblk->deltoffst[i][j] = bitstream_get(5);
 					audblk->deltlen[i][j] = bitstream_get(4);
 					audblk->deltba[i][j] = bitstream_get(3);
@@ -537,61 +467,15 @@ parse_audblk(bsi_t *bsi,audblk_t *audblk)
 	}
 
 	/* Check to see if there's any dummy info to get */
-	if((audblk->skiple =  bitstream_get(1)))
-	{
-		uint_16 skip_data;
+	if((audblk->skiple =  bitstream_get(1))) {
+		uint16_t skip_data;
 
 		audblk->skipl = bitstream_get(9);
-		//XXX remove
-		//fprintf(stderr,"(parse) skipping %d bytes\n",audblk->skipl);
 
-		for(i = 0; i < audblk->skipl ; i++)
-		{
+		for (i = 0; i < audblk->skipl; i++) {
 			skip_data = bitstream_get(8);
-			//XXX remove
-			//fprintf(stderr,"skipped data %2x\n",skip_data);
-			//if(skip_data != 0)
-			//{	
-				//dprintf("(parse) Invalid skipped data %2x\n",skip_data);
-				//exit(1);
-			//}
 		}
 	}
 
 	stats_print_audblk(bsi,audblk);
 }
-
-void
-parse_auxdata(syncinfo_t *syncinfo)
-{
-	//FIXME keep this now that we don't really need it?
-#if 0
-	int i;
-	int skip_length;
-	uint_16 crc;
-	uint_16 auxdatae;
-
-	skip_length = (syncinfo->frame_size * 16)  - bitstream_get_total_bits() - 17 - 1;
-
-	//XXX remove
-	//dprintf("(auxdata) skipping %d auxbits\n",skip_length);
-	
-	for(i=0; i <  skip_length; i++)
-		//printf("Skipped bit %i\n",(uint_16)bitstream_get(1));
-		bitstream_get(1);
-
-	//get the auxdata exists bit
-	auxdatae = bitstream_get(1);	
-
-	//XXX remove
-	//dprintf("auxdatae = %i\n",auxdatae);
-
-	//Skip the CRC reserved bit
-	bitstream_get(1);
-
-	//Get the crc
-	crc = bitstream_get(16);
-#endif
-}
-
-
diff --git a/ac3dec/parse.h b/ac3dec/parse.h
index cdaee66f..6264fea1 100644
--- a/ac3dec/parse.h
+++ b/ac3dec/parse.h
@@ -21,8 +21,6 @@
  *
  */
 
-void parse_syncinfo(syncinfo_t *syncinfo,uint_8 *data);
+void parse_syncinfo(syncinfo_t *syncinfo,uint8_t *data);
 void parse_audblk(bsi_t *bsi,audblk_t *audblk);
 void parse_bsi(bsi_t *bsi);
-void parse_auxdata(syncinfo_t *syncinfo);
-
diff --git a/ac3dec/rematrix.c b/ac3dec/rematrix.c
index caa70943..f1df19b1 100644
--- a/ac3dec/rematrix.c
+++ b/ac3dec/rematrix.c
@@ -28,52 +28,60 @@
 #include "ac3_internal.h"
 
 
-#include "decode.h"
 #include "rematrix.h"
 
+
 struct rematrix_band_s
 {
-	uint_32 start;
-	uint_32 end;
+	uint32_t start;
+	uint32_t end;
+} rematrix_band[] = {
+	{13, 24},
+	{25, 36},
+	{37, 60},
+	{61, 252}
 };
 
-struct rematrix_band_s rematrix_band[] = { {13,24}, {25,36}, {37 ,60}, {61,252}};
 
-static inline uint_32 min(uint_32 a,uint_32 b);
+/**
+ *
+ **/
 
-static inline uint_32
-min(uint_32 a,uint_32 b)
+inline uint32_t min (uint32_t a, uint32_t b)
 {
-	return (a < b ? a : b);
+	return (a < b) ? a : b;
 }
 
-/* This routine simply does stereo rematixing for the 2 channel 
- * stereo mode */
-void rematrix(audblk_t *audblk, stream_samples_t samples)
+
+/**
+ * This routine simply does stereo remartixing for the 2 channel 
+ * stereo mode
+ **/
+
+void rematrix (audblk_t *audblk, stream_samples_t samples)
 {
-	uint_32 num_bands;
-	uint_32 start;
-	uint_32 end;
-	uint_32 i,j;
-	float left,right;
+	uint32_t num_bands;
+	uint32_t start;
+	uint32_t end;
+	int i,j;
 
-	if(!audblk->cplinu || audblk->cplbegf > 2)
+	if (!audblk->cplinu || audblk->cplbegf > 2)
 		num_bands = 4;
 	else if (audblk->cplbegf > 0)
 		num_bands = 3;
 	else
 		num_bands = 2;
 
-	for(i=0;i < num_bands; i++)
-	{
-		if(!audblk->rematflg[i])
+	for (i=0; i < num_bands; i++) {
+		if (!audblk->rematflg[i])
 			continue;
 
 		start = rematrix_band[i].start;
-		end = min(rematrix_band[i].end ,12 * audblk->cplbegf + 36);
+		end = min (rematrix_band[i].end ,12 * audblk->cplbegf + 36);
 	
-		for(j=start;j < end; j++)
-		{
+		for (j=start;j < end; j++) {
+			float left,right;
+
 			left  = samples[0][j] + samples[1][j];
 			right = samples[0][j] - samples[1][j];
 			samples[0][j] = left;
diff --git a/ac3dec/sanity_check.c b/ac3dec/sanity_check.c
index 461f20ea..d475de14 100644
--- a/ac3dec/sanity_check.c
+++ b/ac3dec/sanity_check.c
@@ -28,8 +28,11 @@
 #include "sanity_check.h"
 
 
-void 
-sanity_check_init(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk)
+/**
+ *
+ **/
+
+void sanity_check_init(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk)
 {
 	syncinfo->magic = AC3_MAGIC_NUMBER;
 	bsi->magic = AC3_MAGIC_NUMBER;
@@ -38,94 +41,84 @@ sanity_check_init(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk)
 	audblk->magic3 = AC3_MAGIC_NUMBER;
 }
 
-void 
-sanity_check(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk)
+
+/**
+ *
+ **/
+
+int sanity_check(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk)
 {
 	int i;
 
-	if(syncinfo->magic != AC3_MAGIC_NUMBER)
-	{
+	if(syncinfo->magic != AC3_MAGIC_NUMBER) {
 		fprintf(stderr,"\n** Sanity check failed -- syncinfo magic number **");
-		error_flag = 1;
+		return -1;
 	}
 	
-	if(bsi->magic != AC3_MAGIC_NUMBER)
-	{
+	if(bsi->magic != AC3_MAGIC_NUMBER) {
 		fprintf(stderr,"\n** Sanity check failed -- bsi magic number **");
-		error_flag = 1;
+		return -1;
 	}
 
-	if(audblk->magic1 != AC3_MAGIC_NUMBER)
-	{
+	if(audblk->magic1 != AC3_MAGIC_NUMBER) {
 		fprintf(stderr,"\n** Sanity check failed -- audblk magic number 1 **"); 
-		error_flag = 1;
+		return -1;
 	}
 
-	if(audblk->magic2 != AC3_MAGIC_NUMBER)
-	{
+	if(audblk->magic2 != AC3_MAGIC_NUMBER) {
 		fprintf(stderr,"\n** Sanity check failed -- audblk magic number 2 **"); 
-		error_flag = 1;
+		return -1;
 	}
 
-	if(audblk->magic3 != AC3_MAGIC_NUMBER)
-	{
+	if(audblk->magic3 != AC3_MAGIC_NUMBER) {
 		fprintf(stderr,"\n** Sanity check failed -- audblk magic number 3 **"); 
-		error_flag = 1;
+		return -1;
 	}
 
-	for(i = 0;i < 5 ; i++)
-	{
+	for(i = 0;i < 5 ; i++) {
 		if (audblk->fbw_exp[i][255] !=0 || audblk->fbw_exp[i][254] !=0 || 
-				audblk->fbw_exp[i][253] !=0)
-		{
+				audblk->fbw_exp[i][253] !=0) {
 			fprintf(stderr,"\n** Sanity check failed -- fbw_exp out of bounds **"); 
-			error_flag = 1;
+			return -1;
 		}
 
 		if (audblk->fbw_bap[i][255] !=0 || audblk->fbw_bap[i][254] !=0 || 
-				audblk->fbw_bap[i][253] !=0)
-		{
+				audblk->fbw_bap[i][253] !=0) {
 			fprintf(stderr,"\n** Sanity check failed -- fbw_bap out of bounds **"); 
-			error_flag = 1;
+			return -1;
 		}
 
 	}
 
 	if (audblk->cpl_exp[255] !=0 || audblk->cpl_exp[254] !=0 || 
-			audblk->cpl_exp[253] !=0)
-	{
+			audblk->cpl_exp[253] !=0) {
 		fprintf(stderr,"\n** Sanity check failed -- cpl_exp out of bounds **"); 
-		error_flag = 1;
+		return -1;
 	}
 
 	if (audblk->cpl_bap[255] !=0 || audblk->cpl_bap[254] !=0 || 
-			audblk->cpl_bap[253] !=0)
-	{
+			audblk->cpl_bap[253] !=0) {
 		fprintf(stderr,"\n** Sanity check failed -- cpl_bap out of bounds **"); 
-		error_flag = 1;
+		return -1;
 	}
 
-	if (audblk->cplmant[255] !=0 || audblk->cplmant[254] !=0 || 
-			audblk->cplmant[253] !=0)
-	{
+	if (audblk->cpl_flt[255] !=0 || audblk->cpl_flt[254] !=0 || 
+			audblk->cpl_flt[253] !=0) {
 		fprintf(stderr,"\n** Sanity check failed -- cpl_mant out of bounds **"); 
-		error_flag = 1;
+		return -1;
 	}
 
-	if ((audblk->cplinu == 1) && (audblk->cplbegf > (audblk->cplendf+2)))
-	{
+	if ((audblk->cplinu == 1) && (audblk->cplbegf > (audblk->cplendf+2))) {
 		fprintf(stderr,"\n** Sanity check failed -- cpl params inconsistent **"); 
-		error_flag = 1;
+		return -1;
 	}
 
-	for(i=0; i < bsi->nfchans; i++)
-	{
-		if((audblk->chincpl[i] == 0) && (audblk->chbwcod[i] > 60))
-		{
+	for(i=0; i < bsi->nfchans; i++) {
+		if((audblk->chincpl[i] == 0) && (audblk->chbwcod[i] > 60)) {
 			fprintf(stderr,"\n** Sanity check failed -- chbwcod too big **"); 
-			error_flag = 1;
+			return -1;
 		}
 	}
 
-	return;
+	return 0;
 }	
diff --git a/ac3dec/sanity_check.h b/ac3dec/sanity_check.h
index c4ca63a2..ead9399c 100644
--- a/ac3dec/sanity_check.h
+++ b/ac3dec/sanity_check.h
@@ -23,5 +23,5 @@
 
 #define AC3_MAGIC_NUMBER 0xdeadbeef
 
-void sanity_check_init(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk);
-void sanity_check(syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk);
+void sanity_check_init (syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk);
+int sanity_check (syncinfo_t *syncinfo, bsi_t *bsi, audblk_t *audblk);
diff --git a/ac3dec/srfft.c b/ac3dec/srfft.c
new file mode 100644
index 00000000..4cbb6295
--- /dev/null
+++ b/ac3dec/srfft.c
@@ -0,0 +1,305 @@
+/* 
+ *  srfft.c
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
+ *
+ *  64 and 128 point split radix fft for ac3dec
+ *
+ *  The algorithm is desribed in the book:
+ *  "Computational Frameworks of the Fast Fourier Transform".
+ *
+ *  The ideas and the the organization of code borrowed from djbfft written by
+ *  D. J. Bernstein <djb@cr.py.to>.  djbff can be found at 
+ *  http://cr.yp.to/djbfft.html.
+ *
+ *  srfft.c is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  srfft.c is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+
+#include "srfft.h"
+#include "srfftp.h"
+
+void fft_8 (complex_t *x);
+
+void fft_4(complex_t *x)
+{
+  /* delta_p = 1 here */
+  /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} 
+   */
+
+  register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
+  
+  yt_r = x[0].re;
+  yb_r = yt_r - x[2].re;
+  yt_r += x[2].re;
+
+  u_r = x[1].re;
+  vi_i = x[3].re - u_r;
+  u_r += x[3].re;
+  
+  u_i = x[1].im;
+  vi_r = u_i - x[3].im;
+  u_i += x[3].im;
+
+  yt_i = yt_r;
+  yt_i += u_r;
+  x[0].re = yt_i;
+  yt_r -= u_r;
+  x[2].re = yt_r;
+  yt_i = yb_r;
+  yt_i += vi_r;
+  x[1].re = yt_i;
+  yb_r -= vi_r;
+  x[3].re = yb_r;
+
+  yt_i = x[0].im;
+  yb_i = yt_i - x[2].im;
+  yt_i += x[2].im;
+
+  yt_r = yt_i;
+  yt_r += u_i;
+  x[0].im = yt_r;
+  yt_i -= u_i;
+  x[2].im = yt_i;
+  yt_r = yb_i;
+  yt_r += vi_i;
+  x[1].im = yt_r;
+  yb_i -= vi_i;
+  x[3].im = yb_i;
+}
+
+
+void fft_8 (complex_t *x)
+{
+  /* delta_p = diag{1, sqrt(i)} here */
+  /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} 
+   */
+  register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
+  
+  wT1_r = x[1].re;
+  wT1_i = x[1].im;
+  wB1_r = x[3].re;
+  wB1_i = x[3].im;
+
+  x[1] = x[2];
+  x[2] = x[4];
+  x[3] = x[6];
+  fft_4(&x[0]);
+
+  
+  /* x[0] x[4] */
+  wT2_r = x[5].re;
+  wT2_r += x[7].re;
+  wT2_r += wT1_r;
+  wT2_r += wB1_r;
+  wT2_i = wT2_r;
+  wT2_r += x[0].re;
+  wT2_i = x[0].re - wT2_i;
+  x[0].re = wT2_r;
+  x[4].re = wT2_i;
+
+  wT2_i = x[5].im;
+  wT2_i += x[7].im;
+  wT2_i += wT1_i;
+  wT2_i += wB1_i;
+  wT2_r = wT2_i;
+  wT2_r += x[0].im;
+  wT2_i = x[0].im - wT2_i;
+  x[0].im = wT2_r;
+  x[4].im = wT2_i;
+  
+  /* x[2] x[6] */
+  wT2_r = x[5].im;
+  wT2_r -= x[7].im;
+  wT2_r += wT1_i;
+  wT2_r -= wB1_i;
+  wT2_i = wT2_r;
+  wT2_r += x[2].re;
+  wT2_i = x[2].re - wT2_i;
+  x[2].re = wT2_r;
+  x[6].re = wT2_i;
+
+  wT2_i = x[5].re;
+  wT2_i -= x[7].re;
+  wT2_i += wT1_r;
+  wT2_i -= wB1_r;
+  wT2_r = wT2_i;
+  wT2_r += x[2].im;
+  wT2_i = x[2].im - wT2_i;
+  x[2].im = wT2_i;
+  x[6].im = wT2_r;
+  
+
+  /* x[1] x[5] */
+  wT2_r = wT1_r;
+  wT2_r += wB1_i;
+  wT2_r -= x[5].re;
+  wT2_r -= x[7].im;
+  wT2_i = wT1_i;
+  wT2_i -= wB1_r;
+  wT2_i -= x[5].im;
+  wT2_i += x[7].re;
+
+  wB2_r = wT2_r;
+  wB2_r += wT2_i;
+  wT2_i -= wT2_r;
+  wB2_r *= HSQRT2;
+  wT2_i *= HSQRT2;
+  wT2_r = wB2_r;
+  wB2_r += x[1].re;
+  wT2_r =  x[1].re - wT2_r;
+
+  wB2_i = x[5].re;
+  x[1].re = wB2_r;
+  x[5].re = wT2_r;
+
+  wT2_r = wT2_i;
+  wT2_r += x[1].im;
+  wT2_i = x[1].im - wT2_i;
+  wB2_r = x[5].im;
+  x[1].im = wT2_r;
+  x[5].im = wT2_i;
+
+  /* x[3] x[7] */
+  wT1_r -= wB1_i;
+  wT1_i += wB1_r;
+  wB1_r = wB2_i - x[7].im;
+  wB1_i = wB2_r + x[7].re;
+  wT1_r -= wB1_r;
+  wT1_i -= wB1_i;
+  wB1_r = wT1_r + wT1_i;
+  wB1_r *= HSQRT2;
+  wT1_i -= wT1_r;
+  wT1_i *= HSQRT2;
+  wB2_r = x[3].re;
+  wB2_i = wB2_r + wT1_i;
+  wB2_r -= wT1_i;
+  x[3].re = wB2_i;
+  x[7].re = wB2_r;
+  wB2_i = x[3].im;
+  wB2_r = wB2_i + wB1_r;
+  wB2_i -= wB1_r;
+  x[3].im = wB2_i;
+  x[7].im = wB2_r;
+}
+
+
+void fft_asmb(int k, complex_t *x, complex_t *wTB,
+	     const complex_t *d, const complex_t *d_3)
+{
+  register complex_t  *x2k, *x3k, *x4k, *wB;
+  register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
+
+  x2k = x + 2 * k;
+  x3k = x2k + 2 * k;
+  x4k = x3k + 2 * k;
+  wB = wTB + 2 * k;
+  
+  TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
+  TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
+  
+  --k;
+  for(;;) {
+     TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
+     TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
+     if (!--k) break;
+     x += 2;
+     x2k += 2;
+     x3k += 2;
+     x4k += 2;
+     d += 2;
+     d_3 += 2;
+     wTB += 2;
+     wB += 2;
+  }
+ 
+}
+
+void fft_asmb16(complex_t *x, complex_t *wTB)
+{
+  register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
+  int k = 2;
+
+  /* transform x[0], x[8], x[4], x[12] */
+  TRANSZERO(x[0],x[4],x[8],x[12]);
+
+  /* transform x[1], x[9], x[5], x[13] */
+  TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
+
+  /* transform x[2], x[10], x[6], x[14] */
+  TRANSHALF_16(x[2],x[6],x[10],x[14]);
+
+  /* transform x[3], x[11], x[7], x[15] */
+  TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
+
+} 
+
+
+void fft_64p_c (complex_t *a)
+{
+  fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
+  fft_asmb16(&a[0], &a[8]);
+  
+  fft_8(&a[16]), fft_8(&a[24]);
+  fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+  fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
+  fft_asmb16(&a[32], &a[40]);
+
+  fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
+  fft_asmb16(&a[48], &a[56]);
+
+  fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+}
+
+
+void fft_128p_c (complex_t *a)
+{
+  fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
+  fft_asmb16(&a[0], &a[8]);
+  
+  fft_8(&a[16]), fft_8(&a[24]);
+  fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+  fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
+  fft_asmb16(&a[32], &a[40]);
+
+  fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
+  fft_asmb16(&a[48], &a[56]);
+
+  fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+
+  fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
+  /* fft_16(&a[64]); */
+  fft_asmb16(&a[64], &a[72]);
+
+  fft_8(&a[80]); fft_8(&a[88]);
+  
+  /* fft_32(&a[64]); */
+  fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
+
+  fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
+  /* fft_16(&a[96]); */
+  fft_asmb16(&a[96], &a[104]);
+
+  fft_8(&a[112]), fft_8(&a[120]);
+  /* fft_32(&a[96]); */
+  fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
+  
+  /* fft_128(&a[0]); */
+  fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
+}
diff --git a/ac3dec/srfft.h b/ac3dec/srfft.h
new file mode 100644
index 00000000..7916092a
--- /dev/null
+++ b/ac3dec/srfft.h
@@ -0,0 +1,39 @@
+/* 
+ *  srfft.h
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
+ *
+ *  64 and 128 point split radix fft for ac3dec
+ *
+ *  The algorithm is desribed in the book:
+ *  "Computational Frameworks of the Fast Fourier Transform".
+ *
+ *  The ideas and the the organization of code borrowed from djbfft written by
+ *  D. J. Bernstein <djb@cr.py.to>.  djbff can be found at 
+ *  http://cr.yp.to/djbfft.html.
+ *
+ *  srfft.h is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  srfft.h is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef SRFFT_H__
+#define SRFFT_H__
+
+#include "cmplx.h"
+
+void fft_64p_c (complex_t *x);
+void fft_128p_c (complex_t *x);
+
+#endif /* SRFFT_H__ */
diff --git a/ac3dec/srfft_kni.S b/ac3dec/srfft_kni.S
new file mode 100644
index 00000000..a42a41b1
--- /dev/null
+++ b/ac3dec/srfft_kni.S
@@ -0,0 +1,289 @@
+/* 
+ *  srfft_kni.S
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - October 2000
+ *
+ *
+ *  srfft_kni.S is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  srfft_kni.S is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifdef __i386__
+
+.section	.rodata
+	.align 16
+hsqrt2:	 .float 0f0.707106781188
+	 .float 0f0.707106781188
+	 .float 0f-0.707106781188
+	 .float 0f-0.707106781188
+C_1:	 .float 0f-1.0
+	 .float 0f1.0
+	 .float 0f-1.0
+	 .float 0f1.0
+	
+.text
+	.align 4
+.global fft_4_kni
+	.type fft_4_kni, @function
+fft_4_kni:
+	pushl %ebp
+	movl %esp, %ebp
+	movl 8(%ebp), %eax		/* complex_t * */
+	
+	movaps (%eax), %xmm0		/* x[1] | x[0] */
+	movaps 16(%eax), %xmm2		/* x[3] | x[2] */
+	movaps %xmm0, %xmm1		/* x[1] | x[0] */
+	addps %xmm2, %xmm0		/* x[1] + x[3] | x[0] + x[2] */
+	subps %xmm2, %xmm1		/* x[1] - x[3] | x[0] - x[2] */
+	xorps %xmm6, %xmm6
+	movhlps %xmm1, %xmm4		/* x[1] - x[3] */
+	movhlps %xmm0, %xmm3		/* x[1] + x[3] */
+	subss %xmm4, %xmm6		/* -(x[1] - x[3]).re */
+	movlhps %xmm1, %xmm0		/* x[0] - x[2] | x[0] + x[2] */
+	movss %xmm6, %xmm4		/* (x[1] - x[3]).im | (x[3]-x[1]).re */
+	movaps %xmm0, %xmm2		/* x[0] - x[2] | x[0] + x[2] */
+	shufps $0x14, %xmm4, %xmm3	/* -i*(x[2] - x[3] | x[2] + x[3] */
+	addps %xmm3, %xmm0
+	subps %xmm3, %xmm2
+	movaps %xmm0, (%eax)
+	movaps %xmm2, 16(%eax)
+
+	leave
+	ret
+	
+
+	.align 4
+.global fft_8_kni
+	.type fft_8_kni, @function
+fft_8_kni:
+	pushl %ebp
+	movl %esp, %ebp
+	movl 8(%ebp), %eax		/* complext_t */
+
+	pushl %ebx
+	movlps (%eax), %xmm0		/* x[0] */
+	movlps 32(%eax), %xmm1		/* x[4] */
+	movhps 16(%eax), %xmm0		/* x[2] | x[0] */
+	movhps 48(%eax), %xmm1		/* x[6] | x[4] */
+	movaps %xmm0, %xmm2		/* x[2] | x[0] */
+	xorps %xmm3, %xmm3
+	addps %xmm1, %xmm0		/* x[2] + x[6] | x[0] + x[4] */
+	subps %xmm1, %xmm2		/* x[2] - x[6] | x[0] - x[4] */
+	movhlps %xmm0, %xmm5		/* x[2] + x[6] */
+	movhlps %xmm2, %xmm4
+	movlhps %xmm2, %xmm0		/* x[0] - x[4] | x[0] + x[4] */
+	subss %xmm4, %xmm3		/* -(x[2]-x[6]).re */
+	movaps %xmm0, %xmm7		/* x[0] - x[4] | x[0] + x[4] */
+	movss %xmm3, %xmm4		/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+	movlps 8(%eax), %xmm1		/* x[1] */
+	shufps $0x14, %xmm4, %xmm5	/* -i*(x[2] - x[6]) | x[2] + x[6] */
+
+	addps %xmm5, %xmm0		/* yt */
+	subps %xmm5, %xmm7		/* yb */
+
+	movhps 24(%eax), %xmm1		/* x[3] | x[1] */
+	movl $hsqrt2, %ebx
+	movlps 40(%eax), %xmm2		/* x[5] */
+	movhps 56(%eax), %xmm2		/* /x[7] | x[5] */
+	movaps %xmm1, %xmm3		/* x[3] | x[1] */
+	addps %xmm2, %xmm1		/* x[3] + x[7] | x[1] + x[5] */
+	subps %xmm2, %xmm3		/* x[3] - x[7] | x[1] - x[5] */
+	movaps (%ebx), %xmm4		/* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
+	movaps %xmm3, %xmm6		/* x[3] - x[7] | x[1] - x[5] */
+	mulps %xmm4, %xmm3
+	shufps $0xc8, %xmm4, %xmm4	/* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
+	shufps $0xb1, %xmm6, %xmm6
+	mulps %xmm4, %xmm6
+	addps %xmm3, %xmm6		/* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5] */
+	movhlps %xmm1, %xmm5		/* x[3] + x[7] */
+	movlhps %xmm6, %xmm1		/* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+	shufps $0xe4, %xmm6, %xmm5	/* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
+	movaps %xmm1, %xmm3		/* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+	movl $C_1, %ebx
+	addps %xmm5, %xmm1		/* u */
+	subps %xmm5, %xmm3		/* v */
+	movaps %xmm0, %xmm2		/* yb */
+	movaps %xmm7, %xmm4		/* yt */
+	movaps (%ebx), %xmm5
+	mulps %xmm5, %xmm3
+	addps %xmm1, %xmm0		/* yt + u */
+	subps %xmm1, %xmm2		/* yt - u */
+	shufps $0xb1, %xmm3, %xmm3		/* -i * v */
+	movaps %xmm0, (%eax)
+	movaps %xmm2, 32(%eax)
+	addps %xmm3, %xmm4		/* yb - i*v */
+	subps %xmm3, %xmm7		/* yb + i*v */
+	movaps %xmm4, 16(%eax)
+	movaps %xmm7, 48(%eax)
+
+	popl %ebx
+	leave
+	ret
+
+	.align 4
+.global fft_asmb_kni 
+	.type fft_asmb, @function
+fft_asmb_kni:
+	pushl %ebp
+	movl %esp, %ebp
+
+	subl $4, %esp
+		
+	pushl %eax
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	pushl %esi
+	pushl %edi
+
+	movl 8(%ebp),  %ecx		/* k */
+	movl 12(%ebp), %eax		/* x */
+	movl %ecx, -4(%ebp)		/* k */
+	movl 16(%ebp), %ebx		/* wT */
+	movl 20(%ebp), %edx		/* d */
+	movl 24(%ebp), %esi		/* d3 */
+	shll $4, %ecx			/* 16k */
+	addl $8, %edx
+	leal (%eax, %ecx, 2), %edi
+	addl $8, %esi
+	
+	/* TRANSZERO and TRANS */
+	movaps (%eax), %xmm0		/* x[1] | x[0] */
+	movaps (%ebx), %xmm1		/* wT[1] | wT[0] */
+	movaps (%ebx, %ecx), %xmm2	/* wB[1] | wB[0] */
+	movlps (%edx), %xmm3		/* d */
+	movlps (%esi), %xmm4		/* d3 */
+	movhlps %xmm1, %xmm5		/* wT[1] */
+	movhlps %xmm2, %xmm6		/* wB[1] */
+	shufps $0x50, %xmm3, %xmm3	/* d[1].im | d[1].im | d[1].re | d[1].re */
+	shufps $0x50, %xmm4, %xmm4	/* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
+	movlhps %xmm5, %xmm5		/* wT[1] | wT[1] */
+	movlhps %xmm6, %xmm6		/* wB[1] | wB[1] */
+	mulps %xmm3, %xmm5
+	mulps %xmm4, %xmm6
+	movhlps %xmm5, %xmm7		/* wT[1].im * d[1].im | wT[1].re * d[1].im */
+	movlhps %xmm6, %xmm5		/* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
+	shufps $0xb1, %xmm6, %xmm7	/* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
+	movl $C_1, %edi
+	movaps (%edi), %xmm4
+	mulps %xmm4, %xmm7
+	addps %xmm7, %xmm5		/* wB[1] * d3[1] | wT[1] * d[1] */
+	movlhps %xmm5, %xmm1		/* d[1] * wT[1] | wT[0] */
+	shufps $0xe4, %xmm5, %xmm2	/* d3[1] * wB[1] | wB[0] */
+	movaps %xmm1, %xmm3		/* d[1] * wT[1] | wT[0] */
+	leal (%eax, %ecx, 2), %edi
+	addps %xmm2, %xmm1		/* u */
+	subps %xmm2, %xmm3		/* v */
+	mulps %xmm4, %xmm3
+	movaps (%eax, %ecx), %xmm5	/* xk[1] | xk[0] */
+	shufps $0xb1, %xmm3, %xmm3	/* -i * v */
+	movaps %xmm0, %xmm2		/* x[1] | x[0] */
+	movaps %xmm5, %xmm6		/* xk[1] | xk[0] */
+	addps %xmm1, %xmm0
+	subps %xmm1, %xmm2
+	addps %xmm3, %xmm5
+	subps %xmm3, %xmm6
+	movaps %xmm0, (%eax)
+	movaps %xmm2, (%edi)
+	movaps %xmm5, (%eax, %ecx)
+	movaps %xmm6, (%edi, %ecx)
+	addl $16, %eax
+	addl $16, %ebx
+	addl $8, %edx
+	addl $8, %esi
+	decl -4(%ebp)
+
+.loop:
+	movaps (%ebx), %xmm0		/* wT[1] | wT[0] */
+	movaps (%edx), %xmm1		/* d[1] | d[0] */
+
+	movaps (%ebx, %ecx), %xmm4	/* wB[1] | wB[0] */
+	movaps (%esi), %xmm5		/* d3[1] | d3[0] */
+
+	movhlps %xmm0, %xmm2		/* wT[1] */
+	movhlps %xmm1, %xmm3		/* d[1] */
+
+	movhlps %xmm4, %xmm6		/* wB[1] */
+	movhlps %xmm5, %xmm7		/* d3[1] */
+
+	shufps $0x50, %xmm1, %xmm1	/* d[0].im | d[0].im | d[0].re | d[0].re */
+	shufps $0x50, %xmm3, %xmm3	/* d[1].im | d[1].im | d[1].re | d[1].re */
+
+	movlhps %xmm0, %xmm0		/* wT[0] | wT[0] */
+	shufps $0x50, %xmm5, %xmm5	/* d3[0].im | d3[0].im | d3[0].re | d3[0].re */
+	movlhps %xmm2, %xmm2		/* wT[1] | wT[1] */
+	shufps $0x50, %xmm7, %xmm7	/* d3[1].im | d3[1].im | d3[1].re | d3[1].re */
+
+	mulps %xmm1, %xmm0		/* d[0].im * wT[0].im | d[0].im * wT[0].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+	mulps %xmm3, %xmm2		/* d[1].im * wT[1].im | d[1].im * wT[1].re | d[1].re * wT[1].im | d[1].re * wT[1].re */
+	movlhps %xmm4, %xmm4		/* wB[0] | wB[0] */
+	movlhps %xmm6, %xmm6		/* wB[1] | wB[1] */
+
+	movhlps %xmm0, %xmm1		/* d[0].im * wT[0].im | d[0].im * wT[0].re */
+	movlhps %xmm2, %xmm0		/* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+	mulps %xmm5, %xmm4		/* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
+	mulps %xmm7, %xmm6		/* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
+	shufps $0xb1, %xmm2, %xmm1	/* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
+	movl $C_1, %edi
+	movaps (%edi), %xmm3		/* 1.0 | -1.0 | 1.0 | -1.0 */
+
+	movhlps %xmm4, %xmm5		/* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
+	mulps %xmm3, %xmm1		/* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
+	movlhps %xmm6, %xmm4		/* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
+	addps %xmm1, %xmm0		/* wT[1] * d[1] | wT[0] * d[0] */
+
+	shufps $0xb1, %xmm6, %xmm5	/* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
+	mulps %xmm3, %xmm5		/* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
+	addps %xmm5, %xmm4		/* wB[1] * d3[1] | wB[0] * d3[0] */
+
+	movaps %xmm0, %xmm1		/* wT[1] * d[1] | wT[0] * d[0] */
+	addps %xmm4, %xmm0		/* u */
+	subps %xmm4, %xmm1		/* v */
+	movaps (%eax), %xmm6		/* x[1] | x[0] */
+	leal (%eax, %ecx, 2), %edi
+	mulps %xmm3, %xmm1
+	addl $16, %ebx
+	addl $16, %esi
+	shufps $0xb1, %xmm1, %xmm1	/* -i * v */
+	movaps (%eax, %ecx), %xmm7	/* xk[1] | xk[0] */
+	movaps %xmm6, %xmm2
+	movaps %xmm7, %xmm4
+	addps %xmm0, %xmm6
+	subps %xmm0, %xmm2
+	movaps %xmm6, (%eax)
+	movaps %xmm2, (%edi)
+	addps %xmm1, %xmm7
+	subps %xmm1, %xmm4
+	addl $16, %edx
+	movaps %xmm7, (%eax, %ecx)
+	movaps %xmm4, (%edi, %ecx)
+
+	addl $16, %eax
+	decl -4(%ebp)
+	jnz .loop
+
+.end:		
+	popl %edi	
+	popl %esi
+	popl %edx
+	popl %ecx
+	popl %ebx
+	popl %eax
+	
+	addl $4, %esp
+		
+	leave
+	ret
+#endif
diff --git a/ac3dec/srfft_kni.h b/ac3dec/srfft_kni.h
new file mode 100644
index 00000000..6dc468e3
--- /dev/null
+++ b/ac3dec/srfft_kni.h
@@ -0,0 +1,30 @@
+/*****
+*
+* This file is part of the OMS program.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by 
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; see the file COPYING.  If not, write to
+* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+*****/
+
+#ifndef __SRFF_KNI__
+#define __SRFF_KNI__
+
+#include "cmplx.h"
+
+void fft_4_kni (complex_t *a);
+void fft_8_kni (complex_t *a);
+void fft_asmb_kni (int, complex_t*, complex_t *, complex_t *, complex_t*);
+
+#endif
diff --git a/ac3dec/srfft_kni_c.c b/ac3dec/srfft_kni_c.c
new file mode 100644
index 00000000..d461110f
--- /dev/null
+++ b/ac3dec/srfft_kni_c.c
@@ -0,0 +1,93 @@
+/* 
+ *  srfft_kni.c
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
+ *
+ *  64 and 128 point split radix fft for ac3dec
+ *
+ *  The algorithm is desribed in the book:
+ *  "Computational Frameworks of the Fast Fourier Transform".
+ *
+ *  The ideas and the the organization of code borrowed from djbfft written by
+ *  D. J. Bernstein <djb@cr.py.to>.  djbff can be found at 
+ *  http://cr.yp.to/djbfft.html.
+ *
+ *  srfft.c is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  srfft.c is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifdef __i386__
+
+#include <stdio.h>
+
+#include "srfft_kni.h"
+#include "srfftp.h"
+
+void fft_64p_kni(complex_t *a)
+{
+	fft_8_kni(&a[0]); fft_4_kni(&a[8]); fft_4_kni(&a[12]);
+	fft_asmb_kni(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+  
+	fft_8_kni(&a[16]), fft_8_kni(&a[24]);
+	fft_asmb_kni(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+	fft_8_kni(&a[32]); fft_4_kni(&a[40]); fft_4_kni(&a[44]);
+	fft_asmb_kni(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+	fft_8_kni(&a[48]); fft_4_kni(&a[56]); fft_4_kni(&a[60]);
+	fft_asmb_kni(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+	fft_asmb_kni(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+}
+
+
+void fft_128p_kni(complex_t *a)
+{
+	fft_8_kni(&a[0]); fft_4_kni(&a[8]); fft_4_kni(&a[12]);
+	fft_asmb_kni(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+  
+	fft_8_kni(&a[16]), fft_8_kni(&a[24]);
+	fft_asmb_kni(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+	fft_8_kni(&a[32]); fft_4_kni(&a[40]); fft_4_kni(&a[44]);
+	fft_asmb_kni(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+	fft_8_kni(&a[48]); fft_4_kni(&a[56]); fft_4_kni(&a[60]);
+	fft_asmb_kni(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+	fft_asmb_kni(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+
+	fft_8_kni(&a[64]); fft_4_kni(&a[72]); fft_4_kni(&a[76]);
+	/* fft_16(&a[64]); */
+	fft_asmb_kni(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
+
+	fft_8_kni(&a[80]); fft_8_kni(&a[88]);
+  
+	/* fft_32(&a[64]); */
+	fft_asmb_kni(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
+
+	fft_8_kni(&a[96]); fft_4_kni(&a[104]), fft_4_kni(&a[108]);
+	/* fft_16(&a[96]); */
+	fft_asmb_kni(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
+
+	fft_8_kni(&a[112]), fft_8_kni(&a[120]);
+	/* fft_32(&a[96]); */
+	fft_asmb_kni(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
+  
+	/* fft_128(&a[0]); */
+	fft_asmb_kni(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
+}
+
+#endif
diff --git a/ac3dec/srfftp.h b/ac3dec/srfftp.h
new file mode 100644
index 00000000..6f447153
--- /dev/null
+++ b/ac3dec/srfftp.h
@@ -0,0 +1,305 @@
+
+/* 
+ *  srfftp.h
+ *
+ *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
+ *
+ *  64 and 128 point split radix fft for ac3dec
+ *
+ *  The algorithm is desribed in the book:
+ *  "Computational Frameworks of the Fast Fourier Transform".
+ *
+ *  The ideas and the the organization of code borrowed from djbfft written by
+ *  D. J. Bernstein <djb@cr.py.to>.  djbff can be found at 
+ *  http://cr.yp.to/djbfft.html.
+ *
+ *  srfftp.h is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  srfftp.h is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef SRFFTP_H__
+#define SRFFTP_H__
+
+#include "cmplx.h"
+
+static complex_t delta16[4] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.92387953251129, -0.38268343236509},
+   {0.70710678118655, -0.70710678118655},
+   {0.38268343236509, -0.92387953251129}};
+
+static complex_t delta16_3[4] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.38268343236509, -0.92387953251129},
+   {-0.70710678118655, -0.70710678118655},
+   {-0.92387953251129, 0.38268343236509}};
+
+static complex_t delta32[8] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.98078528040323, -0.19509032201613},
+   {0.92387953251129, -0.38268343236509},
+   {0.83146961230255, -0.55557023301960},
+   {0.70710678118655, -0.70710678118655},
+   {0.55557023301960, -0.83146961230255},
+   {0.38268343236509, -0.92387953251129},
+   {0.19509032201613, -0.98078528040323}};
+
+static complex_t delta32_3[8] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.83146961230255, -0.55557023301960},
+   {0.38268343236509, -0.92387953251129},
+   {-0.19509032201613, -0.98078528040323},
+   {-0.70710678118655, -0.70710678118655},
+   {-0.98078528040323, -0.19509032201613},
+   {-0.92387953251129, 0.38268343236509},
+   {-0.55557023301960, 0.83146961230255}};
+
+static complex_t delta64[16] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.99518472667220, -0.09801714032956},
+   {0.98078528040323, -0.19509032201613},
+   {0.95694033573221, -0.29028467725446},
+   {0.92387953251129, -0.38268343236509},
+   {0.88192126434836, -0.47139673682600},
+   {0.83146961230255, -0.55557023301960},
+   {0.77301045336274, -0.63439328416365},
+   {0.70710678118655, -0.70710678118655},
+   {0.63439328416365, -0.77301045336274},
+   {0.55557023301960, -0.83146961230255},
+   {0.47139673682600, -0.88192126434835},
+   {0.38268343236509, -0.92387953251129},
+   {0.29028467725446, -0.95694033573221},
+   {0.19509032201613, -0.98078528040323},
+   {0.09801714032956, -0.99518472667220}};
+
+static complex_t delta64_3[16] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.95694033573221, -0.29028467725446},
+   {0.83146961230255, -0.55557023301960},
+   {0.63439328416365, -0.77301045336274},
+   {0.38268343236509, -0.92387953251129},
+   {0.09801714032956, -0.99518472667220},
+   {-0.19509032201613, -0.98078528040323},
+   {-0.47139673682600, -0.88192126434836},
+   {-0.70710678118655, -0.70710678118655},
+   {-0.88192126434835, -0.47139673682600},
+   {-0.98078528040323, -0.19509032201613},
+   {-0.99518472667220, 0.09801714032956},
+   {-0.92387953251129, 0.38268343236509},
+   {-0.77301045336274, 0.63439328416365},
+   {-0.55557023301960, 0.83146961230255},
+   {-0.29028467725446, 0.95694033573221}};
+
+static complex_t delta128[32] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.99879545620517, -0.04906767432742},
+   {0.99518472667220, -0.09801714032956},
+   {0.98917650996478, -0.14673047445536},
+   {0.98078528040323, -0.19509032201613},
+   {0.97003125319454, -0.24298017990326},
+   {0.95694033573221, -0.29028467725446},
+   {0.94154406518302, -0.33688985339222},
+   {0.92387953251129, -0.38268343236509},
+   {0.90398929312344, -0.42755509343028},
+   {0.88192126434836, -0.47139673682600},
+   {0.85772861000027, -0.51410274419322},
+   {0.83146961230255, -0.55557023301960},
+   {0.80320753148064, -0.59569930449243},
+   {0.77301045336274, -0.63439328416365},
+   {0.74095112535496, -0.67155895484702},
+   {0.70710678118655, -0.70710678118655},
+   {0.67155895484702, -0.74095112535496},
+   {0.63439328416365, -0.77301045336274},
+   {0.59569930449243, -0.80320753148064},
+   {0.55557023301960, -0.83146961230255},
+   {0.51410274419322, -0.85772861000027},
+   {0.47139673682600, -0.88192126434835},
+   {0.42755509343028, -0.90398929312344},
+   {0.38268343236509, -0.92387953251129},
+   {0.33688985339222, -0.94154406518302},
+   {0.29028467725446, -0.95694033573221},
+   {0.24298017990326, -0.97003125319454},
+   {0.19509032201613, -0.98078528040323},
+   {0.14673047445536, -0.98917650996478},
+   {0.09801714032956, -0.99518472667220},
+   {0.04906767432742, -0.99879545620517}};
+
+static complex_t delta128_3[32] = 
+ { {1.00000000000000,  0.00000000000000},
+   {0.98917650996478, -0.14673047445536},
+   {0.95694033573221, -0.29028467725446},
+   {0.90398929312344, -0.42755509343028},
+   {0.83146961230255, -0.55557023301960},
+   {0.74095112535496, -0.67155895484702},
+   {0.63439328416365, -0.77301045336274},
+   {0.51410274419322, -0.85772861000027},
+   {0.38268343236509, -0.92387953251129},
+   {0.24298017990326, -0.97003125319454},
+   {0.09801714032956, -0.99518472667220},
+   {-0.04906767432742, -0.99879545620517},
+   {-0.19509032201613, -0.98078528040323},
+   {-0.33688985339222, -0.94154406518302},
+   {-0.47139673682600, -0.88192126434836},
+   {-0.59569930449243, -0.80320753148065},
+   {-0.70710678118655, -0.70710678118655},
+   {-0.80320753148065, -0.59569930449243},
+   {-0.88192126434835, -0.47139673682600},
+   {-0.94154406518302, -0.33688985339222},
+   {-0.98078528040323, -0.19509032201613},
+   {-0.99879545620517, -0.04906767432742},
+   {-0.99518472667220, 0.09801714032956},
+   {-0.97003125319454, 0.24298017990326},
+   {-0.92387953251129, 0.38268343236509},
+   {-0.85772861000027, 0.51410274419322},
+   {-0.77301045336274, 0.63439328416365},
+   {-0.67155895484702, 0.74095112535496},
+   {-0.55557023301960, 0.83146961230255},
+   {-0.42755509343028, 0.90398929312344},
+   {-0.29028467725446, 0.95694033573221},
+   {-0.14673047445536, 0.98917650996478}};
+
+#define HSQRT2 0.707106781188;
+
+#define TRANSZERO(A0,A4,A8,A12) { \
+  u_r = wTB[0].re; \
+  v_i = u_r - wTB[k*2].re; \
+  u_r += wTB[k*2].re; \
+  u_i = wTB[0].im; \
+  v_r = wTB[k*2].im - u_i; \
+  u_i += wTB[k*2].im; \
+  a_r = A0.re; \
+  a_i = A0.im; \
+  a1_r = a_r; \
+  a1_r += u_r; \
+  A0.re = a1_r; \
+  a_r -= u_r; \
+  A8.re = a_r; \
+  a1_i = a_i; \
+  a1_i += u_i; \
+  A0.im = a1_i; \
+  a_i -= u_i; \
+  A8.im = a_i; \
+  a1_r = A4.re; \
+  a1_i = A4.im; \
+  a_r = a1_r; \
+  a_r -= v_r; \
+  A4.re = a_r; \
+  a1_r += v_r; \
+  A12.re = a1_r; \
+  a_i = a1_i; \
+  a_i -= v_i; \
+  A4.im = a_i; \
+  a1_i += v_i; \
+  A12.im = a1_i; \
+  }
+
+#define TRANSHALF_16(A2,A6,A10,A14) {\
+  u_r = wTB[2].re; \
+  a_r = u_r; \
+  u_i = wTB[2].im; \
+  u_r += u_i; \
+  u_i -= a_r; \
+  a_r = wTB[6].re; \
+  a1_r = a_r; \
+  a_i = wTB[6].im; \
+  a_r = a_i - a_r; \
+  a_i += a1_r; \
+  v_i = u_r - a_r; \
+  u_r += a_r; \
+  v_r = u_i + a_i; \
+  u_i -= a_i; \
+  v_i *= HSQRT2; \
+  v_r *= HSQRT2; \
+  u_r *= HSQRT2; \
+  u_i *= HSQRT2; \
+  a_r = A2.re; \
+  a_i = A2.im; \
+  a1_r = a_r; \
+  a1_r += u_r; \
+  A2.re = a1_r; \
+  a_r -= u_r; \
+  A10.re = a_r; \
+  a1_i = a_i; \
+  a1_i += u_i; \
+  A2.im = a1_i; \
+  a_i -= u_i; \
+  A10.im = a_i; \
+  a1_r = A6.re; \
+  a1_i = A6.im;  \
+  a_r = a1_r; \
+  a1_r += v_r; \
+  A6.re = a1_r; \
+  a_r -= v_r; \
+  A14.re = a_r; \
+  a_i = a1_i; \
+  a1_i -= v_i; \
+  A6.im = a1_i; \
+  a_i += v_i; \
+  A14.im = a_i; \
+  }
+
+#define TRANS(A1,A5,A9,A13,WT,WB,D,D3) { \
+  u_r = WT.re; \
+  a_r = u_r; \
+  a_r *= D.im; \
+  u_r *= D.re; \
+  a_i = WT.im; \
+  a1_i = a_i; \
+  a1_i *= D.re; \
+  a_i *= D.im; \
+  u_r -= a_i; \
+  u_i = a_r; \
+  u_i += a1_i; \
+  a_r = WB.re; \
+  a1_r = a_r; \
+  a1_r *= D3.re; \
+  a_r *= D3.im; \
+  a_i = WB.im; \
+  a1_i = a_i; \
+  a_i *= D3.re; \
+  a1_i *= D3.im; \
+  a1_r -= a1_i; \
+  a_r += a_i; \
+  v_i = u_r - a1_r; \
+  u_r += a1_r; \
+  v_r = a_r - u_i; \
+  u_i += a_r; \
+  a_r = A1.re; \
+  a_i = A1.im; \
+  a1_r = a_r; \
+  a1_r += u_r; \
+  A1.re = a1_r; \
+  a_r -= u_r; \
+  A9.re = a_r; \
+  a1_i = a_i; \
+  a1_i += u_i; \
+  A1.im = a1_i; \
+  a_i -= u_i; \
+  A9.im = a_i; \
+  a1_r = A5.re; \
+  a1_i = A5.im;  \
+  a_r = a1_r; \
+  a1_r -= v_r; \
+  A5.re = a1_r; \
+  a_r += v_r; \
+  A13.re = a_r; \
+  a_i = a1_i; \
+  a1_i -= v_i; \
+  A5.im = a1_i; \
+  a_i += v_i; \
+  A13.im = a_i; \
+  }
+
+#endif
diff --git a/ac3dec/stats.c b/ac3dec/stats.c
index cfeb453f..4bdad09a 100644
--- a/ac3dec/stats.c
+++ b/ac3dec/stats.c
@@ -28,16 +28,16 @@
 #include "ac3_internal.h"
 
 
-#include "decode.h"
 #include "stats.h"
 #include "debug.h"
 
-
+#if !defined (__GNUC__) || defined (DEBUG)
 static const char *service_ids[8] = 
 {
 	"CM","ME","VI","HI",
 	 "D", "C","E", "VO"
 };
+#endif
 
 struct mixlev_s
 {
@@ -77,9 +77,10 @@ static const char *language[128] =
 	"Bengali", "Belorussian", "Bambora", "Azerbijani", "Assamese", "Armenian", "Arabic", "Amharic"
 };
 
+
 void stats_print_banner(syncinfo_t *syncinfo,bsi_t *bsi)
 {
-	fprintf(stdout,"ac3dec-0.6.2-cvs (C) 2000 Aaron Holtzman (aholtzma@ess.engr.uvic.ca)\n");
+  //	fprintf(stdout,PACKAGE"-"VERSION" (C) 2000 Aaron Holtzman (aholtzma@ess.engr.uvic.ca)\n");
 
 	fprintf(stdout,"%d.%d Mode ",bsi->nfchans,bsi->lfeon);
 	fprintf(stdout,"%2.1f KHz",syncinfo->sampling_rate * 1e-3);
@@ -87,8 +88,7 @@ void stats_print_banner(syncinfo_t *syncinfo,bsi_t *bsi)
 	if (bsi->langcode && (bsi->langcod < 128))
 		fprintf(stdout,"%s ", language[bsi->langcod]);
 
-	switch(bsi->bsmod)
-	{
+	switch(bsi->bsmod) {
 		case 0:
 			fprintf(stdout,"Complete Main Audio Service");
 			break;
@@ -116,12 +116,12 @@ void stats_print_banner(syncinfo_t *syncinfo,bsi_t *bsi)
 	fprintf(stdout,"\n");
 }
 
-void stats_print_syncinfo(syncinfo_t *syncinfo)
+
+void stats_print_syncinfo (syncinfo_t *syncinfo)
 {
 	dprintf("(syncinfo) ");
 	
-	switch (syncinfo->fscod)
-	{
+	switch (syncinfo->fscod) {
 		case 2:
 			dprintf("32 KHz   ");
 			break;
@@ -140,9 +140,9 @@ void stats_print_syncinfo(syncinfo_t *syncinfo)
 			syncinfo->frame_size);
 
 }
+
 	
-void stats_print_bsi(bsi_t *bsi)
-{
+void stats_print_bsi(bsi_t *bsi) {
 	dprintf("(bsi) ");
 	dprintf("%s",service_ids[bsi->bsmod]);
 	dprintf(" %d.%d Mode ",bsi->nfchans,bsi->lfeon);
@@ -154,11 +154,11 @@ void stats_print_bsi(bsi_t *bsi)
 
 }
 
+
 char *exp_strat_tbl[4] = {"R   ","D15 ","D25 ","D45 "};
 
-void stats_print_audblk(bsi_t *bsi,audblk_t *audblk)
-{
-	uint_32 i;
+void stats_print_audblk(bsi_t *bsi,audblk_t *audblk) {
+	uint32_t i;
 
 	dprintf("(audblk) ");
 	dprintf("%s ",audblk->cplinu ? "cpl on " : "cpl off");