From 2bb2802458a4d057b6cda5d0c81df856f78eb2e1 Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Wed, 5 Oct 2022 18:57:09 +1100
Subject: [PATCH] SPU: Logic re-write

---
 desmume/src/NDSSystem.cpp                  |    9 +-
 desmume/src/NDSSystem.h                    |    6 +-
 desmume/src/SPU.cpp                        | 1229 ++++++++++++--------
 desmume/src/SPU.h                          |   49 +-
 desmume/src/frontend/windows/soundView.cpp |   24 +-
 5 files changed, 764 insertions(+), 553 deletions(-)

diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp
index 58cbb57d9..99c8f3cf5 100644
--- a/desmume/src/NDSSystem.cpp
+++ b/desmume/src/NDSSystem.cpp
@@ -1422,9 +1422,12 @@ static void execHardware_hblank()
 
 	//emulation housekeeping. for some reason we always do this at hblank,
 	//even though it sounds more reasonable to do it at hstart
-	SPU_Emulate_core();
-	driver->AVI_SoundUpdate(SPU_core->outbuf,spu_core_samples);
-	WAV_WavSoundUpdate(SPU_core->outbuf,spu_core_samples);
+	int coreSamples = SPU_Emulate_core(355*6); // HLine = 355 dots @ 6c/dot
+	if(coreSamples)
+	{
+		driver->AVI_SoundUpdate(SPU_core->outbuf,coreSamples);
+		WAV_WavSoundUpdate(SPU_core->outbuf,coreSamples);
+	}
 }
 
 static void execHardware_hstart_vblankEnd()
diff --git a/desmume/src/NDSSystem.h b/desmume/src/NDSSystem.h
index f2af566d4..4dbbd779d 100644
--- a/desmume/src/NDSSystem.h
+++ b/desmume/src/NDSSystem.h
@@ -530,6 +530,7 @@ extern struct TCommonSettings
 		, spuInterpolationMode(2)
 		, manualBackupType(0)
 		, autodetectBackupMethod(0)
+		, spu_muteChannels(0)
 		, spu_captureMuted(false)
 		, spu_advanced(true)
 		, StylusPressure(50)
@@ -543,9 +544,6 @@ extern struct TCommonSettings
 		strcpy(ARM7BIOS, "biosnds7.bin");
 		strcpy(ExtFirmwarePath, "firmware.bin");
 
-		for(int i=0;i<16;i++)
-			spu_muteChannels[i] = false;
-
 		for(int g=0;g<2;g++)
 			for(int x=0;x<5;x++)
 				dispLayers[g][x]=true;
@@ -652,7 +650,7 @@ extern struct TCommonSettings
 	int SPU_sync_mode;
 	int SPU_sync_method;
 
-	bool spu_muteChannels[16];
+	u16  spu_muteChannels;
 	bool spu_captureMuted;
 	bool spu_advanced;
 
diff --git a/desmume/src/SPU.cpp b/desmume/src/SPU.cpp
index 6c15e71cc..1d03e4296 100644
--- a/desmume/src/SPU.cpp
+++ b/desmume/src/SPU.cpp
@@ -48,7 +48,14 @@ static inline s16 read16(u32 addr) { return (s16)_MMU_read16<ARMCPU_ARM7,MMU_AT_
 static inline u8 read08(u32 addr) { return _MMU_read08<ARMCPU_ARM7,MMU_AT_DEBUG>(addr); }
 static inline s8 read_s8(u32 addr) { return (s8)_MMU_read08<ARMCPU_ARM7,MMU_AT_DEBUG>(addr); }
 
-#define K_ADPCM_LOOPING_RECOVERY_INDEX 99999
+// Disabling capture when _currentSNDCore == SNDDummy can increase
+// performance by disabling all mixing, but could potentially cause
+// problems if the software is relying on the capture output. By
+// default, we disable all mixing only when capture isn't running,
+// as this is guaranteed to be safe.
+#define ENABLE_DUMMY_SPU_CAPTURE 1
+
+#define K_ADPCM_LOOPING_RECOVERY_INDEX 255
 
 #define CATMULLROM_INTERPOLATION_RESOLUTION_BITS 11
 #define CATMULLROM_INTERPOLATION_RESOLUTION (1<<CATMULLROM_INTERPOLATION_RESOLUTION_BITS)
@@ -56,12 +63,8 @@ static inline s8 read_s8(u32 addr) { return (s8)_MMU_read08<ARMCPU_ARM7,MMU_AT_D
 #define COSINE_INTERPOLATION_RESOLUTION_BITS 13
 #define COSINE_INTERPOLATION_RESOLUTION (1<<COSINE_INTERPOLATION_RESOLUTION_BITS)
 
-#define SPUCHAN_PCM16B_AT(x) ((u32)(x) % SPUINTERPOLATION_TAPS)
-
-//#ifdef FASTBUILD
-	#undef FORCEINLINE
-	#define FORCEINLINE
-//#endif
+#define SPUCHAN_PCM16B_AT(x)    ((x) & (SPUCHAN_PCM16B_SIZE -1))
+#define SPUCAPTURE_PCM16B_AT(x) ((x) & (SPUCAPTURE_FIFO_SIZE-1))
 
 //static ISynchronizingAudioBuffer* _currentSynchronizer = metaspu_construct(ESynchMethod_Z);
 static ISynchronizingAudioBuffer* _currentSynchronizer = metaspu_construct(ESynchMethod_N);
@@ -108,12 +111,9 @@ static u8 precalcindextbl[89][8];
 static u16 catmullrom_lut[CATMULLROM_INTERPOLATION_RESOLUTION][4];
 static u16 cos_lut[COSINE_INTERPOLATION_RESOLUTION];
 
-static const double ARM7_CLOCK = 33513982;
+static const u32 ARM7_CLOCK = 33513982;
 
-static const double samples_per_hline = (DESMUME_SAMPLE_RATE / 59.8261f) / 263.0f;
-
-static double _samples = 0;
-int spu_core_samples = 0;
+static u32 _spu_core_cyclesCounter = 0;
 
 template<typename T>
 static FORCEINLINE T MinMax(T val, T min, T max)
@@ -211,19 +211,19 @@ int SPU_Init(int coreid, int newBufferSizeBytes)
 		// If we wanted to, we could stick entirely to integer maths
 		// here, but I doubt it's worth the hassle.
 		double x = i / (double)CATMULLROM_INTERPOLATION_RESOLUTION;
-		double a = x*(x*(-x + 2) - 1);
+		double a = x*(x*(x - 2) + 1);
 		double b = x*x*(3*x - 5) + 2;
 		double c = x*(x*(-3*x + 4) + 1);
-		double d = x*x*(x - 1);
-		catmullrom_lut[i][0] = (u16)floor((1u<<15) * -0.5*a);
-		catmullrom_lut[i][1] = (u16)floor((1u<<15) *  0.5*b);
-		catmullrom_lut[i][2] = (u16)floor((1u<<15) *  0.5*c);
-		catmullrom_lut[i][3] = (u16)floor((1u<<15) * -0.5*d);
+		double d = x*x*(1 - x);
+		catmullrom_lut[i][0] = (u16)floor((double)(1<<15) * 0.5*a);
+		catmullrom_lut[i][1] = (u16)floor((double)(1<<15) * 0.5*b);
+		catmullrom_lut[i][2] = (u16)floor((double)(1<<15) * 0.5*c);
+		catmullrom_lut[i][3] = (u16)floor((double)(1<<15) * 0.5*d);
 	}
 	for (size_t i = 0; i < COSINE_INTERPOLATION_RESOLUTION; i++)
-		cos_lut[i] = (u16)floor((1u<<16) * ((1.0 - cos(((double)i/(double)COSINE_INTERPOLATION_RESOLUTION) * M_PI)) * 0.5));
+		cos_lut[i] = (u16)floor((double)(1<<16) * ((1.0 - cos(((double)i/(double)COSINE_INTERPOLATION_RESOLUTION) * M_PI)) * 0.5));
 
-	SPU_core = new SPU_struct((int)ceil(samples_per_hline));
+	SPU_core = new SPU_struct();
 	SPU_Reset();
 
 	//create adpcm decode accelerator lookups
@@ -285,7 +285,7 @@ void SPU_SetSynchMode(int mode, int method)
 		
 	if (_currentSynchMode == ESynchMode_DualSynchAsynch)
 	{
-		SPU_user = new SPU_struct(_currentBufferSize);
+		SPU_user = new SPU_struct();
 		SPU_CloneUser();
 	}
 }
@@ -327,15 +327,14 @@ void SPU_Reset(void)
 	for (i = 0x400; i < 0x51D; i++)
 		T1WriteByte(MMU.ARM7_REG, i, 0);
 
-	_samples = 0;
+	_spu_core_cyclesCounter = 0;
 }
 
 //------------------------------------------
 
 void SPU_struct::reset()
 {
-	memset(sndbuf,0,bufsize*2*4);
-	memset(outbuf,0,bufsize*2*2);
+	memset(outbuf,0,bufsize*sizeof(s16)*2);
 
 	memset((void *)channels, 0, sizeof(channel_struct) * 16);
 
@@ -347,22 +346,33 @@ void SPU_struct::reset()
 	}
 }
 
-SPU_struct::SPU_struct(int buffersize)
-	: bufpos(0)
-	, buflength(0)
-	, sndbuf(0)
-	, outbuf(0)
-	, bufsize(buffersize)
+void SPU_struct::resizeBuffer(int buffersize)
 {
-	sndbuf = new s32[buffersize*2];
-	outbuf = new s16[buffersize*2];
+	if(outbuf) delete[] outbuf;
+	outbuf = new s16[(size_t)buffersize*2];
+	bufsize = buffersize;
+}
+
+SPU_struct::SPU_struct()
+	: outbuf(NULL)
+	, bufsize(0)
+{
+	// mixdata[] must be able to contain:
+	// struct {
+	//   s32 mixbuf     [N][2]
+	//   s32 mutedmixbuf[N][2]
+	//   s16 capbuf     [N][2]
+	//   s16 chanbuf    [N][2]
+	// };
+	// where N is at most SPUCAPTURE_FIFO_SIZE
+	mixdata = new s32[SPUCAPTURE_FIFO_SIZE * (sizeof(s32)+sizeof(s32)+sizeof(s16)+sizeof(s16))*2 / sizeof(s32)];
 	reset();
 }
 
 SPU_struct::~SPU_struct()
 {
-	if(sndbuf) delete[] sndbuf;
-	if(outbuf) delete[] outbuf;
+	if(mixdata) delete[] mixdata;
+	if(outbuf)  delete[] outbuf;
 }
 
 void SPU_DeInit(void)
@@ -383,12 +393,13 @@ void SPU_struct::ShutUp()
 		 channels[i].status = CHANSTAT_STOPPED;
 }
 
-static FORCEINLINE void adjust_channel_timer(channel_struct *chan)
+/*FORCEINLINE*/ static void adjust_channel_timer(channel_struct *chan)
 {
 	//  ARM7_CLOCK / (DESMUME_SAMPLE_RATE*2) / (2^16 - Timer)
 	// = ARM7_CLOCK / (DESMUME_SAMPLE_RATE*2 * (2^16 - Timer))
-	// ... and then round up for good measure
-	u64 sampinc = ((u32)ARM7_CLOCK*(1ull << 32) - 1) / (DESMUME_SAMPLE_RATE * 2ull * (0x10000 - chan->timer)) + 1;
+	// Make sure to round DOWN, as we'd rather lag behind
+	// than be ahead, as this causes synchronization issues
+	u64 sampinc = (ARM7_CLOCK*(1ull << 32)) / (DESMUME_SAMPLE_RATE * 2ull * (0x10000 - chan->timer));
 	chan->sampincInt = (u32)(sampinc >> 32), chan->sampincFrac = (u32)sampinc;
 }
 
@@ -418,15 +429,14 @@ void SPU_struct::KeyOn(int channel)
 {
 	channel_struct &thischan = channels[channel];
 	thischan.status = CHANSTAT_PLAY;
-
 	thischan.totlength = thischan.length + thischan.loopstart;
+	thischan.totlength_shifted = thischan.totlength << format_shift[thischan.format];
+	thischan.sampcntFrac = 0;
 	adjust_channel_timer(&thischan);
 
 	thischan.pcm16bOffs = 0;
-	for(int i=0;i<SPUINTERPOLATION_TAPS;i++)
-	{
+	for(int i=0; i < SPUCHAN_PCM16B_SIZE; i++)
 		thischan.pcm16b[i] = 0;
-	}
 
 	//printf("keyon %d totlength:%d\n",channel,thischan.totlength);
 
@@ -440,30 +450,28 @@ void SPU_struct::KeyOn(int channel)
 	case 0: // 8-bit
 	//	thischan.loopstart = thischan.loopstart << 2;
 	//	thischan.length = (thischan.length << 2) + thischan.loopstart;
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -3;
+		thischan.sampcntInt = -3;
 		break;
 	case 1: // 16-bit
 	//	thischan.loopstart = thischan.loopstart << 1;
 	//	thischan.length = (thischan.length << 1) + thischan.loopstart;
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -3;
+		thischan.sampcntInt = -3;
 		break;
 	case 2: // ADPCM
 		thischan.pcm16b[0] = (s16)read16(thischan.addr);
 		thischan.index = read08(thischan.addr + 2) & 0x7F;
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -3;
+		thischan.sampcntInt = -3;
 		thischan.loop_index = K_ADPCM_LOOPING_RECOVERY_INDEX;
 	//	thischan.loopstart = thischan.loopstart << 3;
 	//	thischan.length = (thischan.length << 3) + thischan.loopstart;
 		break;
 	case 3: // PSG
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -1;
+		thischan.sampcntInt = -1;
 		thischan.x = 0x7FFF;
 		break;
 	default: break;
 	}
 
-	thischan.totlength_shifted = thischan.totlength << format_shift[thischan.format];
-
 	if(thischan.format != 3)
 	{
 		if(thischan.totlength_shifted == 0)
@@ -759,14 +767,25 @@ void SPU_struct::ProbeCapture(int which)
 		return;
 	}
 
+	// Original notes on the reasoning behind a FIFO for capture:
+	//so, this is a little strange. why go through a fifo?
+	//it seems that some games will set up a reverb effect by capturing
+	//to the nearly same address as playback, but ahead by a couple.
+	//So, playback will always end up being what was captured a couple of samples ago.
+	//This system counts on playback always having read ahead 16 samples.
+	//In that case, playback will end up being what was processed at one entire buffer length ago,
+	//since the 16 samples would have read ahead before they got captured over
+
+	//It's actually the source channels which should have a fifo, but we are
+	//not going to take the hit in speed and complexity. Save it for a future rewrite.
+	//Instead, what we do here is delay the capture by 16 samples to create a similar effect.
+	//Subjectively, it seems to be working.
 	REGS::CAP &cap = regs.cap[which];
 	cap.runtime.running = 1;
-	cap.runtime.curdad = cap.dad;
+	cap.runtime.dad = cap.dad;
 	u32 len = cap.len;
 	if(len==0) len=1;
-	cap.runtime.maxdad = cap.dad + len*4;
-	cap.runtime.sampcntFrac = cap.runtime.sampcntInt = 0;
-	cap.runtime.fifo.reset();
+	cap.runtime.sampcntFrac = 0, cap.runtime.sampcntInt = -SPUCAPTURE_FIFO_SIZE;
 }
 
 void SPU_struct::WriteByte(u32 addr, u8 val)
@@ -1043,7 +1062,8 @@ void SPU_struct::WriteLong(u32 addr, u32 val)
 
 //////////////////////////////////////////////////////////////////////////////
 
-template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
+template<SPUInterpolationMode INTERPOLATE_MODE>
+FORCEINLINE static s16 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
 {
 	switch (INTERPOLATE_MODE)
 	{
@@ -1051,12 +1071,20 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 		{
 			// Catmull-Rom spline
 			// Delay: 2 samples, Maximum gain: 1.25
+			// NOTE: Ideally, we would just re-scale the resampling
+			// kernel to have a maximum gain of 1.0. However, this
+			// would mean reducing the output volume, which can then
+			// go on to make feedback capture (ie. echo effects)
+			// decay abnormally quickly. Since Catmull-Rom is more
+			// of a 'luxury' thing, we should be able to use MinMax
+			// since if the user is using this interpolation method,
+			// there's likely enough processing power to handle it.
 			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 3)];
 			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 2)];
 			s32 c = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
 			s32 d = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
 			const u16 *w = catmullrom_lut[subPos >> (32 - CATMULLROM_INTERPOLATION_RESOLUTION_BITS)];
-			return (-a*(s32)w[0] + b*(s32)w[1] + c*(s32)w[2] - d*(s32)w[3]) >> 15;
+			return (s16)MinMax((-a*(s32)w[0] + b*(s32)w[1] + c*(s32)w[2] - d*(s32)w[3]) >> 15, -0x8000, +0x7FFF);
 		}
 
 		case SPUInterpolation_Cosine:
@@ -1065,10 +1093,13 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 			// ratio2 = (1 - cos(ratio * M_PI)) / 2
 			// sampleI = sampleA * (1 - ratio2) + sampleB * ratio2
 			// Delay: 1 sample, Maximum gain: 1.0
+			// NOTE: Always cast the result to s16. (b-a) can
+			// overflow, but a+(b-a)*subPos can't. So we might
+			// have garbage in the upper 16 bits.
 			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
 			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
 			s32 subPos16 = (s32)cos_lut[subPos >> (32 - COSINE_INTERPOLATION_RESOLUTION_BITS)];
-			return a + ((b - a)*subPos16 >> 16);
+			return (s16)(a + (((b - a)*subPos16) >> 16));
 		}
 
 		case SPUInterpolation_Linear:
@@ -1076,10 +1107,11 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 			// Linear Interpolation Formula:
 			// sampleI = sampleA * (1 - ratio) + sampleB * ratio
 			// Delay: 1 sample, Maximum gain: 1.0
+			// NOTE: Always cast the result to s16 (see above).
 			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
 			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
 			s32 subPos16 = subPos >> (32 - 16);
-			return a + ((b - a)*subPos16 >> 16);
+			return (s16)(a + (((b - a)*subPos16) >> 16));
 		}
 
 		default:
@@ -1088,41 +1120,43 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 	}
 }
 
-static FORCEINLINE s32 Fetch8BitData(channel_struct *chan, s32 pos)
+FORCEINLINE static s16 Fetch8BitData(channel_struct *chan, s32 pos)
 {
 	if(pos < 0) return 0;
 
-	return read_s8(chan->addr + pos*1) << 8;
+	return (s16)read_s8(chan->addr + pos*1) << 8;
 }
 
-static FORCEINLINE s32 Fetch16BitData(channel_struct *chan, s32 pos)
+FORCEINLINE static s16 Fetch16BitData(channel_struct *chan, s32 pos)
 {
 	if(pos < 0) return 0;
 
 	return read16(chan->addr + pos*2);
 }
 
-static FORCEINLINE s32 FetchADPCMData(channel_struct *chan, s32 pos)
+// NOTE: The decoding state is updated during this function call
+FORCEINLINE static s16 FetchADPCMData(channel_struct *chan, s32 pos)
 {
 	if(pos < 8) return 0;
 
 	s16 last = chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)];
 
-	if(pos == (chan->loopstart<<3)) {
-		//if(chan->loop_index != K_ADPCM_LOOPING_RECOVERY_INDEX) printf("over-snagging\n");
+	// Stash loop sample and index
+	// This saves having to decode to the loop point every time
+	if(pos == ((s32)chan->loopstart<<3)) {
 		chan->loop_pcm16b = last;
 		chan->loop_index = chan->index;
 	}
 	
-	const u32 shift    = (pos&1) * 4;
-	const u32 data4bit = ((u32)read08(chan->addr + (pos>>1))) >> shift;
+	const u8 shift    = ((u8)pos&1) * 4;
+	const u8 data4bit = read08(chan->addr + (pos>>1)) >> shift;
 	const s32 diff = precalcdifftbl [chan->index][data4bit & 0xF];
 	chan->index    = precalcindextbl[chan->index][data4bit & 0x7];
 
-	return MinMax(last + diff, -0x8000, 0x7FFF);
+	return (s16)MinMax(last + diff, -0x8000, 0x7FFF);
 }
 
-static FORCEINLINE s32 FetchPSGData(channel_struct *chan, s32 pos)
+FORCEINLINE static s16 FetchPSGData(channel_struct *chan, s32 pos)
 {
 	if(pos < 0 || chan->num < 8) return 0;
 
@@ -1130,7 +1164,7 @@ static FORCEINLINE s32 FetchPSGData(channel_struct *chan, s32 pos)
 	if(chan->num < 14)
 	{
 		// Doing this avoids using a LUT
-		return ((pos%8u) > chan->waveduty) ? (-0x7FFF) : (+0x7FFF);
+		return (((u8)pos%8u) > chan->waveduty) ? (-0x7FFF) : (+0x7FFF);
 	}
 	else
 	{
@@ -1149,43 +1183,22 @@ static FORCEINLINE s32 FetchPSGData(channel_struct *chan, s32 pos)
 
 //////////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE void MixL(SPU_struct* SPU, channel_struct *chan, s32 data)
-{
-	data = spumuldiv7(data, chan->vol) >> volume_shift[chan->volumeDiv];
-	SPU->sndbuf[SPU->bufpos<<1] += data;
-}
-
-static FORCEINLINE void MixR(SPU_struct* SPU, channel_struct *chan, s32 data)
+// Returns false when the channel needs to stop
+// NOTE: Assumes channel has already reached the end of playback
+template<int FORMAT>
+/*FORCEINLINE*/ static bool TestForLoop(channel_struct *chan, s32 *pos, s32 totalLength)
 {
-	data = spumuldiv7(data, chan->vol) >> volume_shift[chan->volumeDiv];
-	SPU->sndbuf[(SPU->bufpos<<1)+1] += data;
-}
-
-static FORCEINLINE void MixLR(SPU_struct* SPU, channel_struct *chan, s32 data)
-{
-	data = spumuldiv7(data, chan->vol) >> volume_shift[chan->volumeDiv];
-	SPU->sndbuf[SPU->bufpos<<1] += spumuldiv7(data, 127 - chan->pan);
-	SPU->sndbuf[(SPU->bufpos<<1)+1] += spumuldiv7(data, chan->pan);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-
-template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channel_struct *chan)
-{
-	// Do nothing if we haven't reached the end
-	if(chan->sampcntInt < chan->totlength_shifted) return;
-
 	// Kill the channel if we don't repeat
 	if(chan->repeat != 1)
 	{
-		SPU->KeyOff(chan->num);
-		SPU->bufpos = SPU->buflength;
-		return;
+		//SPU->KeyOff(chan->num); // Inlining this avoids having to pass an SPU pointer around
+		chan->status = CHANSTAT_STOPPED;
+		return false;
 	}
 
 	// Wrap sampcnt
-	u32 loopSize = chan->totlength_shifted - (chan->loopstart << format_shift[FORMAT]);
-	do chan->sampcntInt -= loopSize; while(chan->sampcntInt >= chan->totlength_shifted)
+	u32 loopSize = totalLength - (chan->loopstart << format_shift[FORMAT]);
+	do *pos -= loopSize; while(*pos >= totalLength);
 
 	// ADPCM needs special handling
 	if(FORMAT == 2)
@@ -1194,7 +1207,7 @@ template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channe
 		// smaller values (0..3 words) are causing hang-ups 
 		// (busy bit remains set infinite, but no sound output occurs).
 		// fix: 7th Dragon (JP) - http://sourceforge.net/p/desmume/bugs/1357/
-		if (chan->totlength < 4) return;
+		if (totalLength < (4 << format_shift[FORMAT])) return true;
 
 		// Fetch loop sample and index, and get the "new" current decoding position
 		s32 curpos;
@@ -1217,314 +1230,257 @@ template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channe
 		// Decode until we reach the target position
 		// This is really only used for fast seeking (ie. SNDDummy
 		// and loop reset), but makes the code much cleaner.
-		while(curpos < chan->sampcntInt)
+		while(curpos < *pos)
 		{
 			*pcm16Dst = FetchADPCMData(chan, curpos);
 			curpos++;
 		}
 	}
+	return true;
 }
 
-template<int CHANNELS> FORCEINLINE static void SPU_Mix(SPU_struct* SPU, channel_struct *chan, s32 data)
+//////////////////////////////////////////////////////////////////////////////
+
+//WORK
+template<int CHANNELS, int FORMAT, SPUInterpolationMode INTERPOLATE_MODE> 
+static void __SPU_GenerateChanData(channel_struct* const chan, s16 *chanbuf, int length)
 {
-	switch(CHANNELS)
+	s32 totalLength = chan->totlength_shifted;
+
+	if (!CHANNELS)
 	{
-		case 0: MixL(SPU, chan, data); break;
-		case 1: MixLR(SPU, chan, data); break;
-		case 2: MixR(SPU, chan, data); break;
-		default: break;
+		// When we aren't mixing at all, take a much faster path where
+		// we simply update sampcnt. This can glitch interpolation for
+		// up to SPUCHAN_PCM16B_SIZE source samples (since we're not
+		// updating chan->pcm16b[]), but this glitching should really
+		// only show up when switching from Dual SPU to Sync mode, or
+		// when switching from SNDDummy core to an actual output core,
+		// and only for non-ADPCM sources (ADPCM needs to decode data
+		// all the time, so we keep pcm16b[] filled correctly anyway).
+		s32 cursampcntInt = chan->sampcntInt;
+		s64 newsampcnt  = (chan->sampcntFrac | (s64)   cursampcntInt<<32);
+		    newsampcnt += (chan->sampincFrac | (u64)chan->sampincInt<<32) * length;
+		s32 newsampcntInt = (s32)(newsampcnt >> 32);
+		if(FORMAT == 2 && newsampcntInt <= totalLength)
+		{
+			// We won't go past the end, so decode until reaching the target position
+			while(cursampcntInt < newsampcntInt)
+			{
+				s16 data = FetchADPCMData(chan, cursampcntInt);
+				chan->pcm16bOffs++;
+				chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)] = data;
+				cursampcntInt++;
+			}
+		}
+		else if(FORMAT != 3 && newsampcntInt >= totalLength) TestForLoop<FORMAT>(chan, &newsampcntInt, totalLength);
+		chan->sampcntFrac = (u32)newsampcnt;
+		chan->sampcntInt  = newsampcntInt;
+		return;
 	}
-	SPU->lastdata = data;
-}
 
-//WORK
-template<int FORMAT, SPUInterpolationMode INTERPOLATE_MODE, int CHANNELS> 
-	FORCEINLINE static void ____SPU_ChanUpdate(SPU_struct* const SPU, channel_struct* const chan)
-{
-	for (; SPU->bufpos < SPU->buflength; SPU->bufpos++)
+	// chan->vol is .7fxp, plus .4fxp for chan->volumeDiv (total .11fxp)
+	// chan->pan is .7fxp
+	// This gives us .18fxp, but we need at most .16fxp, so we shift down.
+	s32 vol_shifted   = spumuladjust7(chan->vol);
+	    vol_shifted <<= 4;
+	    vol_shifted >>= volume_shift[chan->volumeDiv];
+	s32 vol_left      = spumuladjust7(127 - chan->pan);
+	    vol_left     *= vol_shifted;
+	    vol_left    >>= 2; // .16fxp
+	s32 vol_right     = spumuladjust7(chan->pan);
+	    vol_right    *= vol_shifted;
+	    vol_right   >>= 2; // .16fxp
+
+	// Start mixing loop
+	u32 sampcntFrac = chan->sampcntFrac;
+	s32 sampcntInt  = chan->sampcntInt;
+	do
 	{
 		// Advance sampcnt one sample at a time. This is
 		// needed to keep pcm16b[] filled for interpolation.
-		u32 nSamplesToSkip = chan->sampincInt + AddAndReturnCarry(&chan->sampcntFrac, chan->sampincFrac);
+		u32 nSamplesToSkip = chan->sampincInt + AddAndReturnCarry(&sampcntFrac, chan->sampincFrac);
 		while(nSamplesToSkip--)
 		{
+			// If channel stops, fill the rest of the buffer with 0
+			if(FORMAT != 3 && sampcntInt >= totalLength && !TestForLoop<FORMAT>(chan, &sampcntInt, totalLength))
+			{
+				memset(chanbuf, 0, length*sizeof(s16)*2);
+				return;
+			}
+
 			s16 data = 0;
-			s32 pos = chan->sampcntInt;
 			switch(FORMAT)
 			{
-				case 0: data = Fetch8BitData (chan, pos); break;
-				case 1: data = Fetch16BitData(chan, pos); break;
-				case 2: data = FetchADPCMData(chan, pos); break;
-				case 3: data = FetchPSGData  (chan, pos); break;
-				default: break;
+				case 0: data = Fetch8BitData (chan, sampcntInt); break;
+				case 1: data = Fetch16BitData(chan, sampcntInt); break;
+				case 2: data = FetchADPCMData(chan, sampcntInt); break;
+				case 3: data = FetchPSGData  (chan, sampcntInt); break;
 			}
 			chan->pcm16bOffs++;
 			chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)] = data;
-
-			chan->sampcntInt++;
-			if (FORMAT != 3) TestForLoop<FORMAT>(SPU, chan);
+			sampcntInt++;
 		}
 
-		if(CHANNELS != -1)
+		// Because chanbuf[] is aligned to at least 32 bits, we can
+		// cheat and store a hard-panned sample by writing as 32bit
+		s16 sample, sampleL = 0, sampleR = 0; // <- Clearing these to 0 shuts the compiler up
+		sample = Interpolate<INTERPOLATE_MODE>(chan->pcm16b, chan->pcm16bOffs, sampcntFrac);
+		if(CHANNELS & (1<<0)) sampleL = (s16)(sample * vol_left  >> 16);
+		if(CHANNELS & (1<<1)) sampleR = (s16)(sample * vol_right >> 16);
+		switch(CHANNELS)
 		{
-			s32 data = Interpolate<INTERPOLATE_MODE>(chan->pcm16b, chan->pcm16bOffs, chan->sampcntFrac);
-			SPU_Mix<CHANNELS>(SPU, chan, data);
+			case (1<<0)|(0<<1):
+#ifdef MSB_FIRST
+				*(u32*)chanbuf = (u32)sampleL << 16;
+#else
+				*(u32*)chanbuf = (u32)sampleL;
+#endif
+				break;
+			case (0<<0)|(1<<1):
+#ifdef MSB_FIRST
+				*(u32*)chanbuf = (u32)sampleR;
+#else
+				*(u32*)chanbuf = (u32)sampleR << 16;
+#endif
+				break;
+			case (1<<0)|(1<<1):
+				chanbuf[0] = sampleL;
+				chanbuf[1] = sampleR;
+				break;
 		}
+		chanbuf += 2;
+	} while(--length);
+	chan->sampcntFrac = sampcntFrac;
+	chan->sampcntInt  = sampcntInt;
+}
+
+// Outputs {L,R} into chanbuf[]
+// Assumes chanbuf[] is always aligned to at least 32 bits
+FORCEINLINE static void _SPU_GenerateChanData(bool actuallyMix, channel_struct* const chan, s16 *chanbuf, int length)
+{
+	typedef void (*_SPU_GenerateChanData_Func_t)(channel_struct* const chan, s16 *chanbuf, int length);
+
+	// This looks insane and pointless, but compilers generate
+	// a massive if/elseif block in place of something like this,
+	// since they don't know the range of values we use.
+	// Note that we use SPUInterpolation_None in the case of PSG
+	// channels, as we don't want to interpolate the raw samples.
+	// We also use SPUInterpolation_None with actuallyMix==false,
+	// so that we avoid instantiating the exact same code under
+	// a new template instance.
+	// The table is acessed as: FuncTable[INTERPOLATE_MODE][FORMAT][CHANNELS]
+#define __GENERATE_FUNCTABLE(CHANNELS, FORMAT, INTERPOLATE_MODE) \
+	__SPU_GenerateChanData<CHANNELS, FORMAT, INTERPOLATE_MODE>
+#define _GENERATE_FUNCTABLE(FORMAT, INTERPOLATE_MODE) \
+	{ \
+		__GENERATE_FUNCTABLE((0<<0 | 0<<1), FORMAT, SPUInterpolation_None), \
+		__GENERATE_FUNCTABLE((1<<0 | 0<<1), FORMAT, INTERPOLATE_MODE), \
+		__GENERATE_FUNCTABLE((0<<0 | 1<<1), FORMAT, INTERPOLATE_MODE), \
+		__GENERATE_FUNCTABLE((1<<0 | 1<<1), FORMAT, INTERPOLATE_MODE), \
 	}
-}
-
-template<int FORMAT, SPUInterpolationMode INTERPOLATE_MODE> 
-	FORCEINLINE static void ___SPU_ChanUpdate(const bool actuallyMix, SPU_struct* const SPU, channel_struct* const chan)
-{
-	if(!actuallyMix)
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,-1>(SPU,chan);
-	else if (chan->pan == 0)
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,0>(SPU,chan);
-	else if (chan->pan == 127)
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,2>(SPU,chan);
-	else
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,1>(SPU,chan);
-}
-
-template<SPUInterpolationMode INTERPOLATE_MODE> 
-	FORCEINLINE static void __SPU_ChanUpdate(const bool actuallyMix, SPU_struct* const SPU, channel_struct* const chan)
-{
-	// NOTE: PSG doesn't use interpolation, or it would try to
-	// interpolate between the raw sample points (very bad)
-	switch(chan->format)
-	{
-		case 0: ___SPU_ChanUpdate<0,INTERPOLATE_MODE>(actuallyMix, SPU, chan); break;
-		case 1: ___SPU_ChanUpdate<1,INTERPOLATE_MODE>(actuallyMix, SPU, chan); break;
-		case 2: ___SPU_ChanUpdate<2,INTERPOLATE_MODE>(actuallyMix, SPU, chan); break;
-		case 3: ___SPU_ChanUpdate<3,SPUInterpolation_None>(actuallyMix, SPU, chan); break;
-		default: assert(false);
+#define GENERATE_FUNCTABLE(INTERPOLATE_MODE) \
+	{ \
+		_GENERATE_FUNCTABLE(0, INTERPOLATE_MODE), \
+		_GENERATE_FUNCTABLE(1, INTERPOLATE_MODE), \
+		_GENERATE_FUNCTABLE(2, INTERPOLATE_MODE), \
+		_GENERATE_FUNCTABLE(3, SPUInterpolation_None), \
 	}
-}
-
-FORCEINLINE static void _SPU_ChanUpdate(const bool actuallyMix, SPU_struct* const SPU, channel_struct* const chan)
-{
-	switch(CommonSettings.spuInterpolationMode)
+	static const _SPU_GenerateChanData_Func_t FuncTable[4][4][4] =
 	{
-	case SPUInterpolation_None:       __SPU_ChanUpdate<SPUInterpolation_None>(actuallyMix, SPU, chan); break;
-	case SPUInterpolation_Linear:     __SPU_ChanUpdate<SPUInterpolation_Linear>(actuallyMix, SPU, chan); break;
-	case SPUInterpolation_Cosine:     __SPU_ChanUpdate<SPUInterpolation_Cosine>(actuallyMix, SPU, chan); break;
-	case SPUInterpolation_CatmullRom: __SPU_ChanUpdate<SPUInterpolation_CatmullRom>(actuallyMix, SPU, chan); break;
-	default: assert(false);
-	}
-}
-
-//ENTERNEW
-static void SPU_MixAudio_Advanced(bool actuallyMix, SPU_struct *SPU, int length)
-{
-	//the advanced spu function correctly handles all sound control mixing options, as well as capture
-	//this code is not entirely optimal, as it relies on sort of manhandling the core mixing functions
-	//in order to get the results it needs.
-
-	//THIS IS MAX HACKS!!!!
-	//AND NEEDS TO BE REWRITTEN ALONG WITH THE DEEPEST PARTS OF THE SPU
-	//ONCE WE KNOW THAT IT WORKS
-	
-	//BIAS gets ignored since our spu is still not bit perfect,
-	//and it doesnt matter for purposes of capture
-
-	//-----------DEBUG CODE
-	bool skipcap = false;
-	//-----------------
-
-	s32 samp0[2] = {0,0};
-	
-	//believe it or not, we are going to do this one sample at a time.
-	//like i said, it is slower.
-	for (int samp = 0; samp < length; samp++)
+		GENERATE_FUNCTABLE(SPUInterpolation_None),
+		GENERATE_FUNCTABLE(SPUInterpolation_Linear),
+		GENERATE_FUNCTABLE(SPUInterpolation_Cosine),
+		GENERATE_FUNCTABLE(SPUInterpolation_CatmullRom),
+	};
+#undef GENERATE_FUNCTABLE
+#undef _GENERATE_FUNCTABLE
+#undef __GENERATE_FUNCTABLE
+
+	const _SPU_GenerateChanData_Func_t *Funcs = FuncTable[CommonSettings.spuInterpolationMode][chan->format];
+	     if(!actuallyMix)     Funcs[0](chan, chanbuf, length);
+	else if(chan->pan == 0)   Funcs[1](chan, chanbuf, length);
+	else if(chan->pan == 127) Funcs[2](chan, chanbuf, length);
+	else                      Funcs[3](chan, chanbuf, length);
+}
+
+template<int CAP_BITS, bool USE_SRCBUF>
+/*FORCEINLINE*/ static bool __SPU_WriteCapture(SPU_struct::REGS::CAP& cap, const channel_struct& srcChan, const s16 *srcBuf, int length)
+{
+	s32 capLen_shifted = cap.len * (32 / CAP_BITS);
+	SPU_struct::REGS::CAP::Runtime& runtime = cap.runtime;
+	s32 pos = runtime.sampcntInt;
+	do
 	{
-		SPU->sndbuf[0] = 0;
-		SPU->sndbuf[1] = 0;
-		SPU->buflength = 1;
-
-		s32 capmix[2] = {0,0};
-		s32 mix[2] = {0,0};
-		s32 chanout[16];
-		s32 submix[32];
-
-		//generate each channel, and helpfully mix it at the same time
-		for (int i = 0; i < 16; i++)
+		s16 sample = USE_SRCBUF ? (*srcBuf) : 0;
+		u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&runtime.sampcntFrac, srcChan.sampincFrac);
+		while(nSamplesToProcess--)
 		{
-			channel_struct *chan = &SPU->channels[i];
-
-			if (chan->status == CHANSTAT_PLAY)
+			if(pos >= capLen_shifted)
 			{
-				SPU->bufpos = 0;
-
-				bool bypass = false;
-				if (i==1 && SPU->regs.ctl_ch1bypass) bypass=true;
-				if (i==3 && SPU->regs.ctl_ch3bypass) bypass=true;
-
-
-				//output to mixer unless we are bypassed.
-				//dont output to mixer if the user muted us
-				bool outputToMix = true;
-				if (CommonSettings.spu_muteChannels[i]) outputToMix = false;
-				if (bypass) outputToMix = false;
-				bool outputToCap = outputToMix;
-				if (CommonSettings.spu_captureMuted && !bypass) outputToCap = true;
-
-				//channels 1 and 3 should probably always generate their audio
-				//internally at least, just in case they get used by the spu output
-				bool domix = outputToCap || outputToMix || i==1 || i==3;
-
-				//clear the output buffer since this is where _SPU_ChanUpdate wants to accumulate things
-				SPU->sndbuf[0] = SPU->sndbuf[1] = 0;
-
-				//get channel's next output sample.
-				_SPU_ChanUpdate(domix, SPU, chan);
-				chanout[i] = SPU->lastdata >> volume_shift[chan->volumeDiv];
-
-				//save the panned results
-				submix[i*2] = SPU->sndbuf[0];
-				submix[i*2+1] = SPU->sndbuf[1];
-
-				//send sample to our capture mix
-				if (outputToCap)
+				if(cap.oneshot)
 				{
-					capmix[0] += submix[i*2];
-					capmix[1] += submix[i*2+1];
+					cap.active = runtime.running = 0;
+					return false;
 				}
+				pos -= capLen_shifted;
+			}
 
-				//send sample to our main mixer
-				if (outputToMix)
+			s16 *data = &runtime.pcm16b[SPUCAPTURE_PCM16B_AT(runtime.pcm16bOffs)];
+			if(pos >= 0)
+			{
+				if (CAP_BITS == 8)
 				{
-					mix[0] += submix[i*2];
-					mix[1] += submix[i*2+1];
+					_MMU_write08<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s8), (u8)(*data >> 8));
+				}
+				else
+				{
+					_MMU_write16<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s16), (u16)(*data));
 				}
 			}
-			else 
-			{
-				chanout[i] = 0;
-				submix[i*2] = 0;
-				submix[i*2+1] = 0;
-			}
-		} //foreach channel
-
-		s32 mixout[2] = {mix[0],mix[1]};
-		s32 capmixout[2] = {capmix[0],capmix[1]};
-		s32 sndout[2];
-		s32 capout[2];
-
-		//create SPU output
-		switch (SPU->regs.ctl_left)
-		{
-			case SPU_struct::REGS::LOM_LEFT_MIXER: sndout[0] = mixout[0]; break;
-			case SPU_struct::REGS::LOM_CH1: sndout[0] = submix[1*2+0]; break;
-			case SPU_struct::REGS::LOM_CH3: sndout[0] = submix[3*2+0]; break;
-			case SPU_struct::REGS::LOM_CH1_PLUS_CH3: sndout[0] = submix[1*2+0] + submix[3*2+0]; break;
-			default: break;
-		}
-		switch (SPU->regs.ctl_right)
-		{
-			case SPU_struct::REGS::ROM_RIGHT_MIXER: sndout[1] = mixout[1]; break;
-			case SPU_struct::REGS::ROM_CH1: sndout[1] = submix[1*2+1]; break;
-			case SPU_struct::REGS::ROM_CH3: sndout[1] = submix[3*2+1]; break;
-			case SPU_struct::REGS::ROM_CH1_PLUS_CH3: sndout[1] = submix[1*2+1] + submix[3*2+1]; break;
-			default: break;
+			*data = sample;
+			runtime.pcm16bOffs++;
+			pos++;
 		}
 
+		// srcBuf[] stores two samples per time unit
+		// Either {Ch0[+Ch1],Ch2[+Ch3]}, or {LMix,RMix}
+		if(USE_SRCBUF) srcBuf += 2;
+	} while(--length);
+	runtime.sampcntInt = pos;
+	return true;
+}
 
-		//generate capture output ("capture bugs" from gbatek are not emulated)
-		if (SPU->regs.cap[0].source == 0)
-			capout[0] = capmixout[0]; //cap0 = L-mix
-		else if (SPU->regs.cap[0].add)
-			capout[0] = chanout[0] + chanout[1]; //cap0 = ch0+ch1
-		else capout[0] = chanout[0]; //cap0 = ch0
-
-		if (SPU->regs.cap[1].source == 0)
-			capout[1] = capmixout[1]; //cap1 = R-mix
-		else if (SPU->regs.cap[1].add)
-			capout[1] = chanout[2] + chanout[3]; //cap1 = ch2+ch3
-		else capout[1] = chanout[2]; //cap1 = ch2
-
-		capout[0] = MinMax(capout[0],-0x8000,0x7FFF);
-		capout[1] = MinMax(capout[1],-0x8000,0x7FFF);
+// Writes capture output to capture unit destination
+// Returns false if capture has stopped
+template<bool USE_SRCBUF>
+FORCEINLINE static bool _SPU_WriteCapture(SPU_struct::REGS::CAP& cap, const channel_struct& srcChan, const s16 *srcBuf, int length)
+{
+	if(cap.bits8)
+		return __SPU_WriteCapture< 8,USE_SRCBUF>(cap, srcChan, srcBuf, length);
+	else
+		return __SPU_WriteCapture<16,USE_SRCBUF>(cap, srcChan, srcBuf, length);
+}
 
-		//write the output sample where it is supposed to go
-		if (samp == 0)
+// Advances capture unit destination without writing anything
+/*FORCEINLINE*/ static void _SPU_SeekCapture(SPU_struct::REGS::CAP& cap, const channel_struct& srcChan, int length)
+{
+	s32 capLen_shifted = cap.len * (cap.bits8 ? (32/8) : (32/16));
+	SPU_struct::REGS::CAP::Runtime& runtime = cap.runtime;
+	s64 pos64  = (runtime.sampcntFrac | (s64)runtime.sampcntInt<<32);
+	    pos64 += (srcChan.sampincFrac | (u64)srcChan.sampincInt<<32) * length;
+	runtime.sampcntFrac = (u32)pos64;
+	runtime.sampcntInt  = (s32)(pos64 >> 32);
+	if(runtime.sampcntInt >= capLen_shifted)
+	{
+		if(cap.oneshot)
 		{
-			samp0[0] = sndout[0];
-			samp0[1] = sndout[1];
+			cap.active = runtime.running = 0;
 		}
 		else
 		{
-			SPU->sndbuf[samp*2+0] = sndout[0];
-			SPU->sndbuf[samp*2+1] = sndout[1];
+			do runtime.sampcntInt -= capLen_shifted; while(runtime.sampcntInt >= capLen_shifted);
 		}
-
-		for (int capchan = 0; capchan < 2; capchan++)
-		{
-			SPU_struct::REGS::CAP& cap = SPU->regs.cap[capchan];
-			channel_struct& srcChan = SPU->channels[1 + 2 * capchan];
-			if (SPU->regs.cap[capchan].runtime.running)
-			{
-				u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&cap.runtime.sampcntFrac, srcChan.sampincFrac);
-				cap.runtime.sampcntInt += nSamplesToProcess;
-				while(nSamplesToProcess--)
-				{
-					//so, this is a little strange. why go through a fifo?
-					//it seems that some games will set up a reverb effect by capturing
-					//to the nearly same address as playback, but ahead by a couple.
-					//So, playback will always end up being what was captured a couple of samples ago.
-					//This system counts on playback always having read ahead 16 samples.
-					//In that case, playback will end up being what was processed at one entire buffer length ago,
-					//since the 16 samples would have read ahead before they got captured over
-
-					//It's actually the source channels which should have a fifo, but we are
-					//not going to take the hit in speed and complexity. Save it for a future rewrite.
-					//Instead, what we do here is delay the capture by 16 samples to create a similar effect.
-					//Subjectively, it seems to be working.
-
-					//Don't do anything until the fifo is filled, so as to delay it
-					if (cap.runtime.fifo.size < 16)
-					{
-						cap.runtime.fifo.enqueue(capout[capchan]);
-						continue;
-					}
-
-					//(actually capture sample from fifo instead of most recently generated)
-					u32 multiplier;
-					s32 sample = cap.runtime.fifo.dequeue();
-					cap.runtime.fifo.enqueue(capout[capchan]);
-
-					//static FILE* fp = NULL;
-					//if(!fp) fp = fopen("d:\\capout.raw","wb");
-					//fwrite(&sample,2,1,fp);
-					
-					if (cap.bits8)
-					{
-						s8 sample8 = sample >> 8;
-						if (skipcap) _MMU_write08<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-						else _MMU_write08<1,MMU_AT_DMA>(cap.runtime.curdad,sample8);
-						cap.runtime.curdad++;
-						multiplier = 4;
-					}
-					else
-					{
-						s16 sample16 = sample;
-						if (skipcap) _MMU_write16<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-						else _MMU_write16<1,MMU_AT_DMA>(cap.runtime.curdad,sample16);
-						cap.runtime.curdad+=2;
-						multiplier = 2;
-					}
-
-					if (cap.runtime.curdad >= cap.runtime.maxdad)
-					{
-						cap.runtime.curdad = cap.dad;
-						cap.runtime.sampcntInt -= cap.len*multiplier;
-					}
-				} //sampinc loop
-			} //if capchan running
-		} //capchan loop
-	} //main sample loop
-
-	SPU->sndbuf[0] = samp0[0];
-	SPU->sndbuf[1] = samp0[1];
+	}
 }
 
 //ENTER
@@ -1532,8 +1488,8 @@ static void SPU_MixAudio(bool actuallyMix, SPU_struct *SPU, int length)
 {
 	if (actuallyMix)
 	{
-		memset(SPU->sndbuf, 0, length*4*2);
-		memset(SPU->outbuf, 0, length*2*2);
+		if(SPU->bufsize < length) SPU->resizeBuffer(length);
+		memset(SPU->outbuf, 0, length*sizeof(s16)*2);
 	}
 
 	//we used to use master enable here, and do nothing if audio is disabled.
@@ -1542,133 +1498,366 @@ static void SPU_MixAudio(bool actuallyMix, SPU_struct *SPU, int length)
 	//is this still a good idea? zeroing the capture buffers is important...
 	if(!SPU->regs.masteren) return;
 
-	bool advanced = CommonSettings.spu_advanced;
-
-	//branch here so that slow computers don't have to take the advanced (slower) codepath.
-	//it remainds to be seen exactly how much slower it is
-	//if it isnt much slower then we should refactor everything to be simpler, once it is working
-	if (advanced && SPU == SPU_core)
+	// We used to branch here into advanced/non-advanced mode here.
+	// Hopefully, the current code is good enough to avoid the need now...
+
+	/************************************************/
+
+	// Overall flow:
+	//  For each channel:
+	//    Generate L/R sample data into chanbuf[]
+	//    If not bypassed:
+	//      If not muted:
+	//        Mix chanbuf[] into mixbuf[]
+	//      Else if capturing muted channels:
+	//        Mix chanbuf[] into mutedmixbuf[]
+	//    If capturing from channels:
+	//      Copy/mix chanbuf[] into capbuf[]
+	//    If not playing from mixer:
+	//      Copy/mix chanbuf[] into outbuf[]
+	//  If capturing from channels:
+	//    Output capbuf[] to capture units
+	//  If playing from mixer:
+	//    Output mixbuf[] to outbuf[]
+	//  If capturing from mixer:
+	//    If capturing muted channels:
+	//      Output mixbuf[]+mutedmixbuf[] to capture units
+	//    Else
+	//      Output mixbuf[] to capture units
+	
+	//we used to bail out if speakers were disabled.
+	//this is technically wrong. sound may still be captured, or something.
+	//in all likelihood, any game doing this probably master disabled the SPU also
+	//so, optimization of this case is probably not necessary.
+	//later, we'll just silence the output
+	bool speakersOn = T1ReadWord(MMU.ARM7_REG, 0x304) & 0x01;
+
+	// Translate the mixer and capture states.
+	// This should improve the code generation so that
+	// it doesn't have to reference a lot of memory and
+	// can instead just bitwise-test as needed.
+	//  -bypassMixer controls whether chanbuf[] should NOT be added to mixbuf[]
+	//  -capbufFlags0/1 controls the following:
+	//    -The least-significant bit enabled will store UN-PANNED chanbuf[] to capbuf[]
+	//    -All other bits will add UN-PANNED chanbuf[] to capbuf[]
+	//    -If all zero, capture is either disabled or comes from the mixer
+	//  -outbufFlagsL/R controls the following:
+	//    -The least-significant bit enabled will store PANNED chanbuf[] to outbuf[]
+	//    -All other bits will add PANNED chanbuf[] to outbuf[]
+	//    -If all zero, output comes from the mixer
+	enum
 	{
-		SPU_MixAudio_Advanced(actuallyMix, SPU, length);
-	}
-	else
+		CAPSRC_NONE,
+		CAPSRC_MIXER, // Capture mixer output
+		CAPSRC_CHAN,  // Capture channel 1/3 output
+		CAPSRC_MIXED, // Capture channel 0+1/2+3 output (buggy on hardware)
+	};
+	u8   bypassMixer   = 0;
+	u16  chanMuteFlags = CommonSettings.spu_muteChannels;
+	u8   capbufFlags0  = 0;
+	u8   capbufFlags1  = 0;
+	u8   outbufFlagsL  = 0;
+	u8   outbufFlagsR  = 0;
+	u8   cap0Src       = CAPSRC_NONE;
+	u8   cap1Src       = CAPSRC_NONE;
+	u8  *mixdata       = (u8*)SPU->mixdata;
+	s32 *mixbuf        = NULL;
+	s32 *mutedmixbuf   = NULL;
+	s16 *capbuf        = NULL;
+	s16 *chanbuf       = NULL;
+	s16  *outbuf       = SPU->outbuf;
+	s32   masterVol    = spumuladjust7(SPU->regs.mastervol);
+	int mixdataClearSizeBytes = 0;
+	if(actuallyMix)
 	{
-		//non-advanced mode
-		for (int i = 0; i < 16; i++)
-		{
-			channel_struct *chan = &SPU->channels[i];
+		if(SPU->regs.ctl_ch1bypass) bypassMixer |= (1 << 1);
+		if(SPU->regs.ctl_ch3bypass) bypassMixer |= (1 << 3);
 
-			if (chan->status != CHANSTAT_PLAY)
-				continue;
+		// Translate capture state
+		if(SPU->regs.cap[0].runtime.running)
+		{
+			if(SPU->regs.cap[0].source == 0) cap0Src = CAPSRC_MIXER;
+			else if(SPU->regs.cap[0].add) cap0Src = CAPSRC_MIXED;
+			else cap0Src = CAPSRC_CHAN;
+		}
+		if(SPU->regs.cap[1].runtime.running)
+		{
+			if(SPU->regs.cap[1].source == 0) cap1Src = CAPSRC_MIXER;
+			else if(SPU->regs.cap[1].add) cap1Src = CAPSRC_MIXED;
+			else cap1Src = CAPSRC_CHAN;
+		}
+		if(cap0Src == CAPSRC_CHAN || cap0Src == CAPSRC_MIXED) capbufFlags0 |= (1 << 0);
+		if(                          cap0Src == CAPSRC_MIXED) capbufFlags0 |= (1 << 1);
+		if(cap1Src == CAPSRC_CHAN || cap1Src == CAPSRC_MIXED) capbufFlags1 |= (1 << 2);
+		if(                          cap1Src == CAPSRC_MIXED) capbufFlags1 |= (1 << 3);
+		bool isCapturing  = (cap0Src != CAPSRC_NONE) || (cap1Src != CAPSRC_NONE);
+		bool captureMuted = isCapturing && CommonSettings.spu_captureMuted;
+		if(!captureMuted)
+		{
+			capbufFlags0 &= ~chanMuteFlags;
+			capbufFlags1 &= ~chanMuteFlags;
+		}
 
-			SPU->bufpos = 0;
-			SPU->buflength = length;
+		// Translate outputs
+		switch(SPU->regs.ctl_left)
+		{
+			case SPU_struct::REGS::LOM_CH1:
+				outbufFlagsL = (1 << 1);
+				break;
+			case SPU_struct::REGS::LOM_CH3:
+				outbufFlagsL = (1 << 3);
+				break;
+			case SPU_struct::REGS::LOM_CH1_PLUS_CH3:
+				outbufFlagsL = (1 << 1) | (1 << 3);
+				break;
+		}
+		switch(SPU->regs.ctl_right)
+		{
+			case SPU_struct::REGS::ROM_CH1:
+				outbufFlagsR = (1 << 1);
+				break;
+			case SPU_struct::REGS::ROM_CH3:
+				outbufFlagsR = (1 << 3);
+				break;
+			case SPU_struct::REGS::ROM_CH1_PLUS_CH3:
+				outbufFlagsR = (1 << 1) | (1 << 3);
+				break;
+		}
 
-			// Mix audio
-			_SPU_ChanUpdate(!CommonSettings.spu_muteChannels[i] && actuallyMix, SPU, chan);
+		// Generate mixing pointers
+		// This setup is so we can clear everything in a single memset() call
+		// PONDER: Can we put these on the stack?
+		// ie. u8 mixdata[FIFO_SIZE * (sizeof(s32)+sizeof(s32)+sizeof(s16)+sizeof(s16))*2]
+		int mixBufSize = MIN(length, SPUCAPTURE_FIFO_SIZE);
+		u8 *nextdata = mixdata;
+		if(actuallyMix)  mixbuf      = (s32*)nextdata, nextdata += mixBufSize * sizeof(s32)*2;
+		if(captureMuted) mutedmixbuf = (s32*)nextdata, nextdata += mixBufSize * sizeof(s32)*2;
+		if(isCapturing)  capbuf      = (s16*)nextdata, nextdata += mixBufSize * sizeof(s16)*2;
+		if(actuallyMix)  chanbuf     = (s16*)nextdata; // <- Do not increment nextData
+		mixdataClearSizeBytes = nextdata - mixdata;
+	}
+	else
+	{
+		// If we end up here, we're either mixing the core SPU while
+		// in dual SPU mode (meaning we shouldn't output data from
+		// the capture unit, as we'll do this in the user SPU), or
+		// the output core is SNDDummy (with the capture units either
+		// not running, or disabled via ENABLE_DUMMY_SPU_CAPTURE==0).
+		// In the former case, we can just seek the capture position,
+		// but in the latter case, we must output silence to avoid
+		// potentially leaving the capture buffers filled with garbage.
+		bool captureZeros = (SPU_SoundCore() == &SNDDummy);
+		if(SPU->regs.cap[0].runtime.running)
+		{
+			if(captureZeros) _SPU_WriteCapture<false>(SPU->regs.cap[0], SPU->channels[1], NULL, length);
+			else             _SPU_SeekCapture        (SPU->regs.cap[0], SPU->channels[1],       length);
+		}
+		if(SPU->regs.cap[1].runtime.running)
+		{
+			if(captureZeros) _SPU_WriteCapture<false>(SPU->regs.cap[1], SPU->channels[3], NULL, length);
+			else             _SPU_SeekCapture        (SPU->regs.cap[1], SPU->channels[3],       length);
 		}
+	}
 
-		//zero out capture buffers - effectively transform no-advanced-spu-emulation to capturing-zeroes
-		//this is needed so when the option is changed (or a state with a different setting is loaded)
-		//this code is bulkier and slower than it might otherwise be to reduce the chance of bugs 
-		//IDEALLY the non-advanced codepath would be removed (while the advanced codepath was optimized and improved)
-		//and this code would disappear, to be replaced with code more capable of emitting zeroes at the opportune time.
-		for (int capchan = 0; capchan < 2; capchan++)
+	while(length)
+	{
+		if(mixdataClearSizeBytes) memset(mixdata, 0, mixdataClearSizeBytes);
+
+		// We can only process at most SPUCAPTURE_FIFO_SIZE samples
+		// per mixing batch, in case the capture buffers wrap around.
+		// Technically, we could actually check if this is needed at
+		// all, but this should work well enough as is.
+		int thisLength = MIN(length, SPUCAPTURE_FIFO_SIZE);
+		length -= thisLength;
+
+		// Process each channel in turn
+		// Note that we are using unsigned overflow to avoid counting
+		// directly, as we need to keep track of the bit index anyway
+		channel_struct *chan = SPU->channels;
+		for(u16 chanBit=1; chanBit != 0; chan++, chanBit <<= 1)
 		{
-			SPU_struct::REGS::CAP& cap = SPU->regs.cap[capchan];
-			channel_struct& srcChan = SPU->channels[1 + 2 * capchan];
-			if (cap.runtime.running)
+			if (chan->status != CHANSTAT_PLAY) continue;
+		
+			// Generate data into chanbuf[]
+			// NOTE: If actuallyMix==false, the channel is updated but no data is generated.
+			_SPU_GenerateChanData(actuallyMix, chan, chanbuf, thisLength);
+			if(!actuallyMix) continue;
+
+			// Bypass means we must NOT mix this channel into mixbuf[] OR mutedmixbuf[]
+			if((bypassMixer & chanBit) == 0)
 			{
-				for (int samp = 0; samp < length; samp++)
+				s32 *mixtarget = ((chanMuteFlags & chanBit) == 0) ? mixbuf : mutedmixbuf;
+				if(mixtarget)
 				{
-					u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&cap.runtime.sampcntFrac, srcChan.sampincFrac);
-					cap.runtime.sampcntInt += nSamplesToProcess;
-					while (nSamplesToProcess--)
-					{
-						if (cap.bits8)
-						{
-							_MMU_write08<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-							cap.runtime.curdad++;
-						}
-						else
-						{
-							_MMU_write16<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-							cap.runtime.curdad+=2;
-						}
-
-						if (cap.runtime.curdad >= cap.runtime.maxdad)
-						{
-							cap.runtime.curdad = cap.dad;
-							cap.runtime.sampcntInt -= cap.len*(cap.bits8?4:2);
-						}
-					}
+					for(int n=0; n < thisLength*2; n++) mixtarget[n] += chanbuf[n];
 				}
 			}
-		}
-	} //non-advanced branch
 
-	//we used to bail out if speakers were disabled.
-	//this is technically wrong. sound may still be captured, or something.
-	//in all likelihood, any game doing this probably master disabled the SPU also
-	//so, optimization of this case is probably not necessary.
-	//later, we'll just silence the output
-	bool speakers = T1ReadWord(MMU.ARM7_REG, 0x304) & 0x01;
+			// Generate outputs for channel capture
+			// Yes, we have to undo the panning here, but that's fine.
+			// Incidentally, this emulates the ch(a)+ch(b) overflow bug
+			if((capbufFlags0 & chanBit) != 0)
+			{
+				if((capbufFlags0 & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0]  = chanbuf[n*2+0] + chanbuf[n*2+1];
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0] += chanbuf[n*2+0] + chanbuf[n*2+1];
+			}
+			if((capbufFlags1 & chanBit) != 0)
+			{
+				if((capbufFlags1 & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1]  = chanbuf[n*2+0] + chanbuf[n*2+1];
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1] += chanbuf[n*2+0] + chanbuf[n*2+1];
+			}
 
-	u8 vol = SPU->regs.mastervol;
+			// If speakers are turned off or the channel is muted, we can skip
+			// setting outbuf[]. Note that if the channel is muted and we are
+			// generating outbuf[] from chanbuf[], outbuf[] must already have
+			// been cleared to silence.
+			if(!speakersOn || (chanMuteFlags & chanBit) != 0) continue;
 
-	// convert from 32-bit->16-bit
-	if (actuallyMix && speakers)
-		for (int i = 0; i < length*2; i++)
+			// Set outbuf[] from chanbuf[] when L/R source is not the mixer
+			// Note that Ch1+Ch3 mode clips as intended; only capture has overflow bugs
+			if((outbufFlagsL & chanBit) != 0)
+			{
+				if((outbufFlagsL & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+0] =                        (chanbuf[n*2+0] * masterVol >> 7);
+				else
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+0] = MinMax(outbuf[n*2+0] + (chanbuf[n*2+0] * masterVol >> 7), -0x8000, +0x7FFF);
+			}
+			if((outbufFlagsR & chanBit) != 0)
+			{
+				if((outbufFlagsR & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+1] =                        (chanbuf[n*2+1] * masterVol >> 7);
+				else
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+1] = MinMax(outbuf[n*2+1] + (chanbuf[n*2+1] * masterVol >> 7), -0x8000, +0x7FFF);
+			}
+		}
+
+		// Generate mixer output to outbuf[]
+		if(mixbuf && speakersOn)
 		{
-			// Apply Master Volume
-			SPU->sndbuf[i] = spumuldiv7(SPU->sndbuf[i], vol);
-			s16 outsample = MinMax(SPU->sndbuf[i],-0x8000,0x7FFF);
-			SPU->outbuf[i] = outsample;
+			if(outbufFlagsL == 0)
+			{
+				for(int n=0; n < thisLength; n++)
+					outbuf[n*2+0] = MinMax(mixbuf[n*2+0] * masterVol >> 7, -0x8000, +0x7FFF);
+			}
+			if(outbufFlagsR == 0)
+			{
+				for(int n=0; n < thisLength; n++)
+					outbuf[n*2+1] = MinMax(mixbuf[n*2+1] * masterVol >> 7, -0x8000, +0x7FFF);
+			}
 		}
 
+		// Generate final capture output
+		if(cap0Src != CAPSRC_NONE)
+		{
+			if(cap0Src == CAPSRC_MIXER)
+			{
+				if(mutedmixbuf)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0] = MinMax(mixbuf[n*2+0] + mutedmixbuf[n*2+0], -0x8000, +0x7FFF);
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0] = MinMax(mixbuf[n*2+0],                      -0x8000, +0x7FFF);
+			}
+			bool run = _SPU_WriteCapture<true>(SPU->regs.cap[0], SPU->channels[1], capbuf, thisLength);
+			if(!run) cap0Src = CAPSRC_NONE, capbufFlags0 = 0;
+		}
+		if(cap1Src != CAPSRC_NONE)
+		{
+			if(cap1Src == CAPSRC_MIXER)
+			{
+				if(mutedmixbuf)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1] = MinMax(mixbuf[n*2+1] + mutedmixbuf[n*2+1], -0x8000, +0x7FFF);
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1] = MinMax(mixbuf[n*2+1],                      -0x8000, +0x7FFF);
+			}
+			bool run = _SPU_WriteCapture<true>(SPU->regs.cap[1], SPU->channels[3], capbuf+1, thisLength);
+			if(!run) cap1Src = CAPSRC_NONE, capbufFlags1 = 0;
+		}
 
+		// Advance buffer
+		outbuf += (size_t)thisLength*2;
+	}
 }
 
 //////////////////////////////////////////////////////////////////////////////
 
 
-//emulates one hline of the cpu core.
-//this will produce a variable number of samples, calculated to keep a 44100hz output
-//in sync with the emulator framerate
-void SPU_Emulate_core()
+// Emulates the cpu core for the specified number of ARM7 cycles.
+// This will produce a variable number of samples to sync to DESMUME_SAMPLE_RATE
+int SPU_Emulate_core(u32 numberOfARM7Cycles)
 {
-	bool needToMix = true;
 	SoundInterface_struct *soundProcessor = SPU_SoundCore();
-	
-	_samples += samples_per_hline;
-	spu_core_samples = (int)(_samples);
-	_samples -= spu_core_samples;
-	
+
 	// We don't need to mix audio for Dual Synch/Asynch mode since we do this
 	// later in SPU_Emulate_user(). Disable mixing here to speed up processing.
-	// However, recording still needs to mix the audio, so make sure we're also
-	// not recording before we disable mixing.
-	if ( _currentSynchMode == ESynchMode_DualSynchAsynch &&
-		!(driver->AVI_IsRecording() || driver->WAV_IsRecording()) )
+	// If we are outputting to the dummy core, we can disable all mixing if
+	// we are not capturing the output at all, increasing performance.
+	bool needToMix = false;
+	if(soundProcessor != &SNDDummy)
+		needToMix = (_currentSynchMode != ESynchMode_DualSynchAsynch);
+#if ENABLE_DUMMY_SPU_CAPTURE
+	else if(SPU_core->regs.cap[0].runtime.running || SPU_core->regs.cap[1].runtime.running)
+		needToMix = true;
+#endif
+
+	// If we are recording, we will need to mix the core SPU regardless of anything else
+	// NOTE: Technically, we should be checking wavWriter.mode==WAVMODE_CORE, but that
+	// is only enabled with DEVELOPER_MENU_ITEMS, and we won't break anything anyway.
+	needToMix = needToMix || driver->AVI_IsRecording() || driver->WAV_IsRecording();
+
+	// NOTE: We used to keep a double-type counter here, and pre-divided by
+	// ARM7_CLOCK. This is probably enough for most cases, but for the sake
+	// of perfect accuracy (at least in regards to this), we use a cycles
+	// counter instead here, and figure out the sample count from there.
+	int samplesToMix;
 	{
-		needToMix = false;
+		// minMixSize controls the mixing latency, which reduces the
+		// overhead of the update routines at the cost of synchronicity.
+		// NOTE: minMixSize must be <= 128, or else _spu_core_cyclesCounter
+		// would cause 32bit overflow if we postpone mixing for long enough.
+		// We could use a 64bit counter instead, but 128 samples should be plenty.
+		static const u32 doMix_minMixSize = 1; // <- Setting this too high can break streams, so keep at minimum
+		static const u32 noMix_minMixSize = 64;
+		u64 minDeltaCycles = (u64)(needToMix ? doMix_minMixSize : noMix_minMixSize) * ARM7_CLOCK;
+		u64 cycles64 = _spu_core_cyclesCounter + (u64)numberOfARM7Cycles*DESMUME_SAMPLE_RATE;
+		if(cycles64 < minDeltaCycles)
+		{
+			_spu_core_cyclesCounter = (u32)cycles64;
+			return 0;
+		}
+		samplesToMix            = (int)(cycles64 / ARM7_CLOCK);
+		_spu_core_cyclesCounter = (u32)(cycles64 % ARM7_CLOCK);
 	}
 	
-	SPU_MixAudio(needToMix, SPU_core, spu_core_samples);
+	SPU_MixAudio(needToMix, SPU_core, samplesToMix);
 	
-	if (soundProcessor == NULL)
+	if (soundProcessor != NULL)
 	{
-		return;
-	}
-	
-	if (soundProcessor->FetchSamples != NULL)
-	{
-		soundProcessor->FetchSamples(SPU_core->outbuf, spu_core_samples, _currentSynchMode, _currentSynchronizer);
-	}
-	else
-	{
-		SPU_DefaultFetchSamples(SPU_core->outbuf, spu_core_samples, _currentSynchMode, _currentSynchronizer);
+		if (soundProcessor->FetchSamples != NULL)
+		{
+			soundProcessor->FetchSamples(SPU_core->outbuf, samplesToMix, _currentSynchMode, _currentSynchronizer);
+		}
+		else
+		{
+			SPU_DefaultFetchSamples(SPU_core->outbuf, samplesToMix, _currentSynchMode, _currentSynchronizer);
+		}
 	}
+
+	return samplesToMix;
 }
 
 void SPU_Emulate_user(bool mix)
@@ -1687,16 +1876,16 @@ void SPU_Emulate_user(bool mix)
 	// Check to see how many free samples are available.
 	// If there are some, fill up the output buffer.
 	freeSampleCount = soundProcessor->GetAudioSpace();
-	if (freeSampleCount == 0)
-	{
-		return;
-	}
 	
 	//printf("mix %i samples\n", audiosize);
 	if (freeSampleCount > _currentBufferSize)
 	{
 		freeSampleCount = _currentBufferSize;
 	}
+	if (freeSampleCount == 0)
+	{
+		return;
+	}
 	
 	// If needed, resize the post-process buffer to guarantee that
 	// we can store all the sound data.
@@ -1877,7 +2066,7 @@ void WavWriter::update(void* soundData, int numSamples)
 {
 	if(!spufp) return;
 	//TODO - big endian for the s16 samples??
-	size_t elems_written = fwrite(soundData, numSamples*2, 2, spufp);
+	size_t elems_written = fwrite(soundData, sizeof(s16)*2, numSamples, spufp);
 }
 
 bool WavWriter::isRecording() const
@@ -1929,14 +2118,14 @@ void WAV_WavSoundUpdate(void* soundData, int numSamples, WAVMode mode)
 void spu_savestate(EMUFILE &os)
 {
 	//version
-	os.write_32LE(7);
+	os.write_32LE(8);
 
 	SPU_struct *spu = SPU_core;
 
+	os.write_u8(SPUCHAN_PCM16B_SIZE);
 	for (int j = 0; j < 16; j++)
 	{
 		channel_struct &chan = spu->channels[j];
-		os.write_32LE(chan.num);
 		os.write_u8(chan.vol);
 		os.write_u8(chan.volumeDiv);
 		os.write_u8(chan.hold);
@@ -1952,15 +2141,13 @@ void spu_savestate(EMUFILE &os)
 		os.write_32LE(chan.length);
 		os.write_32LE(chan.sampcntFrac);
 		os.write_32LE(chan.sampcntInt);
-		os.write_32LE(chan.sampincFrac);
-		os.write_32LE(chan.sampincInt);
-		for (int i = 0; i < SPUINTERPOLATION_TAPS; i++) os.write_16LE(chan.pcm16b[i]);
-		os.write_32LE(chan.index);
+		for (int i = 0; i < SPUCHAN_PCM16B_SIZE; i++) os.write_16LE(chan.pcm16b[i]);
+		os.write_u8(chan.index);
 		os.write_16LE(chan.x);
 		os.write_u8(chan.keyon);
 	}
 
-	os.write_doubleLE(_samples);
+	os.write_32LE(_spu_core_cyclesCounter);
 
 	os.write_u8(spu->regs.mastervol);
 	os.write_u8(spu->regs.ctl_left);
@@ -1980,23 +2167,22 @@ void spu_savestate(EMUFILE &os)
 		os.write_32LE(spu->regs.cap[i].dad);
 		os.write_16LE(spu->regs.cap[i].len);
 		os.write_u8(spu->regs.cap[i].runtime.running);
-		os.write_32LE(spu->regs.cap[i].runtime.curdad);
-		os.write_32LE(spu->regs.cap[i].runtime.maxdad);
+		os.write_32LE(spu->regs.cap[i].runtime.dad);
 		os.write_32LE(spu->regs.cap[i].runtime.sampcntFrac);
 		os.write_32LE(spu->regs.cap[i].runtime.sampcntInt);
 	}
 
+	os.write_u8(SPUCAPTURE_FIFO_SIZE);
 	for (int i = 0; i < 2; i++)
-		spu->regs.cap[i].runtime.fifo.save(os);
+	{
+		os.write_u8(spu->regs.cap[i].runtime.pcm16bOffs);
+		for (int n = 0; n < SPUCAPTURE_FIFO_SIZE; n++)
+			os.write_16LE(spu->regs.cap[i].runtime.pcm16b[n]);
+	}
 }
 
 bool spu_loadstate(EMUFILE &is, int size)
 {
-	//note! if we load a state created with advanced spu logic on a system without it,
-	//there's a high likelihood of captured data existing.
-	//this would get played back forever without being replaced by captured data.
-	//it's been solved by capturing zeroes though even when advanced spu logic is disabled.
-	
 	//read version
 	u32 version;
 	if (is.read_32LE(version) != 1) return false;
@@ -2004,10 +2190,11 @@ bool spu_loadstate(EMUFILE &is, int size)
 	SPU_struct *spu = SPU_core;
 	reconstruct(&SPU_core->regs);
 
+	int pcm16bSz_Chan = (version >= 8) ? (int)is.read_u8() : 4;
 	for (int j = 0; j < 16; j++)
 	{
 		channel_struct &chan = spu->channels[j];
-		is.read_32LE(chan.num);
+		if(version < 8) is.read_32LE(chan.num); else chan.num = j;
 		is.read_u8(chan.vol);
 		is.read_u8(chan.volumeDiv);
 		if (chan.volumeDiv == 4) chan.volumeDiv = 3;
@@ -2017,7 +2204,7 @@ bool spu_loadstate(EMUFILE &is, int size)
 		is.read_u8(chan.repeat);
 		is.read_u8(chan.format);
 		is.read_u8(chan.status);
-		if (version >= 7) is.read_u8(chan.pcm16bOffs); else chan.pcm16bOffs = 0;
+		if (version >= 7) chan.pcm16bOffs = SPUCHAN_PCM16B_AT(is.read_u8());
 		is.read_32LE(chan.addr);
 		is.read_16LE(chan.timer);
 		is.read_16LE(chan.loopstart);
@@ -2027,54 +2214,45 @@ bool spu_loadstate(EMUFILE &is, int size)
 		if(version >= 7) {
 			is.read_32LE(chan.sampcntFrac);
 			is.read_32LE(chan.sampcntInt);
-			is.read_32LE(chan.sampincFrac);
-			is.read_32LE(chan.sampincInt);
+			if(version < 8) is.fseek(8, SEEK_CUR); // chan.sampincFrac (LE32), chan.sampincInt (LE32)
 		}
-		else if (version >= 2)
+		else /*if (version >= 2)*/ // <- This check (and its broken else clause) was never needed
 		{
-			double temp;
-			s64 temp2;
-			is.read_doubleLE(temp); temp2 = (s64)(temp * (1ll << 32));
-			chan.sampcntFrac = (u32)temp2;
-			chan.sampcntInt  = (s32)(temp2 >> 32);
-			is.read_doubleLE(temp); temp2 = (u64)(temp * (1ull << 32)); // Intentionally unsigned
-			chan.sampincFrac = (u32)temp2;
-			chan.sampincInt  = (u32)(temp2 >> 32);
-		}
-		else
-		{
-			// FIXME
-			// What even is supposed to be happening here?
-			// sampcnt and sampinc were double type before
-			// I even made any changes, so this is broken.
-			chan.sampcntFrac = 0;
-			is.read_32LE(chan.sampcntInt);
-			chan.sampincFrac = 0;
-			is.read_32LE(chan.sampincInt);
+			s64 temp = (s64)(is.read_doubleLE() * (double)(1ll << 32));
+			chan.sampcntFrac = (u32)temp;
+			chan.sampcntInt  = (s32)(temp >> 32);
+			is.fseek(8, SEEK_CUR); // chan.sampinc (LEdouble)
 		}
 		if (version >= 7) {
-			for (int i = 0; i < SPUINTERPOLATION_TAPS; i++) is.read_16LE(chan.pcm16b[i]);
+			for (int i = 0; i < pcm16bSz_Chan; i++) is.read_16LE(chan.pcm16b[SPUCHAN_PCM16B_AT(i)]);
 		}
 		else
 		{
 			is.fseek(4, SEEK_CUR);        // chan.lastsampcnt (LE32)
 			is.read_16LE(chan.pcm16b[0]); // chan.pcm16b
-			is.fseek(2, SEEK_CUR);        // chan.pcm16b_last
+			is.fseek(2, SEEK_CUR);        // chan.pcm16b_last (LE16)
 		}
-		is.read_32LE(chan.index);
+		chan.index = (version >= 8) ? is.read_u8() : (u8)is.read_s32LE();
 		is.read_16LE(chan.x);
 		if (version < 7) is.fseek(2, SEEK_CUR); // chan.psgnoise_last (LE16)
 
 		if (version >= 4)
 			is.read_u8(chan.keyon);
 
+		// Because we don't save sampinc, we need to recalculate it
+		adjust_channel_timer(&chan);
+
 		//hopefully trigger a recovery of the adpcm looping system
 		chan.loop_index = K_ADPCM_LOOPING_RECOVERY_INDEX;
 	}
 
-	if (version >= 2)
+	if (version >= 8)
 	{
-		is.read_doubleLE(_samples);
+		is.read_32LE(_spu_core_cyclesCounter);
+	}
+	else if (version >= 2)
+	{
+		_spu_core_cyclesCounter = (u32)(is.read_doubleLE() * ARM7_CLOCK); // _samples (doubleLE)
 	}
 
 	if (version >= 4)
@@ -2100,27 +2278,52 @@ bool spu_loadstate(EMUFILE &is, int size)
 			is.read_32LE(spu->regs.cap[i].dad);
 			is.read_16LE(spu->regs.cap[i].len);
 			is.read_u8(spu->regs.cap[i].runtime.running);
-			is.read_32LE(spu->regs.cap[i].runtime.curdad);
-			is.read_32LE(spu->regs.cap[i].runtime.maxdad);
+			if (version >= 8) is.read_32LE(spu->regs.cap[i].dad);
+			else {
+				is.fseek(4, SEEK_CUR); // regs.cap[i].runtime.curdad (LE32)
+				is.read_32LE(spu->regs.cap[i].dad); // regs.cap[i].runtime.maxdad
+				spu->regs.cap[i].dad -= spu->regs.cap[i].len*4;
+			}
 			if (version >= 7) {
 				is.read_32LE(spu->regs.cap[i].runtime.sampcntFrac);
 				is.read_32LE(spu->regs.cap[i].runtime.sampcntInt);
 			}
 			else
 			{
-				double temp;
-				u64 temp2;
-				is.read_doubleLE(temp); temp2 = (u64)(temp * (1ull << 32));
-				spu->regs.cap[i].runtime.sampcntFrac = (u32)temp2;
-				spu->regs.cap[i].runtime.sampcntInt  = (u32)(temp2 >> 32);
+				s64 temp = (s64)(is.read_doubleLE() * (double)(1ll << 32));
+				spu->regs.cap[i].runtime.sampcntFrac = (u32)temp;
+				spu->regs.cap[i].runtime.sampcntInt  = (s32)(temp >> 32);
+			}
+			if(version <= 7)
+			{
+				// Before, sampcnt incremented "as expected" and the FIFO
+				// delay was implemented within the SndFifo construct.
+				// Now, though, we create the delay by setting sampcnt to
+				// -FIFO_SIZE on starting capture, so account for this here.
+				spu->regs.cap[i].runtime.sampcntInt -= SPUCAPTURE_FIFO_SIZE;
 			}
 		}
 	}
 
-	if (version >= 6)
-		for (int i=0;i<2;i++) spu->regs.cap[i].runtime.fifo.load(is);
-	else
-		for (int i=0;i<2;i++) spu->regs.cap[i].runtime.fifo.reset();
+	int pcm16bSz_Capture = (version >= 8) ? (int)is.read_u8() : 16;
+	if (version >= 8)
+		for (int i=0;i<2;i++)
+		{
+			spu->regs.cap[i].runtime.pcm16bOffs = SPUCAPTURE_PCM16B_AT(is.read_u8());
+			for (int n = 0; n < pcm16bSz_Capture; n++)
+				is.read_16LE(spu->regs.cap[i].runtime.pcm16b[SPUCAPTURE_PCM16B_AT(n)]);
+		}
+	else if (version >= 6)
+		for (int i=0;i<2;i++)
+		{
+			// Setting pcm16bOffs to -fifo.size ensures that we always
+			// fill at the correct offset relative to the FIFO queue size
+			SPUFifo fifo;
+			fifo.load(is);
+			spu->regs.cap[i].runtime.pcm16bOffs = (u8)(-fifo.size);
+			for (int n = 0; n < 16; n++)
+				spu->regs.cap[i].runtime.pcm16b[SPUCAPTURE_PCM16B_AT(n)] = fifo.dequeue();
+		}
 
 	//older versions didnt store a mastervol; 
 	//we must reload this or else games will start silent
diff --git a/desmume/src/SPU.h b/desmume/src/SPU.h
index 5ba6d1c96..7ab7a470f 100644
--- a/desmume/src/SPU.h
+++ b/desmume/src/SPU.h
@@ -36,12 +36,16 @@ class EMUFILE;
 #define CHANSTAT_STOPPED          0
 #define CHANSTAT_PLAY             1
 
-#define SPUINTERPOLATION_TAPS 4 // Must be at least 4 for Catmull-Rom interpolation
+#define SPUCHAN_PCM16B_SIZE   4 // Must be 2^n, and at least 4 for Catmull-Rom interpolation
+#define SPUCAPTURE_FIFO_SIZE 16 // Must be 2^n
 
-//who made these static? theyre used in multiple places.
-FORCEINLINE s32 spumuldiv7(s32 val, u8 multiplier) {
-	assert(multiplier <= 127);
-	return (multiplier == 127) ? val : ((val * multiplier) >> 7);
+// This converts a value of 127/128 into 128/128. Needed for volume/pan/etc. calculations
+template<typename T>
+FORCEINLINE T spumuladjust7(T x)
+{
+	// Using >= can result in better code on some platforms
+	assert(x <= 127);
+	return x + (x >= (T)127);
 }
 
 enum SPUInterpolationMode
@@ -95,6 +99,7 @@ struct channel_struct
 						sampcntInt(0),
 						sampincFrac(0),
 						sampincInt(0),
+						pcm16b(),
 						loop_pcm16b(0),
 						index(0),
 						loop_index(0),
@@ -121,11 +126,11 @@ struct channel_struct
    s32 sampcntInt;
    u32 sampincFrac;
    u32 sampincInt;
-   s16 pcm16b[SPUINTERPOLATION_TAPS];
+   s16 pcm16b[SPUCHAN_PCM16B_SIZE];
    // ADPCM specific
    s16 loop_pcm16b;
-   s32 index;
-   int loop_index;
+   u8  index;
+   u8  loop_index;
    // PSG noise
    u16 x;
 };
@@ -146,12 +151,9 @@ class SPUFifo
 class SPU_struct
 {
 public:
-	SPU_struct(int buffersize);
-   u32 bufpos;
-   u32 buflength;
-   s32 *sndbuf;
-   s32 lastdata; //the last sample that a channel generated
-   s16 *outbuf;
+	SPU_struct();
+   s32 *mixdata; // Mixing buffers
+   s16 *outbuf;  // Device output source (L,R)
    u32 bufsize;
    channel_struct channels[16];
 
@@ -192,19 +194,22 @@ class SPU_struct
 		   u16 len;
 		   struct Runtime {
 			   Runtime()
-				   : running(0), curdad(0), maxdad(0)
+				   : running(0), pcm16bOffs(0), dad(0), len(0), sampcntFrac(0), sampcntInt(0), pcm16b()
 			   {}
+
 			   u8 running;
-			   u32 curdad;
-			   u32 maxdad;
+			   u8 pcm16bOffs;
+			   u32 dad;
+			   u32 len;
 			   u32 sampcntFrac;
-			   u32 sampcntInt;
-			   SPUFifo fifo;
+			   s32 sampcntInt;
+			   s16 pcm16b[SPUCAPTURE_FIFO_SIZE];
 		   } runtime;
 	   } cap[2];
    } regs;
 
    void reset();
+   void resizeBuffer(int buffersize);
    ~SPU_struct();
    void KeyOff(int channel);
    void KeyOn(int channel);
@@ -223,7 +228,6 @@ class SPU_struct
 };
 
 extern SPU_struct *SPU_core, *SPU_user;
-extern int spu_core_samples;
 
 int SPU_ChangeSoundCore(int coreid, int newBufferSizeBytes);
 SoundInterface_struct *SPU_SoundCore();
@@ -236,7 +240,7 @@ void SPU_SetSynchMode(int mode, int method);
 void SPU_ClearOutputBuffer(void);
 void SPU_Reset(void);
 void SPU_DeInit(void);
-void SPU_KeyOn(int channel);
+
 static FORCEINLINE void SPU_WriteByte(u32 addr, u8 val)
 {
 	addr &= 0xFFF;
@@ -264,7 +268,8 @@ static FORCEINLINE void SPU_WriteLong(u32 addr, u32 val)
 static FORCEINLINE u8 SPU_ReadByte(u32 addr) { return SPU_core->ReadByte(addr & 0x0FFF); }
 static FORCEINLINE u16 SPU_ReadWord(u32 addr) { return SPU_core->ReadWord(addr & 0x0FFF); }
 static FORCEINLINE u32 SPU_ReadLong(u32 addr) { return SPU_core->ReadLong(addr & 0x0FFF); }
-void SPU_Emulate_core(void);
+
+int SPU_Emulate_core(u32 numberOfARM7Cycles);
 void SPU_Emulate_user(bool mix = true);
 void SPU_DefaultFetchSamples(s16 *sampleBuffer, size_t sampleCount, ESynchMode synchMode, ISynchronizingAudioBuffer *theSynchronizer);
 size_t SPU_DefaultPostProcessSamples(s16 *postProcessBuffer, size_t requestedSampleCount, ESynchMode synchMode, ISynchronizingAudioBuffer *theSynchronizer);
diff --git a/desmume/src/frontend/windows/soundView.cpp b/desmume/src/frontend/windows/soundView.cpp
index 786b789a1..d0e9a1424 100644
--- a/desmume/src/frontend/windows/soundView.cpp
+++ b/desmume/src/frontend/windows/soundView.cpp
@@ -141,7 +141,7 @@ void SoundView_Refresh(bool forceRedraw)
 		InvalidateRect(GetDlgItem(hDlg, IDC_SOUND0PANBAR+chanId), NULL, FALSE);
 		if(thischan.status != CHANSTAT_STOPPED)
 		{
-			volBar[chan] = spumuldiv7(128, thischan.vol) >> volume_shift[thischan.volumeDiv];
+			volBar[chan] = spumuladjust7(thischan.vol) >> volume_shift[thischan.volumeDiv];
 			InvalidateRect(GetDlgItem(hDlg, IDC_SOUND0VOLBAR+chanId), NULL, FALSE);
 
 			if(SoundView_Data->volModeAlternate) 
@@ -274,7 +274,7 @@ void SoundView_Refresh(bool forceRedraw)
 		sprintf(buf,"%08X",cap0.len);
 		SetDlgItemText(hDlg,IDC_CAP0_LEN,buf);
 
-		sprintf(buf,"%08X",cap0.runtime.curdad);
+		sprintf(buf,"%08X",cap0.runtime.dad+cap0.runtime.sampcntInt*(cap0.bits8 ? 1 : 2));
 		SetDlgItemText(hDlg,IDC_CAP0_CURDAD,buf);
 
 		memcpy(&oldCap[0], &cap0, sizeof(SPU_struct::REGS::CAP));
@@ -306,7 +306,7 @@ void SoundView_Refresh(bool forceRedraw)
 		sprintf(buf,"%08X",cap1.len);
 		SetDlgItemText(hDlg,IDC_CAP1_LEN,buf);
 
-		sprintf(buf,"%08X",cap1.runtime.curdad);
+		sprintf(buf,"%08X",cap1.runtime.dad+cap1.runtime.sampcntInt*(cap1.bits8 ? 1 : 2));
 		SetDlgItemText(hDlg,IDC_CAP1_CURDAD,buf);
 
 		memcpy(&oldCap[1], &cap1, sizeof(SPU_struct::REGS::CAP));
@@ -319,19 +319,23 @@ void SoundView_Refresh(bool forceRedraw)
 static void updateMute_toSettings(HWND hDlg, int chan)
 {
 	for(int chanId = 0; chanId < 8; chanId++)
-		CommonSettings.spu_muteChannels[chanId+chanOfs()] = IsDlgButtonChecked(hDlg, IDC_SOUND0MUTE+chanId) == BST_CHECKED;
+	{
+		u16 bit = 1 << (chanId+chanOfs());
+		CommonSettings.spu_muteChannels &= ~bit;
+		CommonSettings.spu_muteChannels |=  bit * (IsDlgButtonChecked(hDlg, IDC_SOUND0MUTE+chanId) == BST_CHECKED);
+	}
 }
 
 static void updateMute_allFromSettings(HWND hDlg)
 {
 	for(int chanId = 0; chanId < 16; chanId++)
-		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,CommonSettings.spu_muteChannels[chanId]);
+		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,(CommonSettings.spu_muteChannels & (1 << chanId)) != 0);
 }
 
 static void updateMute_fromSettings(HWND hDlg)
 {
 	for(int chanId = 0; chanId < 8; chanId++)
-		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,CommonSettings.spu_muteChannels[chanId+chanOfs()]);
+		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,(CommonSettings.spu_muteChannels & (1 << (chanId+chanOfs()))) != 0);
 }
 static void SoundView_SwitchChanOfs(SoundView_DataStruct *data)
 {
@@ -435,7 +439,7 @@ static INT_PTR CALLBACK SoundView_DlgProc(HWND hDlg, UINT uMsg, WPARAM wParam, L
 			}
 
 			for(int chanId = 0; chanId < 8; chanId++) {
-				if(CommonSettings.spu_muteChannels[chanId])
+				if((CommonSettings.spu_muteChannels & (1<<chanId)) != 0)
 					SendDlgItemMessage(hDlg, IDC_SOUND0MUTE+chanId, BM_SETCHECK, TRUE, 0);
 			}
 
@@ -483,14 +487,12 @@ static INT_PTR CALLBACK SoundView_DlgProc(HWND hDlg, UINT uMsg, WPARAM wParam, L
 			CommonSettings.spu_captureMuted = IsDlgButtonChecked(hDlg,IDC_SOUND_CAPTURE_MUTED) != 0;
 			return 1;
 		case IDC_SOUND_UNMUTE_ALL:
-			for(int i=0;i<16;i++) CommonSettings.spu_muteChannels[i] = false;
+			CommonSettings.spu_muteChannels = 0;
 			updateMute_allFromSettings(hDlg);
 			return 1;
 		case IDC_SOUND_ANALYZE_CAP:
 			printf("WTF\n");
-			for(int i=0;i<16;i++) CommonSettings.spu_muteChannels[i] = true;
-			CommonSettings.spu_muteChannels[1] = false;
-			CommonSettings.spu_muteChannels[3] = false;
+			CommonSettings.spu_muteChannels = (u16)(~0) &~ ((1 << 1) | (1 << 3));
 			CommonSettings.spu_captureMuted = true;
 			updateMute_allFromSettings(hDlg);
 			CheckDlgItem(hDlg,IDC_SOUND_CAPTURE_MUTED,CommonSettings.spu_captureMuted);