1 files changed, 2560 insertions, 0 deletions
diff --git a/src/libs/tiny/tinysound.h b/src/libs/tiny/tinysound.h
new file mode 100644
index 0000000..41d547d
--- /dev/null
+++ b/src/libs/tiny/tinysound.h
@@ -0,0 +1,2560 @@
+/*
+tinysound.h - v1.07
+
+Summary:
+tinysound is a C API for loading, playing, looping, panning and fading mono
+and stero sounds. This means tinysound imparts no external DLLs or large
+libraries that adversely effect shipping size. tinysound can also run on
+Windows XP since DirectSound ships with all recent versions of Windows.
+tinysound implements a custom SSE2 mixer by explicitly locking and unlocking
+portions of an internal. tinysound uses CoreAudio for Apple machines (like
+OSX and iOS). SDL is used for all other platforms. Define TS_FORCE_SDL
+before placaing the TS_IMPLEMENTATION in order to force the use of SDL.
+
+Revision history:
+1.0  (06/04/2016) initial release
+1.01 (06/06/2016) load WAV from memory
+separate portable and OS-specific code in tsMix
+fixed bug causing audio glitches when sounds ended
+added stb_vorbis loaders + demo example
+1.02 (06/08/2016) error checking + strings in vorbis loaders
+SSE2 implementation of mixer
+fix typos on docs/comments
+corrected volume bug introduced in 1.01
+1.03 (07/05/2016) size calculation helper (to know size of sound in
+bytes on the heap) tsSoundSize
+1.04 (12/06/2016) merged in Aaron Balint's contributions
+SFFT and pitch functions from Stephan M. Bernsee
+tsMix can run on its own thread with tsSpawnMixThread
+updated documentation, typo fixes
+fixed typo in malloc16 that caused heap corruption
+1.05 (12/08/2016) tsStopAllSounds, suggested by Aaron Balint
+1.06 (02/17/2017) port to CoreAudio for Apple machines
+1.07 (06/18/2017) SIMD the pitch shift code; swapped out old Bernsee
+code for a new re-write, updated docs as necessary,
+support for compiling as .c and .cpp on Windows,
+port for SDL (for Linux, or any other platform).
+Special thanks to DexP of github for 90% of the work
+on the SDL port!
+*/
+
+/*
+Contributors:
+Aaron Balint      1.04 - real time pitch
+1.04 - separate thread for tsMix
+1.04 - bugfix, removed extra free16 call for second channel
+DeXP              1.07 - initial work on SDL port
+*/
+
+/*
+To create implementation (the function definitions)
+#define TS_IMPLEMENTATION
+in *one* C/CPP file (translation unit) that includes this file
+
+DOCUMENTATION (very quick intro):
+1. create context
+2. load sounds from disk into memory
+3. play sounds
+4. free context
+
+1. tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, N );
+2. tsPlaySoundDef def = tsMakeDef( &tsLoadWAV( "path_to_file/filename.wav" ) );
+3. tsPlaySound( ctx, def );
+4. tsShutdownContext( ctx );
+
+DOCUMENTATION (longer introduction):
+tinysound consists of tsLoadedSounds, tsPlayingSounds and the tsContext.
+The tsContext encapsulates an OS sound API, as well as buffers + settings.
+tsLoadedSound holds raw samples of a sound. tsPlayingSound is an instance
+of a tsLoadedSound that represents a sound that can be played through the
+tsContext.
+
+There are two main versions of the API, the low-level and the high-level
+API. The low-level API does not manage any memory for tsPlayingSounds. The
+high level api holds a memory pool of playing sounds.
+
+High-level API:
+First create a context and pass in non-zero to the final parameter. This
+final parameter controls how large of a memory pool to use for tsPlayingSounds.
+Here's an example where N is the size of the internal pool:
+
+tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, N );
+
+We create tsPlayingSounds indirectly with tsPlayDef structs. tsPlayDef is a
+POD struct so feel free to make them straight on the stack. The tsPlayDef
+sets up initialization parameters. Here's an example to load a wav and
+play it:
+
+tsLoadedSound loaded = tsLoadWAV( "path_to_file/filename.wav" );
+tsPlaySoundDef def = tsMakeDef( &loaded );
+tsPlayingSound* sound = tsPlaySound( ctx, def );
+
+The same def can be used to play as many sounds as desired (even simultaneously)
+as long as the context playing sound pool is large enough.
+
+Low-level API:
+First create a context and pass 0 in the final parameter (0 here means
+the context will *not* allocate a tsPlayingSound memory pool):
+
+tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, 0 );
+
+parameters:
+hwnd           --  HWND, handle to window (on OSX just pass in 0)
+frequency      --  int, represents Hz frequency rate in which samples are played
+latency        --  int, estimated latency in Hz from PlaySound call to speaker output
+seconds        --  int, number of second of samples internal buffers can hold
+0 (last param) --  int, number of elements in tsPlayingSound pool
+
+We create a tsPlayingSound like so:
+tsLoadedSound loaded = tsLoadWAV( "path_to_file/filename.wav" );
+tsPlayingSound playing_sound = tsMakePlayingSound( &loaded );
+
+Then to play the sound we do:
+tsInsertSound( ctx, &playing_sound );
+
+The above tsInsertSound function call will place playing_sound into
+a singly-linked list inside the context. The context will remove
+the sound from its internal list when it finishes playing.
+
+WARNING: The high-level API cannot be mixed with the low-level API. If you
+try then the internal code will assert and crash. Pick one and stick with it.
+Usually he high-level API will be used, but if someone is *really* picky about
+their memory usage, or wants more control, the low-level API can be used.
+
+Here is the Low-Level API:
+tsPlayingSound tsMakePlayingSound( tsLoadedSound* loaded );
+void tsInsertSound( tsContext* ctx, tsPlayingSound* sound );
+
+Here is the High-Level API:
+tsPlayingSound* tsPlaySound( tsContext* ctx, tsPlaySoundDef def );
+tsPlaySoundDef tsMakeDef( tsLoadedSound* sound );
+void tsStopAllSounds( tsContext( ctx );
+
+Be sure to link against dsound.dll (or dsound.lib) on Windows.
+
+Read the rest of the header for specific details on all available functions
+and struct types.
+*/
+
+/*
+Known Limitations:
+
+* PCM mono/stereo format is the only formats the LoadWAV function supports. I don't
+guarantee it will work for all kinds of wav files, but it certainly does for the common
+kind (and can be changed fairly easily if someone wanted to extend it).
+* Only supports 16 bits per sample.
+* Mixer does not do any fancy clipping. The algorithm is to convert all 16 bit samples
+to float, mix all samples, and write back to audio API as 16 bit integers. In
+practice this works very well and clipping is not often a big problem.
+* I'm not super familiar with good ways to avoid the DirectSound play cursor from going
+past the write cursor. To mitigate this pass in a larger number to tsMakeContext's 4th
+parameter (buffer scale in seconds).
+* Pitch shifting code is pretty darn expensive. This is due to the use of a Fast Fourier Transform
+routine. The pitch shifting itself is written in rather efficient SIMD using SSE2 intrinsics,
+but the FFT routine is very basic. FFT is a big bottleneck for pitch shifting. There is a
+TODO optimization listed in this file for the FFT routine, but it's fairly low priority;
+optimizing FFT routines is difficult and requires a lot of specialized knowledge.
+*/
+
+/*
+FAQ
+Q : Why DirectSound instead of (insert API here) on Windows?
+A : Casey Muratori documented DS on Handmade Hero, other APIs do not have such good docs. DS has
+shipped on Windows XP all the way through Windows 10 -- using this header effectively intro-
+duces zero dependencies for the foreseeable future. The DS API itself is sane enough to quickly
+implement needed features, and users won't hear the difference between various APIs. Latency is
+not that great with DS but it is shippable. Additionally, many other APIs will in the end speak
+to Windows through the DS API.
+
+Q : Why not include Linux support?
+A : There have been a couple requests for ALSA support on Linux. For now the only option is to use
+SDL backend, which can indirectly support ALSA. SDL is used only in a very low-level manner;
+to get sound samples to the sound card via callback, so there shouldn't be much in the way of
+considering SDL a good option for "name your flavor" of Linux backend.
+
+Q : I would like to use my own memory management, how can I achieve this?
+A : This header makes a couple uses of malloc/free, and malloc16/free16. Simply find these bits
+and replace them with your own memory allocation routines. They can be wrapped up into a macro,
+or call your own functions directly -- it's up to you. Generally these functions allocate fairly
+large chunks of memory, and not very often (if at all), with one exception: tsSetPitch is a very
+expensive routine and requires frequent dynamic memory management.
+*/
+
+/*
+Some past discussion threads:
+https://www.reddit.com/r/gamedev/comments/6i39j2/tinysound_the_cutest_library_to_get_audio_into/
+https://www.reddit.com/r/gamedev/comments/4ml6l9/tinysound_singlefile_c_audio_library/
+https://forums.tigsource.com/index.php?topic=58706.0
+*/
+
+#if !defined( TINYSOUND_H )
+
+#define TS_WINDOWS    1
+#define TS_MAC        2
+#define TS_UNIX        3
+#define TS_SDL        4
+
+#if defined( _WIN32 )
+#define TS_PLATFORM TS_WINDOWS
+#elif defined( __APPLE__ )
+#define TS_PLATFORM TS_MAC
+#else
+#define TS_PLATFORM TS_SDL
+
+// please note TS_UNIX is not directly support
+// instead, unix-style OSes are encouraged to use SDL
+// see: https://www.libsdl.org/
+
+#endif
+
+// Use TS_FORCE_SDL to override the above macros and use
+// the SDL port.
+#ifdef TS_FORCE_SDL
+
+#undef TS_PLATFORM
+#define TS_PLATFORM TS_SDL
+
+#endif
+
+#include <stdint.h>
+
+// read this in the event of tsLoadWAV/tsLoadOGG errors
+// also read this in the event of certain errors from tsMakeContext
+extern const char* g_tsErrorReason;
+
+// stores a loaded sound in memory
+typedef struct
+{
+    int sample_count;
+    int channel_count;
+    void* channels[2];
+} tsLoadedSound;
+
+struct tsPitchData;
+typedef struct tsPitchData tsPitchData;
+
+// represents an instance of a tsLoadedSound, can be played through the tsContext
+typedef struct tsPlayingSound
+{
+    int active;
+    int paused;
+    int looped;
+    float volume0;
+    float volume1;
+    float pan0;
+    float pan1;
+    float pitch;
+    tsPitchData* pitch_filter[2];
+    int sample_index;
+    tsLoadedSound* loaded_sound;
+    struct tsPlayingSound* next;
+} tsPlayingSound;
+
+// holds audio API info and other info
+struct tsContext;
+typedef struct tsContext tsContext;
+
+// The returned struct will contain a null pointer in tsLoadedSound::channel[ 0 ]
+// in the case of errors. Read g_tsErrorReason string for details on what happened.
+// Calls tsReadMemWAV internally.
+tsLoadedSound tsLoadWAV(const char* path);
+
+// Reads a WAV file from memory. Still allocates memory for the tsLoadedSound since
+// WAV format will interlace stereo, and we need separate data streams to do SIMD
+// properly.
+void tsReadMemWAV(const void* memory, tsLoadedSound* sound);
+
+// If stb_vorbis was included *before* tinysound go ahead and create
+// some functions for dealing with OGG files.
+#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H
+void tsReadMemOGG(const void* memory, int length, int* sample_rate, tsLoadedSound* sound);
+tsLoadedSound tsLoadOGG(const char* path, int* sample_rate);
+#endif
+
+// Uses free16 (aligned free, implemented later in this file) to free up both of
+// the channels stored within sound
+void tsFreeSound(tsLoadedSound* sound);
+
+// Returns the size, in bytes, of all heap-allocated memory for this particular
+// loaded sound
+int tsSoundSize(tsLoadedSound* sound);
+
+// playing_pool_count -- 0 to setup low-level API, non-zero to size the internal
+// memory pool for tsPlayingSound instances
+tsContext* tsMakeContext(void* hwnd, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count);
+void tsShutdownContext(tsContext* ctx);
+
+// Call tsSpawnMixThread once to setup a separate thread for the context to run
+// upon. The separate thread will continually call tsMix and perform mixing
+// operations.
+void tsSpawnMixThread(tsContext* ctx);
+
+// Use tsThreadSleepDelay to specify a custom sleep delay time.
+// A sleep will occur after each call to tsMix. By default YieldProcessor
+// is used, and no sleep occurs. Use a sleep delay to conserve CPU bandwidth.
+// A recommended sleep time is a little less than 1/2 your predicted 1/FPS.
+// 60 fps is 16 ms, so about 1-5 should work well in most cases.
+void tsThreadSleepDelay(tsContext* ctx, int milliseconds);
+
+// Call this manually, once per game tick recommended, if you haven't ever
+// called tsSpawnMixThread. Otherwise the thread will call tsMix itself.
+// num_samples_to_write is not used on Windows. On Mac it is used to push
+// samples into a circular buffer while CoreAudio simultaneously pulls samples
+// off of the buffer. num_samples_to_write should be computed each update tick
+// as delta_time * play_frequency_in_Hz + 1.
+void tsMix(tsContext* ctx);
+
+// All of the functions in this next section should only be called if tsIsActive
+// returns true. Calling them otherwise probably won't do anything bad, but it
+// won't do anything at all. If a sound is active it resides in the context's
+// internal list of playing sounds.
+int tsIsActive(tsPlayingSound* sound);
+
+// Flags sound for removal. Upon next tsMix call will remove sound from playing
+// list. If high-level API used sound is placed onto the internal free list.
+void tsStopSound(tsPlayingSound* sound);
+
+void tsLoopSound(tsPlayingSound* sound, int zero_for_no_loop);
+void tsPauseSound(tsPlayingSound* sound, int one_for_paused);
+
+// lerp from 0 to 1, 0 full left, 1 full right
+void tsSetPan(tsPlayingSound* sound, float pan);
+
+// explicitly set volume of each channel. Can be used as panning (but it's
+// recommended to use the tsSetPan function for panning).
+void tsSetVolume(tsPlayingSound* sound, float volume_left, float volume_right);
+
+// Change pitch (not duration) of sound. pitch = 0.5f for one octave lower, pitch = 2.0f for one octave higher.
+// pitch at 1.0f applies no change. pitch settings farther away from 1.0f create more distortion and lower
+// the output sample quality. pitch can be adjusted in real-time for doppler effects and the like. Going beyond
+// 0.5f and 2.0f may require some tweaking the pitch shifting parameters, and is not recommended.
+
+// Additional important information about performance: This function
+// is quite expensive -- you have been warned! Try it out and be aware of how much CPU consumption it uses.
+// To avoid destroying the originally loaded sound samples, tsSetPitch will do a one-time allocation to copy
+// sound samples into a new buffer. The new buffer contains the pitch adjusted samples, and these will be played
+// through tsMix. This lets the pitch be modulated at run-time, but requires dynamically allocated memory. The
+// memory is freed once the sound finishes playing. If a one-time pitch adjustment is desired, for performance
+// reasons please consider doing an off-line pitch adjustment manually as a pre-processing step for your sounds.
+// Also, consider changing malloc16 and free16 to match your custom memory allocation needs. Try adjusting
+// TS_PITCH_QUALITY (must be a power of two) and see how this affects your performance.
+void tsSetPitch(tsPlayingSound* sound, float pitch);
+
+// Delays sound before actually playing it. Requires context to be passed in
+// since there's a conversion from seconds to samples per second.
+// If one were so inclined another version could be implemented like:
+// void tsSetDelay( tsPlayingSound* sound, float delay, int samples_per_second )
+void tsSetDelay(tsContext* ctx, tsPlayingSound* sound, float delay_in_seconds);
+
+// Portable sleep function
+void tsSleep(int milliseconds);
+
+// LOW-LEVEL API
+tsPlayingSound tsMakePlayingSound(tsLoadedSound* loaded);
+void tsInsertSound(tsContext* ctx, tsPlayingSound* sound);
+
+// HIGH-LEVEL API
+typedef struct
+{
+    int paused;
+    int looped;
+    float volume_left;
+    float volume_right;
+    float pan;
+    float pitch;
+    float delay;
+    tsLoadedSound* loaded;
+} tsPlaySoundDef;
+
+tsPlayingSound* tsPlaySound(tsContext* ctx, tsPlaySoundDef def);
+tsPlaySoundDef tsMakeDef(tsLoadedSound* sound);
+void tsStopAllSounds(tsContext* ctx);
+
+#define TINYSOUND_H
+#endif
+
+#ifdef TS_IMPLEMENTATION
+
+#define _CRT_SECURE_NO_WARNINGS FUCK_YOU
+#include <stdlib.h>    // malloc, free
+#include <stdio.h>    // fopen, fclose
+#include <string.h>    // memcmp, memset, memcpy
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#if TS_PLATFORM == TS_WINDOWS
+
+#include <dsound.h>
+#undef PlaySound
+
+#if defined( _MSC_VER )
+#pragma comment( lib, "dsound.lib" )
+#endif
+
+#elif TS_PLATFORM == TS_MAC
+
+#include <CoreAudio/CoreAudio.h>
+#include <AudioUnit/AudioUnit.h>
+#include <pthread.h>
+#include <mach/mach_time.h>
+
+#else
+
+#include "SDL2/SDL.h"
+
+#endif
+
+#define TS_CHECK( X, Y ) do { if ( !(X) ) { g_tsErrorReason = Y; goto ts_err; } } while ( 0 )
+#if TS_PLATFORM == TS_MAC && defined( __clang__ )
+#define TS_ASSERT_INTERNAL __builtin_trap( )
+#else
+#define TS_ASSERT_INTERNAL *(int*)0 = 0
+#endif
+#define TS_ASSERT( X ) do { if ( !(X) ) TS_ASSERT_INTERNAL; } while ( 0 )
+#define TS_ALIGN( X, Y ) ((((size_t)X) + ((Y) - 1)) & ~((Y) - 1))
+#define TS_TRUNC( X, Y ) ((size_t)(X) & ~((Y) - 1))
+
+const char* g_tsErrorReason;
+
+static void* tsReadFileToMemory(const char* path, int* size)
+{
+    void* data = 0;
+    FILE* fp = fopen(path, "rb");
+    int sizeNum = 0;
+
+    if (fp)
+    {
+        fseek(fp, 0, SEEK_END);
+        sizeNum = (int)ftell(fp);
+        fseek(fp, 0, SEEK_SET);
+        data = malloc(sizeNum);
+        fread(data, sizeNum, 1, fp);
+        fclose(fp);
+    }
+
+    if (size) *size = sizeNum;
+    return data;
+}
+
+static int tsFourCC(const char* CC, void* memory)
+{
+    if (!memcmp(CC, memory, 4)) return 1;
+    return 0;
+}
+
+static char* tsNext(char* data)
+{
+    uint32_t size = *(uint32_t*)(data + 4);
+    size = (size + 1) & ~1;
+    return data + 8 + size;
+}
+
+static void* malloc16(size_t size)
+{
+    void* p = malloc(size + 16);
+    if (!p) return 0;
+    unsigned char offset = (size_t)p & 15;
+    p = (void*)TS_ALIGN(p + 1, 16);
+    *((char*)p - 1) = 16 - offset;
+    TS_ASSERT(!((size_t)p & 15));
+    return p;
+}
+
+static void free16(void* p)
+{
+    if (!p) return;
+    free((char*)p - (size_t)*((char*)p - 1));
+}
+
+static void tsLastElement(__m128* a, int i, int j, int16_t* samples, int offset)
+{
+    switch (offset)
+    {
+    case 1:
+        a[i] = _mm_set_ps(samples[j], 0.0f, 0.0f, 0.0f);
+        break;
+
+    case 2:
+        a[i] = _mm_set_ps(samples[j], samples[j + 1], 0.0f, 0.0f);
+        break;
+
+    case 3:
+        a[i] = _mm_set_ps(samples[j], samples[j + 1], samples[j + 2], 0.0f);
+        break;
+
+    case 0:
+        a[i] = _mm_set_ps(samples[j], samples[j + 1], samples[j + 2], samples[j + 3]);
+        break;
+    }
+}
+
+void tsReadMemWAV(const void* memory, tsLoadedSound* sound)
+{
+#pragma pack( push, 1 )
+    typedef struct
+    {
+        uint16_t wFormatTag;
+        uint16_t nChannels;
+        uint32_t nSamplesPerSec;
+        uint32_t nAvgBytesPerSec;
+        uint16_t nBlockAlign;
+        uint16_t wBitsPerSample;
+        uint16_t cbSize;
+        uint16_t wValidBitsPerSample;
+        uint32_t dwChannelMask;
+        uint8_t SubFormat[18];
+    } Fmt;
+#pragma pack( pop )
+
+    char* data = (char*)memory;
+    TS_CHECK(data, "Unable to read input file (file doesn't exist, or could not allocate heap memory.");
+    TS_CHECK(tsFourCC("RIFF", data), "Incorrect file header; is this a WAV file?");
+    TS_CHECK(tsFourCC("WAVE", data + 8), "Incorrect file header; is this a WAV file?");
+
+    data += 12;
+
+    TS_CHECK(tsFourCC("fmt ", data), "fmt chunk not found.");
+    Fmt fmt;
+    fmt = *(Fmt*)(data + 8);
+    TS_CHECK(fmt.wFormatTag == 1, "Only PCM WAV files are supported.");
+    TS_CHECK(fmt.nChannels == 1 || fmt.nChannels == 2, "Only mono or stereo supported (too many channels detected).");
+    TS_CHECK(fmt.wBitsPerSample == 16, "Only 16 bits per sample supported.");
+    TS_CHECK(fmt.nBlockAlign == fmt.nChannels * 2, "implementation error");
+
+    data = tsNext(data);
+    TS_CHECK(tsFourCC("data", data), "data chunk not found.");
+    int sample_size = *((uint32_t*)(data + 4));
+    int sample_count = sample_size / (fmt.nChannels * sizeof(uint16_t));
+    sound->sample_count = sample_count;
+    sound->channel_count = fmt.nChannels;
+
+    int wide_count = (int)TS_ALIGN(sample_count, 4);
+    wide_count /= 4;
+    int wide_offset = sample_count & 3;
+    int16_t* samples = (int16_t*)(data + 8);
+    float* sample = (float*)alloca(sizeof(float) * 4 + 16);
+    sample = (float*)TS_ALIGN(sample, 16);
+
+    switch (sound->channel_count)
+    {
+    case 1:
+    {
+        sound->channels[0] = malloc16(wide_count * sizeof(__m128));
+        sound->channels[1] = 0;
+        __m128* a = (__m128*)sound->channels[0];
+
+        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 4)
+        {
+            sample[0] = (float)samples[j];
+            sample[1] = (float)samples[j + 1];
+            sample[2] = (float)samples[j + 2];
+            sample[3] = (float)samples[j + 3];
+            a[i] = _mm_load_ps(sample);
+        }
+
+        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
+    }    break;
+
+    case 2:
+    {
+        __m128* a = (__m128*)malloc16(wide_count * sizeof(__m128) * 2);
+        __m128* b = a + wide_count;
+
+        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 8)
+        {
+            sample[0] = (float)samples[j];
+            sample[1] = (float)samples[j + 2];
+            sample[2] = (float)samples[j + 4];
+            sample[3] = (float)samples[j + 6];
+            a[i] = _mm_load_ps(sample);
+
+            sample[0] = (float)samples[j + 1];
+            sample[1] = (float)samples[j + 3];
+            sample[2] = (float)samples[j + 5];
+            sample[3] = (float)samples[j + 7];
+            b[i] = _mm_load_ps(sample);
+        }
+
+        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
+        tsLastElement(b, wide_count - 1, (wide_count - 1) * 4 + 4, samples, wide_offset);
+        sound->channels[0] = a;
+        sound->channels[1] = b;
+    }    break;
+
+    default:
+        TS_CHECK(0, "unsupported channel count (only support mono and stereo).");
+    }
+
+    return;
+
+ts_err:
+    memset(&sound, 0, sizeof(sound));
+}
+
+tsLoadedSound tsLoadWAV(const char* path)
+{
+    tsLoadedSound sound = { 0 };
+    char* wav = (char*)tsReadFileToMemory(path, 0);
+    tsReadMemWAV(wav, &sound);
+    free(wav);
+    return sound;
+}
+
+// If stb_vorbis was included *before* tinysound go ahead and create
+// some functions for dealing with OGG files.
+#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H
+void tsReadMemOGG(const void* memory, int length, int* sample_rate, tsLoadedSound* sound)
+{
+    int16_t* samples = 0;
+    int channel_count;
+    int sample_count = stb_vorbis_decode_memory((const unsigned char*)memory, length, &channel_count, sample_rate, &samples);
+
+    TS_CHECK(sample_count > 0, "stb_vorbis_decode_memory failed. Make sure your file exists and is a valid OGG file.");
+
+    int wide_count = (int)TS_ALIGN(sample_count, 4) / 4;
+    int wide_offset = sample_count & 3;
+    float* sample = (float*)alloca(sizeof(float) * 4 + 16);
+    sample = (float*)TS_ALIGN(sample, 16);
+    __m128* a;
+    __m128* b;
+
+    switch (channel_count)
+    {
+    case 1:
+    {
+        a = (__m128*)malloc16(wide_count * sizeof(__m128));
+        b = 0;
+
+        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 4)
+        {
+            sample[0] = (float)samples[j];
+            sample[1] = (float)samples[j + 1];
+            sample[2] = (float)samples[j + 2];
+            sample[3] = (float)samples[j + 3];
+            a[i] = _mm_load_ps(sample);
+        }
+
+        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
+    }    break;
+
+    case 2:
+        a = (__m128*)malloc16(wide_count * sizeof(__m128) * 2);
+        b = a + wide_count;
+
+        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 8)
+        {
+            sample[0] = (float)samples[j];
+            sample[1] = (float)samples[j + 2];
+            sample[2] = (float)samples[j + 4];
+            sample[3] = (float)samples[j + 6];
+            a[i] = _mm_load_ps(sample);
+
+            sample[0] = (float)samples[j + 1];
+            sample[1] = (float)samples[j + 3];
+            sample[2] = (float)samples[j + 5];
+            sample[3] = (float)samples[j + 7];
+            b[i] = _mm_load_ps(sample);
+        }
+
+        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
+        tsLastElement(b, wide_count - 1, (wide_count - 1) * 4 + 4, samples, wide_offset);
+        break;
+
+    default:
+        TS_CHECK(0, "Unsupported channel count.");
+    }
+
+    sound->sample_count = sample_count;
+    sound->channel_count = channel_count;
+    sound->channels[0] = a;
+    sound->channels[1] = b;
+    free(samples);
+    return;
+
+ts_err:
+    free(samples);
+    memset(sound, 0, sizeof(tsLoadedSound));
+}
+
+tsLoadedSound tsLoadOGG(const char* path, int* sample_rate)
+{
+    int length;
+    void* memory = tsReadFileToMemory(path, &length);
+    tsLoadedSound sound;
+    tsReadMemOGG(memory, length, sample_rate, &sound);
+    free(memory);
+
+    return sound;
+}
+#endif
+
+void tsFreeSound(tsLoadedSound* sound)
+{
+    free16(sound->channels[0]);
+    memset(sound, 0, sizeof(tsLoadedSound));
+}
+
+int tsSoundSize(tsLoadedSound* sound)
+{
+    return sound->sample_count * sound->channel_count * sizeof(uint16_t);
+}
+
+tsPlayingSound tsMakePlayingSound(tsLoadedSound* loaded)
+{
+    tsPlayingSound playing;
+    playing.active = 0;
+    playing.paused = 0;
+    playing.looped = 0;
+    playing.volume0 = 1.0f;
+    playing.volume1 = 1.0f;
+    playing.pan0 = 0.5f;
+    playing.pan1 = 0.5f;
+    playing.pitch = 1.0f;
+    playing.pitch_filter[0] = 0;
+    playing.pitch_filter[1] = 0;
+    playing.sample_index = 0;
+    playing.loaded_sound = loaded;
+    playing.next = 0;
+    return playing;
+}
+
+int tsIsActive(tsPlayingSound* sound)
+{
+    return sound->active;
+}
+
+void tsStopSound(tsPlayingSound* sound)
+{
+    sound->active = 0;
+}
+
+void tsLoopSound(tsPlayingSound* sound, int zero_for_no_loop)
+{
+    sound->looped = zero_for_no_loop;
+}
+
+void tsPauseSound(tsPlayingSound* sound, int one_for_paused)
+{
+    sound->paused = one_for_paused;
+}
+
+void tsSetPan(tsPlayingSound* sound, float pan)
+{
+    if (pan > 1.0f) pan = 1.0f;
+    else if (pan < 0.0f) pan = 0.0f;
+    float left = 1.0f - pan;
+    float right = pan;
+    sound->pan0 = left;
+    sound->pan1 = right;
+}
+
+void tsSetPitch(tsPlayingSound* sound, float pitch)
+{
+    sound->pitch = pitch;
+}
+
+void tsSetVolume(tsPlayingSound* sound, float volume_left, float volume_right)
+{
+    if (volume_left < 0.0f) volume_left = 0.0f;
+    if (volume_right < 0.0f) volume_right = 0.0f;
+    sound->volume0 = volume_left;
+    sound->volume1 = volume_right;
+}
+
+static void tsRemoveFilter(tsPlayingSound* playing);
+
+#if TS_PLATFORM == TS_WINDOWS
+
+void tsSleep(int milliseconds)
+{
+    Sleep(milliseconds);
+}
+
+struct tsContext
+{
+    unsigned latency_samples;
+    unsigned running_index;
+    int Hz;
+    int bps;
+    int buffer_size;
+    int wide_count;
+    tsPlayingSound* playing;
+    __m128* floatA;
+    __m128* floatB;
+    __m128i* samples;
+    tsPlayingSound* playing_pool;
+    tsPlayingSound* playing_free;
+
+    // platform specific stuff
+    LPDIRECTSOUND dsound;
+    LPDIRECTSOUNDBUFFER buffer;
+    LPDIRECTSOUNDBUFFER primary;
+
+    // data for tsMix thread, enable these with tsSpawnMixThread
+    CRITICAL_SECTION critical_section;
+    int separate_thread;
+    int running;
+    int sleep_milliseconds;
+};
+
+static void tsReleaseContext(tsContext* ctx)
+{
+    if (ctx->separate_thread)    DeleteCriticalSection(&ctx->critical_section);
+#ifdef __cplusplus
+    ctx->buffer->Release();
+    ctx->primary->Release();
+    ctx->dsound->Release();
+#else
+    ctx->buffer->lpVtbl->Release(ctx->buffer);
+    ctx->primary->lpVtbl->Release(ctx->primary);
+    ctx->dsound->lpVtbl->Release(ctx->dsound);
+#endif
+    tsPlayingSound* playing = ctx->playing;
+    while (playing)
+    {
+        tsRemoveFilter(playing);
+        playing = playing->next;
+    }
+    free(ctx);
+}
+
+static DWORD WINAPI tsCtxThread(LPVOID lpParameter)
+{
+    tsContext* ctx = (tsContext*)lpParameter;
+
+    while (ctx->running)
+    {
+        tsMix(ctx);
+        if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds);
+        else YieldProcessor();
+    }
+
+    ctx->separate_thread = 0;
+    return 0;
+}
+
+static void tsLock(tsContext* ctx)
+{
+    if (ctx->separate_thread) EnterCriticalSection(&ctx->critical_section);
+}
+
+static void tsUnlock(tsContext* ctx)
+{
+    if (ctx->separate_thread) LeaveCriticalSection(&ctx->critical_section);
+}
+
+tsContext* tsMakeContext(void* hwnd, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count)
+{
+    int bps = sizeof(INT16) * 2;
+    int buffer_size = play_frequency_in_Hz * bps * num_buffered_seconds;
+    tsContext* ctx = 0;
+    WAVEFORMATEX format = { 0 };
+    DSBUFFERDESC bufdesc = { 0 };
+    LPDIRECTSOUND dsound;
+
+    TS_CHECK(hwnd, "Invalid hwnd passed to tsMakeContext.");
+
+    HRESULT res = DirectSoundCreate(0, &dsound, 0);
+    TS_CHECK(res == DS_OK, "DirectSoundCreate failed");
+#ifdef __cplusplus
+    dsound->SetCooperativeLevel((HWND)hwnd, DSSCL_PRIORITY);
+#else
+    dsound->lpVtbl->SetCooperativeLevel(dsound, (HWND)hwnd, DSSCL_PRIORITY);
+#endif
+    bufdesc.dwSize = sizeof(bufdesc);
+    bufdesc.dwFlags = DSBCAPS_PRIMARYBUFFER;
+
+    LPDIRECTSOUNDBUFFER primary_buffer;
+#ifdef __cplusplus
+    res = dsound->CreateSoundBuffer(&bufdesc, &primary_buffer, 0);
+#else
+    res = dsound->lpVtbl->CreateSoundBuffer(dsound, &bufdesc, &primary_buffer, 0);
+#endif
+    TS_CHECK(res == DS_OK, "Failed to create primary sound buffer");
+
+    format.wFormatTag = WAVE_FORMAT_PCM;
+    format.nChannels = 2;
+    format.nSamplesPerSec = play_frequency_in_Hz;
+    format.wBitsPerSample = 16;
+    format.nBlockAlign = (format.nChannels * format.wBitsPerSample) / 8;
+    format.nAvgBytesPerSec = format.nSamplesPerSec * format.nBlockAlign;
+    format.cbSize = 0;
+#ifdef __cplusplus
+    res = primary_buffer->SetFormat(&format);
+#else
+    res = primary_buffer->lpVtbl->SetFormat(primary_buffer, &format);
+#endif
+    TS_CHECK(res == DS_OK, "Failed to set format on primary buffer");
+
+    LPDIRECTSOUNDBUFFER secondary_buffer;
+    bufdesc.dwSize = sizeof(bufdesc);
+    bufdesc.dwFlags = 0;
+    bufdesc.dwBufferBytes = buffer_size;
+    bufdesc.lpwfxFormat = &format;
+#ifdef __cplusplus
+    res = dsound->CreateSoundBuffer(&bufdesc, &secondary_buffer, 0);
+#else
+    res = dsound->lpVtbl->CreateSoundBuffer(dsound, &bufdesc, &secondary_buffer, 0);
+#endif
+    TS_CHECK(res == DS_OK, "Failed to set format on secondary buffer");
+
+    int sample_count = play_frequency_in_Hz * num_buffered_seconds;
+    int wide_count = (int)TS_ALIGN(sample_count, 4);
+    int pool_size = playing_pool_count * sizeof(tsPlayingSound);
+    int mix_buffers_size = sizeof(__m128) * wide_count * 2;
+    int sample_buffer_size = sizeof(__m128i) * wide_count;
+    ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size);
+    ctx->latency_samples = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4);
+    ctx->running_index = 0;
+    ctx->Hz = play_frequency_in_Hz;
+    ctx->bps = bps;
+    ctx->buffer_size = buffer_size;
+    ctx->wide_count = wide_count;
+    ctx->dsound = dsound;
+    ctx->buffer = secondary_buffer;
+    ctx->primary = primary_buffer;
+    ctx->playing = 0;
+    ctx->floatA = (__m128*)(ctx + 1);
+    ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16);
+    TS_ASSERT(!((size_t)ctx->floatA & 15));
+    ctx->floatB = ctx->floatA + wide_count;
+    ctx->samples = (__m128i*)ctx->floatB + wide_count;
+    ctx->running = 1;
+    ctx->separate_thread = 0;
+    ctx->sleep_milliseconds = 0;
+
+    if (playing_pool_count)
+    {
+        ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count);
+        for (int i = 0; i < playing_pool_count - 1; ++i)
+            ctx->playing_pool[i].next = ctx->playing_pool + i + 1;
+        ctx->playing_pool[playing_pool_count - 1].next = 0;
+        ctx->playing_free = ctx->playing_pool;
+    }
+
+    else
+    {
+        ctx->playing_pool = 0;
+        ctx->playing_free = 0;
+    }
+
+    return ctx;
+
+ts_err:
+    free(ctx);
+    return 0;
+}
+
+void tsSpawnMixThread(tsContext* ctx)
+{
+    if (ctx->separate_thread) return;
+    InitializeCriticalSectionAndSpinCount(&ctx->critical_section, 0x00000400);
+    ctx->separate_thread = 1;
+    CreateThread(0, 0, tsCtxThread, ctx, 0, 0);
+}
+
+#elif TS_PLATFORM == TS_MAC
+
+void tsSleep(int milliseconds)
+{
+    usleep(milliseconds * 1000);
+}
+
+struct tsContext
+{
+    unsigned latency_samples;
+    unsigned index0; // read
+    unsigned index1; // write
+    int Hz;
+    int bps;
+    int wide_count;
+    int sample_count;
+    tsPlayingSound* playing;
+    __m128* floatA;
+    __m128* floatB;
+    __m128i* samples;
+    tsPlayingSound* playing_pool;
+    tsPlayingSound* playing_free;
+
+    // platform specific stuff
+    AudioComponentInstance inst;
+
+    // data for tsMix thread, enable these with tsSpawnMixThread
+    pthread_t thread;
+    pthread_mutex_t mutex;
+    int separate_thread;
+    int running;
+    int sleep_milliseconds;
+};
+
+static void tsReleaseContext(tsContext* ctx)
+{
+    if (ctx->separate_thread)    pthread_mutex_destroy(&ctx->mutex);
+    AudioOutputUnitStop(ctx->inst);
+    AudioUnitUninitialize(ctx->inst);
+    AudioComponentInstanceDispose(ctx->inst);
+    tsPlayingSound* playing = ctx->playing;
+    while (playing)
+    {
+        tsRemoveFilter(playing);
+        playing = playing->next;
+    }
+    free(ctx);
+}
+
+static void* tsCtxThread(void* udata)
+{
+    tsContext* ctx = (tsContext*)udata;
+
+    while (ctx->running)
+    {
+        tsMix(ctx);
+        if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds);
+        else pthread_yield_np();
+    }
+
+    ctx->separate_thread = 0;
+    pthread_exit(0);
+    return 0;
+}
+
+static void tsLock(tsContext* ctx)
+{
+    if (ctx->separate_thread) pthread_mutex_lock(&ctx->mutex);
+}
+
+static void tsUnlock(tsContext* ctx)
+{
+    if (ctx->separate_thread) pthread_mutex_unlock(&ctx->mutex);
+}
+
+static OSStatus tsMemcpyToCA(void* udata, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData);
+
+tsContext* tsMakeContext(void* unused, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count)
+{
+    int bps = sizeof(uint16_t) * 2;
+
+    AudioComponentDescription comp_desc = { 0 };
+    comp_desc.componentType = kAudioUnitType_Output;
+    comp_desc.componentSubType = kAudioUnitSubType_DefaultOutput;
+    comp_desc.componentFlags = 0;
+    comp_desc.componentFlagsMask = 0;
+    comp_desc.componentManufacturer = kAudioUnitManufacturer_Apple;
+
+    AudioComponent comp = AudioComponentFindNext(NULL, &comp_desc);
+    if (!comp)
+    {
+        g_tsErrorReason = "Failed to create output unit from AudioComponentFindNext.";
+        return 0;
+    }
+
+    AudioStreamBasicDescription stream_desc = { 0 };
+    stream_desc.mSampleRate = (double)play_frequency_in_Hz;
+    stream_desc.mFormatID = kAudioFormatLinearPCM;
+    stream_desc.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagsNativeEndian | kAudioFormatFlagIsPacked;
+    stream_desc.mFramesPerPacket = 1;
+    stream_desc.mChannelsPerFrame = 2;
+    stream_desc.mBitsPerChannel = sizeof(uint16_t) * 8;
+    stream_desc.mBytesPerPacket = bps;
+    stream_desc.mBytesPerFrame = bps;
+    stream_desc.mReserved = 0;
+
+    AudioComponentInstance inst;
+    OSStatus ret;
+    AURenderCallbackStruct input;
+
+    ret = AudioComponentInstanceNew(comp, &inst);
+
+    int sample_count = play_frequency_in_Hz * num_buffered_seconds;
+    int latency_count = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4);
+    TS_ASSERT(sample_count > latency_count);
+    int wide_count = (int)TS_ALIGN(sample_count, 4) / 4;
+    int pool_size = playing_pool_count * sizeof(tsPlayingSound);
+    int mix_buffers_size = sizeof(__m128) * wide_count * 2;
+    int sample_buffer_size = sizeof(__m128i) * wide_count;
+    tsContext* ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size);
+    TS_CHECK(ret == noErr, "AudioComponentInstanceNew failed");
+    ctx->latency_samples = latency_count;
+    ctx->index0 = 0;
+    ctx->index1 = 0;
+    ctx->Hz = play_frequency_in_Hz;
+    ctx->bps = bps;
+    ctx->wide_count = wide_count;
+    ctx->sample_count = wide_count * 4;
+    ctx->inst = inst;
+    ctx->playing = 0;
+    ctx->floatA = (__m128*)(ctx + 1);
+    ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16);
+    TS_ASSERT(!((size_t)ctx->floatA & 15));
+    ctx->floatB = ctx->floatA + wide_count;
+    ctx->samples = (__m128i*)ctx->floatB + wide_count;
+    ctx->running = 1;
+    ctx->separate_thread = 0;
+    ctx->sleep_milliseconds = 0;
+
+    ret = AudioUnitSetProperty(inst, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, 0, &stream_desc, sizeof(stream_desc));
+    TS_CHECK(ret == noErr, "Failed to set stream forat");
+
+    input.inputProc = tsMemcpyToCA;
+    input.inputProcRefCon = ctx;
+    ret = AudioUnitSetProperty(inst, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, 0, &input, sizeof(input));
+    TS_CHECK(ret == noErr, "AudioUnitSetProperty failed");
+
+    ret = AudioUnitInitialize(inst);
+    TS_CHECK(ret == noErr, "Couldn't initialize output unit");
+
+    ret = AudioOutputUnitStart(inst);
+    TS_CHECK(ret == noErr, "Couldn't start output unit");
+
+    if (playing_pool_count)
+    {
+        ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count);
+        for (int i = 0; i < playing_pool_count - 1; ++i)
+            ctx->playing_pool[i].next = ctx->playing_pool + i + 1;
+        ctx->playing_pool[playing_pool_count - 1].next = 0;
+        ctx->playing_free = ctx->playing_pool;
+    }
+
+    else
+    {
+        ctx->playing_pool = 0;
+        ctx->playing_free = 0;
+    }
+
+    return ctx;
+
+ts_err:
+    free(ctx);
+    return 0;
+}
+
+void tsSpawnMixThread(tsContext* ctx)
+{
+    if (ctx->separate_thread) return;
+    pthread_mutex_init(&ctx->mutex, 0);
+    ctx->separate_thread = 1;
+    pthread_create(&ctx->thread, 0, tsCtxThread, ctx);
+}
+
+#else
+
+void tsSleep(int milliseconds)
+{
+    SDL_Delay(milliseconds);
+}
+
+struct tsContext
+{
+    unsigned latency_samples;
+    unsigned index0; // read
+    unsigned index1; // write
+    unsigned running_index;
+    int Hz;
+    int bps;
+    int buffer_size;
+    int wide_count;
+    int sample_count;
+    tsPlayingSound* playing;
+    __m128* floatA;
+    __m128* floatB;
+    __m128i* samples;
+    tsPlayingSound* playing_pool;
+    tsPlayingSound* playing_free;
+
+    // data for tsMix thread, enable these with tsSpawnMixThread
+    SDL_Thread* thread;
+    SDL_mutex* mutex;
+    int separate_thread;
+    int running;
+    int sleep_milliseconds;
+};
+
+static void tsReleaseContext(tsContext* ctx)
+{
+    if (ctx->separate_thread)    SDL_DestroyMutex(ctx->mutex);
+    tsPlayingSound* playing = ctx->playing;
+    while (playing)
+    {
+        tsRemoveFilter(playing);
+        playing = playing->next;
+    }
+    SDL_CloseAudio();
+    free(ctx);
+}
+
+int tsCtxThread(void* udata)
+{
+    tsContext* ctx = (tsContext*)udata;
+
+    while (ctx->running)
+    {
+        tsMix(ctx);
+        if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds);
+        else tsSleep(1);
+    }
+
+    ctx->separate_thread = 0;
+    return 0;
+}
+
+static void tsLock(tsContext* ctx)
+{
+    if (ctx->separate_thread) SDL_LockMutex(ctx->mutex);
+}
+
+static void tsUnlock(tsContext* ctx)
+{
+    if (ctx->separate_thread) SDL_UnlockMutex(ctx->mutex);
+}
+
+void tsSDL_AudioCallback(void* udata, Uint8* stream, int len);
+
+tsContext* tsMakeContext(void* unused, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count)
+{
+    (void)unused;
+    int bps = sizeof(uint16_t) * 2;
+    int sample_count = play_frequency_in_Hz * num_buffered_seconds;
+    int latency_count = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4);
+    TS_ASSERT(sample_count > latency_count);
+    int wide_count = (int)TS_ALIGN(sample_count, 4) / 4;
+    int pool_size = playing_pool_count * sizeof(tsPlayingSound);
+    int mix_buffers_size = sizeof(__m128) * wide_count * 2;
+    int sample_buffer_size = sizeof(__m128i) * wide_count;
+    tsContext* ctx = 0;
+    SDL_AudioSpec wanted;
+    int ret = SDL_Init(SDL_INIT_AUDIO);
+    TS_CHECK(ret >= 0, "Can't init SDL audio");
+
+    ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size);
+    TS_CHECK(ctx != NULL, "Can't create audio context");
+    ctx->latency_samples = latency_count;
+    ctx->index0 = 0;
+    ctx->index1 = 0;
+    ctx->Hz = play_frequency_in_Hz;
+    ctx->bps = bps;
+    ctx->wide_count = wide_count;
+    ctx->sample_count = wide_count * 4;
+    ctx->playing = 0;
+    ctx->floatA = (__m128*)(ctx + 1);
+    ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16);
+    TS_ASSERT(!((size_t)ctx->floatA & 15));
+    ctx->floatB = ctx->floatA + wide_count;
+    ctx->samples = (__m128i*)ctx->floatB + wide_count;
+    ctx->running = 1;
+    ctx->separate_thread = 0;
+    ctx->sleep_milliseconds = 0;
+
+    SDL_memset(&wanted, 0, sizeof(wanted));
+    wanted.freq = play_frequency_in_Hz;
+    wanted.format = AUDIO_S16SYS;
+    wanted.channels = 2; /* 1 = mono, 2 = stereo */
+    wanted.samples = 1024;
+    wanted.callback = tsSDL_AudioCallback;
+    wanted.userdata = ctx;
+    ret = SDL_OpenAudio(&wanted, NULL);
+    TS_CHECK(ret >= 0, "Can't open SDL audio");
+    SDL_PauseAudio(0);
+
+    if (playing_pool_count)
+    {
+        ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count);
+        for (int i = 0; i < playing_pool_count - 1; ++i)
+            ctx->playing_pool[i].next = ctx->playing_pool + i + 1;
+        ctx->playing_pool[playing_pool_count - 1].next = 0;
+        ctx->playing_free = ctx->playing_pool;
+    }
+
+    else
+    {
+        ctx->playing_pool = 0;
+        ctx->playing_free = 0;
+    }
+
+    return ctx;
+
+ts_err:
+    if (ctx) free(ctx);
+    return 0;
+}
+
+void tsSpawnMixThread(tsContext* ctx)
+{
+    if (ctx->separate_thread) return;
+    ctx->mutex = SDL_CreateMutex();
+    ctx->separate_thread = 1;
+    ctx->thread = SDL_CreateThread(&tsCtxThread, "TinySoundThread", ctx);
+}
+
+#endif
+
+#if TS_PLATFORM == TS_SDL || TS_PLATFORM == TS_MAC
+
+static int tsSamplesWritten(tsContext* ctx)
+{
+    int index0 = ctx->index0;
+    int index1 = ctx->index1;
+    if (index0 <= index1) return index1 - index0;
+    else return ctx->sample_count - index0 + index1;
+}
+
+static int tsSamplesUnwritten(tsContext* ctx)
+{
+    int index0 = ctx->index0;
+    int index1 = ctx->index1;
+    if (index0 <= index1) return ctx->sample_count - index1 + index0;
+    else return index0 - index1;
+}
+
+static int tsSamplesToMix(tsContext* ctx)
+{
+    int lat = ctx->latency_samples;
+    int written = tsSamplesWritten(ctx);
+    int dif = lat - written;
+    TS_ASSERT(dif >= 0);
+    if (dif)
+    {
+        int unwritten = tsSamplesUnwritten(ctx);
+        return dif < unwritten ? dif : unwritten;
+    }
+    return 0;
+}
+
+#define TS_SAMPLES_TO_BYTES( interleaved_sample_count ) ((interleaved_sample_count) * ctx->bps)
+#define TS_BYTES_TO_SAMPLES( byte_count ) ((byte_count) / ctx->bps)
+
+static void tsPushBytes(tsContext* ctx, void* data, int size)
+{
+    int index0 = ctx->index0;
+    int index1 = ctx->index1;
+    int samples = TS_BYTES_TO_SAMPLES(size);
+    int sample_count = ctx->sample_count;
+
+    int unwritten = tsSamplesUnwritten(ctx);
+    if (unwritten < samples) samples = unwritten;
+    int can_overflow = index0 <= index1;
+    int would_overflow = index1 + samples > sample_count;
+
+    if (can_overflow && would_overflow)
+    {
+        int first_size = TS_SAMPLES_TO_BYTES(sample_count - index1);
+        int second_size = size - first_size;
+        memcpy((char*)ctx->samples + TS_SAMPLES_TO_BYTES(index1), data, first_size);
+        memcpy(ctx->samples, (char*)data + first_size, second_size);
+        ctx->index1 = TS_BYTES_TO_SAMPLES(second_size);
+    }
+
+    else
+    {
+        memcpy((char*)ctx->samples + TS_SAMPLES_TO_BYTES(index1), data, size);
+        ctx->index1 += TS_BYTES_TO_SAMPLES(size);
+    }
+}
+
+static int tsPullBytes(tsContext* ctx, void* dst, int size)
+{
+    int index0 = ctx->index0;
+    int index1 = ctx->index1;
+    int allowed_size = TS_SAMPLES_TO_BYTES(tsSamplesWritten(ctx));
+    int zeros = 0;
+
+    if (allowed_size < size)
+    {
+        zeros = size - allowed_size;
+        size = allowed_size;
+    }
+
+    if (index1 >= index0)
+    {
+        memcpy(dst, ((char*)ctx->samples) + TS_SAMPLES_TO_BYTES(index0), size);
+        ctx->index0 += TS_BYTES_TO_SAMPLES(size);
+    }
+
+    else
+    {
+        int first_size = TS_SAMPLES_TO_BYTES(ctx->sample_count) - TS_SAMPLES_TO_BYTES(index0);
+        if (first_size > size) first_size = size;
+        int second_size = size - first_size;
+        memcpy(dst, ((char*)ctx->samples) + TS_SAMPLES_TO_BYTES(index0), first_size);
+        memcpy(((char*)dst) + first_size, ctx->samples, second_size);
+        if (second_size) ctx->index0 = TS_BYTES_TO_SAMPLES(second_size);
+        else ctx->index0 += TS_BYTES_TO_SAMPLES(first_size);
+    }
+
+    return zeros;
+}
+
+#endif
+
+void tsShutdownContext(tsContext* ctx)
+{
+    if (ctx->separate_thread)
+    {
+        tsLock(ctx);
+        ctx->running = 0;
+        tsUnlock(ctx);
+    }
+
+    while (ctx->separate_thread) tsSleep(1);
+    tsReleaseContext(ctx);
+}
+
+void tsThreadSleepDelay(tsContext* ctx, int milliseconds)
+{
+    ctx->sleep_milliseconds = milliseconds;
+}
+
+void tsInsertSound(tsContext* ctx, tsPlayingSound* sound)
+{
+    // Cannot use tsPlayingSound if tsMakeContext was passed non-zero for playing_pool_count
+    // since non-zero playing_pool_count means the context is doing some memory-management
+    // for a playing sound pool. InsertSound assumes the pool does not exist, and is apart
+    // of the lower-level API (see top of this header for documentation details).
+    TS_ASSERT(ctx->playing_pool == 0);
+
+    if (sound->active) return;
+    tsLock(ctx);
+    sound->next = ctx->playing;
+    ctx->playing = sound;
+    sound->active = 1;
+    tsUnlock(ctx);
+}
+
+// NOTE: does not allow delay_in_seconds to be negative (clamps at 0)
+void tsSetDelay(tsContext* ctx, tsPlayingSound* sound, float delay_in_seconds)
+{
+    if (delay_in_seconds < 0.0f) delay_in_seconds = 0.0f;
+    sound->sample_index = (int)(delay_in_seconds * (float)ctx->Hz);
+    sound->sample_index = -(int)TS_ALIGN(sound->sample_index, 4);
+}
+
+tsPlaySoundDef tsMakeDef(tsLoadedSound* sound)
+{
+    tsPlaySoundDef def;
+    def.paused = 0;
+    def.looped = 0;
+    def.volume_left = 1.0f;
+    def.volume_right = 1.0f;
+    def.pan = 0.5f;
+    def.pitch = 1.0f;
+    def.delay = 0.0f;
+    def.loaded = sound;
+    return def;
+}
+
+tsPlayingSound* tsPlaySound(tsContext* ctx, tsPlaySoundDef def)
+{
+    tsLock(ctx);
+
+    tsPlayingSound* playing = ctx->playing_free;
+    if (!playing) return 0;
+    ctx->playing_free = playing->next;
+    *playing = tsMakePlayingSound(def.loaded);
+    playing->active = 1;
+    playing->paused = def.paused;
+    playing->looped = def.looped;
+    tsSetVolume(playing, def.volume_left, def.volume_right);
+    tsSetPan(playing, def.pan);
+    tsSetPitch(playing, def.pitch);
+    tsSetDelay(ctx, playing, def.delay);
+    playing->next = ctx->playing;
+    ctx->playing = playing;
+
+    tsUnlock(ctx);
+
+    return playing;
+}
+
+void tsStopAllSounds(tsContext* ctx)
+{
+    // This is apart of the high level API, not the low level API.
+    // If using the low level API you must write your own function to
+    // stop playing all sounds.
+    TS_ASSERT(ctx->playing_pool == 0);
+
+    tsPlayingSound* sound = ctx->playing;
+    ctx->playing = 0;
+
+    while (sound)
+    {
+        tsPlayingSound* next = sound->next;
+        sound->next = ctx->playing_free;
+        ctx->playing_free = sound;
+        sound = next;
+    }
+}
+
+#if TS_PLATFORM == TS_WINDOWS
+
+static void tsPosition(tsContext* ctx, int* byte_to_lock, int* bytes_to_write)
+{
+    // compute bytes to be written to direct sound
+    DWORD play_cursor;
+    DWORD write_cursor;
+#ifdef __cplusplus
+    HRESULT hr = ctx->buffer->GetCurrentPosition(&play_cursor, &write_cursor);
+#else
+    HRESULT hr = ctx->buffer->lpVtbl->GetCurrentPosition(ctx->buffer, &play_cursor, &write_cursor);
+#endif
+    TS_ASSERT(hr == DS_OK);
+
+    DWORD lock = (ctx->running_index * ctx->bps) % ctx->buffer_size;
+    DWORD target_cursor = (write_cursor + ctx->latency_samples * ctx->bps) % ctx->buffer_size;
+    target_cursor = (DWORD)TS_ALIGN(target_cursor, 16);
+    DWORD write;
+
+    if (lock > target_cursor)
+    {
+        write = (ctx->buffer_size - lock) + target_cursor;
+    }
+
+    else
+    {
+        write = target_cursor - lock;
+    }
+
+    *byte_to_lock = lock;
+    *bytes_to_write = write;
+}
+
+static void tsMemcpyToDS(tsContext* ctx, int16_t* samples, int byte_to_lock, int bytes_to_write)
+{
+    // copy mixer buffers to direct sound
+    void* region1;
+    DWORD size1;
+    void* region2;
+    DWORD size2;
+#ifdef __cplusplus
+    HRESULT hr = ctx->buffer->Lock(byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
+
+    if (hr == DSERR_BUFFERLOST)
+    {
+        ctx->buffer->Restore();
+        hr = ctx->buffer->Lock(byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
+    }
+#else
+    HRESULT hr = ctx->buffer->lpVtbl->Lock(ctx->buffer, byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
+
+    if (hr == DSERR_BUFFERLOST)
+    {
+        ctx->buffer->lpVtbl->Restore(ctx->buffer);
+        hr = ctx->buffer->lpVtbl->Lock(ctx->buffer, byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
+    }
+#endif
+
+    if (!SUCCEEDED(hr))
+        return;
+
+    unsigned running_index = ctx->running_index;
+    INT16* sample1 = (INT16*)region1;
+    DWORD sample1_count = size1 / ctx->bps;
+    memcpy(sample1, samples, sample1_count * sizeof(INT16) * 2);
+    samples += sample1_count * 2;
+    running_index += sample1_count;
+
+    INT16* sample2 = (INT16*)region2;
+    DWORD sample2_count = size2 / ctx->bps;
+    memcpy(sample2, samples, sample2_count * sizeof(INT16) * 2);
+    samples += sample2_count * 2;
+    running_index += sample2_count;
+
+#ifdef __cplusplus
+    ctx->buffer->Unlock(region1, size1, region2, size2);
+#else
+    ctx->buffer->lpVtbl->Unlock(ctx->buffer, region1, size1, region2, size2);
+#endif
+    ctx->running_index = running_index;
+
+    // meager hack to fill out sound buffer before playing
+    static int first;
+    if (!first)
+    {
+#ifdef __cplusplus
+        ctx->buffer->Play(0, 0, DSBPLAY_LOOPING);
+#else
+        ctx->buffer->lpVtbl->Play(ctx->buffer, 0, 0, DSBPLAY_LOOPING);
+#endif
+        first = 1;
+    }
+}
+
+#elif TS_PLATFORM == TS_MAC
+
+static OSStatus tsMemcpyToCA(void* udata, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData)
+{
+    tsContext* ctx = (tsContext*)udata;
+    int bps = ctx->bps;
+    int samples_requested_to_consume = inNumberFrames;
+    AudioBuffer* buffer = ioData->mBuffers;
+
+    TS_ASSERT(ioData->mNumberBuffers == 1);
+    TS_ASSERT(buffer->mNumberChannels == 2);
+    int byte_size = buffer->mDataByteSize;
+    TS_ASSERT(byte_size == samples_requested_to_consume * bps);
+
+    int zero_bytes = tsPullBytes(ctx, buffer->mData, byte_size);
+    memset(((char*)buffer->mData) + (byte_size - zero_bytes), 0, zero_bytes);
+
+    return noErr;
+}
+
+#elif TS_PLATFORM == TS_SDL
+
+static void tsSDL_AudioCallback(void* udata, Uint8* stream, int len)
+{
+    tsContext* ctx = (tsContext*)udata;
+    int zero_bytes = tsPullBytes(ctx, stream, len);
+    memset(stream + (len - zero_bytes), 0, zero_bytes);
+}
+
+#endif
+
+static void tsPitchShift(float pitchShift, int num_samples_to_process, float sampleRate, float* indata, tsPitchData** pitch_filter);
+
+// Pitch processing tunables
+#define TS_MAX_FRAME_LENGTH 4096
+#define TS_PITCH_FRAME_SIZE 512
+#define TS_PITCH_QUALITY 8
+
+// interals
+#define TS_STEPSIZE (TS_PITCH_FRAME_SIZE / TS_PITCH_QUALITY)
+#define TS_OVERLAP (TS_PITCH_FRAME_SIZE - TS_STEPSIZE)
+#define TS_EXPECTED_FREQUENCY (2.0f * 3.14159265359f * (float)TS_STEPSIZE / (float)TS_PITCH_FRAME_SIZE)
+
+// TODO:
+// Use a memory pool for these things. For now they are just malloc16'd/free16'd
+// Not high priority to use a pool, since pitch shifting is already really expensive,
+// and cost of malloc is dwarfed. But would be a nice-to-have for potential memory
+// fragmentation issues.
+typedef struct tsPitchData
+{
+    float pitch_shifted_output_samples[TS_MAX_FRAME_LENGTH];
+    float in_FIFO[TS_STEPSIZE + TS_PITCH_FRAME_SIZE];
+    float out_FIFO[TS_STEPSIZE + TS_PITCH_FRAME_SIZE];
+    float fft_data[2 * TS_PITCH_FRAME_SIZE];
+    float previous_phase[TS_PITCH_FRAME_SIZE / 2 + 4];
+    float sum_phase[TS_PITCH_FRAME_SIZE / 2 + 4];
+    float window_accumulator[TS_STEPSIZE + TS_PITCH_FRAME_SIZE];
+    float freq[TS_PITCH_FRAME_SIZE];
+    float mag[TS_PITCH_FRAME_SIZE];
+    float pitch_shift_workspace[TS_PITCH_FRAME_SIZE];
+    int index;
+} tsPitchData;
+
+static void tsRemoveFilter(tsPlayingSound* playing)
+{
+    for (int i = 0; i < 2; i++)
+    {
+        if (playing->pitch_filter[i])
+        {
+            free16(playing->pitch_filter[i]);
+            playing->pitch_filter[i] = 0;
+        }
+    }
+}
+
+void tsMix(tsContext* ctx)
+{
+    tsLock(ctx);
+
+#if TS_PLATFORM == TS_WINDOWS
+
+    int byte_to_lock;
+    int bytes_to_write;
+    tsPosition(ctx, &byte_to_lock, &bytes_to_write);
+
+    if (!bytes_to_write) goto unlock;
+    int samples_to_write = bytes_to_write / ctx->bps;
+
+#elif TS_PLATFORM == TS_MAC || TS_PLATFORM == TS_SDL
+
+    int samples_to_write = tsSamplesToMix(ctx);
+    if (!samples_to_write) goto unlock;
+    int bytes_to_write = samples_to_write * ctx->bps;
+
+#else
+#endif
+
+    // clear mixer buffers
+    int wide_count = samples_to_write / 4;
+    TS_ASSERT(!(samples_to_write & 3));
+
+    __m128* floatA = ctx->floatA;
+    __m128* floatB = ctx->floatB;
+    __m128 zero = _mm_set1_ps(0.0f);
+
+    for (int i = 0; i < wide_count; ++i)
+    {
+        floatA[i] = zero;
+        floatB[i] = zero;
+    }
+
+    // mix all playing sounds into the mixer buffers
+    tsPlayingSound** ptr = &ctx->playing;
+    while (*ptr)
+    {
+        tsPlayingSound* playing = *ptr;
+        tsLoadedSound* loaded = playing->loaded_sound;
+        __m128* cA = (__m128*)loaded->channels[0];
+        __m128* cB = (__m128*)loaded->channels[1];
+
+        // Attempted to play a sound with no audio.
+        // Make sure the audio file was loaded properly. Check for
+        // error messages in g_tsErrorReason.
+        TS_ASSERT(cA);
+
+        int mix_count = samples_to_write;
+        int offset = playing->sample_index;
+        int remaining = loaded->sample_count - offset;
+        if (remaining < mix_count) mix_count = remaining;
+        TS_ASSERT(remaining > 0);
+
+        float vA0 = playing->volume0 * playing->pan0;
+        float vB0 = playing->volume1 * playing->pan1;
+        __m128 vA = _mm_set1_ps(vA0);
+        __m128 vB = _mm_set1_ps(vB0);
+
+        // skip sound if it's delay is longer than mix_count and
+        // handle various delay cases
+        int delay_offset = 0;
+        if (offset < 0)
+        {
+            int samples_till_positive = -offset;
+            int mix_leftover = mix_count - samples_till_positive;
+
+            if (mix_leftover <= 0)
+            {
+                playing->sample_index += mix_count;
+                goto get_next_playing_sound;
+            }
+
+            else
+            {
+                offset = 0;
+                delay_offset = samples_till_positive;
+                mix_count = mix_leftover;
+            }
+        }
+        TS_ASSERT(!(delay_offset & 3));
+
+        // immediately remove any inactive elements
+        if (!playing->active || !ctx->running)
+            goto remove;
+
+        // skip all paused sounds
+        if (playing->paused)
+            goto get_next_playing_sound;
+
+        // SIMD offets
+        int mix_wide = (int)TS_ALIGN(mix_count, 4) / 4;
+        int offset_wide = (int)TS_TRUNC(offset, 4) / 4;
+        int delay_wide = (int)TS_ALIGN(delay_offset, 4) / 4;
+
+        // use tsPitchShift to on-the-fly pitch shift some samples
+        // only call this function if the user set a custom pitch value
+        if (playing->pitch != 1.0f)
+        {
+            int sample_count = (mix_wide - 2 * delay_wide) * 4;
+            int falling_behind = sample_count > TS_MAX_FRAME_LENGTH;
+
+            // TS_MAX_FRAME_LENGTH represents max samples we can pitch shift in one go. In the event
+            // that this process takes longer than the time required to play the actual sound, just
+            // fall back to the original sound (non-pitch shifted). This will sound very ugly. To
+            // prevent falling behind, make sure not to pitch shift too many sounds at once. Try tweaking
+            // TS_PITCH_QUALITY to make it lower (must be a power of 2).
+            if (!falling_behind)
+            {
+                tsPitchShift(playing->pitch, sample_count, (float)ctx->Hz, (float*)(cA + delay_wide + offset_wide), playing->pitch_filter);
+                cA = (__m128 *)playing->pitch_filter[0]->pitch_shifted_output_samples;
+
+                if (loaded->channel_count == 2)
+                {
+                    tsPitchShift(playing->pitch, sample_count, (float)ctx->Hz, (float*)(cB + delay_wide + offset_wide), playing->pitch_filter + 1);
+                    cB = (__m128 *)playing->pitch_filter[1]->pitch_shifted_output_samples;
+                }
+
+                offset_wide = -delay_wide;
+            }
+        }
+
+        // apply volume, load samples into float buffers
+        switch (loaded->channel_count)
+        {
+        case 1:
+            for (int i = delay_wide; i < mix_wide - delay_wide; ++i)
+            {
+                __m128 A = cA[i + offset_wide];
+                __m128 B = _mm_mul_ps(A, vB);
+                A = _mm_mul_ps(A, vA);
+                floatA[i] = _mm_add_ps(floatA[i], A);
+                floatB[i] = _mm_add_ps(floatB[i], B);
+            }
+            break;
+
+        case 2:
+        {
+            for (int i = delay_wide; i < mix_wide - delay_wide; ++i)
+            {
+                __m128 A = cA[i + offset_wide];
+                __m128 B = cB[i + offset_wide];
+
+                A = _mm_mul_ps(A, vA);
+                B = _mm_mul_ps(B, vB);
+                floatA[i] = _mm_add_ps(floatA[i], A);
+                floatB[i] = _mm_add_ps(floatB[i], B);
+            }
+        }    break;
+        }
+
+        // playing list logic
+        playing->sample_index += mix_count;
+        if (playing->sample_index == loaded->sample_count)
+        {
+            if (playing->looped)
+            {
+                playing->sample_index = 0;
+                goto get_next_playing_sound;
+            }
+
+        remove:
+            playing->sample_index = 0;
+            *ptr = (*ptr)->next;
+            playing->next = 0;
+            playing->active = 0;
+
+            tsRemoveFilter(playing);
+
+            // if using high-level API manage the tsPlayingSound memory ourselves
+            if (ctx->playing_pool)
+            {
+                playing->next = ctx->playing_free;
+                ctx->playing_free = playing;
+            }
+
+            // we already incremented next pointer, so don't do it again
+            continue;
+        }
+
+    get_next_playing_sound:
+        if (*ptr) ptr = &(*ptr)->next;
+        else break;
+    }
+
+    // load all floats into 16 bit packed interleaved samples
+#if TS_PLATFORM == TS_WINDOWS
+
+    __m128i* samples = ctx->samples;
+    for (int i = 0; i < wide_count; ++i)
+    {
+        __m128i a = _mm_cvtps_epi32(floatA[i]);
+        __m128i b = _mm_cvtps_epi32(floatB[i]);
+        __m128i a0b0a1b1 = _mm_unpacklo_epi32(a, b);
+        __m128i a2b2a3b3 = _mm_unpackhi_epi32(a, b);
+        samples[i] = _mm_packs_epi32(a0b0a1b1, a2b2a3b3);
+    }
+    tsMemcpyToDS(ctx, (int16_t*)samples, byte_to_lock, bytes_to_write);
+
+#elif TS_PLATFORM == TS_MAC || TS_PLATFORM == TS_SDL
+
+    // Since the ctx->samples array is already in use as a ring buffer
+    // reusing floatA to store output is a good way to temporarly store
+    // the final samples. Then a single ring buffer push can be used
+    // afterwards. Pretty hacky, but whatever :)
+    __m128i* samples = (__m128i*)floatA;
+    memset(samples, 0, sizeof(__m128i) * wide_count);
+    for (int i = 0; i < wide_count; ++i)
+    {
+        __m128i a = _mm_cvtps_epi32(floatA[i]);
+        __m128i b = _mm_cvtps_epi32(floatB[i]);
+        __m128i a0b0a1b1 = _mm_unpacklo_epi32(a, b);
+        __m128i a2b2a3b3 = _mm_unpackhi_epi32(a, b);
+        samples[i] = _mm_packs_epi32(a0b0a1b1, a2b2a3b3);
+    }
+    tsPushBytes(ctx, samples, bytes_to_write);
+
+#else
+#endif
+
+unlock:
+    tsUnlock(ctx);
+}
+
+// TODO:
+// Try this optimization out (2N POINT REAL FFT USING AN N POINT COMPLEX FFT)
+// http://www.fftguru.com/fftguru.com.tutorial2.pdf
+
+#include <math.h>
+
+static uint32_t tsRev32(uint32_t x)
+{
+    uint32_t a = ((x & 0xAAAAAAAA) >> 1) | ((x & 0x55555555) << 1);
+    uint32_t b = ((a & 0xCCCCCCCC) >> 2) | ((a & 0x33333333) << 2);
+    uint32_t c = ((b & 0xF0F0F0F0) >> 4) | ((b & 0x0F0F0F0F) << 4);
+    uint32_t d = ((c & 0xFF00FF00) >> 8) | ((c & 0x00FF00FF) << 8);
+    return (d >> 16) | (d << 16);
+}
+
+static uint32_t tsPopCount(uint32_t x)
+{
+    uint32_t a = x - ((x >> 1) & 0x55555555);
+    uint32_t b = (((a >> 2) & 0x33333333) + (a & 0x33333333));
+    uint32_t c = (((b >> 4) + b) & 0x0F0F0F0F);
+    uint32_t d = c + (c >> 8);
+    uint32_t e = d + (d >> 16);
+    uint32_t f = e & 0x0000003F;
+    return f;
+}
+
+static uint32_t tsLog2(uint32_t x)
+{
+    uint32_t a = x | (x >> 1);
+    uint32_t b = a | (a >> 2);
+    uint32_t c = b | (b >> 4);
+    uint32_t d = c | (c >> 8);
+    uint32_t e = d | (d >> 16);
+    uint32_t f = e >> 1;
+    return tsPopCount(f);
+}
+
+// x contains real inputs
+// y contains imaginary inputs
+// count must be a power of 2
+// sign must be 1.0 (forward transform) or -1.0f (inverse transform)
+static void tsFFT(float* x, float* y, int count, float sign)
+{
+    int exponent = (int)tsLog2((uint32_t)count);
+
+    // bit reversal stage
+    // swap all elements with their bit reversed index within the
+    // lowest level of the Cooley-Tukey recursion tree
+    for (int i = 1; i < count - 1; i++)
+    {
+        uint32_t j = tsRev32((uint32_t)i);
+        j >>= (32 - exponent);
+        if (i < (int)j)
+        {
+            float tx = x[i];
+            float ty = y[i];
+            x[i] = x[j];
+            y[i] = y[j];
+            x[j] = tx;
+            y[j] = ty;
+        }
+    }
+
+    // for each recursive iteration
+    for (int iter = 0, L = 1; iter < exponent; ++iter)
+    {
+        int Ls = L;
+        L <<= 1;
+        float ur = 1.0f; // cos( pi / 2 )
+        float ui = 0;    // sin( pi / 2 )
+        float arg = 3.14159265359f / (float)Ls;
+        float wr = cosf(arg);
+        float wi = -sign * sinf(arg);
+
+        // rows in DFT submatrix
+        for (int j = 0; j < Ls; ++j)
+        {
+            // do butterflies upon DFT row elements
+            for (int i = j; i < count; i += L)
+            {
+                int index = i + Ls;
+                float x_index = x[index];
+                float y_index = y[index];
+                float x_i = x[i];
+                float y_i = y[i];
+
+                float tr = ur * x_index - ui * y_index;
+                float ti = ur * y_index + ui * x_index;
+                float x_low = x_i - tr;
+                float x_high = x_i + tr;
+                float y_low = y_i - ti;
+                float y_high = y_i + ti;
+
+                x[index] = x_low;
+                y[index] = y_low;
+                x[i] = x_high;
+                y[i] = y_high;
+            }
+
+            // Rotate u1 and u2 via Givens rotations (2d planar rotation).
+            // This keeps cos/sin calls in the outermost loop.
+            // Floating point error is scaled proportionally to Ls.
+            float t = ur * wr - ui * wi;
+            ui = ur * wi + ui * wr;
+            ur = t;
+        }
+    }
+
+    // scale factor for forward transform
+    if (sign > 0)
+    {
+        float inv_count = 1.0f / (float)count;
+        for (int i = 0; i < count; i++)
+        {
+            x[i] *= inv_count;
+            y[i] *= inv_count;
+        }
+    }
+}
+
+#ifdef _MSC_VER
+
+#define TS_ALIGN16_0 __declspec( align( 16 ) )
+#define TS_ALIGN16_1
+#define TS_SELECTANY extern const __declspec( selectany )
+
+#else
+
+#define TS_ALIGN16_0
+#define TS_ALIGN16_1 __attribute__( (aligned( 16 )) )
+#define TS_SELECTANY const __attribute__( (selectany) )
+
+#endif
+
+// SSE2 trig funcs from https://github.com/to-miz/sse_mathfun_extension/
+#define _PS_CONST( Name, Val ) \
+    TS_SELECTANY TS_ALIGN16_0 float _ps_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val }
+
+#define _PS_CONST_TYPE( Name, Type, Val ) \
+    TS_SELECTANY TS_ALIGN16_0 Type _ps_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val }
+
+#define _PI32_CONST( Name, Val ) \
+    TS_SELECTANY TS_ALIGN16_0 int _pi32_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val }
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, (int)~0x80000000);
+
+_PS_CONST(atanrange_hi, 2.414213562373095f);
+_PS_CONST(atanrange_lo, 0.4142135623730950f);
+_PS_CONST(cephes_PIO2F, 1.5707963267948966192f);
+_PS_CONST(cephes_PIO4F, 0.7853981633974483096f);
+_PS_CONST(1, 1.0f);
+_PS_CONST(0p5, 0.5f);
+_PS_CONST(0, 0);
+_PS_CONST(sincof_p0, -1.9515295891E-4f);
+_PS_CONST(sincof_p1, 8.3321608736E-3f);
+_PS_CONST(sincof_p2, -1.6666654611E-1f);
+_PS_CONST(atancof_p0, 8.05374449538e-2f);
+_PS_CONST(atancof_p1, 1.38776856032E-1f);
+_PS_CONST(atancof_p2, 1.99777106478E-1f);
+_PS_CONST(atancof_p3, 3.33329491539E-1f);
+_PS_CONST(cephes_PIF, 3.141592653589793238f);
+_PS_CONST(cephes_2PIF, 2.0f * 3.141592653589793238f);
+_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+_PS_CONST(minus_cephes_DP1, -0.78515625f);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
+_PS_CONST(coscof_p0, 2.443315711809948E-005f);
+_PS_CONST(coscof_p1, -1.388731625493765E-003f);
+_PS_CONST(coscof_p2, 4.166664568298827E-002f);
+_PS_CONST(frame_size, (float)TS_PITCH_FRAME_SIZE);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+
+static __m128 _mm_atan_ps(__m128 x)
+{
+    __m128 sign_bit, y;
+
+    sign_bit = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask);
+
+    /* range reduction, init x and y depending on range */
+    /* x > 2.414213562373095 */
+    __m128 cmp0 = _mm_cmpgt_ps(x, *(__m128*)_ps_atanrange_hi);
+    /* x > 0.4142135623730950 */
+    __m128 cmp1 = _mm_cmpgt_ps(x, *(__m128*)_ps_atanrange_lo);
+
+    /* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
+    __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
+
+    /* -( 1.0/x ) */
+    __m128 y0 = _mm_and_ps(cmp0, *(__m128*)_ps_cephes_PIO2F);
+    __m128 x0 = _mm_div_ps(*(__m128*)_ps_1, x);
+    x0 = _mm_xor_ps(x0, *(__m128*)_ps_sign_mask);
+
+    __m128 y1 = _mm_and_ps(cmp2, *(__m128*)_ps_cephes_PIO4F);
+    /* (x-1.0)/(x+1.0) */
+    __m128 x1_o = _mm_sub_ps(x, *(__m128*)_ps_1);
+    __m128 x1_u = _mm_add_ps(x, *(__m128*)_ps_1);
+    __m128 x1 = _mm_div_ps(x1_o, x1_u);
+
+    __m128 x2 = _mm_and_ps(cmp2, x1);
+    x0 = _mm_and_ps(cmp0, x0);
+    x2 = _mm_or_ps(x2, x0);
+    cmp1 = _mm_or_ps(cmp0, cmp2);
+    x2 = _mm_and_ps(cmp1, x2);
+    x = _mm_andnot_ps(cmp1, x);
+    x = _mm_or_ps(x2, x);
+
+    y = _mm_or_ps(y0, y1);
+
+    __m128 zz = _mm_mul_ps(x, x);
+    __m128 acc = *(__m128*)_ps_atancof_p0;
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_sub_ps(acc, *(__m128*)_ps_atancof_p1);
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_add_ps(acc, *(__m128*)_ps_atancof_p2);
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_sub_ps(acc, *(__m128*)_ps_atancof_p3);
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_mul_ps(acc, x);
+    acc = _mm_add_ps(acc, x);
+    y = _mm_add_ps(y, acc);
+
+    /* update the sign */
+    y = _mm_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+static __m128 _mm_atan2_ps(__m128 y, __m128 x)
+{
+    __m128 x_eq_0 = _mm_cmpeq_ps(x, *(__m128*)_ps_0);
+    __m128 x_gt_0 = _mm_cmpgt_ps(x, *(__m128*)_ps_0);
+    __m128 x_le_0 = _mm_cmple_ps(x, *(__m128*)_ps_0);
+    __m128 y_eq_0 = _mm_cmpeq_ps(y, *(__m128*)_ps_0);
+    __m128 x_lt_0 = _mm_cmplt_ps(x, *(__m128*)_ps_0);
+    __m128 y_lt_0 = _mm_cmplt_ps(y, *(__m128*)_ps_0);
+
+    __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
+    __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
+    zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
+
+    __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
+    __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, *(__m128*)_ps_sign_mask);
+    __m128 pio2_result = *(__m128*)_ps_cephes_PIO2F;
+    pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
+    pio2_result = _mm_and_ps(pio2_mask, pio2_result);
+
+    __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0);
+    __m128 pi = *(__m128*)_ps_cephes_PIF;
+    __m128 pi_result = _mm_and_ps(pi_mask, pi);
+
+    __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
+    swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, *(__m128*)_ps_sign_mask);
+
+    __m128 offset0 = _mm_setzero_ps();
+    __m128 offset1 = *(__m128*)_ps_cephes_PIF;
+    offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
+
+    __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
+    offset = _mm_and_ps(x_lt_0, offset1);
+
+    __m128 arg = _mm_div_ps(y, x);
+    __m128 atan_result = _mm_atan_ps(arg);
+    atan_result = _mm_add_ps(atan_result, offset);
+
+    /* select between zero_result, pio2_result and atan_result */
+
+    __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
+    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
+    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
+    result = _mm_or_ps(result, atan_result);
+    result = _mm_or_ps(result, pi_result);
+
+    return result;
+}
+
+static void _mm_sincos_ps(__m128 x, __m128 *s, __m128 *c)
+{
+    __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+    __m128i emm0, emm2, emm4;
+    sign_bit_sin = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
+
+    /* store the integer part of y in emm2 */
+    emm2 = _mm_cvttps_epi32(y);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
+    emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
+    y = _mm_cvtepi32_ps(emm2);
+
+    emm4 = emm2;
+
+    /* get the swap sign flag for the sine */
+    emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
+    emm0 = _mm_slli_epi32(emm0, 29);
+    __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+    /* get the polynom selection mask for the sine*/
+    emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    __m128 poly_mask = _mm_castsi128_ps(emm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+    x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m128*)_ps_minus_cephes_DP1;
+    xmm2 = *(__m128*)_ps_minus_cephes_DP2;
+    xmm3 = *(__m128*)_ps_minus_cephes_DP3;
+    xmm1 = _mm_mul_ps(y, xmm1);
+    xmm2 = _mm_mul_ps(y, xmm2);
+    xmm3 = _mm_mul_ps(y, xmm3);
+    x = _mm_add_ps(x, xmm1);
+    x = _mm_add_ps(x, xmm2);
+    x = _mm_add_ps(x, xmm3);
+
+    emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
+    emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
+    emm4 = _mm_slli_epi32(emm4, 29);
+    __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
+
+    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    __m128 z = _mm_mul_ps(x, x);
+    y = *(__m128*)_ps_coscof_p0;
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, z);
+    __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, *(__m128*)_ps_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m128 y2 = *(__m128*)_ps_sincof_p0;
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_mul_ps(y2, x);
+    y2 = _mm_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    __m128 ysin2 = _mm_and_ps(xmm3, y2);
+    __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+    y2 = _mm_sub_ps(y2, ysin2);
+    y = _mm_sub_ps(y, ysin1);
+
+    xmm1 = _mm_add_ps(ysin1, ysin2);
+    xmm2 = _mm_add_ps(y, y2);
+
+    /* update the sign */
+    *s = _mm_xor_ps(xmm1, sign_bit_sin);
+    *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+static __m128i select_si(__m128i a, __m128i b, __m128i mask)
+{
+    return _mm_xor_si128(a, _mm_and_si128(mask, _mm_xor_si128(b, a)));
+}
+
+#define tsVonHann( i ) (-0.5f * cosf( 2.0f * 3.14159265359f * (float)(i) / (float)TS_PITCH_FRAME_SIZE ) + 0.5f)
+
+static __m128 tsVonHann4(int i)
+{
+    __m128 k4 = _mm_set_ps((float)(i * 4 + 3), (float)(i * 4 + 2), (float)(i * 4 + 1), (float)(i * 4));
+    k4 = _mm_mul_ps(*(__m128*)_ps_cephes_2PIF, k4);
+    k4 = _mm_div_ps(k4, *(__m128*)_ps_frame_size);
+
+    // Seems like _mm_cos_ps and _mm_sincos_ps was causing some audio popping...
+    // I'm not really skilled enough to fix it, but feel free to try: http://gruntthepeon.free.fr/ssemath/sse_mathfun.h
+    // My guess is some large negative or positive values were causing some
+    // precision trouble. In this case manually calling 4 cosines is not
+    // really a big deal, since this function is not a bottleneck.
+
+#if 0
+    __m128 c = _mm_cos_ps(k4);
+#elif 0
+    __m128 s, c;
+    _mm_sincos_ps(k4, &s, &c);
+#else
+    __m128 c = k4;
+    float* cf = (float*)&c;
+    cf[0] = cosf(cf[0]);
+    cf[1] = cosf(cf[1]);
+    cf[2] = cosf(cf[2]);
+    cf[3] = cosf(cf[3]);
+#endif
+
+    __m128 von_hann = _mm_add_ps(_mm_mul_ps(_mm_set_ps1(-0.5f), c), _mm_set_ps1(0.5f));
+    return von_hann;
+}
+
+// Analysis and synthesis steps learned from Bernsee's wonderful blog post:
+// http://blogs.zynaptiq.com/bernsee/pitch-shifting-using-the-ft/
+static void tsPitchShift(float pitchShift, int num_samples_to_process, float sampleRate, float* indata, tsPitchData** pitch_filter)
+{
+    TS_ASSERT(num_samples_to_process <= TS_MAX_FRAME_LENGTH);
+
+    // make sure compiler didn't do anything weird with the member
+    // offsets of tsPitchData. All arrays must be 16 byte aligned
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->pitch_shifted_output_samples) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->fft_data) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->previous_phase) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->sum_phase) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->window_accumulator) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->freq) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->mag) & 15));
+    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->pitch_shift_workspace) & 15));
+
+    tsPitchData* pf;
+
+    if (*pitch_filter == NULL)
+    {
+        pf = (tsPitchData*)malloc16(sizeof(tsPitchData));
+        memset(pf, 0, sizeof(tsPitchData));
+        *pitch_filter = pf;
+    }
+    else
+    {
+        pf = *pitch_filter;
+    }
+
+    float freqPerBin = sampleRate / (float)TS_PITCH_FRAME_SIZE;
+    __m128 freq_per_bin = _mm_set_ps1(sampleRate / (float)TS_PITCH_FRAME_SIZE);
+    __m128 pi = *(__m128*)_ps_cephes_PIF;
+    __m128 two_pi = *(__m128*)_ps_cephes_2PIF;
+    __m128 pitch_quality = _mm_set_ps1((float)TS_PITCH_QUALITY);
+    float* out_samples = pf->pitch_shifted_output_samples;
+    if (pf->index == 0) pf->index = TS_OVERLAP;
+
+    while (num_samples_to_process)
+    {
+        int copy_count = TS_PITCH_FRAME_SIZE - pf->index;
+        if (num_samples_to_process < copy_count) copy_count = num_samples_to_process;
+
+        memcpy(pf->in_FIFO + pf->index, indata, sizeof(float) * copy_count);
+        memcpy(out_samples, pf->out_FIFO + pf->index - TS_OVERLAP, sizeof(float) * copy_count);
+
+        int start_index = pf->index;
+        int offset = start_index & 3;
+        start_index += 4 - offset;
+
+        for (int i = 0; i < offset; ++i)
+            pf->in_FIFO[pf->index + i] /= 32768.0f;
+
+        int extra = copy_count & 3;
+        copy_count = copy_count / 4 - extra;
+        __m128* in_FIFO = (__m128*)(pf->in_FIFO + pf->index + offset);
+        TS_ASSERT(!((size_t)in_FIFO & 15));
+        __m128 int16_max = _mm_set_ps1(32768.0f);
+
+        for (int i = 0; i < copy_count; ++i)
+        {
+            __m128 val = in_FIFO[i];
+            __m128 div = _mm_div_ps(val, int16_max);
+            in_FIFO[i] = div;
+        }
+
+        for (int i = 0, copy_count4 = copy_count * 4; i < extra; ++i)
+        {
+            int index = copy_count4 + i;
+            pf->in_FIFO[pf->index + index] /= 32768.0f;
+        }
+
+        TS_ASSERT(!((size_t)out_samples & 15));
+        __m128* out_samples4 = (__m128*)out_samples;
+        for (int i = 0; i < copy_count; ++i)
+        {
+            __m128 val = out_samples4[i];
+            __m128 mul = _mm_mul_ps(val, int16_max);
+            out_samples4[i] = mul;
+        }
+
+        for (int i = 0, copy_count4 = copy_count * 4; i < extra; ++i)
+        {
+            int index = copy_count4 + i;
+            out_samples[index] *= 32768.0f;
+        }
+
+        copy_count = copy_count * 4 + extra;
+        num_samples_to_process -= copy_count;
+        pf->index += copy_count;
+        indata += copy_count;
+        out_samples += copy_count;
+
+        if (pf->index >= TS_PITCH_FRAME_SIZE)
+        {
+            pf->index = TS_OVERLAP;
+            {
+                __m128* fft_data = (__m128*)pf->fft_data;
+                __m128* in_FIFO = (__m128*)pf->in_FIFO;
+
+                for (int k = 0; k < TS_PITCH_FRAME_SIZE / 4; k++)
+                {
+                    __m128 von_hann = tsVonHann4(k);
+                    __m128 sample = in_FIFO[k];
+                    __m128 windowed_sample = _mm_mul_ps(sample, von_hann);
+                    fft_data[k] = windowed_sample;
+                }
+            }
+
+            memset(pf->fft_data + TS_PITCH_FRAME_SIZE, 0, TS_PITCH_FRAME_SIZE * sizeof(float));
+            tsFFT(pf->fft_data, pf->fft_data + TS_PITCH_FRAME_SIZE, TS_PITCH_FRAME_SIZE, 1.0f);
+
+            {
+                __m128* fft_data = (__m128*)pf->fft_data;
+                __m128* previous_phase = (__m128*)pf->previous_phase;
+                __m128* magnitudes = (__m128*)pf->mag;
+                __m128* frequencies = (__m128*)pf->freq;
+                int simd_count = (TS_PITCH_FRAME_SIZE / 2) / 4;
+
+                for (int k = 0; k <= simd_count; k++)
+                {
+                    __m128 real = fft_data[k];
+                    __m128 imag = fft_data[(TS_PITCH_FRAME_SIZE / 4) + k];
+                    __m128 overlap_phase = _mm_set_ps((float)(k * 4 + 3) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 2) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 1) * TS_EXPECTED_FREQUENCY, (float)(k * 4) * TS_EXPECTED_FREQUENCY);
+                    __m128 k4 = _mm_set_ps((float)(k * 4 + 3), (float)(k * 4 + 2), (float)(k * 4 + 1), (float)(k * 4));
+
+                    __m128 mag = _mm_mul_ps(_mm_set_ps1(2.0f), _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(real, real), _mm_mul_ps(imag, imag))));
+                    __m128 phase = _mm_atan2_ps(imag, real);
+                    __m128 phase_dif = _mm_sub_ps(phase, previous_phase[k]);
+
+                    previous_phase[k] = phase;
+                    phase_dif = _mm_sub_ps(phase_dif, overlap_phase);
+
+                    // map delta phase into +/- pi interval
+                    __m128i qpd = _mm_cvttps_epi32(_mm_div_ps(phase_dif, pi));
+                    __m128i zero = _mm_setzero_si128();
+                    __m128i ltzero_mask = _mm_cmplt_epi32(qpd, zero);
+                    __m128i ones_bit = _mm_and_si128(qpd, _mm_set1_epi32(1));
+                    __m128i neg_qpd = _mm_sub_epi32(qpd, ones_bit);
+                    __m128i pos_qpd = _mm_add_epi32(qpd, ones_bit);
+                    qpd = select_si(pos_qpd, neg_qpd, ltzero_mask);
+                    __m128 pi_range_offset = _mm_mul_ps(pi, _mm_cvtepi32_ps(qpd));
+                    phase_dif = _mm_sub_ps(phase_dif, pi_range_offset);
+
+                    __m128 deviation = _mm_div_ps(_mm_mul_ps(_mm_set_ps1((float)TS_PITCH_QUALITY), phase_dif), two_pi);
+                    __m128 true_freq_estimated = _mm_add_ps(_mm_mul_ps(k4, freq_per_bin), _mm_mul_ps(deviation, freq_per_bin));
+
+                    magnitudes[k] = mag;
+                    frequencies[k] = true_freq_estimated;
+                }
+            }
+
+            // actual pitch shifting work
+            // shift frequencies into workspace
+            memset(pf->pitch_shift_workspace, 0, (TS_PITCH_FRAME_SIZE / 2) * sizeof(float));
+            for (int k = 0; k <= TS_PITCH_FRAME_SIZE / 2; k++)
+            {
+                int index = (int)(k * pitchShift);
+                if (index <= TS_PITCH_FRAME_SIZE / 2)
+                    pf->pitch_shift_workspace[index] = pf->freq[k] * pitchShift;
+            }
+
+            // swap buffers around to reuse old pf->preq buffer as the new workspace
+            float* frequencies = pf->pitch_shift_workspace;
+            float* pitch_shift_workspace = pf->freq;
+            float* magnitudes = pf->mag;
+
+            // shift magnitudes into workspace
+            memset(pitch_shift_workspace, 0, TS_PITCH_FRAME_SIZE * sizeof(float));
+            for (int k = 0; k <= TS_PITCH_FRAME_SIZE / 2; k++)
+            {
+                int index = (int)(k * pitchShift);
+                if (index <= TS_PITCH_FRAME_SIZE / 2)
+                    pitch_shift_workspace[index] += magnitudes[k];
+            }
+
+            // track where the shifted magnitudes are
+            magnitudes = pitch_shift_workspace;
+
+            {
+                __m128* magnitudes4 = (__m128*)magnitudes;
+                __m128* frequencies4 = (__m128*)frequencies;
+                __m128* fft_data = (__m128*)pf->fft_data;
+                __m128* sum_phase = (__m128*)pf->sum_phase;
+                int simd_count = (TS_PITCH_FRAME_SIZE / 2) / 4;
+
+                for (int k = 0; k <= simd_count; k++)
+                {
+                    __m128 mag = magnitudes4[k];
+                    __m128 freq = frequencies4[k];
+                    __m128 freq_per_bin_k = _mm_set_ps((float)(k * 4 + 3) * freqPerBin, (float)(k * 4 + 2) * freqPerBin, (float)(k * 4 + 1) * freqPerBin, (float)(k * 4) * freqPerBin);
+
+                    freq = _mm_sub_ps(freq, freq_per_bin_k);
+                    freq = _mm_div_ps(freq, freq_per_bin);
+
+                    freq = _mm_mul_ps(two_pi, freq);
+                    freq = _mm_div_ps(freq, pitch_quality);
+
+                    __m128 overlap_phase = _mm_set_ps((float)(k * 4 + 3) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 2) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 1) * TS_EXPECTED_FREQUENCY, (float)(k * 4) * TS_EXPECTED_FREQUENCY);
+                    freq = _mm_add_ps(freq, overlap_phase);
+
+                    __m128 phase = sum_phase[k];
+                    phase = _mm_add_ps(phase, freq);
+                    sum_phase[k] = phase;
+
+                    __m128 c, s;
+                    _mm_sincos_ps(phase, &s, &c);
+                    __m128 real = _mm_mul_ps(mag, c);
+                    __m128 imag = _mm_mul_ps(mag, s);
+
+                    fft_data[k] = real;
+                    fft_data[(TS_PITCH_FRAME_SIZE / 4) + k] = imag;
+                }
+            }
+
+            for (int k = TS_PITCH_FRAME_SIZE + 2; k < 2 * TS_PITCH_FRAME_SIZE - 2; ++k)
+                pf->fft_data[k] = 0;
+
+            tsFFT(pf->fft_data, pf->fft_data + TS_PITCH_FRAME_SIZE, TS_PITCH_FRAME_SIZE, -1);
+
+            {
+                __m128* fft_data = (__m128*)pf->fft_data;
+                __m128* window_accumulator = (__m128*)pf->window_accumulator;
+
+                for (int k = 0; k < TS_PITCH_FRAME_SIZE / 4; ++k)
+                {
+                    __m128 von_hann = tsVonHann4(k);
+                    __m128 fft_data_segment = fft_data[k];
+                    __m128 accumulator_segment = window_accumulator[k];
+                    __m128 divisor = _mm_div_ps(pitch_quality, _mm_set_ps1(8.0f));
+                    fft_data_segment = _mm_mul_ps(von_hann, fft_data_segment);
+                    fft_data_segment = _mm_div_ps(fft_data_segment, divisor);
+                    accumulator_segment = _mm_add_ps(accumulator_segment, fft_data_segment);
+                    window_accumulator[k] = accumulator_segment;
+                }
+            }
+
+            memcpy(pf->out_FIFO, pf->window_accumulator, TS_STEPSIZE * sizeof(float));
+            memmove(pf->window_accumulator, pf->window_accumulator + TS_STEPSIZE, TS_PITCH_FRAME_SIZE * sizeof(float));
+            memmove(pf->in_FIFO, pf->in_FIFO + TS_STEPSIZE, TS_OVERLAP * sizeof(float));
+        }
+    }
+}
+
+/*
+zlib license:
+
+Copyright (c) 2017 Randy Gaul http://www.randygaul.net
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from
+the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not
+be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#endif