1 files changed, 0 insertions, 2560 deletions
diff --git a/src/libs/tiny/tinysound.h b/src/libs/tiny/tinysound.h
deleted file mode 100644
index 41d547d..0000000
--- a/src/libs/tiny/tinysound.h
+++ /dev/null
@@ -1,2560 +0,0 @@
-/*
-tinysound.h - v1.07
-
-Summary:
-tinysound is a C API for loading, playing, looping, panning and fading mono
-and stero sounds. This means tinysound imparts no external DLLs or large
-libraries that adversely effect shipping size. tinysound can also run on
-Windows XP since DirectSound ships with all recent versions of Windows.
-tinysound implements a custom SSE2 mixer by explicitly locking and unlocking
-portions of an internal. tinysound uses CoreAudio for Apple machines (like
-OSX and iOS). SDL is used for all other platforms. Define TS_FORCE_SDL
-before placaing the TS_IMPLEMENTATION in order to force the use of SDL.
-
-Revision history:
-1.0  (06/04/2016) initial release
-1.01 (06/06/2016) load WAV from memory
-separate portable and OS-specific code in tsMix
-fixed bug causing audio glitches when sounds ended
-added stb_vorbis loaders + demo example
-1.02 (06/08/2016) error checking + strings in vorbis loaders
-SSE2 implementation of mixer
-fix typos on docs/comments
-corrected volume bug introduced in 1.01
-1.03 (07/05/2016) size calculation helper (to know size of sound in
-bytes on the heap) tsSoundSize
-1.04 (12/06/2016) merged in Aaron Balint's contributions
-SFFT and pitch functions from Stephan M. Bernsee
-tsMix can run on its own thread with tsSpawnMixThread
-updated documentation, typo fixes
-fixed typo in malloc16 that caused heap corruption
-1.05 (12/08/2016) tsStopAllSounds, suggested by Aaron Balint
-1.06 (02/17/2017) port to CoreAudio for Apple machines
-1.07 (06/18/2017) SIMD the pitch shift code; swapped out old Bernsee
-code for a new re-write, updated docs as necessary,
-support for compiling as .c and .cpp on Windows,
-port for SDL (for Linux, or any other platform).
-Special thanks to DexP of github for 90% of the work
-on the SDL port!
-*/
-
-/*
-Contributors:
-Aaron Balint      1.04 - real time pitch
-1.04 - separate thread for tsMix
-1.04 - bugfix, removed extra free16 call for second channel
-DeXP              1.07 - initial work on SDL port
-*/
-
-/*
-To create implementation (the function definitions)
-#define TS_IMPLEMENTATION
-in *one* C/CPP file (translation unit) that includes this file
-
-DOCUMENTATION (very quick intro):
-1. create context
-2. load sounds from disk into memory
-3. play sounds
-4. free context
-
-1. tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, N );
-2. tsPlaySoundDef def = tsMakeDef( &tsLoadWAV( "path_to_file/filename.wav" ) );
-3. tsPlaySound( ctx, def );
-4. tsShutdownContext( ctx );
-
-DOCUMENTATION (longer introduction):
-tinysound consists of tsLoadedSounds, tsPlayingSounds and the tsContext.
-The tsContext encapsulates an OS sound API, as well as buffers + settings.
-tsLoadedSound holds raw samples of a sound. tsPlayingSound is an instance
-of a tsLoadedSound that represents a sound that can be played through the
-tsContext.
-
-There are two main versions of the API, the low-level and the high-level
-API. The low-level API does not manage any memory for tsPlayingSounds. The
-high level api holds a memory pool of playing sounds.
-
-High-level API:
-First create a context and pass in non-zero to the final parameter. This
-final parameter controls how large of a memory pool to use for tsPlayingSounds.
-Here's an example where N is the size of the internal pool:
-
-tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, N );
-
-We create tsPlayingSounds indirectly with tsPlayDef structs. tsPlayDef is a
-POD struct so feel free to make them straight on the stack. The tsPlayDef
-sets up initialization parameters. Here's an example to load a wav and
-play it:
-
-tsLoadedSound loaded = tsLoadWAV( "path_to_file/filename.wav" );
-tsPlaySoundDef def = tsMakeDef( &loaded );
-tsPlayingSound* sound = tsPlaySound( ctx, def );
-
-The same def can be used to play as many sounds as desired (even simultaneously)
-as long as the context playing sound pool is large enough.
-
-Low-level API:
-First create a context and pass 0 in the final parameter (0 here means
-the context will *not* allocate a tsPlayingSound memory pool):
-
-tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, 0 );
-
-parameters:
-hwnd           --  HWND, handle to window (on OSX just pass in 0)
-frequency      --  int, represents Hz frequency rate in which samples are played
-latency        --  int, estimated latency in Hz from PlaySound call to speaker output
-seconds        --  int, number of second of samples internal buffers can hold
-0 (last param) --  int, number of elements in tsPlayingSound pool
-
-We create a tsPlayingSound like so:
-tsLoadedSound loaded = tsLoadWAV( "path_to_file/filename.wav" );
-tsPlayingSound playing_sound = tsMakePlayingSound( &loaded );
-
-Then to play the sound we do:
-tsInsertSound( ctx, &playing_sound );
-
-The above tsInsertSound function call will place playing_sound into
-a singly-linked list inside the context. The context will remove
-the sound from its internal list when it finishes playing.
-
-WARNING: The high-level API cannot be mixed with the low-level API. If you
-try then the internal code will assert and crash. Pick one and stick with it.
-Usually he high-level API will be used, but if someone is *really* picky about
-their memory usage, or wants more control, the low-level API can be used.
-
-Here is the Low-Level API:
-tsPlayingSound tsMakePlayingSound( tsLoadedSound* loaded );
-void tsInsertSound( tsContext* ctx, tsPlayingSound* sound );
-
-Here is the High-Level API:
-tsPlayingSound* tsPlaySound( tsContext* ctx, tsPlaySoundDef def );
-tsPlaySoundDef tsMakeDef( tsLoadedSound* sound );
-void tsStopAllSounds( tsContext( ctx );
-
-Be sure to link against dsound.dll (or dsound.lib) on Windows.
-
-Read the rest of the header for specific details on all available functions
-and struct types.
-*/
-
-/*
-Known Limitations:
-
-* PCM mono/stereo format is the only formats the LoadWAV function supports. I don't
-guarantee it will work for all kinds of wav files, but it certainly does for the common
-kind (and can be changed fairly easily if someone wanted to extend it).
-* Only supports 16 bits per sample.
-* Mixer does not do any fancy clipping. The algorithm is to convert all 16 bit samples
-to float, mix all samples, and write back to audio API as 16 bit integers. In
-practice this works very well and clipping is not often a big problem.
-* I'm not super familiar with good ways to avoid the DirectSound play cursor from going
-past the write cursor. To mitigate this pass in a larger number to tsMakeContext's 4th
-parameter (buffer scale in seconds).
-* Pitch shifting code is pretty darn expensive. This is due to the use of a Fast Fourier Transform
-routine. The pitch shifting itself is written in rather efficient SIMD using SSE2 intrinsics,
-but the FFT routine is very basic. FFT is a big bottleneck for pitch shifting. There is a
-TODO optimization listed in this file for the FFT routine, but it's fairly low priority;
-optimizing FFT routines is difficult and requires a lot of specialized knowledge.
-*/
-
-/*
-FAQ
-Q : Why DirectSound instead of (insert API here) on Windows?
-A : Casey Muratori documented DS on Handmade Hero, other APIs do not have such good docs. DS has
-shipped on Windows XP all the way through Windows 10 -- using this header effectively intro-
-duces zero dependencies for the foreseeable future. The DS API itself is sane enough to quickly
-implement needed features, and users won't hear the difference between various APIs. Latency is
-not that great with DS but it is shippable. Additionally, many other APIs will in the end speak
-to Windows through the DS API.
-
-Q : Why not include Linux support?
-A : There have been a couple requests for ALSA support on Linux. For now the only option is to use
-SDL backend, which can indirectly support ALSA. SDL is used only in a very low-level manner;
-to get sound samples to the sound card via callback, so there shouldn't be much in the way of
-considering SDL a good option for "name your flavor" of Linux backend.
-
-Q : I would like to use my own memory management, how can I achieve this?
-A : This header makes a couple uses of malloc/free, and malloc16/free16. Simply find these bits
-and replace them with your own memory allocation routines. They can be wrapped up into a macro,
-or call your own functions directly -- it's up to you. Generally these functions allocate fairly
-large chunks of memory, and not very often (if at all), with one exception: tsSetPitch is a very
-expensive routine and requires frequent dynamic memory management.
-*/
-
-/*
-Some past discussion threads:
-https://www.reddit.com/r/gamedev/comments/6i39j2/tinysound_the_cutest_library_to_get_audio_into/
-https://www.reddit.com/r/gamedev/comments/4ml6l9/tinysound_singlefile_c_audio_library/
-https://forums.tigsource.com/index.php?topic=58706.0
-*/
-
-#if !defined( TINYSOUND_H )
-
-#define TS_WINDOWS    1
-#define TS_MAC        2
-#define TS_UNIX        3
-#define TS_SDL        4
-
-#if defined( _WIN32 )
-#define TS_PLATFORM TS_WINDOWS
-#elif defined( __APPLE__ )
-#define TS_PLATFORM TS_MAC
-#else
-#define TS_PLATFORM TS_SDL
-
-// please note TS_UNIX is not directly support
-// instead, unix-style OSes are encouraged to use SDL
-// see: https://www.libsdl.org/
-
-#endif
-
-// Use TS_FORCE_SDL to override the above macros and use
-// the SDL port.
-#ifdef TS_FORCE_SDL
-
-#undef TS_PLATFORM
-#define TS_PLATFORM TS_SDL
-
-#endif
-
-#include <stdint.h>
-
-// read this in the event of tsLoadWAV/tsLoadOGG errors
-// also read this in the event of certain errors from tsMakeContext
-extern const char* g_tsErrorReason;
-
-// stores a loaded sound in memory
-typedef struct
-{
-    int sample_count;
-    int channel_count;
-    void* channels[2];
-} tsLoadedSound;
-
-struct tsPitchData;
-typedef struct tsPitchData tsPitchData;
-
-// represents an instance of a tsLoadedSound, can be played through the tsContext
-typedef struct tsPlayingSound
-{
-    int active;
-    int paused;
-    int looped;
-    float volume0;
-    float volume1;
-    float pan0;
-    float pan1;
-    float pitch;
-    tsPitchData* pitch_filter[2];
-    int sample_index;
-    tsLoadedSound* loaded_sound;
-    struct tsPlayingSound* next;
-} tsPlayingSound;
-
-// holds audio API info and other info
-struct tsContext;
-typedef struct tsContext tsContext;
-
-// The returned struct will contain a null pointer in tsLoadedSound::channel[ 0 ]
-// in the case of errors. Read g_tsErrorReason string for details on what happened.
-// Calls tsReadMemWAV internally.
-tsLoadedSound tsLoadWAV(const char* path);
-
-// Reads a WAV file from memory. Still allocates memory for the tsLoadedSound since
-// WAV format will interlace stereo, and we need separate data streams to do SIMD
-// properly.
-void tsReadMemWAV(const void* memory, tsLoadedSound* sound);
-
-// If stb_vorbis was included *before* tinysound go ahead and create
-// some functions for dealing with OGG files.
-#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H
-void tsReadMemOGG(const void* memory, int length, int* sample_rate, tsLoadedSound* sound);
-tsLoadedSound tsLoadOGG(const char* path, int* sample_rate);
-#endif
-
-// Uses free16 (aligned free, implemented later in this file) to free up both of
-// the channels stored within sound
-void tsFreeSound(tsLoadedSound* sound);
-
-// Returns the size, in bytes, of all heap-allocated memory for this particular
-// loaded sound
-int tsSoundSize(tsLoadedSound* sound);
-
-// playing_pool_count -- 0 to setup low-level API, non-zero to size the internal
-// memory pool for tsPlayingSound instances
-tsContext* tsMakeContext(void* hwnd, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count);
-void tsShutdownContext(tsContext* ctx);
-
-// Call tsSpawnMixThread once to setup a separate thread for the context to run
-// upon. The separate thread will continually call tsMix and perform mixing
-// operations.
-void tsSpawnMixThread(tsContext* ctx);
-
-// Use tsThreadSleepDelay to specify a custom sleep delay time.
-// A sleep will occur after each call to tsMix. By default YieldProcessor
-// is used, and no sleep occurs. Use a sleep delay to conserve CPU bandwidth.
-// A recommended sleep time is a little less than 1/2 your predicted 1/FPS.
-// 60 fps is 16 ms, so about 1-5 should work well in most cases.
-void tsThreadSleepDelay(tsContext* ctx, int milliseconds);
-
-// Call this manually, once per game tick recommended, if you haven't ever
-// called tsSpawnMixThread. Otherwise the thread will call tsMix itself.
-// num_samples_to_write is not used on Windows. On Mac it is used to push
-// samples into a circular buffer while CoreAudio simultaneously pulls samples
-// off of the buffer. num_samples_to_write should be computed each update tick
-// as delta_time * play_frequency_in_Hz + 1.
-void tsMix(tsContext* ctx);
-
-// All of the functions in this next section should only be called if tsIsActive
-// returns true. Calling them otherwise probably won't do anything bad, but it
-// won't do anything at all. If a sound is active it resides in the context's
-// internal list of playing sounds.
-int tsIsActive(tsPlayingSound* sound);
-
-// Flags sound for removal. Upon next tsMix call will remove sound from playing
-// list. If high-level API used sound is placed onto the internal free list.
-void tsStopSound(tsPlayingSound* sound);
-
-void tsLoopSound(tsPlayingSound* sound, int zero_for_no_loop);
-void tsPauseSound(tsPlayingSound* sound, int one_for_paused);
-
-// lerp from 0 to 1, 0 full left, 1 full right
-void tsSetPan(tsPlayingSound* sound, float pan);
-
-// explicitly set volume of each channel. Can be used as panning (but it's
-// recommended to use the tsSetPan function for panning).
-void tsSetVolume(tsPlayingSound* sound, float volume_left, float volume_right);
-
-// Change pitch (not duration) of sound. pitch = 0.5f for one octave lower, pitch = 2.0f for one octave higher.
-// pitch at 1.0f applies no change. pitch settings farther away from 1.0f create more distortion and lower
-// the output sample quality. pitch can be adjusted in real-time for doppler effects and the like. Going beyond
-// 0.5f and 2.0f may require some tweaking the pitch shifting parameters, and is not recommended.
-
-// Additional important information about performance: This function
-// is quite expensive -- you have been warned! Try it out and be aware of how much CPU consumption it uses.
-// To avoid destroying the originally loaded sound samples, tsSetPitch will do a one-time allocation to copy
-// sound samples into a new buffer. The new buffer contains the pitch adjusted samples, and these will be played
-// through tsMix. This lets the pitch be modulated at run-time, but requires dynamically allocated memory. The
-// memory is freed once the sound finishes playing. If a one-time pitch adjustment is desired, for performance
-// reasons please consider doing an off-line pitch adjustment manually as a pre-processing step for your sounds.
-// Also, consider changing malloc16 and free16 to match your custom memory allocation needs. Try adjusting
-// TS_PITCH_QUALITY (must be a power of two) and see how this affects your performance.
-void tsSetPitch(tsPlayingSound* sound, float pitch);
-
-// Delays sound before actually playing it. Requires context to be passed in
-// since there's a conversion from seconds to samples per second.
-// If one were so inclined another version could be implemented like:
-// void tsSetDelay( tsPlayingSound* sound, float delay, int samples_per_second )
-void tsSetDelay(tsContext* ctx, tsPlayingSound* sound, float delay_in_seconds);
-
-// Portable sleep function
-void tsSleep(int milliseconds);
-
-// LOW-LEVEL API
-tsPlayingSound tsMakePlayingSound(tsLoadedSound* loaded);
-void tsInsertSound(tsContext* ctx, tsPlayingSound* sound);
-
-// HIGH-LEVEL API
-typedef struct
-{
-    int paused;
-    int looped;
-    float volume_left;
-    float volume_right;
-    float pan;
-    float pitch;
-    float delay;
-    tsLoadedSound* loaded;
-} tsPlaySoundDef;
-
-tsPlayingSound* tsPlaySound(tsContext* ctx, tsPlaySoundDef def);
-tsPlaySoundDef tsMakeDef(tsLoadedSound* sound);
-void tsStopAllSounds(tsContext* ctx);
-
-#define TINYSOUND_H
-#endif
-
-#ifdef TS_IMPLEMENTATION
-
-#define _CRT_SECURE_NO_WARNINGS FUCK_YOU
-#include <stdlib.h>    // malloc, free
-#include <stdio.h>    // fopen, fclose
-#include <string.h>    // memcmp, memset, memcpy
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-#if TS_PLATFORM == TS_WINDOWS
-
-#include <dsound.h>
-#undef PlaySound
-
-#if defined( _MSC_VER )
-#pragma comment( lib, "dsound.lib" )
-#endif
-
-#elif TS_PLATFORM == TS_MAC
-
-#include <CoreAudio/CoreAudio.h>
-#include <AudioUnit/AudioUnit.h>
-#include <pthread.h>
-#include <mach/mach_time.h>
-
-#else
-
-#include "SDL2/SDL.h"
-
-#endif
-
-#define TS_CHECK( X, Y ) do { if ( !(X) ) { g_tsErrorReason = Y; goto ts_err; } } while ( 0 )
-#if TS_PLATFORM == TS_MAC && defined( __clang__ )
-#define TS_ASSERT_INTERNAL __builtin_trap( )
-#else
-#define TS_ASSERT_INTERNAL *(int*)0 = 0
-#endif
-#define TS_ASSERT( X ) do { if ( !(X) ) TS_ASSERT_INTERNAL; } while ( 0 )
-#define TS_ALIGN( X, Y ) ((((size_t)X) + ((Y) - 1)) & ~((Y) - 1))
-#define TS_TRUNC( X, Y ) ((size_t)(X) & ~((Y) - 1))
-
-const char* g_tsErrorReason;
-
-static void* tsReadFileToMemory(const char* path, int* size)
-{
-    void* data = 0;
-    FILE* fp = fopen(path, "rb");
-    int sizeNum = 0;
-
-    if (fp)
-    {
-        fseek(fp, 0, SEEK_END);
-        sizeNum = (int)ftell(fp);
-        fseek(fp, 0, SEEK_SET);
-        data = malloc(sizeNum);
-        fread(data, sizeNum, 1, fp);
-        fclose(fp);
-    }
-
-    if (size) *size = sizeNum;
-    return data;
-}
-
-static int tsFourCC(const char* CC, void* memory)
-{
-    if (!memcmp(CC, memory, 4)) return 1;
-    return 0;
-}
-
-static char* tsNext(char* data)
-{
-    uint32_t size = *(uint32_t*)(data + 4);
-    size = (size + 1) & ~1;
-    return data + 8 + size;
-}
-
-static void* malloc16(size_t size)
-{
-    void* p = malloc(size + 16);
-    if (!p) return 0;
-    unsigned char offset = (size_t)p & 15;
-    p = (void*)TS_ALIGN(p + 1, 16);
-    *((char*)p - 1) = 16 - offset;
-    TS_ASSERT(!((size_t)p & 15));
-    return p;
-}
-
-static void free16(void* p)
-{
-    if (!p) return;
-    free((char*)p - (size_t)*((char*)p - 1));
-}
-
-static void tsLastElement(__m128* a, int i, int j, int16_t* samples, int offset)
-{
-    switch (offset)
-    {
-    case 1:
-        a[i] = _mm_set_ps(samples[j], 0.0f, 0.0f, 0.0f);
-        break;
-
-    case 2:
-        a[i] = _mm_set_ps(samples[j], samples[j + 1], 0.0f, 0.0f);
-        break;
-
-    case 3:
-        a[i] = _mm_set_ps(samples[j], samples[j + 1], samples[j + 2], 0.0f);
-        break;
-
-    case 0:
-        a[i] = _mm_set_ps(samples[j], samples[j + 1], samples[j + 2], samples[j + 3]);
-        break;
-    }
-}
-
-void tsReadMemWAV(const void* memory, tsLoadedSound* sound)
-{
-#pragma pack( push, 1 )
-    typedef struct
-    {
-        uint16_t wFormatTag;
-        uint16_t nChannels;
-        uint32_t nSamplesPerSec;
-        uint32_t nAvgBytesPerSec;
-        uint16_t nBlockAlign;
-        uint16_t wBitsPerSample;
-        uint16_t cbSize;
-        uint16_t wValidBitsPerSample;
-        uint32_t dwChannelMask;
-        uint8_t SubFormat[18];
-    } Fmt;
-#pragma pack( pop )
-
-    char* data = (char*)memory;
-    TS_CHECK(data, "Unable to read input file (file doesn't exist, or could not allocate heap memory.");
-    TS_CHECK(tsFourCC("RIFF", data), "Incorrect file header; is this a WAV file?");
-    TS_CHECK(tsFourCC("WAVE", data + 8), "Incorrect file header; is this a WAV file?");
-
-    data += 12;
-
-    TS_CHECK(tsFourCC("fmt ", data), "fmt chunk not found.");
-    Fmt fmt;
-    fmt = *(Fmt*)(data + 8);
-    TS_CHECK(fmt.wFormatTag == 1, "Only PCM WAV files are supported.");
-    TS_CHECK(fmt.nChannels == 1 || fmt.nChannels == 2, "Only mono or stereo supported (too many channels detected).");
-    TS_CHECK(fmt.wBitsPerSample == 16, "Only 16 bits per sample supported.");
-    TS_CHECK(fmt.nBlockAlign == fmt.nChannels * 2, "implementation error");
-
-    data = tsNext(data);
-    TS_CHECK(tsFourCC("data", data), "data chunk not found.");
-    int sample_size = *((uint32_t*)(data + 4));
-    int sample_count = sample_size / (fmt.nChannels * sizeof(uint16_t));
-    sound->sample_count = sample_count;
-    sound->channel_count = fmt.nChannels;
-
-    int wide_count = (int)TS_ALIGN(sample_count, 4);
-    wide_count /= 4;
-    int wide_offset = sample_count & 3;
-    int16_t* samples = (int16_t*)(data + 8);
-    float* sample = (float*)alloca(sizeof(float) * 4 + 16);
-    sample = (float*)TS_ALIGN(sample, 16);
-
-    switch (sound->channel_count)
-    {
-    case 1:
-    {
-        sound->channels[0] = malloc16(wide_count * sizeof(__m128));
-        sound->channels[1] = 0;
-        __m128* a = (__m128*)sound->channels[0];
-
-        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 4)
-        {
-            sample[0] = (float)samples[j];
-            sample[1] = (float)samples[j + 1];
-            sample[2] = (float)samples[j + 2];
-            sample[3] = (float)samples[j + 3];
-            a[i] = _mm_load_ps(sample);
-        }
-
-        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
-    }    break;
-
-    case 2:
-    {
-        __m128* a = (__m128*)malloc16(wide_count * sizeof(__m128) * 2);
-        __m128* b = a + wide_count;
-
-        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 8)
-        {
-            sample[0] = (float)samples[j];
-            sample[1] = (float)samples[j + 2];
-            sample[2] = (float)samples[j + 4];
-            sample[3] = (float)samples[j + 6];
-            a[i] = _mm_load_ps(sample);
-
-            sample[0] = (float)samples[j + 1];
-            sample[1] = (float)samples[j + 3];
-            sample[2] = (float)samples[j + 5];
-            sample[3] = (float)samples[j + 7];
-            b[i] = _mm_load_ps(sample);
-        }
-
-        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
-        tsLastElement(b, wide_count - 1, (wide_count - 1) * 4 + 4, samples, wide_offset);
-        sound->channels[0] = a;
-        sound->channels[1] = b;
-    }    break;
-
-    default:
-        TS_CHECK(0, "unsupported channel count (only support mono and stereo).");
-    }
-
-    return;
-
-ts_err:
-    memset(&sound, 0, sizeof(sound));
-}
-
-tsLoadedSound tsLoadWAV(const char* path)
-{
-    tsLoadedSound sound = { 0 };
-    char* wav = (char*)tsReadFileToMemory(path, 0);
-    tsReadMemWAV(wav, &sound);
-    free(wav);
-    return sound;
-}
-
-// If stb_vorbis was included *before* tinysound go ahead and create
-// some functions for dealing with OGG files.
-#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H
-void tsReadMemOGG(const void* memory, int length, int* sample_rate, tsLoadedSound* sound)
-{
-    int16_t* samples = 0;
-    int channel_count;
-    int sample_count = stb_vorbis_decode_memory((const unsigned char*)memory, length, &channel_count, sample_rate, &samples);
-
-    TS_CHECK(sample_count > 0, "stb_vorbis_decode_memory failed. Make sure your file exists and is a valid OGG file.");
-
-    int wide_count = (int)TS_ALIGN(sample_count, 4) / 4;
-    int wide_offset = sample_count & 3;
-    float* sample = (float*)alloca(sizeof(float) * 4 + 16);
-    sample = (float*)TS_ALIGN(sample, 16);
-    __m128* a;
-    __m128* b;
-
-    switch (channel_count)
-    {
-    case 1:
-    {
-        a = (__m128*)malloc16(wide_count * sizeof(__m128));
-        b = 0;
-
-        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 4)
-        {
-            sample[0] = (float)samples[j];
-            sample[1] = (float)samples[j + 1];
-            sample[2] = (float)samples[j + 2];
-            sample[3] = (float)samples[j + 3];
-            a[i] = _mm_load_ps(sample);
-        }
-
-        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
-    }    break;
-
-    case 2:
-        a = (__m128*)malloc16(wide_count * sizeof(__m128) * 2);
-        b = a + wide_count;
-
-        for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 8)
-        {
-            sample[0] = (float)samples[j];
-            sample[1] = (float)samples[j + 2];
-            sample[2] = (float)samples[j + 4];
-            sample[3] = (float)samples[j + 6];
-            a[i] = _mm_load_ps(sample);
-
-            sample[0] = (float)samples[j + 1];
-            sample[1] = (float)samples[j + 3];
-            sample[2] = (float)samples[j + 5];
-            sample[3] = (float)samples[j + 7];
-            b[i] = _mm_load_ps(sample);
-        }
-
-        tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset);
-        tsLastElement(b, wide_count - 1, (wide_count - 1) * 4 + 4, samples, wide_offset);
-        break;
-
-    default:
-        TS_CHECK(0, "Unsupported channel count.");
-    }
-
-    sound->sample_count = sample_count;
-    sound->channel_count = channel_count;
-    sound->channels[0] = a;
-    sound->channels[1] = b;
-    free(samples);
-    return;
-
-ts_err:
-    free(samples);
-    memset(sound, 0, sizeof(tsLoadedSound));
-}
-
-tsLoadedSound tsLoadOGG(const char* path, int* sample_rate)
-{
-    int length;
-    void* memory = tsReadFileToMemory(path, &length);
-    tsLoadedSound sound;
-    tsReadMemOGG(memory, length, sample_rate, &sound);
-    free(memory);
-
-    return sound;
-}
-#endif
-
-void tsFreeSound(tsLoadedSound* sound)
-{
-    free16(sound->channels[0]);
-    memset(sound, 0, sizeof(tsLoadedSound));
-}
-
-int tsSoundSize(tsLoadedSound* sound)
-{
-    return sound->sample_count * sound->channel_count * sizeof(uint16_t);
-}
-
-tsPlayingSound tsMakePlayingSound(tsLoadedSound* loaded)
-{
-    tsPlayingSound playing;
-    playing.active = 0;
-    playing.paused = 0;
-    playing.looped = 0;
-    playing.volume0 = 1.0f;
-    playing.volume1 = 1.0f;
-    playing.pan0 = 0.5f;
-    playing.pan1 = 0.5f;
-    playing.pitch = 1.0f;
-    playing.pitch_filter[0] = 0;
-    playing.pitch_filter[1] = 0;
-    playing.sample_index = 0;
-    playing.loaded_sound = loaded;
-    playing.next = 0;
-    return playing;
-}
-
-int tsIsActive(tsPlayingSound* sound)
-{
-    return sound->active;
-}
-
-void tsStopSound(tsPlayingSound* sound)
-{
-    sound->active = 0;
-}
-
-void tsLoopSound(tsPlayingSound* sound, int zero_for_no_loop)
-{
-    sound->looped = zero_for_no_loop;
-}
-
-void tsPauseSound(tsPlayingSound* sound, int one_for_paused)
-{
-    sound->paused = one_for_paused;
-}
-
-void tsSetPan(tsPlayingSound* sound, float pan)
-{
-    if (pan > 1.0f) pan = 1.0f;
-    else if (pan < 0.0f) pan = 0.0f;
-    float left = 1.0f - pan;
-    float right = pan;
-    sound->pan0 = left;
-    sound->pan1 = right;
-}
-
-void tsSetPitch(tsPlayingSound* sound, float pitch)
-{
-    sound->pitch = pitch;
-}
-
-void tsSetVolume(tsPlayingSound* sound, float volume_left, float volume_right)
-{
-    if (volume_left < 0.0f) volume_left = 0.0f;
-    if (volume_right < 0.0f) volume_right = 0.0f;
-    sound->volume0 = volume_left;
-    sound->volume1 = volume_right;
-}
-
-static void tsRemoveFilter(tsPlayingSound* playing);
-
-#if TS_PLATFORM == TS_WINDOWS
-
-void tsSleep(int milliseconds)
-{
-    Sleep(milliseconds);
-}
-
-struct tsContext
-{
-    unsigned latency_samples;
-    unsigned running_index;
-    int Hz;
-    int bps;
-    int buffer_size;
-    int wide_count;
-    tsPlayingSound* playing;
-    __m128* floatA;
-    __m128* floatB;
-    __m128i* samples;
-    tsPlayingSound* playing_pool;
-    tsPlayingSound* playing_free;
-
-    // platform specific stuff
-    LPDIRECTSOUND dsound;
-    LPDIRECTSOUNDBUFFER buffer;
-    LPDIRECTSOUNDBUFFER primary;
-
-    // data for tsMix thread, enable these with tsSpawnMixThread
-    CRITICAL_SECTION critical_section;
-    int separate_thread;
-    int running;
-    int sleep_milliseconds;
-};
-
-static void tsReleaseContext(tsContext* ctx)
-{
-    if (ctx->separate_thread)    DeleteCriticalSection(&ctx->critical_section);
-#ifdef __cplusplus
-    ctx->buffer->Release();
-    ctx->primary->Release();
-    ctx->dsound->Release();
-#else
-    ctx->buffer->lpVtbl->Release(ctx->buffer);
-    ctx->primary->lpVtbl->Release(ctx->primary);
-    ctx->dsound->lpVtbl->Release(ctx->dsound);
-#endif
-    tsPlayingSound* playing = ctx->playing;
-    while (playing)
-    {
-        tsRemoveFilter(playing);
-        playing = playing->next;
-    }
-    free(ctx);
-}
-
-static DWORD WINAPI tsCtxThread(LPVOID lpParameter)
-{
-    tsContext* ctx = (tsContext*)lpParameter;
-
-    while (ctx->running)
-    {
-        tsMix(ctx);
-        if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds);
-        else YieldProcessor();
-    }
-
-    ctx->separate_thread = 0;
-    return 0;
-}
-
-static void tsLock(tsContext* ctx)
-{
-    if (ctx->separate_thread) EnterCriticalSection(&ctx->critical_section);
-}
-
-static void tsUnlock(tsContext* ctx)
-{
-    if (ctx->separate_thread) LeaveCriticalSection(&ctx->critical_section);
-}
-
-tsContext* tsMakeContext(void* hwnd, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count)
-{
-    int bps = sizeof(INT16) * 2;
-    int buffer_size = play_frequency_in_Hz * bps * num_buffered_seconds;
-    tsContext* ctx = 0;
-    WAVEFORMATEX format = { 0 };
-    DSBUFFERDESC bufdesc = { 0 };
-    LPDIRECTSOUND dsound;
-
-    TS_CHECK(hwnd, "Invalid hwnd passed to tsMakeContext.");
-
-    HRESULT res = DirectSoundCreate(0, &dsound, 0);
-    TS_CHECK(res == DS_OK, "DirectSoundCreate failed");
-#ifdef __cplusplus
-    dsound->SetCooperativeLevel((HWND)hwnd, DSSCL_PRIORITY);
-#else
-    dsound->lpVtbl->SetCooperativeLevel(dsound, (HWND)hwnd, DSSCL_PRIORITY);
-#endif
-    bufdesc.dwSize = sizeof(bufdesc);
-    bufdesc.dwFlags = DSBCAPS_PRIMARYBUFFER;
-
-    LPDIRECTSOUNDBUFFER primary_buffer;
-#ifdef __cplusplus
-    res = dsound->CreateSoundBuffer(&bufdesc, &primary_buffer, 0);
-#else
-    res = dsound->lpVtbl->CreateSoundBuffer(dsound, &bufdesc, &primary_buffer, 0);
-#endif
-    TS_CHECK(res == DS_OK, "Failed to create primary sound buffer");
-
-    format.wFormatTag = WAVE_FORMAT_PCM;
-    format.nChannels = 2;
-    format.nSamplesPerSec = play_frequency_in_Hz;
-    format.wBitsPerSample = 16;
-    format.nBlockAlign = (format.nChannels * format.wBitsPerSample) / 8;
-    format.nAvgBytesPerSec = format.nSamplesPerSec * format.nBlockAlign;
-    format.cbSize = 0;
-#ifdef __cplusplus
-    res = primary_buffer->SetFormat(&format);
-#else
-    res = primary_buffer->lpVtbl->SetFormat(primary_buffer, &format);
-#endif
-    TS_CHECK(res == DS_OK, "Failed to set format on primary buffer");
-
-    LPDIRECTSOUNDBUFFER secondary_buffer;
-    bufdesc.dwSize = sizeof(bufdesc);
-    bufdesc.dwFlags = 0;
-    bufdesc.dwBufferBytes = buffer_size;
-    bufdesc.lpwfxFormat = &format;
-#ifdef __cplusplus
-    res = dsound->CreateSoundBuffer(&bufdesc, &secondary_buffer, 0);
-#else
-    res = dsound->lpVtbl->CreateSoundBuffer(dsound, &bufdesc, &secondary_buffer, 0);
-#endif
-    TS_CHECK(res == DS_OK, "Failed to set format on secondary buffer");
-
-    int sample_count = play_frequency_in_Hz * num_buffered_seconds;
-    int wide_count = (int)TS_ALIGN(sample_count, 4);
-    int pool_size = playing_pool_count * sizeof(tsPlayingSound);
-    int mix_buffers_size = sizeof(__m128) * wide_count * 2;
-    int sample_buffer_size = sizeof(__m128i) * wide_count;
-    ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size);
-    ctx->latency_samples = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4);
-    ctx->running_index = 0;
-    ctx->Hz = play_frequency_in_Hz;
-    ctx->bps = bps;
-    ctx->buffer_size = buffer_size;
-    ctx->wide_count = wide_count;
-    ctx->dsound = dsound;
-    ctx->buffer = secondary_buffer;
-    ctx->primary = primary_buffer;
-    ctx->playing = 0;
-    ctx->floatA = (__m128*)(ctx + 1);
-    ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16);
-    TS_ASSERT(!((size_t)ctx->floatA & 15));
-    ctx->floatB = ctx->floatA + wide_count;
-    ctx->samples = (__m128i*)ctx->floatB + wide_count;
-    ctx->running = 1;
-    ctx->separate_thread = 0;
-    ctx->sleep_milliseconds = 0;
-
-    if (playing_pool_count)
-    {
-        ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count);
-        for (int i = 0; i < playing_pool_count - 1; ++i)
-            ctx->playing_pool[i].next = ctx->playing_pool + i + 1;
-        ctx->playing_pool[playing_pool_count - 1].next = 0;
-        ctx->playing_free = ctx->playing_pool;
-    }
-
-    else
-    {
-        ctx->playing_pool = 0;
-        ctx->playing_free = 0;
-    }
-
-    return ctx;
-
-ts_err:
-    free(ctx);
-    return 0;
-}
-
-void tsSpawnMixThread(tsContext* ctx)
-{
-    if (ctx->separate_thread) return;
-    InitializeCriticalSectionAndSpinCount(&ctx->critical_section, 0x00000400);
-    ctx->separate_thread = 1;
-    CreateThread(0, 0, tsCtxThread, ctx, 0, 0);
-}
-
-#elif TS_PLATFORM == TS_MAC
-
-void tsSleep(int milliseconds)
-{
-    usleep(milliseconds * 1000);
-}
-
-struct tsContext
-{
-    unsigned latency_samples;
-    unsigned index0; // read
-    unsigned index1; // write
-    int Hz;
-    int bps;
-    int wide_count;
-    int sample_count;
-    tsPlayingSound* playing;
-    __m128* floatA;
-    __m128* floatB;
-    __m128i* samples;
-    tsPlayingSound* playing_pool;
-    tsPlayingSound* playing_free;
-
-    // platform specific stuff
-    AudioComponentInstance inst;
-
-    // data for tsMix thread, enable these with tsSpawnMixThread
-    pthread_t thread;
-    pthread_mutex_t mutex;
-    int separate_thread;
-    int running;
-    int sleep_milliseconds;
-};
-
-static void tsReleaseContext(tsContext* ctx)
-{
-    if (ctx->separate_thread)    pthread_mutex_destroy(&ctx->mutex);
-    AudioOutputUnitStop(ctx->inst);
-    AudioUnitUninitialize(ctx->inst);
-    AudioComponentInstanceDispose(ctx->inst);
-    tsPlayingSound* playing = ctx->playing;
-    while (playing)
-    {
-        tsRemoveFilter(playing);
-        playing = playing->next;
-    }
-    free(ctx);
-}
-
-static void* tsCtxThread(void* udata)
-{
-    tsContext* ctx = (tsContext*)udata;
-
-    while (ctx->running)
-    {
-        tsMix(ctx);
-        if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds);
-        else pthread_yield_np();
-    }
-
-    ctx->separate_thread = 0;
-    pthread_exit(0);
-    return 0;
-}
-
-static void tsLock(tsContext* ctx)
-{
-    if (ctx->separate_thread) pthread_mutex_lock(&ctx->mutex);
-}
-
-static void tsUnlock(tsContext* ctx)
-{
-    if (ctx->separate_thread) pthread_mutex_unlock(&ctx->mutex);
-}
-
-static OSStatus tsMemcpyToCA(void* udata, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData);
-
-tsContext* tsMakeContext(void* unused, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count)
-{
-    int bps = sizeof(uint16_t) * 2;
-
-    AudioComponentDescription comp_desc = { 0 };
-    comp_desc.componentType = kAudioUnitType_Output;
-    comp_desc.componentSubType = kAudioUnitSubType_DefaultOutput;
-    comp_desc.componentFlags = 0;
-    comp_desc.componentFlagsMask = 0;
-    comp_desc.componentManufacturer = kAudioUnitManufacturer_Apple;
-
-    AudioComponent comp = AudioComponentFindNext(NULL, &comp_desc);
-    if (!comp)
-    {
-        g_tsErrorReason = "Failed to create output unit from AudioComponentFindNext.";
-        return 0;
-    }
-
-    AudioStreamBasicDescription stream_desc = { 0 };
-    stream_desc.mSampleRate = (double)play_frequency_in_Hz;
-    stream_desc.mFormatID = kAudioFormatLinearPCM;
-    stream_desc.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagsNativeEndian | kAudioFormatFlagIsPacked;
-    stream_desc.mFramesPerPacket = 1;
-    stream_desc.mChannelsPerFrame = 2;
-    stream_desc.mBitsPerChannel = sizeof(uint16_t) * 8;
-    stream_desc.mBytesPerPacket = bps;
-    stream_desc.mBytesPerFrame = bps;
-    stream_desc.mReserved = 0;
-
-    AudioComponentInstance inst;
-    OSStatus ret;
-    AURenderCallbackStruct input;
-
-    ret = AudioComponentInstanceNew(comp, &inst);
-
-    int sample_count = play_frequency_in_Hz * num_buffered_seconds;
-    int latency_count = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4);
-    TS_ASSERT(sample_count > latency_count);
-    int wide_count = (int)TS_ALIGN(sample_count, 4) / 4;
-    int pool_size = playing_pool_count * sizeof(tsPlayingSound);
-    int mix_buffers_size = sizeof(__m128) * wide_count * 2;
-    int sample_buffer_size = sizeof(__m128i) * wide_count;
-    tsContext* ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size);
-    TS_CHECK(ret == noErr, "AudioComponentInstanceNew failed");
-    ctx->latency_samples = latency_count;
-    ctx->index0 = 0;
-    ctx->index1 = 0;
-    ctx->Hz = play_frequency_in_Hz;
-    ctx->bps = bps;
-    ctx->wide_count = wide_count;
-    ctx->sample_count = wide_count * 4;
-    ctx->inst = inst;
-    ctx->playing = 0;
-    ctx->floatA = (__m128*)(ctx + 1);
-    ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16);
-    TS_ASSERT(!((size_t)ctx->floatA & 15));
-    ctx->floatB = ctx->floatA + wide_count;
-    ctx->samples = (__m128i*)ctx->floatB + wide_count;
-    ctx->running = 1;
-    ctx->separate_thread = 0;
-    ctx->sleep_milliseconds = 0;
-
-    ret = AudioUnitSetProperty(inst, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, 0, &stream_desc, sizeof(stream_desc));
-    TS_CHECK(ret == noErr, "Failed to set stream forat");
-
-    input.inputProc = tsMemcpyToCA;
-    input.inputProcRefCon = ctx;
-    ret = AudioUnitSetProperty(inst, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, 0, &input, sizeof(input));
-    TS_CHECK(ret == noErr, "AudioUnitSetProperty failed");
-
-    ret = AudioUnitInitialize(inst);
-    TS_CHECK(ret == noErr, "Couldn't initialize output unit");
-
-    ret = AudioOutputUnitStart(inst);
-    TS_CHECK(ret == noErr, "Couldn't start output unit");
-
-    if (playing_pool_count)
-    {
-        ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count);
-        for (int i = 0; i < playing_pool_count - 1; ++i)
-            ctx->playing_pool[i].next = ctx->playing_pool + i + 1;
-        ctx->playing_pool[playing_pool_count - 1].next = 0;
-        ctx->playing_free = ctx->playing_pool;
-    }
-
-    else
-    {
-        ctx->playing_pool = 0;
-        ctx->playing_free = 0;
-    }
-
-    return ctx;
-
-ts_err:
-    free(ctx);
-    return 0;
-}
-
-void tsSpawnMixThread(tsContext* ctx)
-{
-    if (ctx->separate_thread) return;
-    pthread_mutex_init(&ctx->mutex, 0);
-    ctx->separate_thread = 1;
-    pthread_create(&ctx->thread, 0, tsCtxThread, ctx);
-}
-
-#else
-
-void tsSleep(int milliseconds)
-{
-    SDL_Delay(milliseconds);
-}
-
-struct tsContext
-{
-    unsigned latency_samples;
-    unsigned index0; // read
-    unsigned index1; // write
-    unsigned running_index;
-    int Hz;
-    int bps;
-    int buffer_size;
-    int wide_count;
-    int sample_count;
-    tsPlayingSound* playing;
-    __m128* floatA;
-    __m128* floatB;
-    __m128i* samples;
-    tsPlayingSound* playing_pool;
-    tsPlayingSound* playing_free;
-
-    // data for tsMix thread, enable these with tsSpawnMixThread
-    SDL_Thread* thread;
-    SDL_mutex* mutex;
-    int separate_thread;
-    int running;
-    int sleep_milliseconds;
-};
-
-static void tsReleaseContext(tsContext* ctx)
-{
-    if (ctx->separate_thread)    SDL_DestroyMutex(ctx->mutex);
-    tsPlayingSound* playing = ctx->playing;
-    while (playing)
-    {
-        tsRemoveFilter(playing);
-        playing = playing->next;
-    }
-    SDL_CloseAudio();
-    free(ctx);
-}
-
-int tsCtxThread(void* udata)
-{
-    tsContext* ctx = (tsContext*)udata;
-
-    while (ctx->running)
-    {
-        tsMix(ctx);
-        if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds);
-        else tsSleep(1);
-    }
-
-    ctx->separate_thread = 0;
-    return 0;
-}
-
-static void tsLock(tsContext* ctx)
-{
-    if (ctx->separate_thread) SDL_LockMutex(ctx->mutex);
-}
-
-static void tsUnlock(tsContext* ctx)
-{
-    if (ctx->separate_thread) SDL_UnlockMutex(ctx->mutex);
-}
-
-void tsSDL_AudioCallback(void* udata, Uint8* stream, int len);
-
-tsContext* tsMakeContext(void* unused, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count)
-{
-    (void)unused;
-    int bps = sizeof(uint16_t) * 2;
-    int sample_count = play_frequency_in_Hz * num_buffered_seconds;
-    int latency_count = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4);
-    TS_ASSERT(sample_count > latency_count);
-    int wide_count = (int)TS_ALIGN(sample_count, 4) / 4;
-    int pool_size = playing_pool_count * sizeof(tsPlayingSound);
-    int mix_buffers_size = sizeof(__m128) * wide_count * 2;
-    int sample_buffer_size = sizeof(__m128i) * wide_count;
-    tsContext* ctx = 0;
-    SDL_AudioSpec wanted;
-    int ret = SDL_Init(SDL_INIT_AUDIO);
-    TS_CHECK(ret >= 0, "Can't init SDL audio");
-
-    ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size);
-    TS_CHECK(ctx != NULL, "Can't create audio context");
-    ctx->latency_samples = latency_count;
-    ctx->index0 = 0;
-    ctx->index1 = 0;
-    ctx->Hz = play_frequency_in_Hz;
-    ctx->bps = bps;
-    ctx->wide_count = wide_count;
-    ctx->sample_count = wide_count * 4;
-    ctx->playing = 0;
-    ctx->floatA = (__m128*)(ctx + 1);
-    ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16);
-    TS_ASSERT(!((size_t)ctx->floatA & 15));
-    ctx->floatB = ctx->floatA + wide_count;
-    ctx->samples = (__m128i*)ctx->floatB + wide_count;
-    ctx->running = 1;
-    ctx->separate_thread = 0;
-    ctx->sleep_milliseconds = 0;
-
-    SDL_memset(&wanted, 0, sizeof(wanted));
-    wanted.freq = play_frequency_in_Hz;
-    wanted.format = AUDIO_S16SYS;
-    wanted.channels = 2; /* 1 = mono, 2 = stereo */
-    wanted.samples = 1024;
-    wanted.callback = tsSDL_AudioCallback;
-    wanted.userdata = ctx;
-    ret = SDL_OpenAudio(&wanted, NULL);
-    TS_CHECK(ret >= 0, "Can't open SDL audio");
-    SDL_PauseAudio(0);
-
-    if (playing_pool_count)
-    {
-        ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count);
-        for (int i = 0; i < playing_pool_count - 1; ++i)
-            ctx->playing_pool[i].next = ctx->playing_pool + i + 1;
-        ctx->playing_pool[playing_pool_count - 1].next = 0;
-        ctx->playing_free = ctx->playing_pool;
-    }
-
-    else
-    {
-        ctx->playing_pool = 0;
-        ctx->playing_free = 0;
-    }
-
-    return ctx;
-
-ts_err:
-    if (ctx) free(ctx);
-    return 0;
-}
-
-void tsSpawnMixThread(tsContext* ctx)
-{
-    if (ctx->separate_thread) return;
-    ctx->mutex = SDL_CreateMutex();
-    ctx->separate_thread = 1;
-    ctx->thread = SDL_CreateThread(&tsCtxThread, "TinySoundThread", ctx);
-}
-
-#endif
-
-#if TS_PLATFORM == TS_SDL || TS_PLATFORM == TS_MAC
-
-static int tsSamplesWritten(tsContext* ctx)
-{
-    int index0 = ctx->index0;
-    int index1 = ctx->index1;
-    if (index0 <= index1) return index1 - index0;
-    else return ctx->sample_count - index0 + index1;
-}
-
-static int tsSamplesUnwritten(tsContext* ctx)
-{
-    int index0 = ctx->index0;
-    int index1 = ctx->index1;
-    if (index0 <= index1) return ctx->sample_count - index1 + index0;
-    else return index0 - index1;
-}
-
-static int tsSamplesToMix(tsContext* ctx)
-{
-    int lat = ctx->latency_samples;
-    int written = tsSamplesWritten(ctx);
-    int dif = lat - written;
-    TS_ASSERT(dif >= 0);
-    if (dif)
-    {
-        int unwritten = tsSamplesUnwritten(ctx);
-        return dif < unwritten ? dif : unwritten;
-    }
-    return 0;
-}
-
-#define TS_SAMPLES_TO_BYTES( interleaved_sample_count ) ((interleaved_sample_count) * ctx->bps)
-#define TS_BYTES_TO_SAMPLES( byte_count ) ((byte_count) / ctx->bps)
-
-static void tsPushBytes(tsContext* ctx, void* data, int size)
-{
-    int index0 = ctx->index0;
-    int index1 = ctx->index1;
-    int samples = TS_BYTES_TO_SAMPLES(size);
-    int sample_count = ctx->sample_count;
-
-    int unwritten = tsSamplesUnwritten(ctx);
-    if (unwritten < samples) samples = unwritten;
-    int can_overflow = index0 <= index1;
-    int would_overflow = index1 + samples > sample_count;
-
-    if (can_overflow && would_overflow)
-    {
-        int first_size = TS_SAMPLES_TO_BYTES(sample_count - index1);
-        int second_size = size - first_size;
-        memcpy((char*)ctx->samples + TS_SAMPLES_TO_BYTES(index1), data, first_size);
-        memcpy(ctx->samples, (char*)data + first_size, second_size);
-        ctx->index1 = TS_BYTES_TO_SAMPLES(second_size);
-    }
-
-    else
-    {
-        memcpy((char*)ctx->samples + TS_SAMPLES_TO_BYTES(index1), data, size);
-        ctx->index1 += TS_BYTES_TO_SAMPLES(size);
-    }
-}
-
-static int tsPullBytes(tsContext* ctx, void* dst, int size)
-{
-    int index0 = ctx->index0;
-    int index1 = ctx->index1;
-    int allowed_size = TS_SAMPLES_TO_BYTES(tsSamplesWritten(ctx));
-    int zeros = 0;
-
-    if (allowed_size < size)
-    {
-        zeros = size - allowed_size;
-        size = allowed_size;
-    }
-
-    if (index1 >= index0)
-    {
-        memcpy(dst, ((char*)ctx->samples) + TS_SAMPLES_TO_BYTES(index0), size);
-        ctx->index0 += TS_BYTES_TO_SAMPLES(size);
-    }
-
-    else
-    {
-        int first_size = TS_SAMPLES_TO_BYTES(ctx->sample_count) - TS_SAMPLES_TO_BYTES(index0);
-        if (first_size > size) first_size = size;
-        int second_size = size - first_size;
-        memcpy(dst, ((char*)ctx->samples) + TS_SAMPLES_TO_BYTES(index0), first_size);
-        memcpy(((char*)dst) + first_size, ctx->samples, second_size);
-        if (second_size) ctx->index0 = TS_BYTES_TO_SAMPLES(second_size);
-        else ctx->index0 += TS_BYTES_TO_SAMPLES(first_size);
-    }
-
-    return zeros;
-}
-
-#endif
-
-void tsShutdownContext(tsContext* ctx)
-{
-    if (ctx->separate_thread)
-    {
-        tsLock(ctx);
-        ctx->running = 0;
-        tsUnlock(ctx);
-    }
-
-    while (ctx->separate_thread) tsSleep(1);
-    tsReleaseContext(ctx);
-}
-
-void tsThreadSleepDelay(tsContext* ctx, int milliseconds)
-{
-    ctx->sleep_milliseconds = milliseconds;
-}
-
-void tsInsertSound(tsContext* ctx, tsPlayingSound* sound)
-{
-    // Cannot use tsPlayingSound if tsMakeContext was passed non-zero for playing_pool_count
-    // since non-zero playing_pool_count means the context is doing some memory-management
-    // for a playing sound pool. InsertSound assumes the pool does not exist, and is apart
-    // of the lower-level API (see top of this header for documentation details).
-    TS_ASSERT(ctx->playing_pool == 0);
-
-    if (sound->active) return;
-    tsLock(ctx);
-    sound->next = ctx->playing;
-    ctx->playing = sound;
-    sound->active = 1;
-    tsUnlock(ctx);
-}
-
-// NOTE: does not allow delay_in_seconds to be negative (clamps at 0)
-void tsSetDelay(tsContext* ctx, tsPlayingSound* sound, float delay_in_seconds)
-{
-    if (delay_in_seconds < 0.0f) delay_in_seconds = 0.0f;
-    sound->sample_index = (int)(delay_in_seconds * (float)ctx->Hz);
-    sound->sample_index = -(int)TS_ALIGN(sound->sample_index, 4);
-}
-
-tsPlaySoundDef tsMakeDef(tsLoadedSound* sound)
-{
-    tsPlaySoundDef def;
-    def.paused = 0;
-    def.looped = 0;
-    def.volume_left = 1.0f;
-    def.volume_right = 1.0f;
-    def.pan = 0.5f;
-    def.pitch = 1.0f;
-    def.delay = 0.0f;
-    def.loaded = sound;
-    return def;
-}
-
-tsPlayingSound* tsPlaySound(tsContext* ctx, tsPlaySoundDef def)
-{
-    tsLock(ctx);
-
-    tsPlayingSound* playing = ctx->playing_free;
-    if (!playing) return 0;
-    ctx->playing_free = playing->next;
-    *playing = tsMakePlayingSound(def.loaded);
-    playing->active = 1;
-    playing->paused = def.paused;
-    playing->looped = def.looped;
-    tsSetVolume(playing, def.volume_left, def.volume_right);
-    tsSetPan(playing, def.pan);
-    tsSetPitch(playing, def.pitch);
-    tsSetDelay(ctx, playing, def.delay);
-    playing->next = ctx->playing;
-    ctx->playing = playing;
-
-    tsUnlock(ctx);
-
-    return playing;
-}
-
-void tsStopAllSounds(tsContext* ctx)
-{
-    // This is apart of the high level API, not the low level API.
-    // If using the low level API you must write your own function to
-    // stop playing all sounds.
-    TS_ASSERT(ctx->playing_pool == 0);
-
-    tsPlayingSound* sound = ctx->playing;
-    ctx->playing = 0;
-
-    while (sound)
-    {
-        tsPlayingSound* next = sound->next;
-        sound->next = ctx->playing_free;
-        ctx->playing_free = sound;
-        sound = next;
-    }
-}
-
-#if TS_PLATFORM == TS_WINDOWS
-
-static void tsPosition(tsContext* ctx, int* byte_to_lock, int* bytes_to_write)
-{
-    // compute bytes to be written to direct sound
-    DWORD play_cursor;
-    DWORD write_cursor;
-#ifdef __cplusplus
-    HRESULT hr = ctx->buffer->GetCurrentPosition(&play_cursor, &write_cursor);
-#else
-    HRESULT hr = ctx->buffer->lpVtbl->GetCurrentPosition(ctx->buffer, &play_cursor, &write_cursor);
-#endif
-    TS_ASSERT(hr == DS_OK);
-
-    DWORD lock = (ctx->running_index * ctx->bps) % ctx->buffer_size;
-    DWORD target_cursor = (write_cursor + ctx->latency_samples * ctx->bps) % ctx->buffer_size;
-    target_cursor = (DWORD)TS_ALIGN(target_cursor, 16);
-    DWORD write;
-
-    if (lock > target_cursor)
-    {
-        write = (ctx->buffer_size - lock) + target_cursor;
-    }
-
-    else
-    {
-        write = target_cursor - lock;
-    }
-
-    *byte_to_lock = lock;
-    *bytes_to_write = write;
-}
-
-static void tsMemcpyToDS(tsContext* ctx, int16_t* samples, int byte_to_lock, int bytes_to_write)
-{
-    // copy mixer buffers to direct sound
-    void* region1;
-    DWORD size1;
-    void* region2;
-    DWORD size2;
-#ifdef __cplusplus
-    HRESULT hr = ctx->buffer->Lock(byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
-
-    if (hr == DSERR_BUFFERLOST)
-    {
-        ctx->buffer->Restore();
-        hr = ctx->buffer->Lock(byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
-    }
-#else
-    HRESULT hr = ctx->buffer->lpVtbl->Lock(ctx->buffer, byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
-
-    if (hr == DSERR_BUFFERLOST)
-    {
-        ctx->buffer->lpVtbl->Restore(ctx->buffer);
-        hr = ctx->buffer->lpVtbl->Lock(ctx->buffer, byte_to_lock, bytes_to_write, &region1, &size1, &region2, &size2, 0);
-    }
-#endif
-
-    if (!SUCCEEDED(hr))
-        return;
-
-    unsigned running_index = ctx->running_index;
-    INT16* sample1 = (INT16*)region1;
-    DWORD sample1_count = size1 / ctx->bps;
-    memcpy(sample1, samples, sample1_count * sizeof(INT16) * 2);
-    samples += sample1_count * 2;
-    running_index += sample1_count;
-
-    INT16* sample2 = (INT16*)region2;
-    DWORD sample2_count = size2 / ctx->bps;
-    memcpy(sample2, samples, sample2_count * sizeof(INT16) * 2);
-    samples += sample2_count * 2;
-    running_index += sample2_count;
-
-#ifdef __cplusplus
-    ctx->buffer->Unlock(region1, size1, region2, size2);
-#else
-    ctx->buffer->lpVtbl->Unlock(ctx->buffer, region1, size1, region2, size2);
-#endif
-    ctx->running_index = running_index;
-
-    // meager hack to fill out sound buffer before playing
-    static int first;
-    if (!first)
-    {
-#ifdef __cplusplus
-        ctx->buffer->Play(0, 0, DSBPLAY_LOOPING);
-#else
-        ctx->buffer->lpVtbl->Play(ctx->buffer, 0, 0, DSBPLAY_LOOPING);
-#endif
-        first = 1;
-    }
-}
-
-#elif TS_PLATFORM == TS_MAC
-
-static OSStatus tsMemcpyToCA(void* udata, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData)
-{
-    tsContext* ctx = (tsContext*)udata;
-    int bps = ctx->bps;
-    int samples_requested_to_consume = inNumberFrames;
-    AudioBuffer* buffer = ioData->mBuffers;
-
-    TS_ASSERT(ioData->mNumberBuffers == 1);
-    TS_ASSERT(buffer->mNumberChannels == 2);
-    int byte_size = buffer->mDataByteSize;
-    TS_ASSERT(byte_size == samples_requested_to_consume * bps);
-
-    int zero_bytes = tsPullBytes(ctx, buffer->mData, byte_size);
-    memset(((char*)buffer->mData) + (byte_size - zero_bytes), 0, zero_bytes);
-
-    return noErr;
-}
-
-#elif TS_PLATFORM == TS_SDL
-
-static void tsSDL_AudioCallback(void* udata, Uint8* stream, int len)
-{
-    tsContext* ctx = (tsContext*)udata;
-    int zero_bytes = tsPullBytes(ctx, stream, len);
-    memset(stream + (len - zero_bytes), 0, zero_bytes);
-}
-
-#endif
-
-static void tsPitchShift(float pitchShift, int num_samples_to_process, float sampleRate, float* indata, tsPitchData** pitch_filter);
-
-// Pitch processing tunables
-#define TS_MAX_FRAME_LENGTH 4096
-#define TS_PITCH_FRAME_SIZE 512
-#define TS_PITCH_QUALITY 8
-
-// interals
-#define TS_STEPSIZE (TS_PITCH_FRAME_SIZE / TS_PITCH_QUALITY)
-#define TS_OVERLAP (TS_PITCH_FRAME_SIZE - TS_STEPSIZE)
-#define TS_EXPECTED_FREQUENCY (2.0f * 3.14159265359f * (float)TS_STEPSIZE / (float)TS_PITCH_FRAME_SIZE)
-
-// TODO:
-// Use a memory pool for these things. For now they are just malloc16'd/free16'd
-// Not high priority to use a pool, since pitch shifting is already really expensive,
-// and cost of malloc is dwarfed. But would be a nice-to-have for potential memory
-// fragmentation issues.
-typedef struct tsPitchData
-{
-    float pitch_shifted_output_samples[TS_MAX_FRAME_LENGTH];
-    float in_FIFO[TS_STEPSIZE + TS_PITCH_FRAME_SIZE];
-    float out_FIFO[TS_STEPSIZE + TS_PITCH_FRAME_SIZE];
-    float fft_data[2 * TS_PITCH_FRAME_SIZE];
-    float previous_phase[TS_PITCH_FRAME_SIZE / 2 + 4];
-    float sum_phase[TS_PITCH_FRAME_SIZE / 2 + 4];
-    float window_accumulator[TS_STEPSIZE + TS_PITCH_FRAME_SIZE];
-    float freq[TS_PITCH_FRAME_SIZE];
-    float mag[TS_PITCH_FRAME_SIZE];
-    float pitch_shift_workspace[TS_PITCH_FRAME_SIZE];
-    int index;
-} tsPitchData;
-
-static void tsRemoveFilter(tsPlayingSound* playing)
-{
-    for (int i = 0; i < 2; i++)
-    {
-        if (playing->pitch_filter[i])
-        {
-            free16(playing->pitch_filter[i]);
-            playing->pitch_filter[i] = 0;
-        }
-    }
-}
-
-void tsMix(tsContext* ctx)
-{
-    tsLock(ctx);
-
-#if TS_PLATFORM == TS_WINDOWS
-
-    int byte_to_lock;
-    int bytes_to_write;
-    tsPosition(ctx, &byte_to_lock, &bytes_to_write);
-
-    if (!bytes_to_write) goto unlock;
-    int samples_to_write = bytes_to_write / ctx->bps;
-
-#elif TS_PLATFORM == TS_MAC || TS_PLATFORM == TS_SDL
-
-    int samples_to_write = tsSamplesToMix(ctx);
-    if (!samples_to_write) goto unlock;
-    int bytes_to_write = samples_to_write * ctx->bps;
-
-#else
-#endif
-
-    // clear mixer buffers
-    int wide_count = samples_to_write / 4;
-    TS_ASSERT(!(samples_to_write & 3));
-
-    __m128* floatA = ctx->floatA;
-    __m128* floatB = ctx->floatB;
-    __m128 zero = _mm_set1_ps(0.0f);
-
-    for (int i = 0; i < wide_count; ++i)
-    {
-        floatA[i] = zero;
-        floatB[i] = zero;
-    }
-
-    // mix all playing sounds into the mixer buffers
-    tsPlayingSound** ptr = &ctx->playing;
-    while (*ptr)
-    {
-        tsPlayingSound* playing = *ptr;
-        tsLoadedSound* loaded = playing->loaded_sound;
-        __m128* cA = (__m128*)loaded->channels[0];
-        __m128* cB = (__m128*)loaded->channels[1];
-
-        // Attempted to play a sound with no audio.
-        // Make sure the audio file was loaded properly. Check for
-        // error messages in g_tsErrorReason.
-        TS_ASSERT(cA);
-
-        int mix_count = samples_to_write;
-        int offset = playing->sample_index;
-        int remaining = loaded->sample_count - offset;
-        if (remaining < mix_count) mix_count = remaining;
-        TS_ASSERT(remaining > 0);
-
-        float vA0 = playing->volume0 * playing->pan0;
-        float vB0 = playing->volume1 * playing->pan1;
-        __m128 vA = _mm_set1_ps(vA0);
-        __m128 vB = _mm_set1_ps(vB0);
-
-        // skip sound if it's delay is longer than mix_count and
-        // handle various delay cases
-        int delay_offset = 0;
-        if (offset < 0)
-        {
-            int samples_till_positive = -offset;
-            int mix_leftover = mix_count - samples_till_positive;
-
-            if (mix_leftover <= 0)
-            {
-                playing->sample_index += mix_count;
-                goto get_next_playing_sound;
-            }
-
-            else
-            {
-                offset = 0;
-                delay_offset = samples_till_positive;
-                mix_count = mix_leftover;
-            }
-        }
-        TS_ASSERT(!(delay_offset & 3));
-
-        // immediately remove any inactive elements
-        if (!playing->active || !ctx->running)
-            goto remove;
-
-        // skip all paused sounds
-        if (playing->paused)
-            goto get_next_playing_sound;
-
-        // SIMD offets
-        int mix_wide = (int)TS_ALIGN(mix_count, 4) / 4;
-        int offset_wide = (int)TS_TRUNC(offset, 4) / 4;
-        int delay_wide = (int)TS_ALIGN(delay_offset, 4) / 4;
-
-        // use tsPitchShift to on-the-fly pitch shift some samples
-        // only call this function if the user set a custom pitch value
-        if (playing->pitch != 1.0f)
-        {
-            int sample_count = (mix_wide - 2 * delay_wide) * 4;
-            int falling_behind = sample_count > TS_MAX_FRAME_LENGTH;
-
-            // TS_MAX_FRAME_LENGTH represents max samples we can pitch shift in one go. In the event
-            // that this process takes longer than the time required to play the actual sound, just
-            // fall back to the original sound (non-pitch shifted). This will sound very ugly. To
-            // prevent falling behind, make sure not to pitch shift too many sounds at once. Try tweaking
-            // TS_PITCH_QUALITY to make it lower (must be a power of 2).
-            if (!falling_behind)
-            {
-                tsPitchShift(playing->pitch, sample_count, (float)ctx->Hz, (float*)(cA + delay_wide + offset_wide), playing->pitch_filter);
-                cA = (__m128 *)playing->pitch_filter[0]->pitch_shifted_output_samples;
-
-                if (loaded->channel_count == 2)
-                {
-                    tsPitchShift(playing->pitch, sample_count, (float)ctx->Hz, (float*)(cB + delay_wide + offset_wide), playing->pitch_filter + 1);
-                    cB = (__m128 *)playing->pitch_filter[1]->pitch_shifted_output_samples;
-                }
-
-                offset_wide = -delay_wide;
-            }
-        }
-
-        // apply volume, load samples into float buffers
-        switch (loaded->channel_count)
-        {
-        case 1:
-            for (int i = delay_wide; i < mix_wide - delay_wide; ++i)
-            {
-                __m128 A = cA[i + offset_wide];
-                __m128 B = _mm_mul_ps(A, vB);
-                A = _mm_mul_ps(A, vA);
-                floatA[i] = _mm_add_ps(floatA[i], A);
-                floatB[i] = _mm_add_ps(floatB[i], B);
-            }
-            break;
-
-        case 2:
-        {
-            for (int i = delay_wide; i < mix_wide - delay_wide; ++i)
-            {
-                __m128 A = cA[i + offset_wide];
-                __m128 B = cB[i + offset_wide];
-
-                A = _mm_mul_ps(A, vA);
-                B = _mm_mul_ps(B, vB);
-                floatA[i] = _mm_add_ps(floatA[i], A);
-                floatB[i] = _mm_add_ps(floatB[i], B);
-            }
-        }    break;
-        }
-
-        // playing list logic
-        playing->sample_index += mix_count;
-        if (playing->sample_index == loaded->sample_count)
-        {
-            if (playing->looped)
-            {
-                playing->sample_index = 0;
-                goto get_next_playing_sound;
-            }
-
-        remove:
-            playing->sample_index = 0;
-            *ptr = (*ptr)->next;
-            playing->next = 0;
-            playing->active = 0;
-
-            tsRemoveFilter(playing);
-
-            // if using high-level API manage the tsPlayingSound memory ourselves
-            if (ctx->playing_pool)
-            {
-                playing->next = ctx->playing_free;
-                ctx->playing_free = playing;
-            }
-
-            // we already incremented next pointer, so don't do it again
-            continue;
-        }
-
-    get_next_playing_sound:
-        if (*ptr) ptr = &(*ptr)->next;
-        else break;
-    }
-
-    // load all floats into 16 bit packed interleaved samples
-#if TS_PLATFORM == TS_WINDOWS
-
-    __m128i* samples = ctx->samples;
-    for (int i = 0; i < wide_count; ++i)
-    {
-        __m128i a = _mm_cvtps_epi32(floatA[i]);
-        __m128i b = _mm_cvtps_epi32(floatB[i]);
-        __m128i a0b0a1b1 = _mm_unpacklo_epi32(a, b);
-        __m128i a2b2a3b3 = _mm_unpackhi_epi32(a, b);
-        samples[i] = _mm_packs_epi32(a0b0a1b1, a2b2a3b3);
-    }
-    tsMemcpyToDS(ctx, (int16_t*)samples, byte_to_lock, bytes_to_write);
-
-#elif TS_PLATFORM == TS_MAC || TS_PLATFORM == TS_SDL
-
-    // Since the ctx->samples array is already in use as a ring buffer
-    // reusing floatA to store output is a good way to temporarly store
-    // the final samples. Then a single ring buffer push can be used
-    // afterwards. Pretty hacky, but whatever :)
-    __m128i* samples = (__m128i*)floatA;
-    memset(samples, 0, sizeof(__m128i) * wide_count);
-    for (int i = 0; i < wide_count; ++i)
-    {
-        __m128i a = _mm_cvtps_epi32(floatA[i]);
-        __m128i b = _mm_cvtps_epi32(floatB[i]);
-        __m128i a0b0a1b1 = _mm_unpacklo_epi32(a, b);
-        __m128i a2b2a3b3 = _mm_unpackhi_epi32(a, b);
-        samples[i] = _mm_packs_epi32(a0b0a1b1, a2b2a3b3);
-    }
-    tsPushBytes(ctx, samples, bytes_to_write);
-
-#else
-#endif
-
-unlock:
-    tsUnlock(ctx);
-}
-
-// TODO:
-// Try this optimization out (2N POINT REAL FFT USING AN N POINT COMPLEX FFT)
-// http://www.fftguru.com/fftguru.com.tutorial2.pdf
-
-#include <math.h>
-
-static uint32_t tsRev32(uint32_t x)
-{
-    uint32_t a = ((x & 0xAAAAAAAA) >> 1) | ((x & 0x55555555) << 1);
-    uint32_t b = ((a & 0xCCCCCCCC) >> 2) | ((a & 0x33333333) << 2);
-    uint32_t c = ((b & 0xF0F0F0F0) >> 4) | ((b & 0x0F0F0F0F) << 4);
-    uint32_t d = ((c & 0xFF00FF00) >> 8) | ((c & 0x00FF00FF) << 8);
-    return (d >> 16) | (d << 16);
-}
-
-static uint32_t tsPopCount(uint32_t x)
-{
-    uint32_t a = x - ((x >> 1) & 0x55555555);
-    uint32_t b = (((a >> 2) & 0x33333333) + (a & 0x33333333));
-    uint32_t c = (((b >> 4) + b) & 0x0F0F0F0F);
-    uint32_t d = c + (c >> 8);
-    uint32_t e = d + (d >> 16);
-    uint32_t f = e & 0x0000003F;
-    return f;
-}
-
-static uint32_t tsLog2(uint32_t x)
-{
-    uint32_t a = x | (x >> 1);
-    uint32_t b = a | (a >> 2);
-    uint32_t c = b | (b >> 4);
-    uint32_t d = c | (c >> 8);
-    uint32_t e = d | (d >> 16);
-    uint32_t f = e >> 1;
-    return tsPopCount(f);
-}
-
-// x contains real inputs
-// y contains imaginary inputs
-// count must be a power of 2
-// sign must be 1.0 (forward transform) or -1.0f (inverse transform)
-static void tsFFT(float* x, float* y, int count, float sign)
-{
-    int exponent = (int)tsLog2((uint32_t)count);
-
-    // bit reversal stage
-    // swap all elements with their bit reversed index within the
-    // lowest level of the Cooley-Tukey recursion tree
-    for (int i = 1; i < count - 1; i++)
-    {
-        uint32_t j = tsRev32((uint32_t)i);
-        j >>= (32 - exponent);
-        if (i < (int)j)
-        {
-            float tx = x[i];
-            float ty = y[i];
-            x[i] = x[j];
-            y[i] = y[j];
-            x[j] = tx;
-            y[j] = ty;
-        }
-    }
-
-    // for each recursive iteration
-    for (int iter = 0, L = 1; iter < exponent; ++iter)
-    {
-        int Ls = L;
-        L <<= 1;
-        float ur = 1.0f; // cos( pi / 2 )
-        float ui = 0;    // sin( pi / 2 )
-        float arg = 3.14159265359f / (float)Ls;
-        float wr = cosf(arg);
-        float wi = -sign * sinf(arg);
-
-        // rows in DFT submatrix
-        for (int j = 0; j < Ls; ++j)
-        {
-            // do butterflies upon DFT row elements
-            for (int i = j; i < count; i += L)
-            {
-                int index = i + Ls;
-                float x_index = x[index];
-                float y_index = y[index];
-                float x_i = x[i];
-                float y_i = y[i];
-
-                float tr = ur * x_index - ui * y_index;
-                float ti = ur * y_index + ui * x_index;
-                float x_low = x_i - tr;
-                float x_high = x_i + tr;
-                float y_low = y_i - ti;
-                float y_high = y_i + ti;
-
-                x[index] = x_low;
-                y[index] = y_low;
-                x[i] = x_high;
-                y[i] = y_high;
-            }
-
-            // Rotate u1 and u2 via Givens rotations (2d planar rotation).
-            // This keeps cos/sin calls in the outermost loop.
-            // Floating point error is scaled proportionally to Ls.
-            float t = ur * wr - ui * wi;
-            ui = ur * wi + ui * wr;
-            ur = t;
-        }
-    }
-
-    // scale factor for forward transform
-    if (sign > 0)
-    {
-        float inv_count = 1.0f / (float)count;
-        for (int i = 0; i < count; i++)
-        {
-            x[i] *= inv_count;
-            y[i] *= inv_count;
-        }
-    }
-}
-
-#ifdef _MSC_VER
-
-#define TS_ALIGN16_0 __declspec( align( 16 ) )
-#define TS_ALIGN16_1
-#define TS_SELECTANY extern const __declspec( selectany )
-
-#else
-
-#define TS_ALIGN16_0
-#define TS_ALIGN16_1 __attribute__( (aligned( 16 )) )
-#define TS_SELECTANY const __attribute__( (selectany) )
-
-#endif
-
-// SSE2 trig funcs from https://github.com/to-miz/sse_mathfun_extension/
-#define _PS_CONST( Name, Val ) \
-    TS_SELECTANY TS_ALIGN16_0 float _ps_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val }
-
-#define _PS_CONST_TYPE( Name, Type, Val ) \
-    TS_SELECTANY TS_ALIGN16_0 Type _ps_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val }
-
-#define _PI32_CONST( Name, Val ) \
-    TS_SELECTANY TS_ALIGN16_0 int _pi32_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val }
-
-_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS_CONST_TYPE(inv_sign_mask, int, (int)~0x80000000);
-
-_PS_CONST(atanrange_hi, 2.414213562373095f);
-_PS_CONST(atanrange_lo, 0.4142135623730950f);
-_PS_CONST(cephes_PIO2F, 1.5707963267948966192f);
-_PS_CONST(cephes_PIO4F, 0.7853981633974483096f);
-_PS_CONST(1, 1.0f);
-_PS_CONST(0p5, 0.5f);
-_PS_CONST(0, 0);
-_PS_CONST(sincof_p0, -1.9515295891E-4f);
-_PS_CONST(sincof_p1, 8.3321608736E-3f);
-_PS_CONST(sincof_p2, -1.6666654611E-1f);
-_PS_CONST(atancof_p0, 8.05374449538e-2f);
-_PS_CONST(atancof_p1, 1.38776856032E-1f);
-_PS_CONST(atancof_p2, 1.99777106478E-1f);
-_PS_CONST(atancof_p3, 3.33329491539E-1f);
-_PS_CONST(cephes_PIF, 3.141592653589793238f);
-_PS_CONST(cephes_2PIF, 2.0f * 3.141592653589793238f);
-_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-_PS_CONST(minus_cephes_DP1, -0.78515625f);
-_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
-_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
-_PS_CONST(coscof_p0, 2.443315711809948E-005f);
-_PS_CONST(coscof_p1, -1.388731625493765E-003f);
-_PS_CONST(coscof_p2, 4.166664568298827E-002f);
-_PS_CONST(frame_size, (float)TS_PITCH_FRAME_SIZE);
-
-_PI32_CONST(1, 1);
-_PI32_CONST(inv1, ~1);
-_PI32_CONST(2, 2);
-_PI32_CONST(4, 4);
-
-static __m128 _mm_atan_ps(__m128 x)
-{
-    __m128 sign_bit, y;
-
-    sign_bit = x;
-    /* take the absolute value */
-    x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
-    /* extract the sign bit (upper one) */
-    sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask);
-
-    /* range reduction, init x and y depending on range */
-    /* x > 2.414213562373095 */
-    __m128 cmp0 = _mm_cmpgt_ps(x, *(__m128*)_ps_atanrange_hi);
-    /* x > 0.4142135623730950 */
-    __m128 cmp1 = _mm_cmpgt_ps(x, *(__m128*)_ps_atanrange_lo);
-
-    /* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
-    __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
-
-    /* -( 1.0/x ) */
-    __m128 y0 = _mm_and_ps(cmp0, *(__m128*)_ps_cephes_PIO2F);
-    __m128 x0 = _mm_div_ps(*(__m128*)_ps_1, x);
-    x0 = _mm_xor_ps(x0, *(__m128*)_ps_sign_mask);
-
-    __m128 y1 = _mm_and_ps(cmp2, *(__m128*)_ps_cephes_PIO4F);
-    /* (x-1.0)/(x+1.0) */
-    __m128 x1_o = _mm_sub_ps(x, *(__m128*)_ps_1);
-    __m128 x1_u = _mm_add_ps(x, *(__m128*)_ps_1);
-    __m128 x1 = _mm_div_ps(x1_o, x1_u);
-
-    __m128 x2 = _mm_and_ps(cmp2, x1);
-    x0 = _mm_and_ps(cmp0, x0);
-    x2 = _mm_or_ps(x2, x0);
-    cmp1 = _mm_or_ps(cmp0, cmp2);
-    x2 = _mm_and_ps(cmp1, x2);
-    x = _mm_andnot_ps(cmp1, x);
-    x = _mm_or_ps(x2, x);
-
-    y = _mm_or_ps(y0, y1);
-
-    __m128 zz = _mm_mul_ps(x, x);
-    __m128 acc = *(__m128*)_ps_atancof_p0;
-    acc = _mm_mul_ps(acc, zz);
-    acc = _mm_sub_ps(acc, *(__m128*)_ps_atancof_p1);
-    acc = _mm_mul_ps(acc, zz);
-    acc = _mm_add_ps(acc, *(__m128*)_ps_atancof_p2);
-    acc = _mm_mul_ps(acc, zz);
-    acc = _mm_sub_ps(acc, *(__m128*)_ps_atancof_p3);
-    acc = _mm_mul_ps(acc, zz);
-    acc = _mm_mul_ps(acc, x);
-    acc = _mm_add_ps(acc, x);
-    y = _mm_add_ps(y, acc);
-
-    /* update the sign */
-    y = _mm_xor_ps(y, sign_bit);
-
-    return y;
-}
-
-static __m128 _mm_atan2_ps(__m128 y, __m128 x)
-{
-    __m128 x_eq_0 = _mm_cmpeq_ps(x, *(__m128*)_ps_0);
-    __m128 x_gt_0 = _mm_cmpgt_ps(x, *(__m128*)_ps_0);
-    __m128 x_le_0 = _mm_cmple_ps(x, *(__m128*)_ps_0);
-    __m128 y_eq_0 = _mm_cmpeq_ps(y, *(__m128*)_ps_0);
-    __m128 x_lt_0 = _mm_cmplt_ps(x, *(__m128*)_ps_0);
-    __m128 y_lt_0 = _mm_cmplt_ps(y, *(__m128*)_ps_0);
-
-    __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
-    __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
-    zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
-
-    __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
-    __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, *(__m128*)_ps_sign_mask);
-    __m128 pio2_result = *(__m128*)_ps_cephes_PIO2F;
-    pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
-    pio2_result = _mm_and_ps(pio2_mask, pio2_result);
-
-    __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0);
-    __m128 pi = *(__m128*)_ps_cephes_PIF;
-    __m128 pi_result = _mm_and_ps(pi_mask, pi);
-
-    __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
-    swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, *(__m128*)_ps_sign_mask);
-
-    __m128 offset0 = _mm_setzero_ps();
-    __m128 offset1 = *(__m128*)_ps_cephes_PIF;
-    offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
-
-    __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
-    offset = _mm_and_ps(x_lt_0, offset1);
-
-    __m128 arg = _mm_div_ps(y, x);
-    __m128 atan_result = _mm_atan_ps(arg);
-    atan_result = _mm_add_ps(atan_result, offset);
-
-    /* select between zero_result, pio2_result and atan_result */
-
-    __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
-    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
-    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
-    result = _mm_or_ps(result, atan_result);
-    result = _mm_or_ps(result, pi_result);
-
-    return result;
-}
-
-static void _mm_sincos_ps(__m128 x, __m128 *s, __m128 *c)
-{
-    __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
-    __m128i emm0, emm2, emm4;
-    sign_bit_sin = x;
-    /* take the absolute value */
-    x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
-    /* extract the sign bit (upper one) */
-    sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
-
-    /* scale by 4/Pi */
-    y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
-
-    /* store the integer part of y in emm2 */
-    emm2 = _mm_cvttps_epi32(y);
-
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
-    emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
-    y = _mm_cvtepi32_ps(emm2);
-
-    emm4 = emm2;
-
-    /* get the swap sign flag for the sine */
-    emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
-    emm0 = _mm_slli_epi32(emm0, 29);
-    __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
-
-    /* get the polynom selection mask for the sine*/
-    emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
-    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-    __m128 poly_mask = _mm_castsi128_ps(emm2);
-
-    /* The magic pass: "Extended precision modular arithmetic"
-    x = ((x - y * DP1) - y * DP2) - y * DP3; */
-    xmm1 = *(__m128*)_ps_minus_cephes_DP1;
-    xmm2 = *(__m128*)_ps_minus_cephes_DP2;
-    xmm3 = *(__m128*)_ps_minus_cephes_DP3;
-    xmm1 = _mm_mul_ps(y, xmm1);
-    xmm2 = _mm_mul_ps(y, xmm2);
-    xmm3 = _mm_mul_ps(y, xmm3);
-    x = _mm_add_ps(x, xmm1);
-    x = _mm_add_ps(x, xmm2);
-    x = _mm_add_ps(x, xmm3);
-
-    emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
-    emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
-    emm4 = _mm_slli_epi32(emm4, 29);
-    __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
-
-    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-
-    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-    __m128 z = _mm_mul_ps(x, x);
-    y = *(__m128*)_ps_coscof_p0;
-
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
-    y = _mm_mul_ps(y, z);
-    y = _mm_mul_ps(y, z);
-    __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
-    y = _mm_sub_ps(y, tmp);
-    y = _mm_add_ps(y, *(__m128*)_ps_1);
-
-    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-    __m128 y2 = *(__m128*)_ps_sincof_p0;
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_mul_ps(y2, x);
-    y2 = _mm_add_ps(y2, x);
-
-    /* select the correct result from the two polynoms */
-    xmm3 = poly_mask;
-    __m128 ysin2 = _mm_and_ps(xmm3, y2);
-    __m128 ysin1 = _mm_andnot_ps(xmm3, y);
-    y2 = _mm_sub_ps(y2, ysin2);
-    y = _mm_sub_ps(y, ysin1);
-
-    xmm1 = _mm_add_ps(ysin1, ysin2);
-    xmm2 = _mm_add_ps(y, y2);
-
-    /* update the sign */
-    *s = _mm_xor_ps(xmm1, sign_bit_sin);
-    *c = _mm_xor_ps(xmm2, sign_bit_cos);
-}
-
-static __m128i select_si(__m128i a, __m128i b, __m128i mask)
-{
-    return _mm_xor_si128(a, _mm_and_si128(mask, _mm_xor_si128(b, a)));
-}
-
-#define tsVonHann( i ) (-0.5f * cosf( 2.0f * 3.14159265359f * (float)(i) / (float)TS_PITCH_FRAME_SIZE ) + 0.5f)
-
-static __m128 tsVonHann4(int i)
-{
-    __m128 k4 = _mm_set_ps((float)(i * 4 + 3), (float)(i * 4 + 2), (float)(i * 4 + 1), (float)(i * 4));
-    k4 = _mm_mul_ps(*(__m128*)_ps_cephes_2PIF, k4);
-    k4 = _mm_div_ps(k4, *(__m128*)_ps_frame_size);
-
-    // Seems like _mm_cos_ps and _mm_sincos_ps was causing some audio popping...
-    // I'm not really skilled enough to fix it, but feel free to try: http://gruntthepeon.free.fr/ssemath/sse_mathfun.h
-    // My guess is some large negative or positive values were causing some
-    // precision trouble. In this case manually calling 4 cosines is not
-    // really a big deal, since this function is not a bottleneck.
-
-#if 0
-    __m128 c = _mm_cos_ps(k4);
-#elif 0
-    __m128 s, c;
-    _mm_sincos_ps(k4, &s, &c);
-#else
-    __m128 c = k4;
-    float* cf = (float*)&c;
-    cf[0] = cosf(cf[0]);
-    cf[1] = cosf(cf[1]);
-    cf[2] = cosf(cf[2]);
-    cf[3] = cosf(cf[3]);
-#endif
-
-    __m128 von_hann = _mm_add_ps(_mm_mul_ps(_mm_set_ps1(-0.5f), c), _mm_set_ps1(0.5f));
-    return von_hann;
-}
-
-// Analysis and synthesis steps learned from Bernsee's wonderful blog post:
-// http://blogs.zynaptiq.com/bernsee/pitch-shifting-using-the-ft/
-static void tsPitchShift(float pitchShift, int num_samples_to_process, float sampleRate, float* indata, tsPitchData** pitch_filter)
-{
-    TS_ASSERT(num_samples_to_process <= TS_MAX_FRAME_LENGTH);
-
-    // make sure compiler didn't do anything weird with the member
-    // offsets of tsPitchData. All arrays must be 16 byte aligned
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->pitch_shifted_output_samples) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->fft_data) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->previous_phase) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->sum_phase) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->window_accumulator) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->freq) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->mag) & 15));
-    TS_ASSERT(!((size_t)&(((tsPitchData*)0)->pitch_shift_workspace) & 15));
-
-    tsPitchData* pf;
-
-    if (*pitch_filter == NULL)
-    {
-        pf = (tsPitchData*)malloc16(sizeof(tsPitchData));
-        memset(pf, 0, sizeof(tsPitchData));
-        *pitch_filter = pf;
-    }
-    else
-    {
-        pf = *pitch_filter;
-    }
-
-    float freqPerBin = sampleRate / (float)TS_PITCH_FRAME_SIZE;
-    __m128 freq_per_bin = _mm_set_ps1(sampleRate / (float)TS_PITCH_FRAME_SIZE);
-    __m128 pi = *(__m128*)_ps_cephes_PIF;
-    __m128 two_pi = *(__m128*)_ps_cephes_2PIF;
-    __m128 pitch_quality = _mm_set_ps1((float)TS_PITCH_QUALITY);
-    float* out_samples = pf->pitch_shifted_output_samples;
-    if (pf->index == 0) pf->index = TS_OVERLAP;
-
-    while (num_samples_to_process)
-    {
-        int copy_count = TS_PITCH_FRAME_SIZE - pf->index;
-        if (num_samples_to_process < copy_count) copy_count = num_samples_to_process;
-
-        memcpy(pf->in_FIFO + pf->index, indata, sizeof(float) * copy_count);
-        memcpy(out_samples, pf->out_FIFO + pf->index - TS_OVERLAP, sizeof(float) * copy_count);
-
-        int start_index = pf->index;
-        int offset = start_index & 3;
-        start_index += 4 - offset;
-
-        for (int i = 0; i < offset; ++i)
-            pf->in_FIFO[pf->index + i] /= 32768.0f;
-
-        int extra = copy_count & 3;
-        copy_count = copy_count / 4 - extra;
-        __m128* in_FIFO = (__m128*)(pf->in_FIFO + pf->index + offset);
-        TS_ASSERT(!((size_t)in_FIFO & 15));
-        __m128 int16_max = _mm_set_ps1(32768.0f);
-
-        for (int i = 0; i < copy_count; ++i)
-        {
-            __m128 val = in_FIFO[i];
-            __m128 div = _mm_div_ps(val, int16_max);
-            in_FIFO[i] = div;
-        }
-
-        for (int i = 0, copy_count4 = copy_count * 4; i < extra; ++i)
-        {
-            int index = copy_count4 + i;
-            pf->in_FIFO[pf->index + index] /= 32768.0f;
-        }
-
-        TS_ASSERT(!((size_t)out_samples & 15));
-        __m128* out_samples4 = (__m128*)out_samples;
-        for (int i = 0; i < copy_count; ++i)
-        {
-            __m128 val = out_samples4[i];
-            __m128 mul = _mm_mul_ps(val, int16_max);
-            out_samples4[i] = mul;
-        }
-
-        for (int i = 0, copy_count4 = copy_count * 4; i < extra; ++i)
-        {
-            int index = copy_count4 + i;
-            out_samples[index] *= 32768.0f;
-        }
-
-        copy_count = copy_count * 4 + extra;
-        num_samples_to_process -= copy_count;
-        pf->index += copy_count;
-        indata += copy_count;
-        out_samples += copy_count;
-
-        if (pf->index >= TS_PITCH_FRAME_SIZE)
-        {
-            pf->index = TS_OVERLAP;
-            {
-                __m128* fft_data = (__m128*)pf->fft_data;
-                __m128* in_FIFO = (__m128*)pf->in_FIFO;
-
-                for (int k = 0; k < TS_PITCH_FRAME_SIZE / 4; k++)
-                {
-                    __m128 von_hann = tsVonHann4(k);
-                    __m128 sample = in_FIFO[k];
-                    __m128 windowed_sample = _mm_mul_ps(sample, von_hann);
-                    fft_data[k] = windowed_sample;
-                }
-            }
-
-            memset(pf->fft_data + TS_PITCH_FRAME_SIZE, 0, TS_PITCH_FRAME_SIZE * sizeof(float));
-            tsFFT(pf->fft_data, pf->fft_data + TS_PITCH_FRAME_SIZE, TS_PITCH_FRAME_SIZE, 1.0f);
-
-            {
-                __m128* fft_data = (__m128*)pf->fft_data;
-                __m128* previous_phase = (__m128*)pf->previous_phase;
-                __m128* magnitudes = (__m128*)pf->mag;
-                __m128* frequencies = (__m128*)pf->freq;
-                int simd_count = (TS_PITCH_FRAME_SIZE / 2) / 4;
-
-                for (int k = 0; k <= simd_count; k++)
-                {
-                    __m128 real = fft_data[k];
-                    __m128 imag = fft_data[(TS_PITCH_FRAME_SIZE / 4) + k];
-                    __m128 overlap_phase = _mm_set_ps((float)(k * 4 + 3) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 2) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 1) * TS_EXPECTED_FREQUENCY, (float)(k * 4) * TS_EXPECTED_FREQUENCY);
-                    __m128 k4 = _mm_set_ps((float)(k * 4 + 3), (float)(k * 4 + 2), (float)(k * 4 + 1), (float)(k * 4));
-
-                    __m128 mag = _mm_mul_ps(_mm_set_ps1(2.0f), _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(real, real), _mm_mul_ps(imag, imag))));
-                    __m128 phase = _mm_atan2_ps(imag, real);
-                    __m128 phase_dif = _mm_sub_ps(phase, previous_phase[k]);
-
-                    previous_phase[k] = phase;
-                    phase_dif = _mm_sub_ps(phase_dif, overlap_phase);
-
-                    // map delta phase into +/- pi interval
-                    __m128i qpd = _mm_cvttps_epi32(_mm_div_ps(phase_dif, pi));
-                    __m128i zero = _mm_setzero_si128();
-                    __m128i ltzero_mask = _mm_cmplt_epi32(qpd, zero);
-                    __m128i ones_bit = _mm_and_si128(qpd, _mm_set1_epi32(1));
-                    __m128i neg_qpd = _mm_sub_epi32(qpd, ones_bit);
-                    __m128i pos_qpd = _mm_add_epi32(qpd, ones_bit);
-                    qpd = select_si(pos_qpd, neg_qpd, ltzero_mask);
-                    __m128 pi_range_offset = _mm_mul_ps(pi, _mm_cvtepi32_ps(qpd));
-                    phase_dif = _mm_sub_ps(phase_dif, pi_range_offset);
-
-                    __m128 deviation = _mm_div_ps(_mm_mul_ps(_mm_set_ps1((float)TS_PITCH_QUALITY), phase_dif), two_pi);
-                    __m128 true_freq_estimated = _mm_add_ps(_mm_mul_ps(k4, freq_per_bin), _mm_mul_ps(deviation, freq_per_bin));
-
-                    magnitudes[k] = mag;
-                    frequencies[k] = true_freq_estimated;
-                }
-            }
-
-            // actual pitch shifting work
-            // shift frequencies into workspace
-            memset(pf->pitch_shift_workspace, 0, (TS_PITCH_FRAME_SIZE / 2) * sizeof(float));
-            for (int k = 0; k <= TS_PITCH_FRAME_SIZE / 2; k++)
-            {
-                int index = (int)(k * pitchShift);
-                if (index <= TS_PITCH_FRAME_SIZE / 2)
-                    pf->pitch_shift_workspace[index] = pf->freq[k] * pitchShift;
-            }
-
-            // swap buffers around to reuse old pf->preq buffer as the new workspace
-            float* frequencies = pf->pitch_shift_workspace;
-            float* pitch_shift_workspace = pf->freq;
-            float* magnitudes = pf->mag;
-
-            // shift magnitudes into workspace
-            memset(pitch_shift_workspace, 0, TS_PITCH_FRAME_SIZE * sizeof(float));
-            for (int k = 0; k <= TS_PITCH_FRAME_SIZE / 2; k++)
-            {
-                int index = (int)(k * pitchShift);
-                if (index <= TS_PITCH_FRAME_SIZE / 2)
-                    pitch_shift_workspace[index] += magnitudes[k];
-            }
-
-            // track where the shifted magnitudes are
-            magnitudes = pitch_shift_workspace;
-
-            {
-                __m128* magnitudes4 = (__m128*)magnitudes;
-                __m128* frequencies4 = (__m128*)frequencies;
-                __m128* fft_data = (__m128*)pf->fft_data;
-                __m128* sum_phase = (__m128*)pf->sum_phase;
-                int simd_count = (TS_PITCH_FRAME_SIZE / 2) / 4;
-
-                for (int k = 0; k <= simd_count; k++)
-                {
-                    __m128 mag = magnitudes4[k];
-                    __m128 freq = frequencies4[k];
-                    __m128 freq_per_bin_k = _mm_set_ps((float)(k * 4 + 3) * freqPerBin, (float)(k * 4 + 2) * freqPerBin, (float)(k * 4 + 1) * freqPerBin, (float)(k * 4) * freqPerBin);
-
-                    freq = _mm_sub_ps(freq, freq_per_bin_k);
-                    freq = _mm_div_ps(freq, freq_per_bin);
-
-                    freq = _mm_mul_ps(two_pi, freq);
-                    freq = _mm_div_ps(freq, pitch_quality);
-
-                    __m128 overlap_phase = _mm_set_ps((float)(k * 4 + 3) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 2) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 1) * TS_EXPECTED_FREQUENCY, (float)(k * 4) * TS_EXPECTED_FREQUENCY);
-                    freq = _mm_add_ps(freq, overlap_phase);
-
-                    __m128 phase = sum_phase[k];
-                    phase = _mm_add_ps(phase, freq);
-                    sum_phase[k] = phase;
-
-                    __m128 c, s;
-                    _mm_sincos_ps(phase, &s, &c);
-                    __m128 real = _mm_mul_ps(mag, c);
-                    __m128 imag = _mm_mul_ps(mag, s);
-
-                    fft_data[k] = real;
-                    fft_data[(TS_PITCH_FRAME_SIZE / 4) + k] = imag;
-                }
-            }
-
-            for (int k = TS_PITCH_FRAME_SIZE + 2; k < 2 * TS_PITCH_FRAME_SIZE - 2; ++k)
-                pf->fft_data[k] = 0;
-
-            tsFFT(pf->fft_data, pf->fft_data + TS_PITCH_FRAME_SIZE, TS_PITCH_FRAME_SIZE, -1);
-
-            {
-                __m128* fft_data = (__m128*)pf->fft_data;
-                __m128* window_accumulator = (__m128*)pf->window_accumulator;
-
-                for (int k = 0; k < TS_PITCH_FRAME_SIZE / 4; ++k)
-                {
-                    __m128 von_hann = tsVonHann4(k);
-                    __m128 fft_data_segment = fft_data[k];
-                    __m128 accumulator_segment = window_accumulator[k];
-                    __m128 divisor = _mm_div_ps(pitch_quality, _mm_set_ps1(8.0f));
-                    fft_data_segment = _mm_mul_ps(von_hann, fft_data_segment);
-                    fft_data_segment = _mm_div_ps(fft_data_segment, divisor);
-                    accumulator_segment = _mm_add_ps(accumulator_segment, fft_data_segment);
-                    window_accumulator[k] = accumulator_segment;
-                }
-            }
-
-            memcpy(pf->out_FIFO, pf->window_accumulator, TS_STEPSIZE * sizeof(float));
-            memmove(pf->window_accumulator, pf->window_accumulator + TS_STEPSIZE, TS_PITCH_FRAME_SIZE * sizeof(float));
-            memmove(pf->in_FIFO, pf->in_FIFO + TS_STEPSIZE, TS_OVERLAP * sizeof(float));
-        }
-    }
-}
-
-/*
-zlib license:
-
-Copyright (c) 2017 Randy Gaul http://www.randygaul.net
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from
-the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-1. The origin of this software must not be misrepresented; you must not
-claim that you wrote the original software. If you use this software
-in a product, an acknowledgment in the product documentation would be
-appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not
-be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#endif