diff options
Diffstat (limited to 'src/libs/tiny/tinysound.h')
-rw-r--r-- | src/libs/tiny/tinysound.h | 2560 |
1 files changed, 0 insertions, 2560 deletions
diff --git a/src/libs/tiny/tinysound.h b/src/libs/tiny/tinysound.h deleted file mode 100644 index 41d547d..0000000 --- a/src/libs/tiny/tinysound.h +++ /dev/null @@ -1,2560 +0,0 @@ -/* -tinysound.h - v1.07 - -Summary: -tinysound is a C API for loading, playing, looping, panning and fading mono -and stero sounds. This means tinysound imparts no external DLLs or large -libraries that adversely effect shipping size. tinysound can also run on -Windows XP since DirectSound ships with all recent versions of Windows. -tinysound implements a custom SSE2 mixer by explicitly locking and unlocking -portions of an internal. tinysound uses CoreAudio for Apple machines (like -OSX and iOS). SDL is used for all other platforms. Define TS_FORCE_SDL -before placaing the TS_IMPLEMENTATION in order to force the use of SDL. - -Revision history: -1.0 (06/04/2016) initial release -1.01 (06/06/2016) load WAV from memory -separate portable and OS-specific code in tsMix -fixed bug causing audio glitches when sounds ended -added stb_vorbis loaders + demo example -1.02 (06/08/2016) error checking + strings in vorbis loaders -SSE2 implementation of mixer -fix typos on docs/comments -corrected volume bug introduced in 1.01 -1.03 (07/05/2016) size calculation helper (to know size of sound in -bytes on the heap) tsSoundSize -1.04 (12/06/2016) merged in Aaron Balint's contributions -SFFT and pitch functions from Stephan M. Bernsee -tsMix can run on its own thread with tsSpawnMixThread -updated documentation, typo fixes -fixed typo in malloc16 that caused heap corruption -1.05 (12/08/2016) tsStopAllSounds, suggested by Aaron Balint -1.06 (02/17/2017) port to CoreAudio for Apple machines -1.07 (06/18/2017) SIMD the pitch shift code; swapped out old Bernsee -code for a new re-write, updated docs as necessary, -support for compiling as .c and .cpp on Windows, -port for SDL (for Linux, or any other platform). -Special thanks to DexP of github for 90% of the work -on the SDL port! -*/ - -/* -Contributors: -Aaron Balint 1.04 - real time pitch -1.04 - separate thread for tsMix -1.04 - bugfix, removed extra free16 call for second channel -DeXP 1.07 - initial work on SDL port -*/ - -/* -To create implementation (the function definitions) -#define TS_IMPLEMENTATION -in *one* C/CPP file (translation unit) that includes this file - -DOCUMENTATION (very quick intro): -1. create context -2. load sounds from disk into memory -3. play sounds -4. free context - -1. tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, N ); -2. tsPlaySoundDef def = tsMakeDef( &tsLoadWAV( "path_to_file/filename.wav" ) ); -3. tsPlaySound( ctx, def ); -4. tsShutdownContext( ctx ); - -DOCUMENTATION (longer introduction): -tinysound consists of tsLoadedSounds, tsPlayingSounds and the tsContext. -The tsContext encapsulates an OS sound API, as well as buffers + settings. -tsLoadedSound holds raw samples of a sound. tsPlayingSound is an instance -of a tsLoadedSound that represents a sound that can be played through the -tsContext. - -There are two main versions of the API, the low-level and the high-level -API. The low-level API does not manage any memory for tsPlayingSounds. The -high level api holds a memory pool of playing sounds. - -High-level API: -First create a context and pass in non-zero to the final parameter. This -final parameter controls how large of a memory pool to use for tsPlayingSounds. -Here's an example where N is the size of the internal pool: - -tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, N ); - -We create tsPlayingSounds indirectly with tsPlayDef structs. tsPlayDef is a -POD struct so feel free to make them straight on the stack. The tsPlayDef -sets up initialization parameters. Here's an example to load a wav and -play it: - -tsLoadedSound loaded = tsLoadWAV( "path_to_file/filename.wav" ); -tsPlaySoundDef def = tsMakeDef( &loaded ); -tsPlayingSound* sound = tsPlaySound( ctx, def ); - -The same def can be used to play as many sounds as desired (even simultaneously) -as long as the context playing sound pool is large enough. - -Low-level API: -First create a context and pass 0 in the final parameter (0 here means -the context will *not* allocate a tsPlayingSound memory pool): - -tsContext* ctx = tsMakeContext( hwnd, frequency, latency, seconds, 0 ); - -parameters: -hwnd -- HWND, handle to window (on OSX just pass in 0) -frequency -- int, represents Hz frequency rate in which samples are played -latency -- int, estimated latency in Hz from PlaySound call to speaker output -seconds -- int, number of second of samples internal buffers can hold -0 (last param) -- int, number of elements in tsPlayingSound pool - -We create a tsPlayingSound like so: -tsLoadedSound loaded = tsLoadWAV( "path_to_file/filename.wav" ); -tsPlayingSound playing_sound = tsMakePlayingSound( &loaded ); - -Then to play the sound we do: -tsInsertSound( ctx, &playing_sound ); - -The above tsInsertSound function call will place playing_sound into -a singly-linked list inside the context. The context will remove -the sound from its internal list when it finishes playing. - -WARNING: The high-level API cannot be mixed with the low-level API. If you -try then the internal code will assert and crash. Pick one and stick with it. -Usually he high-level API will be used, but if someone is *really* picky about -their memory usage, or wants more control, the low-level API can be used. - -Here is the Low-Level API: -tsPlayingSound tsMakePlayingSound( tsLoadedSound* loaded ); -void tsInsertSound( tsContext* ctx, tsPlayingSound* sound ); - -Here is the High-Level API: -tsPlayingSound* tsPlaySound( tsContext* ctx, tsPlaySoundDef def ); -tsPlaySoundDef tsMakeDef( tsLoadedSound* sound ); -void tsStopAllSounds( tsContext( ctx ); - -Be sure to link against dsound.dll (or dsound.lib) on Windows. - -Read the rest of the header for specific details on all available functions -and struct types. -*/ - -/* -Known Limitations: - -* PCM mono/stereo format is the only formats the LoadWAV function supports. I don't -guarantee it will work for all kinds of wav files, but it certainly does for the common -kind (and can be changed fairly easily if someone wanted to extend it). -* Only supports 16 bits per sample. -* Mixer does not do any fancy clipping. The algorithm is to convert all 16 bit samples -to float, mix all samples, and write back to audio API as 16 bit integers. In -practice this works very well and clipping is not often a big problem. -* I'm not super familiar with good ways to avoid the DirectSound play cursor from going -past the write cursor. To mitigate this pass in a larger number to tsMakeContext's 4th -parameter (buffer scale in seconds). -* Pitch shifting code is pretty darn expensive. This is due to the use of a Fast Fourier Transform -routine. The pitch shifting itself is written in rather efficient SIMD using SSE2 intrinsics, -but the FFT routine is very basic. FFT is a big bottleneck for pitch shifting. There is a -TODO optimization listed in this file for the FFT routine, but it's fairly low priority; -optimizing FFT routines is difficult and requires a lot of specialized knowledge. -*/ - -/* -FAQ -Q : Why DirectSound instead of (insert API here) on Windows? -A : Casey Muratori documented DS on Handmade Hero, other APIs do not have such good docs. DS has -shipped on Windows XP all the way through Windows 10 -- using this header effectively intro- -duces zero dependencies for the foreseeable future. The DS API itself is sane enough to quickly -implement needed features, and users won't hear the difference between various APIs. Latency is -not that great with DS but it is shippable. Additionally, many other APIs will in the end speak -to Windows through the DS API. - -Q : Why not include Linux support? -A : There have been a couple requests for ALSA support on Linux. For now the only option is to use -SDL backend, which can indirectly support ALSA. SDL is used only in a very low-level manner; -to get sound samples to the sound card via callback, so there shouldn't be much in the way of -considering SDL a good option for "name your flavor" of Linux backend. - -Q : I would like to use my own memory management, how can I achieve this? -A : This header makes a couple uses of malloc/free, and malloc16/free16. Simply find these bits -and replace them with your own memory allocation routines. They can be wrapped up into a macro, -or call your own functions directly -- it's up to you. Generally these functions allocate fairly -large chunks of memory, and not very often (if at all), with one exception: tsSetPitch is a very -expensive routine and requires frequent dynamic memory management. -*/ - -/* -Some past discussion threads: -https://www.reddit.com/r/gamedev/comments/6i39j2/tinysound_the_cutest_library_to_get_audio_into/ -https://www.reddit.com/r/gamedev/comments/4ml6l9/tinysound_singlefile_c_audio_library/ -https://forums.tigsource.com/index.php?topic=58706.0 -*/ - -#if !defined( TINYSOUND_H ) - -#define TS_WINDOWS 1 -#define TS_MAC 2 -#define TS_UNIX 3 -#define TS_SDL 4 - -#if defined( _WIN32 ) -#define TS_PLATFORM TS_WINDOWS -#elif defined( __APPLE__ ) -#define TS_PLATFORM TS_MAC -#else -#define TS_PLATFORM TS_SDL - -// please note TS_UNIX is not directly support -// instead, unix-style OSes are encouraged to use SDL -// see: https://www.libsdl.org/ - -#endif - -// Use TS_FORCE_SDL to override the above macros and use -// the SDL port. -#ifdef TS_FORCE_SDL - -#undef TS_PLATFORM -#define TS_PLATFORM TS_SDL - -#endif - -#include <stdint.h> - -// read this in the event of tsLoadWAV/tsLoadOGG errors -// also read this in the event of certain errors from tsMakeContext -extern const char* g_tsErrorReason; - -// stores a loaded sound in memory -typedef struct -{ - int sample_count; - int channel_count; - void* channels[2]; -} tsLoadedSound; - -struct tsPitchData; -typedef struct tsPitchData tsPitchData; - -// represents an instance of a tsLoadedSound, can be played through the tsContext -typedef struct tsPlayingSound -{ - int active; - int paused; - int looped; - float volume0; - float volume1; - float pan0; - float pan1; - float pitch; - tsPitchData* pitch_filter[2]; - int sample_index; - tsLoadedSound* loaded_sound; - struct tsPlayingSound* next; -} tsPlayingSound; - -// holds audio API info and other info -struct tsContext; -typedef struct tsContext tsContext; - -// The returned struct will contain a null pointer in tsLoadedSound::channel[ 0 ] -// in the case of errors. Read g_tsErrorReason string for details on what happened. -// Calls tsReadMemWAV internally. -tsLoadedSound tsLoadWAV(const char* path); - -// Reads a WAV file from memory. Still allocates memory for the tsLoadedSound since -// WAV format will interlace stereo, and we need separate data streams to do SIMD -// properly. -void tsReadMemWAV(const void* memory, tsLoadedSound* sound); - -// If stb_vorbis was included *before* tinysound go ahead and create -// some functions for dealing with OGG files. -#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H -void tsReadMemOGG(const void* memory, int length, int* sample_rate, tsLoadedSound* sound); -tsLoadedSound tsLoadOGG(const char* path, int* sample_rate); -#endif - -// Uses free16 (aligned free, implemented later in this file) to free up both of -// the channels stored within sound -void tsFreeSound(tsLoadedSound* sound); - -// Returns the size, in bytes, of all heap-allocated memory for this particular -// loaded sound -int tsSoundSize(tsLoadedSound* sound); - -// playing_pool_count -- 0 to setup low-level API, non-zero to size the internal -// memory pool for tsPlayingSound instances -tsContext* tsMakeContext(void* hwnd, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count); -void tsShutdownContext(tsContext* ctx); - -// Call tsSpawnMixThread once to setup a separate thread for the context to run -// upon. The separate thread will continually call tsMix and perform mixing -// operations. -void tsSpawnMixThread(tsContext* ctx); - -// Use tsThreadSleepDelay to specify a custom sleep delay time. -// A sleep will occur after each call to tsMix. By default YieldProcessor -// is used, and no sleep occurs. Use a sleep delay to conserve CPU bandwidth. -// A recommended sleep time is a little less than 1/2 your predicted 1/FPS. -// 60 fps is 16 ms, so about 1-5 should work well in most cases. -void tsThreadSleepDelay(tsContext* ctx, int milliseconds); - -// Call this manually, once per game tick recommended, if you haven't ever -// called tsSpawnMixThread. Otherwise the thread will call tsMix itself. -// num_samples_to_write is not used on Windows. On Mac it is used to push -// samples into a circular buffer while CoreAudio simultaneously pulls samples -// off of the buffer. num_samples_to_write should be computed each update tick -// as delta_time * play_frequency_in_Hz + 1. -void tsMix(tsContext* ctx); - -// All of the functions in this next section should only be called if tsIsActive -// returns true. Calling them otherwise probably won't do anything bad, but it -// won't do anything at all. If a sound is active it resides in the context's -// internal list of playing sounds. -int tsIsActive(tsPlayingSound* sound); - -// Flags sound for removal. Upon next tsMix call will remove sound from playing -// list. If high-level API used sound is placed onto the internal free list. -void tsStopSound(tsPlayingSound* sound); - -void tsLoopSound(tsPlayingSound* sound, int zero_for_no_loop); -void tsPauseSound(tsPlayingSound* sound, int one_for_paused); - -// lerp from 0 to 1, 0 full left, 1 full right -void tsSetPan(tsPlayingSound* sound, float pan); - -// explicitly set volume of each channel. Can be used as panning (but it's -// recommended to use the tsSetPan function for panning). -void tsSetVolume(tsPlayingSound* sound, float volume_left, float volume_right); - -// Change pitch (not duration) of sound. pitch = 0.5f for one octave lower, pitch = 2.0f for one octave higher. -// pitch at 1.0f applies no change. pitch settings farther away from 1.0f create more distortion and lower -// the output sample quality. pitch can be adjusted in real-time for doppler effects and the like. Going beyond -// 0.5f and 2.0f may require some tweaking the pitch shifting parameters, and is not recommended. - -// Additional important information about performance: This function -// is quite expensive -- you have been warned! Try it out and be aware of how much CPU consumption it uses. -// To avoid destroying the originally loaded sound samples, tsSetPitch will do a one-time allocation to copy -// sound samples into a new buffer. The new buffer contains the pitch adjusted samples, and these will be played -// through tsMix. This lets the pitch be modulated at run-time, but requires dynamically allocated memory. The -// memory is freed once the sound finishes playing. If a one-time pitch adjustment is desired, for performance -// reasons please consider doing an off-line pitch adjustment manually as a pre-processing step for your sounds. -// Also, consider changing malloc16 and free16 to match your custom memory allocation needs. Try adjusting -// TS_PITCH_QUALITY (must be a power of two) and see how this affects your performance. -void tsSetPitch(tsPlayingSound* sound, float pitch); - -// Delays sound before actually playing it. Requires context to be passed in -// since there's a conversion from seconds to samples per second. -// If one were so inclined another version could be implemented like: -// void tsSetDelay( tsPlayingSound* sound, float delay, int samples_per_second ) -void tsSetDelay(tsContext* ctx, tsPlayingSound* sound, float delay_in_seconds); - -// Portable sleep function -void tsSleep(int milliseconds); - -// LOW-LEVEL API -tsPlayingSound tsMakePlayingSound(tsLoadedSound* loaded); -void tsInsertSound(tsContext* ctx, tsPlayingSound* sound); - -// HIGH-LEVEL API -typedef struct -{ - int paused; - int looped; - float volume_left; - float volume_right; - float pan; - float pitch; - float delay; - tsLoadedSound* loaded; -} tsPlaySoundDef; - -tsPlayingSound* tsPlaySound(tsContext* ctx, tsPlaySoundDef def); -tsPlaySoundDef tsMakeDef(tsLoadedSound* sound); -void tsStopAllSounds(tsContext* ctx); - -#define TINYSOUND_H -#endif - -#ifdef TS_IMPLEMENTATION - -#define _CRT_SECURE_NO_WARNINGS FUCK_YOU -#include <stdlib.h> // malloc, free -#include <stdio.h> // fopen, fclose -#include <string.h> // memcmp, memset, memcpy -#include <xmmintrin.h> -#include <emmintrin.h> - -#if TS_PLATFORM == TS_WINDOWS - -#include <dsound.h> -#undef PlaySound - -#if defined( _MSC_VER ) -#pragma comment( lib, "dsound.lib" ) -#endif - -#elif TS_PLATFORM == TS_MAC - -#include <CoreAudio/CoreAudio.h> -#include <AudioUnit/AudioUnit.h> -#include <pthread.h> -#include <mach/mach_time.h> - -#else - -#include "SDL2/SDL.h" - -#endif - -#define TS_CHECK( X, Y ) do { if ( !(X) ) { g_tsErrorReason = Y; goto ts_err; } } while ( 0 ) -#if TS_PLATFORM == TS_MAC && defined( __clang__ ) -#define TS_ASSERT_INTERNAL __builtin_trap( ) -#else -#define TS_ASSERT_INTERNAL *(int*)0 = 0 -#endif -#define TS_ASSERT( X ) do { if ( !(X) ) TS_ASSERT_INTERNAL; } while ( 0 ) -#define TS_ALIGN( X, Y ) ((((size_t)X) + ((Y) - 1)) & ~((Y) - 1)) -#define TS_TRUNC( X, Y ) ((size_t)(X) & ~((Y) - 1)) - -const char* g_tsErrorReason; - -static void* tsReadFileToMemory(const char* path, int* size) -{ - void* data = 0; - FILE* fp = fopen(path, "rb"); - int sizeNum = 0; - - if (fp) - { - fseek(fp, 0, SEEK_END); - sizeNum = (int)ftell(fp); - fseek(fp, 0, SEEK_SET); - data = malloc(sizeNum); - fread(data, sizeNum, 1, fp); - fclose(fp); - } - - if (size) *size = sizeNum; - return data; -} - -static int tsFourCC(const char* CC, void* memory) -{ - if (!memcmp(CC, memory, 4)) return 1; - return 0; -} - -static char* tsNext(char* data) -{ - uint32_t size = *(uint32_t*)(data + 4); - size = (size + 1) & ~1; - return data + 8 + size; -} - -static void* malloc16(size_t size) -{ - void* p = malloc(size + 16); - if (!p) return 0; - unsigned char offset = (size_t)p & 15; - p = (void*)TS_ALIGN(p + 1, 16); - *((char*)p - 1) = 16 - offset; - TS_ASSERT(!((size_t)p & 15)); - return p; -} - -static void free16(void* p) -{ - if (!p) return; - free((char*)p - (size_t)*((char*)p - 1)); -} - -static void tsLastElement(__m128* a, int i, int j, int16_t* samples, int offset) -{ - switch (offset) - { - case 1: - a[i] = _mm_set_ps(samples[j], 0.0f, 0.0f, 0.0f); - break; - - case 2: - a[i] = _mm_set_ps(samples[j], samples[j + 1], 0.0f, 0.0f); - break; - - case 3: - a[i] = _mm_set_ps(samples[j], samples[j + 1], samples[j + 2], 0.0f); - break; - - case 0: - a[i] = _mm_set_ps(samples[j], samples[j + 1], samples[j + 2], samples[j + 3]); - break; - } -} - -void tsReadMemWAV(const void* memory, tsLoadedSound* sound) -{ -#pragma pack( push, 1 ) - typedef struct - { - uint16_t wFormatTag; - uint16_t nChannels; - uint32_t nSamplesPerSec; - uint32_t nAvgBytesPerSec; - uint16_t nBlockAlign; - uint16_t wBitsPerSample; - uint16_t cbSize; - uint16_t wValidBitsPerSample; - uint32_t dwChannelMask; - uint8_t SubFormat[18]; - } Fmt; -#pragma pack( pop ) - - char* data = (char*)memory; - TS_CHECK(data, "Unable to read input file (file doesn't exist, or could not allocate heap memory."); - TS_CHECK(tsFourCC("RIFF", data), "Incorrect file header; is this a WAV file?"); - TS_CHECK(tsFourCC("WAVE", data + 8), "Incorrect file header; is this a WAV file?"); - - data += 12; - - TS_CHECK(tsFourCC("fmt ", data), "fmt chunk not found."); - Fmt fmt; - fmt = *(Fmt*)(data + 8); - TS_CHECK(fmt.wFormatTag == 1, "Only PCM WAV files are supported."); - TS_CHECK(fmt.nChannels == 1 || fmt.nChannels == 2, "Only mono or stereo supported (too many channels detected)."); - TS_CHECK(fmt.wBitsPerSample == 16, "Only 16 bits per sample supported."); - TS_CHECK(fmt.nBlockAlign == fmt.nChannels * 2, "implementation error"); - - data = tsNext(data); - TS_CHECK(tsFourCC("data", data), "data chunk not found."); - int sample_size = *((uint32_t*)(data + 4)); - int sample_count = sample_size / (fmt.nChannels * sizeof(uint16_t)); - sound->sample_count = sample_count; - sound->channel_count = fmt.nChannels; - - int wide_count = (int)TS_ALIGN(sample_count, 4); - wide_count /= 4; - int wide_offset = sample_count & 3; - int16_t* samples = (int16_t*)(data + 8); - float* sample = (float*)alloca(sizeof(float) * 4 + 16); - sample = (float*)TS_ALIGN(sample, 16); - - switch (sound->channel_count) - { - case 1: - { - sound->channels[0] = malloc16(wide_count * sizeof(__m128)); - sound->channels[1] = 0; - __m128* a = (__m128*)sound->channels[0]; - - for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 4) - { - sample[0] = (float)samples[j]; - sample[1] = (float)samples[j + 1]; - sample[2] = (float)samples[j + 2]; - sample[3] = (float)samples[j + 3]; - a[i] = _mm_load_ps(sample); - } - - tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset); - } break; - - case 2: - { - __m128* a = (__m128*)malloc16(wide_count * sizeof(__m128) * 2); - __m128* b = a + wide_count; - - for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 8) - { - sample[0] = (float)samples[j]; - sample[1] = (float)samples[j + 2]; - sample[2] = (float)samples[j + 4]; - sample[3] = (float)samples[j + 6]; - a[i] = _mm_load_ps(sample); - - sample[0] = (float)samples[j + 1]; - sample[1] = (float)samples[j + 3]; - sample[2] = (float)samples[j + 5]; - sample[3] = (float)samples[j + 7]; - b[i] = _mm_load_ps(sample); - } - - tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset); - tsLastElement(b, wide_count - 1, (wide_count - 1) * 4 + 4, samples, wide_offset); - sound->channels[0] = a; - sound->channels[1] = b; - } break; - - default: - TS_CHECK(0, "unsupported channel count (only support mono and stereo)."); - } - - return; - -ts_err: - memset(&sound, 0, sizeof(sound)); -} - -tsLoadedSound tsLoadWAV(const char* path) -{ - tsLoadedSound sound = { 0 }; - char* wav = (char*)tsReadFileToMemory(path, 0); - tsReadMemWAV(wav, &sound); - free(wav); - return sound; -} - -// If stb_vorbis was included *before* tinysound go ahead and create -// some functions for dealing with OGG files. -#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H -void tsReadMemOGG(const void* memory, int length, int* sample_rate, tsLoadedSound* sound) -{ - int16_t* samples = 0; - int channel_count; - int sample_count = stb_vorbis_decode_memory((const unsigned char*)memory, length, &channel_count, sample_rate, &samples); - - TS_CHECK(sample_count > 0, "stb_vorbis_decode_memory failed. Make sure your file exists and is a valid OGG file."); - - int wide_count = (int)TS_ALIGN(sample_count, 4) / 4; - int wide_offset = sample_count & 3; - float* sample = (float*)alloca(sizeof(float) * 4 + 16); - sample = (float*)TS_ALIGN(sample, 16); - __m128* a; - __m128* b; - - switch (channel_count) - { - case 1: - { - a = (__m128*)malloc16(wide_count * sizeof(__m128)); - b = 0; - - for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 4) - { - sample[0] = (float)samples[j]; - sample[1] = (float)samples[j + 1]; - sample[2] = (float)samples[j + 2]; - sample[3] = (float)samples[j + 3]; - a[i] = _mm_load_ps(sample); - } - - tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset); - } break; - - case 2: - a = (__m128*)malloc16(wide_count * sizeof(__m128) * 2); - b = a + wide_count; - - for (int i = 0, j = 0; i < wide_count - 1; ++i, j += 8) - { - sample[0] = (float)samples[j]; - sample[1] = (float)samples[j + 2]; - sample[2] = (float)samples[j + 4]; - sample[3] = (float)samples[j + 6]; - a[i] = _mm_load_ps(sample); - - sample[0] = (float)samples[j + 1]; - sample[1] = (float)samples[j + 3]; - sample[2] = (float)samples[j + 5]; - sample[3] = (float)samples[j + 7]; - b[i] = _mm_load_ps(sample); - } - - tsLastElement(a, wide_count - 1, (wide_count - 1) * 4, samples, wide_offset); - tsLastElement(b, wide_count - 1, (wide_count - 1) * 4 + 4, samples, wide_offset); - break; - - default: - TS_CHECK(0, "Unsupported channel count."); - } - - sound->sample_count = sample_count; - sound->channel_count = channel_count; - sound->channels[0] = a; - sound->channels[1] = b; - free(samples); - return; - -ts_err: - free(samples); - memset(sound, 0, sizeof(tsLoadedSound)); -} - -tsLoadedSound tsLoadOGG(const char* path, int* sample_rate) -{ - int length; - void* memory = tsReadFileToMemory(path, &length); - tsLoadedSound sound; - tsReadMemOGG(memory, length, sample_rate, &sound); - free(memory); - - return sound; -} -#endif - -void tsFreeSound(tsLoadedSound* sound) -{ - free16(sound->channels[0]); - memset(sound, 0, sizeof(tsLoadedSound)); -} - -int tsSoundSize(tsLoadedSound* sound) -{ - return sound->sample_count * sound->channel_count * sizeof(uint16_t); -} - -tsPlayingSound tsMakePlayingSound(tsLoadedSound* loaded) -{ - tsPlayingSound playing; - playing.active = 0; - playing.paused = 0; - playing.looped = 0; - playing.volume0 = 1.0f; - playing.volume1 = 1.0f; - playing.pan0 = 0.5f; - playing.pan1 = 0.5f; - playing.pitch = 1.0f; - playing.pitch_filter[0] = 0; - playing.pitch_filter[1] = 0; - playing.sample_index = 0; - playing.loaded_sound = loaded; - playing.next = 0; - return playing; -} - -int tsIsActive(tsPlayingSound* sound) -{ - return sound->active; -} - -void tsStopSound(tsPlayingSound* sound) -{ - sound->active = 0; -} - -void tsLoopSound(tsPlayingSound* sound, int zero_for_no_loop) -{ - sound->looped = zero_for_no_loop; -} - -void tsPauseSound(tsPlayingSound* sound, int one_for_paused) -{ - sound->paused = one_for_paused; -} - -void tsSetPan(tsPlayingSound* sound, float pan) -{ - if (pan > 1.0f) pan = 1.0f; - else if (pan < 0.0f) pan = 0.0f; - float left = 1.0f - pan; - float right = pan; - sound->pan0 = left; - sound->pan1 = right; -} - -void tsSetPitch(tsPlayingSound* sound, float pitch) -{ - sound->pitch = pitch; -} - -void tsSetVolume(tsPlayingSound* sound, float volume_left, float volume_right) -{ - if (volume_left < 0.0f) volume_left = 0.0f; - if (volume_right < 0.0f) volume_right = 0.0f; - sound->volume0 = volume_left; - sound->volume1 = volume_right; -} - -static void tsRemoveFilter(tsPlayingSound* playing); - -#if TS_PLATFORM == TS_WINDOWS - -void tsSleep(int milliseconds) -{ - Sleep(milliseconds); -} - -struct tsContext -{ - unsigned latency_samples; - unsigned running_index; - int Hz; - int bps; - int buffer_size; - int wide_count; - tsPlayingSound* playing; - __m128* floatA; - __m128* floatB; - __m128i* samples; - tsPlayingSound* playing_pool; - tsPlayingSound* playing_free; - - // platform specific stuff - LPDIRECTSOUND dsound; - LPDIRECTSOUNDBUFFER buffer; - LPDIRECTSOUNDBUFFER primary; - - // data for tsMix thread, enable these with tsSpawnMixThread - CRITICAL_SECTION critical_section; - int separate_thread; - int running; - int sleep_milliseconds; -}; - -static void tsReleaseContext(tsContext* ctx) -{ - if (ctx->separate_thread) DeleteCriticalSection(&ctx->critical_section); -#ifdef __cplusplus - ctx->buffer->Release(); - ctx->primary->Release(); - ctx->dsound->Release(); -#else - ctx->buffer->lpVtbl->Release(ctx->buffer); - ctx->primary->lpVtbl->Release(ctx->primary); - ctx->dsound->lpVtbl->Release(ctx->dsound); -#endif - tsPlayingSound* playing = ctx->playing; - while (playing) - { - tsRemoveFilter(playing); - playing = playing->next; - } - free(ctx); -} - -static DWORD WINAPI tsCtxThread(LPVOID lpParameter) -{ - tsContext* ctx = (tsContext*)lpParameter; - - while (ctx->running) - { - tsMix(ctx); - if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds); - else YieldProcessor(); - } - - ctx->separate_thread = 0; - return 0; -} - -static void tsLock(tsContext* ctx) -{ - if (ctx->separate_thread) EnterCriticalSection(&ctx->critical_section); -} - -static void tsUnlock(tsContext* ctx) -{ - if (ctx->separate_thread) LeaveCriticalSection(&ctx->critical_section); -} - -tsContext* tsMakeContext(void* hwnd, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count) -{ - int bps = sizeof(INT16) * 2; - int buffer_size = play_frequency_in_Hz * bps * num_buffered_seconds; - tsContext* ctx = 0; - WAVEFORMATEX format = { 0 }; - DSBUFFERDESC bufdesc = { 0 }; - LPDIRECTSOUND dsound; - - TS_CHECK(hwnd, "Invalid hwnd passed to tsMakeContext."); - - HRESULT res = DirectSoundCreate(0, &dsound, 0); - TS_CHECK(res == DS_OK, "DirectSoundCreate failed"); -#ifdef __cplusplus - dsound->SetCooperativeLevel((HWND)hwnd, DSSCL_PRIORITY); -#else - dsound->lpVtbl->SetCooperativeLevel(dsound, (HWND)hwnd, DSSCL_PRIORITY); -#endif - bufdesc.dwSize = sizeof(bufdesc); - bufdesc.dwFlags = DSBCAPS_PRIMARYBUFFER; - - LPDIRECTSOUNDBUFFER primary_buffer; -#ifdef __cplusplus - res = dsound->CreateSoundBuffer(&bufdesc, &primary_buffer, 0); -#else - res = dsound->lpVtbl->CreateSoundBuffer(dsound, &bufdesc, &primary_buffer, 0); -#endif - TS_CHECK(res == DS_OK, "Failed to create primary sound buffer"); - - format.wFormatTag = WAVE_FORMAT_PCM; - format.nChannels = 2; - format.nSamplesPerSec = play_frequency_in_Hz; - format.wBitsPerSample = 16; - format.nBlockAlign = (format.nChannels * format.wBitsPerSample) / 8; - format.nAvgBytesPerSec = format.nSamplesPerSec * format.nBlockAlign; - format.cbSize = 0; -#ifdef __cplusplus - res = primary_buffer->SetFormat(&format); -#else - res = primary_buffer->lpVtbl->SetFormat(primary_buffer, &format); -#endif - TS_CHECK(res == DS_OK, "Failed to set format on primary buffer"); - - LPDIRECTSOUNDBUFFER secondary_buffer; - bufdesc.dwSize = sizeof(bufdesc); - bufdesc.dwFlags = 0; - bufdesc.dwBufferBytes = buffer_size; - bufdesc.lpwfxFormat = &format; -#ifdef __cplusplus - res = dsound->CreateSoundBuffer(&bufdesc, &secondary_buffer, 0); -#else - res = dsound->lpVtbl->CreateSoundBuffer(dsound, &bufdesc, &secondary_buffer, 0); -#endif - TS_CHECK(res == DS_OK, "Failed to set format on secondary buffer"); - - int sample_count = play_frequency_in_Hz * num_buffered_seconds; - int wide_count = (int)TS_ALIGN(sample_count, 4); - int pool_size = playing_pool_count * sizeof(tsPlayingSound); - int mix_buffers_size = sizeof(__m128) * wide_count * 2; - int sample_buffer_size = sizeof(__m128i) * wide_count; - ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size); - ctx->latency_samples = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4); - ctx->running_index = 0; - ctx->Hz = play_frequency_in_Hz; - ctx->bps = bps; - ctx->buffer_size = buffer_size; - ctx->wide_count = wide_count; - ctx->dsound = dsound; - ctx->buffer = secondary_buffer; - ctx->primary = primary_buffer; - ctx->playing = 0; - ctx->floatA = (__m128*)(ctx + 1); - ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16); - TS_ASSERT(!((size_t)ctx->floatA & 15)); - ctx->floatB = ctx->floatA + wide_count; - ctx->samples = (__m128i*)ctx->floatB + wide_count; - ctx->running = 1; - ctx->separate_thread = 0; - ctx->sleep_milliseconds = 0; - - if (playing_pool_count) - { - ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count); - for (int i = 0; i < playing_pool_count - 1; ++i) - ctx->playing_pool[i].next = ctx->playing_pool + i + 1; - ctx->playing_pool[playing_pool_count - 1].next = 0; - ctx->playing_free = ctx->playing_pool; - } - - else - { - ctx->playing_pool = 0; - ctx->playing_free = 0; - } - - return ctx; - -ts_err: - free(ctx); - return 0; -} - -void tsSpawnMixThread(tsContext* ctx) -{ - if (ctx->separate_thread) return; - InitializeCriticalSectionAndSpinCount(&ctx->critical_section, 0x00000400); - ctx->separate_thread = 1; - CreateThread(0, 0, tsCtxThread, ctx, 0, 0); -} - -#elif TS_PLATFORM == TS_MAC - -void tsSleep(int milliseconds) -{ - usleep(milliseconds * 1000); -} - -struct tsContext -{ - unsigned latency_samples; - unsigned index0; // read - unsigned index1; // write - int Hz; - int bps; - int wide_count; - int sample_count; - tsPlayingSound* playing; - __m128* floatA; - __m128* floatB; - __m128i* samples; - tsPlayingSound* playing_pool; - tsPlayingSound* playing_free; - - // platform specific stuff - AudioComponentInstance inst; - - // data for tsMix thread, enable these with tsSpawnMixThread - pthread_t thread; - pthread_mutex_t mutex; - int separate_thread; - int running; - int sleep_milliseconds; -}; - -static void tsReleaseContext(tsContext* ctx) -{ - if (ctx->separate_thread) pthread_mutex_destroy(&ctx->mutex); - AudioOutputUnitStop(ctx->inst); - AudioUnitUninitialize(ctx->inst); - AudioComponentInstanceDispose(ctx->inst); - tsPlayingSound* playing = ctx->playing; - while (playing) - { - tsRemoveFilter(playing); - playing = playing->next; - } - free(ctx); -} - -static void* tsCtxThread(void* udata) -{ - tsContext* ctx = (tsContext*)udata; - - while (ctx->running) - { - tsMix(ctx); - if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds); - else pthread_yield_np(); - } - - ctx->separate_thread = 0; - pthread_exit(0); - return 0; -} - -static void tsLock(tsContext* ctx) -{ - if (ctx->separate_thread) pthread_mutex_lock(&ctx->mutex); -} - -static void tsUnlock(tsContext* ctx) -{ - if (ctx->separate_thread) pthread_mutex_unlock(&ctx->mutex); -} - -static OSStatus tsMemcpyToCA(void* udata, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData); - -tsContext* tsMakeContext(void* unused, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count) -{ - int bps = sizeof(uint16_t) * 2; - - AudioComponentDescription comp_desc = { 0 }; - comp_desc.componentType = kAudioUnitType_Output; - comp_desc.componentSubType = kAudioUnitSubType_DefaultOutput; - comp_desc.componentFlags = 0; - comp_desc.componentFlagsMask = 0; - comp_desc.componentManufacturer = kAudioUnitManufacturer_Apple; - - AudioComponent comp = AudioComponentFindNext(NULL, &comp_desc); - if (!comp) - { - g_tsErrorReason = "Failed to create output unit from AudioComponentFindNext."; - return 0; - } - - AudioStreamBasicDescription stream_desc = { 0 }; - stream_desc.mSampleRate = (double)play_frequency_in_Hz; - stream_desc.mFormatID = kAudioFormatLinearPCM; - stream_desc.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagsNativeEndian | kAudioFormatFlagIsPacked; - stream_desc.mFramesPerPacket = 1; - stream_desc.mChannelsPerFrame = 2; - stream_desc.mBitsPerChannel = sizeof(uint16_t) * 8; - stream_desc.mBytesPerPacket = bps; - stream_desc.mBytesPerFrame = bps; - stream_desc.mReserved = 0; - - AudioComponentInstance inst; - OSStatus ret; - AURenderCallbackStruct input; - - ret = AudioComponentInstanceNew(comp, &inst); - - int sample_count = play_frequency_in_Hz * num_buffered_seconds; - int latency_count = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4); - TS_ASSERT(sample_count > latency_count); - int wide_count = (int)TS_ALIGN(sample_count, 4) / 4; - int pool_size = playing_pool_count * sizeof(tsPlayingSound); - int mix_buffers_size = sizeof(__m128) * wide_count * 2; - int sample_buffer_size = sizeof(__m128i) * wide_count; - tsContext* ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size); - TS_CHECK(ret == noErr, "AudioComponentInstanceNew failed"); - ctx->latency_samples = latency_count; - ctx->index0 = 0; - ctx->index1 = 0; - ctx->Hz = play_frequency_in_Hz; - ctx->bps = bps; - ctx->wide_count = wide_count; - ctx->sample_count = wide_count * 4; - ctx->inst = inst; - ctx->playing = 0; - ctx->floatA = (__m128*)(ctx + 1); - ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16); - TS_ASSERT(!((size_t)ctx->floatA & 15)); - ctx->floatB = ctx->floatA + wide_count; - ctx->samples = (__m128i*)ctx->floatB + wide_count; - ctx->running = 1; - ctx->separate_thread = 0; - ctx->sleep_milliseconds = 0; - - ret = AudioUnitSetProperty(inst, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, 0, &stream_desc, sizeof(stream_desc)); - TS_CHECK(ret == noErr, "Failed to set stream forat"); - - input.inputProc = tsMemcpyToCA; - input.inputProcRefCon = ctx; - ret = AudioUnitSetProperty(inst, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, 0, &input, sizeof(input)); - TS_CHECK(ret == noErr, "AudioUnitSetProperty failed"); - - ret = AudioUnitInitialize(inst); - TS_CHECK(ret == noErr, "Couldn't initialize output unit"); - - ret = AudioOutputUnitStart(inst); - TS_CHECK(ret == noErr, "Couldn't start output unit"); - - if (playing_pool_count) - { - ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count); - for (int i = 0; i < playing_pool_count - 1; ++i) - ctx->playing_pool[i].next = ctx->playing_pool + i + 1; - ctx->playing_pool[playing_pool_count - 1].next = 0; - ctx->playing_free = ctx->playing_pool; - } - - else - { - ctx->playing_pool = 0; - ctx->playing_free = 0; - } - - return ctx; - -ts_err: - free(ctx); - return 0; -} - -void tsSpawnMixThread(tsContext* ctx) -{ - if (ctx->separate_thread) return; - pthread_mutex_init(&ctx->mutex, 0); - ctx->separate_thread = 1; - pthread_create(&ctx->thread, 0, tsCtxThread, ctx); -} - -#else - -void tsSleep(int milliseconds) -{ - SDL_Delay(milliseconds); -} - -struct tsContext -{ - unsigned latency_samples; - unsigned index0; // read - unsigned index1; // write - unsigned running_index; - int Hz; - int bps; - int buffer_size; - int wide_count; - int sample_count; - tsPlayingSound* playing; - __m128* floatA; - __m128* floatB; - __m128i* samples; - tsPlayingSound* playing_pool; - tsPlayingSound* playing_free; - - // data for tsMix thread, enable these with tsSpawnMixThread - SDL_Thread* thread; - SDL_mutex* mutex; - int separate_thread; - int running; - int sleep_milliseconds; -}; - -static void tsReleaseContext(tsContext* ctx) -{ - if (ctx->separate_thread) SDL_DestroyMutex(ctx->mutex); - tsPlayingSound* playing = ctx->playing; - while (playing) - { - tsRemoveFilter(playing); - playing = playing->next; - } - SDL_CloseAudio(); - free(ctx); -} - -int tsCtxThread(void* udata) -{ - tsContext* ctx = (tsContext*)udata; - - while (ctx->running) - { - tsMix(ctx); - if (ctx->sleep_milliseconds) tsSleep(ctx->sleep_milliseconds); - else tsSleep(1); - } - - ctx->separate_thread = 0; - return 0; -} - -static void tsLock(tsContext* ctx) -{ - if (ctx->separate_thread) SDL_LockMutex(ctx->mutex); -} - -static void tsUnlock(tsContext* ctx) -{ - if (ctx->separate_thread) SDL_UnlockMutex(ctx->mutex); -} - -void tsSDL_AudioCallback(void* udata, Uint8* stream, int len); - -tsContext* tsMakeContext(void* unused, unsigned play_frequency_in_Hz, int latency_factor_in_Hz, int num_buffered_seconds, int playing_pool_count) -{ - (void)unused; - int bps = sizeof(uint16_t) * 2; - int sample_count = play_frequency_in_Hz * num_buffered_seconds; - int latency_count = (unsigned)TS_ALIGN(play_frequency_in_Hz / latency_factor_in_Hz, 4); - TS_ASSERT(sample_count > latency_count); - int wide_count = (int)TS_ALIGN(sample_count, 4) / 4; - int pool_size = playing_pool_count * sizeof(tsPlayingSound); - int mix_buffers_size = sizeof(__m128) * wide_count * 2; - int sample_buffer_size = sizeof(__m128i) * wide_count; - tsContext* ctx = 0; - SDL_AudioSpec wanted; - int ret = SDL_Init(SDL_INIT_AUDIO); - TS_CHECK(ret >= 0, "Can't init SDL audio"); - - ctx = (tsContext*)malloc(sizeof(tsContext) + mix_buffers_size + sample_buffer_size + 16 + pool_size); - TS_CHECK(ctx != NULL, "Can't create audio context"); - ctx->latency_samples = latency_count; - ctx->index0 = 0; - ctx->index1 = 0; - ctx->Hz = play_frequency_in_Hz; - ctx->bps = bps; - ctx->wide_count = wide_count; - ctx->sample_count = wide_count * 4; - ctx->playing = 0; - ctx->floatA = (__m128*)(ctx + 1); - ctx->floatA = (__m128*)TS_ALIGN(ctx->floatA, 16); - TS_ASSERT(!((size_t)ctx->floatA & 15)); - ctx->floatB = ctx->floatA + wide_count; - ctx->samples = (__m128i*)ctx->floatB + wide_count; - ctx->running = 1; - ctx->separate_thread = 0; - ctx->sleep_milliseconds = 0; - - SDL_memset(&wanted, 0, sizeof(wanted)); - wanted.freq = play_frequency_in_Hz; - wanted.format = AUDIO_S16SYS; - wanted.channels = 2; /* 1 = mono, 2 = stereo */ - wanted.samples = 1024; - wanted.callback = tsSDL_AudioCallback; - wanted.userdata = ctx; - ret = SDL_OpenAudio(&wanted, NULL); - TS_CHECK(ret >= 0, "Can't open SDL audio"); - SDL_PauseAudio(0); - - if (playing_pool_count) - { - ctx->playing_pool = (tsPlayingSound*)(ctx->samples + wide_count); - for (int i = 0; i < playing_pool_count - 1; ++i) - ctx->playing_pool[i].next = ctx->playing_pool + i + 1; - ctx->playing_pool[playing_pool_count - 1].next = 0; - ctx->playing_free = ctx->playing_pool; - } - - else - { - ctx->playing_pool = 0; - ctx->playing_free = 0; - } - - return ctx; - -ts_err: - if (ctx) free(ctx); - return 0; -} - -void tsSpawnMixThread(tsContext* ctx) -{ - if (ctx->separate_thread) return; - ctx->mutex = SDL_CreateMutex(); - ctx->separate_thread = 1; - ctx->thread = SDL_CreateThread(&tsCtxThread, "TinySoundThread", ctx); -} - -#endif - -#if TS_PLATFORM == TS_SDL || TS_PLATFORM == TS_MAC - -static int tsSamplesWritten(tsContext* ctx) -{ - int index0 = ctx->index0; - int index1 = ctx->index1; - if (index0 <= index1) return index1 - index0; - else return ctx->sample_count - index0 + index1; -} - -static int tsSamplesUnwritten(tsContext* ctx) -{ - int index0 = ctx->index0; - int index1 = ctx->index1; - if (index0 <= index1) return ctx->sample_count - index1 + index0; - else return index0 - index1; -} - -static int tsSamplesToMix(tsContext* ctx) -{ - int lat = ctx->latency_samples; - int written = tsSamplesWritten(ctx); - int dif = lat - written; - TS_ASSERT(dif >= 0); - if (dif) - { - int unwritten = tsSamplesUnwritten(ctx); - return dif < unwritten ? dif : unwritten; - } - return 0; -} - -#define TS_SAMPLES_TO_BYTES( interleaved_sample_count ) ((interleaved_sample_count) * ctx->bps) -#define TS_BYTES_TO_SAMPLES( byte_count ) ((byte_count) / ctx->bps) - -static void tsPushBytes(tsContext* ctx, void* data, int size) -{ - int index0 = ctx->index0; - int index1 = ctx->index1; - int samples = TS_BYTES_TO_SAMPLES(size); - int sample_count = ctx->sample_count; - - int unwritten = tsSamplesUnwritten(ctx); - if (unwritten < samples) samples = unwritten; - int can_overflow = index0 <= index1; - int would_overflow = index1 + samples > sample_count; - - if (can_overflow && would_overflow) - { - int first_size = TS_SAMPLES_TO_BYTES(sample_count - index1); - int second_size = size - first_size; - memcpy((char*)ctx->samples + TS_SAMPLES_TO_BYTES(index1), data, first_size); - memcpy(ctx->samples, (char*)data + first_size, second_size); - ctx->index1 = TS_BYTES_TO_SAMPLES(second_size); - } - - else - { - memcpy((char*)ctx->samples + TS_SAMPLES_TO_BYTES(index1), data, size); - ctx->index1 += TS_BYTES_TO_SAMPLES(size); - } -} - -static int tsPullBytes(tsContext* ctx, void* dst, int size) -{ - int index0 = ctx->index0; - int index1 = ctx->index1; - int allowed_size = TS_SAMPLES_TO_BYTES(tsSamplesWritten(ctx)); - int zeros = 0; - - if (allowed_size < size) - { - zeros = size - allowed_size; - size = allowed_size; - } - - if (index1 >= index0) - { - memcpy(dst, ((char*)ctx->samples) + TS_SAMPLES_TO_BYTES(index0), size); - ctx->index0 += TS_BYTES_TO_SAMPLES(size); - } - - else - { - int first_size = TS_SAMPLES_TO_BYTES(ctx->sample_count) - TS_SAMPLES_TO_BYTES(index0); - if (first_size > size) first_size = size; - int second_size = size - first_size; - memcpy(dst, ((char*)ctx->samples) + TS_SAMPLES_TO_BYTES(index0), first_size); - memcpy(((char*)dst) + first_size, ctx->samples, second_size); - if (second_size) ctx->index0 = TS_BYTES_TO_SAMPLES(second_size); - else ctx->index0 += TS_BYTES_TO_SAMPLES(first_size); - } - - return zeros; -} - -#endif - -void tsShutdownContext(tsContext* ctx) -{ - if (ctx->separate_thread) - { - tsLock(ctx); - ctx->running = 0; - tsUnlock(ctx); - } - - while (ctx->separate_thread) tsSleep(1); - tsReleaseContext(ctx); -} - -void tsThreadSleepDelay(tsContext* ctx, int milliseconds) -{ - ctx->sleep_milliseconds = milliseconds; -} - -void tsInsertSound(tsContext* ctx, tsPlayingSound* sound) -{ - // Cannot use tsPlayingSound if tsMakeContext was passed non-zero for playing_pool_count - // since non-zero playing_pool_count means the context is doing some memory-management - // for a playing sound pool. InsertSound assumes the pool does not exist, and is apart - // of the lower-level API (see top of this header for documentation details). - TS_ASSERT(ctx->playing_pool == 0); - - if (sound->active) return; - tsLock(ctx); - sound->next = ctx->playing; - ctx->playing = sound; - sound->active = 1; - tsUnlock(ctx); -} - -// NOTE: does not allow delay_in_seconds to be negative (clamps at 0) -void tsSetDelay(tsContext* ctx, tsPlayingSound* sound, float delay_in_seconds) -{ - if (delay_in_seconds < 0.0f) delay_in_seconds = 0.0f; - sound->sample_index = (int)(delay_in_seconds * (float)ctx->Hz); - sound->sample_index = -(int)TS_ALIGN(sound->sample_index, 4); -} - -tsPlaySoundDef tsMakeDef(tsLoadedSound* sound) -{ - tsPlaySoundDef def; - def.paused = 0; - def.looped = 0; - def.volume_left = 1.0f; - def.volume_right = 1.0f; - def.pan = 0.5f; - def.pitch = 1.0f; - def.delay = 0.0f; - def.loaded = sound; - return def; -} - -tsPlayingSound* tsPlaySound(tsContext* ctx, tsPlaySoundDef def) -{ - tsLock(ctx); - - tsPlayingSound* playing = ctx->playing_free; - if (!playing) return 0; - ctx->playing_free = playing->next; - *playing = tsMakePlayingSound(def.loaded); - playing->active = 1; - playing->paused = def.paused; - playing->looped = def.looped; - tsSetVolume(playing, def.volume_left, def.volume_right); - tsSetPan(playing, def.pan); - tsSetPitch(playing, def.pitch); - tsSetDelay(ctx, playing, def.delay); - playing->next = ctx->playing; - ctx->playing = playing; - - tsUnlock(ctx); - - return playing; -} - -void tsStopAllSounds(tsContext* ctx) -{ - // This is apart of the high level API, not the low level API. - // If using the low level API you must write your own function to - // stop playing all sounds. - TS_ASSERT(ctx->playing_pool == 0); - - tsPlayingSound* sound = ctx->playing; - ctx->playing = 0; - - while (sound) - { - tsPlayingSound* next = sound->next; - sound->next = ctx->playing_free; - ctx->playing_free = sound; - sound = next; - } -} - -#if TS_PLATFORM == TS_WINDOWS - -static void tsPosition(tsContext* ctx, int* byte_to_lock, int* bytes_to_write) -{ - // compute bytes to be written to direct sound - DWORD play_cursor; - DWORD write_cursor; -#ifdef __cplusplus - HRESULT hr = ctx->buffer->GetCurrentPosition(&play_cursor, &write_cursor); -#else - HRESULT hr = ctx->buffer->lpVtbl->GetCurrentPosition(ctx->buffer, &play_cursor, &write_cursor); -#endif - TS_ASSERT(hr == DS_OK); - - DWORD lock = (ctx->running_index * ctx->bps) % ctx->buffer_size; - DWORD target_cursor = (write_cursor + ctx->latency_samples * ctx->bps) % ctx->buffer_size; - target_cursor = (DWORD)TS_ALIGN(target_cursor, 16); - DWORD write; - - if (lock > target_cursor) - { - write = (ctx->buffer_size - lock) + target_cursor; - } - - else - { - write = target_cursor - lock; - } - - *byte_to_lock = lock; - *bytes_to_write = write; -} - -static void tsMemcpyToDS(tsContext* ctx, int16_t* samples, int byte_to_lock, int bytes_to_write) -{ - // copy mixer buffers to direct sound - void* region1; - DWORD size1; - void* region2; - DWORD size2; -#ifdef __cplusplus - HRESULT hr = ctx->buffer->Lock(byte_to_lock, bytes_to_write, ®ion1, &size1, ®ion2, &size2, 0); - - if (hr == DSERR_BUFFERLOST) - { - ctx->buffer->Restore(); - hr = ctx->buffer->Lock(byte_to_lock, bytes_to_write, ®ion1, &size1, ®ion2, &size2, 0); - } -#else - HRESULT hr = ctx->buffer->lpVtbl->Lock(ctx->buffer, byte_to_lock, bytes_to_write, ®ion1, &size1, ®ion2, &size2, 0); - - if (hr == DSERR_BUFFERLOST) - { - ctx->buffer->lpVtbl->Restore(ctx->buffer); - hr = ctx->buffer->lpVtbl->Lock(ctx->buffer, byte_to_lock, bytes_to_write, ®ion1, &size1, ®ion2, &size2, 0); - } -#endif - - if (!SUCCEEDED(hr)) - return; - - unsigned running_index = ctx->running_index; - INT16* sample1 = (INT16*)region1; - DWORD sample1_count = size1 / ctx->bps; - memcpy(sample1, samples, sample1_count * sizeof(INT16) * 2); - samples += sample1_count * 2; - running_index += sample1_count; - - INT16* sample2 = (INT16*)region2; - DWORD sample2_count = size2 / ctx->bps; - memcpy(sample2, samples, sample2_count * sizeof(INT16) * 2); - samples += sample2_count * 2; - running_index += sample2_count; - -#ifdef __cplusplus - ctx->buffer->Unlock(region1, size1, region2, size2); -#else - ctx->buffer->lpVtbl->Unlock(ctx->buffer, region1, size1, region2, size2); -#endif - ctx->running_index = running_index; - - // meager hack to fill out sound buffer before playing - static int first; - if (!first) - { -#ifdef __cplusplus - ctx->buffer->Play(0, 0, DSBPLAY_LOOPING); -#else - ctx->buffer->lpVtbl->Play(ctx->buffer, 0, 0, DSBPLAY_LOOPING); -#endif - first = 1; - } -} - -#elif TS_PLATFORM == TS_MAC - -static OSStatus tsMemcpyToCA(void* udata, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData) -{ - tsContext* ctx = (tsContext*)udata; - int bps = ctx->bps; - int samples_requested_to_consume = inNumberFrames; - AudioBuffer* buffer = ioData->mBuffers; - - TS_ASSERT(ioData->mNumberBuffers == 1); - TS_ASSERT(buffer->mNumberChannels == 2); - int byte_size = buffer->mDataByteSize; - TS_ASSERT(byte_size == samples_requested_to_consume * bps); - - int zero_bytes = tsPullBytes(ctx, buffer->mData, byte_size); - memset(((char*)buffer->mData) + (byte_size - zero_bytes), 0, zero_bytes); - - return noErr; -} - -#elif TS_PLATFORM == TS_SDL - -static void tsSDL_AudioCallback(void* udata, Uint8* stream, int len) -{ - tsContext* ctx = (tsContext*)udata; - int zero_bytes = tsPullBytes(ctx, stream, len); - memset(stream + (len - zero_bytes), 0, zero_bytes); -} - -#endif - -static void tsPitchShift(float pitchShift, int num_samples_to_process, float sampleRate, float* indata, tsPitchData** pitch_filter); - -// Pitch processing tunables -#define TS_MAX_FRAME_LENGTH 4096 -#define TS_PITCH_FRAME_SIZE 512 -#define TS_PITCH_QUALITY 8 - -// interals -#define TS_STEPSIZE (TS_PITCH_FRAME_SIZE / TS_PITCH_QUALITY) -#define TS_OVERLAP (TS_PITCH_FRAME_SIZE - TS_STEPSIZE) -#define TS_EXPECTED_FREQUENCY (2.0f * 3.14159265359f * (float)TS_STEPSIZE / (float)TS_PITCH_FRAME_SIZE) - -// TODO: -// Use a memory pool for these things. For now they are just malloc16'd/free16'd -// Not high priority to use a pool, since pitch shifting is already really expensive, -// and cost of malloc is dwarfed. But would be a nice-to-have for potential memory -// fragmentation issues. -typedef struct tsPitchData -{ - float pitch_shifted_output_samples[TS_MAX_FRAME_LENGTH]; - float in_FIFO[TS_STEPSIZE + TS_PITCH_FRAME_SIZE]; - float out_FIFO[TS_STEPSIZE + TS_PITCH_FRAME_SIZE]; - float fft_data[2 * TS_PITCH_FRAME_SIZE]; - float previous_phase[TS_PITCH_FRAME_SIZE / 2 + 4]; - float sum_phase[TS_PITCH_FRAME_SIZE / 2 + 4]; - float window_accumulator[TS_STEPSIZE + TS_PITCH_FRAME_SIZE]; - float freq[TS_PITCH_FRAME_SIZE]; - float mag[TS_PITCH_FRAME_SIZE]; - float pitch_shift_workspace[TS_PITCH_FRAME_SIZE]; - int index; -} tsPitchData; - -static void tsRemoveFilter(tsPlayingSound* playing) -{ - for (int i = 0; i < 2; i++) - { - if (playing->pitch_filter[i]) - { - free16(playing->pitch_filter[i]); - playing->pitch_filter[i] = 0; - } - } -} - -void tsMix(tsContext* ctx) -{ - tsLock(ctx); - -#if TS_PLATFORM == TS_WINDOWS - - int byte_to_lock; - int bytes_to_write; - tsPosition(ctx, &byte_to_lock, &bytes_to_write); - - if (!bytes_to_write) goto unlock; - int samples_to_write = bytes_to_write / ctx->bps; - -#elif TS_PLATFORM == TS_MAC || TS_PLATFORM == TS_SDL - - int samples_to_write = tsSamplesToMix(ctx); - if (!samples_to_write) goto unlock; - int bytes_to_write = samples_to_write * ctx->bps; - -#else -#endif - - // clear mixer buffers - int wide_count = samples_to_write / 4; - TS_ASSERT(!(samples_to_write & 3)); - - __m128* floatA = ctx->floatA; - __m128* floatB = ctx->floatB; - __m128 zero = _mm_set1_ps(0.0f); - - for (int i = 0; i < wide_count; ++i) - { - floatA[i] = zero; - floatB[i] = zero; - } - - // mix all playing sounds into the mixer buffers - tsPlayingSound** ptr = &ctx->playing; - while (*ptr) - { - tsPlayingSound* playing = *ptr; - tsLoadedSound* loaded = playing->loaded_sound; - __m128* cA = (__m128*)loaded->channels[0]; - __m128* cB = (__m128*)loaded->channels[1]; - - // Attempted to play a sound with no audio. - // Make sure the audio file was loaded properly. Check for - // error messages in g_tsErrorReason. - TS_ASSERT(cA); - - int mix_count = samples_to_write; - int offset = playing->sample_index; - int remaining = loaded->sample_count - offset; - if (remaining < mix_count) mix_count = remaining; - TS_ASSERT(remaining > 0); - - float vA0 = playing->volume0 * playing->pan0; - float vB0 = playing->volume1 * playing->pan1; - __m128 vA = _mm_set1_ps(vA0); - __m128 vB = _mm_set1_ps(vB0); - - // skip sound if it's delay is longer than mix_count and - // handle various delay cases - int delay_offset = 0; - if (offset < 0) - { - int samples_till_positive = -offset; - int mix_leftover = mix_count - samples_till_positive; - - if (mix_leftover <= 0) - { - playing->sample_index += mix_count; - goto get_next_playing_sound; - } - - else - { - offset = 0; - delay_offset = samples_till_positive; - mix_count = mix_leftover; - } - } - TS_ASSERT(!(delay_offset & 3)); - - // immediately remove any inactive elements - if (!playing->active || !ctx->running) - goto remove; - - // skip all paused sounds - if (playing->paused) - goto get_next_playing_sound; - - // SIMD offets - int mix_wide = (int)TS_ALIGN(mix_count, 4) / 4; - int offset_wide = (int)TS_TRUNC(offset, 4) / 4; - int delay_wide = (int)TS_ALIGN(delay_offset, 4) / 4; - - // use tsPitchShift to on-the-fly pitch shift some samples - // only call this function if the user set a custom pitch value - if (playing->pitch != 1.0f) - { - int sample_count = (mix_wide - 2 * delay_wide) * 4; - int falling_behind = sample_count > TS_MAX_FRAME_LENGTH; - - // TS_MAX_FRAME_LENGTH represents max samples we can pitch shift in one go. In the event - // that this process takes longer than the time required to play the actual sound, just - // fall back to the original sound (non-pitch shifted). This will sound very ugly. To - // prevent falling behind, make sure not to pitch shift too many sounds at once. Try tweaking - // TS_PITCH_QUALITY to make it lower (must be a power of 2). - if (!falling_behind) - { - tsPitchShift(playing->pitch, sample_count, (float)ctx->Hz, (float*)(cA + delay_wide + offset_wide), playing->pitch_filter); - cA = (__m128 *)playing->pitch_filter[0]->pitch_shifted_output_samples; - - if (loaded->channel_count == 2) - { - tsPitchShift(playing->pitch, sample_count, (float)ctx->Hz, (float*)(cB + delay_wide + offset_wide), playing->pitch_filter + 1); - cB = (__m128 *)playing->pitch_filter[1]->pitch_shifted_output_samples; - } - - offset_wide = -delay_wide; - } - } - - // apply volume, load samples into float buffers - switch (loaded->channel_count) - { - case 1: - for (int i = delay_wide; i < mix_wide - delay_wide; ++i) - { - __m128 A = cA[i + offset_wide]; - __m128 B = _mm_mul_ps(A, vB); - A = _mm_mul_ps(A, vA); - floatA[i] = _mm_add_ps(floatA[i], A); - floatB[i] = _mm_add_ps(floatB[i], B); - } - break; - - case 2: - { - for (int i = delay_wide; i < mix_wide - delay_wide; ++i) - { - __m128 A = cA[i + offset_wide]; - __m128 B = cB[i + offset_wide]; - - A = _mm_mul_ps(A, vA); - B = _mm_mul_ps(B, vB); - floatA[i] = _mm_add_ps(floatA[i], A); - floatB[i] = _mm_add_ps(floatB[i], B); - } - } break; - } - - // playing list logic - playing->sample_index += mix_count; - if (playing->sample_index == loaded->sample_count) - { - if (playing->looped) - { - playing->sample_index = 0; - goto get_next_playing_sound; - } - - remove: - playing->sample_index = 0; - *ptr = (*ptr)->next; - playing->next = 0; - playing->active = 0; - - tsRemoveFilter(playing); - - // if using high-level API manage the tsPlayingSound memory ourselves - if (ctx->playing_pool) - { - playing->next = ctx->playing_free; - ctx->playing_free = playing; - } - - // we already incremented next pointer, so don't do it again - continue; - } - - get_next_playing_sound: - if (*ptr) ptr = &(*ptr)->next; - else break; - } - - // load all floats into 16 bit packed interleaved samples -#if TS_PLATFORM == TS_WINDOWS - - __m128i* samples = ctx->samples; - for (int i = 0; i < wide_count; ++i) - { - __m128i a = _mm_cvtps_epi32(floatA[i]); - __m128i b = _mm_cvtps_epi32(floatB[i]); - __m128i a0b0a1b1 = _mm_unpacklo_epi32(a, b); - __m128i a2b2a3b3 = _mm_unpackhi_epi32(a, b); - samples[i] = _mm_packs_epi32(a0b0a1b1, a2b2a3b3); - } - tsMemcpyToDS(ctx, (int16_t*)samples, byte_to_lock, bytes_to_write); - -#elif TS_PLATFORM == TS_MAC || TS_PLATFORM == TS_SDL - - // Since the ctx->samples array is already in use as a ring buffer - // reusing floatA to store output is a good way to temporarly store - // the final samples. Then a single ring buffer push can be used - // afterwards. Pretty hacky, but whatever :) - __m128i* samples = (__m128i*)floatA; - memset(samples, 0, sizeof(__m128i) * wide_count); - for (int i = 0; i < wide_count; ++i) - { - __m128i a = _mm_cvtps_epi32(floatA[i]); - __m128i b = _mm_cvtps_epi32(floatB[i]); - __m128i a0b0a1b1 = _mm_unpacklo_epi32(a, b); - __m128i a2b2a3b3 = _mm_unpackhi_epi32(a, b); - samples[i] = _mm_packs_epi32(a0b0a1b1, a2b2a3b3); - } - tsPushBytes(ctx, samples, bytes_to_write); - -#else -#endif - -unlock: - tsUnlock(ctx); -} - -// TODO: -// Try this optimization out (2N POINT REAL FFT USING AN N POINT COMPLEX FFT) -// http://www.fftguru.com/fftguru.com.tutorial2.pdf - -#include <math.h> - -static uint32_t tsRev32(uint32_t x) -{ - uint32_t a = ((x & 0xAAAAAAAA) >> 1) | ((x & 0x55555555) << 1); - uint32_t b = ((a & 0xCCCCCCCC) >> 2) | ((a & 0x33333333) << 2); - uint32_t c = ((b & 0xF0F0F0F0) >> 4) | ((b & 0x0F0F0F0F) << 4); - uint32_t d = ((c & 0xFF00FF00) >> 8) | ((c & 0x00FF00FF) << 8); - return (d >> 16) | (d << 16); -} - -static uint32_t tsPopCount(uint32_t x) -{ - uint32_t a = x - ((x >> 1) & 0x55555555); - uint32_t b = (((a >> 2) & 0x33333333) + (a & 0x33333333)); - uint32_t c = (((b >> 4) + b) & 0x0F0F0F0F); - uint32_t d = c + (c >> 8); - uint32_t e = d + (d >> 16); - uint32_t f = e & 0x0000003F; - return f; -} - -static uint32_t tsLog2(uint32_t x) -{ - uint32_t a = x | (x >> 1); - uint32_t b = a | (a >> 2); - uint32_t c = b | (b >> 4); - uint32_t d = c | (c >> 8); - uint32_t e = d | (d >> 16); - uint32_t f = e >> 1; - return tsPopCount(f); -} - -// x contains real inputs -// y contains imaginary inputs -// count must be a power of 2 -// sign must be 1.0 (forward transform) or -1.0f (inverse transform) -static void tsFFT(float* x, float* y, int count, float sign) -{ - int exponent = (int)tsLog2((uint32_t)count); - - // bit reversal stage - // swap all elements with their bit reversed index within the - // lowest level of the Cooley-Tukey recursion tree - for (int i = 1; i < count - 1; i++) - { - uint32_t j = tsRev32((uint32_t)i); - j >>= (32 - exponent); - if (i < (int)j) - { - float tx = x[i]; - float ty = y[i]; - x[i] = x[j]; - y[i] = y[j]; - x[j] = tx; - y[j] = ty; - } - } - - // for each recursive iteration - for (int iter = 0, L = 1; iter < exponent; ++iter) - { - int Ls = L; - L <<= 1; - float ur = 1.0f; // cos( pi / 2 ) - float ui = 0; // sin( pi / 2 ) - float arg = 3.14159265359f / (float)Ls; - float wr = cosf(arg); - float wi = -sign * sinf(arg); - - // rows in DFT submatrix - for (int j = 0; j < Ls; ++j) - { - // do butterflies upon DFT row elements - for (int i = j; i < count; i += L) - { - int index = i + Ls; - float x_index = x[index]; - float y_index = y[index]; - float x_i = x[i]; - float y_i = y[i]; - - float tr = ur * x_index - ui * y_index; - float ti = ur * y_index + ui * x_index; - float x_low = x_i - tr; - float x_high = x_i + tr; - float y_low = y_i - ti; - float y_high = y_i + ti; - - x[index] = x_low; - y[index] = y_low; - x[i] = x_high; - y[i] = y_high; - } - - // Rotate u1 and u2 via Givens rotations (2d planar rotation). - // This keeps cos/sin calls in the outermost loop. - // Floating point error is scaled proportionally to Ls. - float t = ur * wr - ui * wi; - ui = ur * wi + ui * wr; - ur = t; - } - } - - // scale factor for forward transform - if (sign > 0) - { - float inv_count = 1.0f / (float)count; - for (int i = 0; i < count; i++) - { - x[i] *= inv_count; - y[i] *= inv_count; - } - } -} - -#ifdef _MSC_VER - -#define TS_ALIGN16_0 __declspec( align( 16 ) ) -#define TS_ALIGN16_1 -#define TS_SELECTANY extern const __declspec( selectany ) - -#else - -#define TS_ALIGN16_0 -#define TS_ALIGN16_1 __attribute__( (aligned( 16 )) ) -#define TS_SELECTANY const __attribute__( (selectany) ) - -#endif - -// SSE2 trig funcs from https://github.com/to-miz/sse_mathfun_extension/ -#define _PS_CONST( Name, Val ) \ - TS_SELECTANY TS_ALIGN16_0 float _ps_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val } - -#define _PS_CONST_TYPE( Name, Type, Val ) \ - TS_SELECTANY TS_ALIGN16_0 Type _ps_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val } - -#define _PI32_CONST( Name, Val ) \ - TS_SELECTANY TS_ALIGN16_0 int _pi32_##Name[ 4 ] TS_ALIGN16_1 = { Val, Val, Val, Val } - -_PS_CONST_TYPE(sign_mask, int, (int)0x80000000); -_PS_CONST_TYPE(inv_sign_mask, int, (int)~0x80000000); - -_PS_CONST(atanrange_hi, 2.414213562373095f); -_PS_CONST(atanrange_lo, 0.4142135623730950f); -_PS_CONST(cephes_PIO2F, 1.5707963267948966192f); -_PS_CONST(cephes_PIO4F, 0.7853981633974483096f); -_PS_CONST(1, 1.0f); -_PS_CONST(0p5, 0.5f); -_PS_CONST(0, 0); -_PS_CONST(sincof_p0, -1.9515295891E-4f); -_PS_CONST(sincof_p1, 8.3321608736E-3f); -_PS_CONST(sincof_p2, -1.6666654611E-1f); -_PS_CONST(atancof_p0, 8.05374449538e-2f); -_PS_CONST(atancof_p1, 1.38776856032E-1f); -_PS_CONST(atancof_p2, 1.99777106478E-1f); -_PS_CONST(atancof_p3, 3.33329491539E-1f); -_PS_CONST(cephes_PIF, 3.141592653589793238f); -_PS_CONST(cephes_2PIF, 2.0f * 3.141592653589793238f); -_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI -_PS_CONST(minus_cephes_DP1, -0.78515625f); -_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); -_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); -_PS_CONST(coscof_p0, 2.443315711809948E-005f); -_PS_CONST(coscof_p1, -1.388731625493765E-003f); -_PS_CONST(coscof_p2, 4.166664568298827E-002f); -_PS_CONST(frame_size, (float)TS_PITCH_FRAME_SIZE); - -_PI32_CONST(1, 1); -_PI32_CONST(inv1, ~1); -_PI32_CONST(2, 2); -_PI32_CONST(4, 4); - -static __m128 _mm_atan_ps(__m128 x) -{ - __m128 sign_bit, y; - - sign_bit = x; - /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask); - - /* range reduction, init x and y depending on range */ - /* x > 2.414213562373095 */ - __m128 cmp0 = _mm_cmpgt_ps(x, *(__m128*)_ps_atanrange_hi); - /* x > 0.4142135623730950 */ - __m128 cmp1 = _mm_cmpgt_ps(x, *(__m128*)_ps_atanrange_lo); - - /* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */ - __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1); - - /* -( 1.0/x ) */ - __m128 y0 = _mm_and_ps(cmp0, *(__m128*)_ps_cephes_PIO2F); - __m128 x0 = _mm_div_ps(*(__m128*)_ps_1, x); - x0 = _mm_xor_ps(x0, *(__m128*)_ps_sign_mask); - - __m128 y1 = _mm_and_ps(cmp2, *(__m128*)_ps_cephes_PIO4F); - /* (x-1.0)/(x+1.0) */ - __m128 x1_o = _mm_sub_ps(x, *(__m128*)_ps_1); - __m128 x1_u = _mm_add_ps(x, *(__m128*)_ps_1); - __m128 x1 = _mm_div_ps(x1_o, x1_u); - - __m128 x2 = _mm_and_ps(cmp2, x1); - x0 = _mm_and_ps(cmp0, x0); - x2 = _mm_or_ps(x2, x0); - cmp1 = _mm_or_ps(cmp0, cmp2); - x2 = _mm_and_ps(cmp1, x2); - x = _mm_andnot_ps(cmp1, x); - x = _mm_or_ps(x2, x); - - y = _mm_or_ps(y0, y1); - - __m128 zz = _mm_mul_ps(x, x); - __m128 acc = *(__m128*)_ps_atancof_p0; - acc = _mm_mul_ps(acc, zz); - acc = _mm_sub_ps(acc, *(__m128*)_ps_atancof_p1); - acc = _mm_mul_ps(acc, zz); - acc = _mm_add_ps(acc, *(__m128*)_ps_atancof_p2); - acc = _mm_mul_ps(acc, zz); - acc = _mm_sub_ps(acc, *(__m128*)_ps_atancof_p3); - acc = _mm_mul_ps(acc, zz); - acc = _mm_mul_ps(acc, x); - acc = _mm_add_ps(acc, x); - y = _mm_add_ps(y, acc); - - /* update the sign */ - y = _mm_xor_ps(y, sign_bit); - - return y; -} - -static __m128 _mm_atan2_ps(__m128 y, __m128 x) -{ - __m128 x_eq_0 = _mm_cmpeq_ps(x, *(__m128*)_ps_0); - __m128 x_gt_0 = _mm_cmpgt_ps(x, *(__m128*)_ps_0); - __m128 x_le_0 = _mm_cmple_ps(x, *(__m128*)_ps_0); - __m128 y_eq_0 = _mm_cmpeq_ps(y, *(__m128*)_ps_0); - __m128 x_lt_0 = _mm_cmplt_ps(x, *(__m128*)_ps_0); - __m128 y_lt_0 = _mm_cmplt_ps(y, *(__m128*)_ps_0); - - __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0); - __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0); - zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case); - - __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0); - __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, *(__m128*)_ps_sign_mask); - __m128 pio2_result = *(__m128*)_ps_cephes_PIO2F; - pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign); - pio2_result = _mm_and_ps(pio2_mask, pio2_result); - - __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0); - __m128 pi = *(__m128*)_ps_cephes_PIF; - __m128 pi_result = _mm_and_ps(pi_mask, pi); - - __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0); - swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, *(__m128*)_ps_sign_mask); - - __m128 offset0 = _mm_setzero_ps(); - __m128 offset1 = *(__m128*)_ps_cephes_PIF; - offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset); - - __m128 offset = _mm_andnot_ps(x_lt_0, offset0); - offset = _mm_and_ps(x_lt_0, offset1); - - __m128 arg = _mm_div_ps(y, x); - __m128 atan_result = _mm_atan_ps(arg); - atan_result = _mm_add_ps(atan_result, offset); - - /* select between zero_result, pio2_result and atan_result */ - - __m128 result = _mm_andnot_ps(zero_mask, pio2_result); - atan_result = _mm_andnot_ps(pio2_mask, atan_result); - atan_result = _mm_andnot_ps(pio2_mask, atan_result); - result = _mm_or_ps(result, atan_result); - result = _mm_or_ps(result, pi_result); - - return result; -} - -static void _mm_sincos_ps(__m128 x, __m128 *s, __m128 *c) -{ - __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - __m128i emm0, emm2, emm4; - sign_bit_sin = x; - /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); - - /* scale by 4/Pi */ - y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); - - /* store the integer part of y in emm2 */ - emm2 = _mm_cvttps_epi32(y); - - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); - emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); - y = _mm_cvtepi32_ps(emm2); - - emm4 = emm2; - - /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); - emm0 = _mm_slli_epi32(emm0, 29); - __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); - - /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - __m128 poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m128*)_ps_minus_cephes_DP1; - xmm2 = *(__m128*)_ps_minus_cephes_DP2; - xmm3 = *(__m128*)_ps_minus_cephes_DP3; - xmm1 = _mm_mul_ps(y, xmm1); - xmm2 = _mm_mul_ps(y, xmm2); - xmm3 = _mm_mul_ps(y, xmm3); - x = _mm_add_ps(x, xmm1); - x = _mm_add_ps(x, xmm2); - x = _mm_add_ps(x, xmm3); - - emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); - emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); - emm4 = _mm_slli_epi32(emm4, 29); - __m128 sign_bit_cos = _mm_castsi128_ps(emm4); - - sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x, x); - y = *(__m128*)_ps_coscof_p0; - - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); - y = _mm_mul_ps(y, z); - y = _mm_mul_ps(y, z); - __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(__m128*)_ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - __m128 y2 = *(__m128*)_ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_mul_ps(y2, x); - y2 = _mm_add_ps(y2, x); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - __m128 ysin2 = _mm_and_ps(xmm3, y2); - __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2, ysin2); - y = _mm_sub_ps(y, ysin1); - - xmm1 = _mm_add_ps(ysin1, ysin2); - xmm2 = _mm_add_ps(y, y2); - - /* update the sign */ - *s = _mm_xor_ps(xmm1, sign_bit_sin); - *c = _mm_xor_ps(xmm2, sign_bit_cos); -} - -static __m128i select_si(__m128i a, __m128i b, __m128i mask) -{ - return _mm_xor_si128(a, _mm_and_si128(mask, _mm_xor_si128(b, a))); -} - -#define tsVonHann( i ) (-0.5f * cosf( 2.0f * 3.14159265359f * (float)(i) / (float)TS_PITCH_FRAME_SIZE ) + 0.5f) - -static __m128 tsVonHann4(int i) -{ - __m128 k4 = _mm_set_ps((float)(i * 4 + 3), (float)(i * 4 + 2), (float)(i * 4 + 1), (float)(i * 4)); - k4 = _mm_mul_ps(*(__m128*)_ps_cephes_2PIF, k4); - k4 = _mm_div_ps(k4, *(__m128*)_ps_frame_size); - - // Seems like _mm_cos_ps and _mm_sincos_ps was causing some audio popping... - // I'm not really skilled enough to fix it, but feel free to try: http://gruntthepeon.free.fr/ssemath/sse_mathfun.h - // My guess is some large negative or positive values were causing some - // precision trouble. In this case manually calling 4 cosines is not - // really a big deal, since this function is not a bottleneck. - -#if 0 - __m128 c = _mm_cos_ps(k4); -#elif 0 - __m128 s, c; - _mm_sincos_ps(k4, &s, &c); -#else - __m128 c = k4; - float* cf = (float*)&c; - cf[0] = cosf(cf[0]); - cf[1] = cosf(cf[1]); - cf[2] = cosf(cf[2]); - cf[3] = cosf(cf[3]); -#endif - - __m128 von_hann = _mm_add_ps(_mm_mul_ps(_mm_set_ps1(-0.5f), c), _mm_set_ps1(0.5f)); - return von_hann; -} - -// Analysis and synthesis steps learned from Bernsee's wonderful blog post: -// http://blogs.zynaptiq.com/bernsee/pitch-shifting-using-the-ft/ -static void tsPitchShift(float pitchShift, int num_samples_to_process, float sampleRate, float* indata, tsPitchData** pitch_filter) -{ - TS_ASSERT(num_samples_to_process <= TS_MAX_FRAME_LENGTH); - - // make sure compiler didn't do anything weird with the member - // offsets of tsPitchData. All arrays must be 16 byte aligned - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->pitch_shifted_output_samples) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->fft_data) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->previous_phase) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->sum_phase) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->window_accumulator) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->freq) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->mag) & 15)); - TS_ASSERT(!((size_t)&(((tsPitchData*)0)->pitch_shift_workspace) & 15)); - - tsPitchData* pf; - - if (*pitch_filter == NULL) - { - pf = (tsPitchData*)malloc16(sizeof(tsPitchData)); - memset(pf, 0, sizeof(tsPitchData)); - *pitch_filter = pf; - } - else - { - pf = *pitch_filter; - } - - float freqPerBin = sampleRate / (float)TS_PITCH_FRAME_SIZE; - __m128 freq_per_bin = _mm_set_ps1(sampleRate / (float)TS_PITCH_FRAME_SIZE); - __m128 pi = *(__m128*)_ps_cephes_PIF; - __m128 two_pi = *(__m128*)_ps_cephes_2PIF; - __m128 pitch_quality = _mm_set_ps1((float)TS_PITCH_QUALITY); - float* out_samples = pf->pitch_shifted_output_samples; - if (pf->index == 0) pf->index = TS_OVERLAP; - - while (num_samples_to_process) - { - int copy_count = TS_PITCH_FRAME_SIZE - pf->index; - if (num_samples_to_process < copy_count) copy_count = num_samples_to_process; - - memcpy(pf->in_FIFO + pf->index, indata, sizeof(float) * copy_count); - memcpy(out_samples, pf->out_FIFO + pf->index - TS_OVERLAP, sizeof(float) * copy_count); - - int start_index = pf->index; - int offset = start_index & 3; - start_index += 4 - offset; - - for (int i = 0; i < offset; ++i) - pf->in_FIFO[pf->index + i] /= 32768.0f; - - int extra = copy_count & 3; - copy_count = copy_count / 4 - extra; - __m128* in_FIFO = (__m128*)(pf->in_FIFO + pf->index + offset); - TS_ASSERT(!((size_t)in_FIFO & 15)); - __m128 int16_max = _mm_set_ps1(32768.0f); - - for (int i = 0; i < copy_count; ++i) - { - __m128 val = in_FIFO[i]; - __m128 div = _mm_div_ps(val, int16_max); - in_FIFO[i] = div; - } - - for (int i = 0, copy_count4 = copy_count * 4; i < extra; ++i) - { - int index = copy_count4 + i; - pf->in_FIFO[pf->index + index] /= 32768.0f; - } - - TS_ASSERT(!((size_t)out_samples & 15)); - __m128* out_samples4 = (__m128*)out_samples; - for (int i = 0; i < copy_count; ++i) - { - __m128 val = out_samples4[i]; - __m128 mul = _mm_mul_ps(val, int16_max); - out_samples4[i] = mul; - } - - for (int i = 0, copy_count4 = copy_count * 4; i < extra; ++i) - { - int index = copy_count4 + i; - out_samples[index] *= 32768.0f; - } - - copy_count = copy_count * 4 + extra; - num_samples_to_process -= copy_count; - pf->index += copy_count; - indata += copy_count; - out_samples += copy_count; - - if (pf->index >= TS_PITCH_FRAME_SIZE) - { - pf->index = TS_OVERLAP; - { - __m128* fft_data = (__m128*)pf->fft_data; - __m128* in_FIFO = (__m128*)pf->in_FIFO; - - for (int k = 0; k < TS_PITCH_FRAME_SIZE / 4; k++) - { - __m128 von_hann = tsVonHann4(k); - __m128 sample = in_FIFO[k]; - __m128 windowed_sample = _mm_mul_ps(sample, von_hann); - fft_data[k] = windowed_sample; - } - } - - memset(pf->fft_data + TS_PITCH_FRAME_SIZE, 0, TS_PITCH_FRAME_SIZE * sizeof(float)); - tsFFT(pf->fft_data, pf->fft_data + TS_PITCH_FRAME_SIZE, TS_PITCH_FRAME_SIZE, 1.0f); - - { - __m128* fft_data = (__m128*)pf->fft_data; - __m128* previous_phase = (__m128*)pf->previous_phase; - __m128* magnitudes = (__m128*)pf->mag; - __m128* frequencies = (__m128*)pf->freq; - int simd_count = (TS_PITCH_FRAME_SIZE / 2) / 4; - - for (int k = 0; k <= simd_count; k++) - { - __m128 real = fft_data[k]; - __m128 imag = fft_data[(TS_PITCH_FRAME_SIZE / 4) + k]; - __m128 overlap_phase = _mm_set_ps((float)(k * 4 + 3) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 2) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 1) * TS_EXPECTED_FREQUENCY, (float)(k * 4) * TS_EXPECTED_FREQUENCY); - __m128 k4 = _mm_set_ps((float)(k * 4 + 3), (float)(k * 4 + 2), (float)(k * 4 + 1), (float)(k * 4)); - - __m128 mag = _mm_mul_ps(_mm_set_ps1(2.0f), _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(real, real), _mm_mul_ps(imag, imag)))); - __m128 phase = _mm_atan2_ps(imag, real); - __m128 phase_dif = _mm_sub_ps(phase, previous_phase[k]); - - previous_phase[k] = phase; - phase_dif = _mm_sub_ps(phase_dif, overlap_phase); - - // map delta phase into +/- pi interval - __m128i qpd = _mm_cvttps_epi32(_mm_div_ps(phase_dif, pi)); - __m128i zero = _mm_setzero_si128(); - __m128i ltzero_mask = _mm_cmplt_epi32(qpd, zero); - __m128i ones_bit = _mm_and_si128(qpd, _mm_set1_epi32(1)); - __m128i neg_qpd = _mm_sub_epi32(qpd, ones_bit); - __m128i pos_qpd = _mm_add_epi32(qpd, ones_bit); - qpd = select_si(pos_qpd, neg_qpd, ltzero_mask); - __m128 pi_range_offset = _mm_mul_ps(pi, _mm_cvtepi32_ps(qpd)); - phase_dif = _mm_sub_ps(phase_dif, pi_range_offset); - - __m128 deviation = _mm_div_ps(_mm_mul_ps(_mm_set_ps1((float)TS_PITCH_QUALITY), phase_dif), two_pi); - __m128 true_freq_estimated = _mm_add_ps(_mm_mul_ps(k4, freq_per_bin), _mm_mul_ps(deviation, freq_per_bin)); - - magnitudes[k] = mag; - frequencies[k] = true_freq_estimated; - } - } - - // actual pitch shifting work - // shift frequencies into workspace - memset(pf->pitch_shift_workspace, 0, (TS_PITCH_FRAME_SIZE / 2) * sizeof(float)); - for (int k = 0; k <= TS_PITCH_FRAME_SIZE / 2; k++) - { - int index = (int)(k * pitchShift); - if (index <= TS_PITCH_FRAME_SIZE / 2) - pf->pitch_shift_workspace[index] = pf->freq[k] * pitchShift; - } - - // swap buffers around to reuse old pf->preq buffer as the new workspace - float* frequencies = pf->pitch_shift_workspace; - float* pitch_shift_workspace = pf->freq; - float* magnitudes = pf->mag; - - // shift magnitudes into workspace - memset(pitch_shift_workspace, 0, TS_PITCH_FRAME_SIZE * sizeof(float)); - for (int k = 0; k <= TS_PITCH_FRAME_SIZE / 2; k++) - { - int index = (int)(k * pitchShift); - if (index <= TS_PITCH_FRAME_SIZE / 2) - pitch_shift_workspace[index] += magnitudes[k]; - } - - // track where the shifted magnitudes are - magnitudes = pitch_shift_workspace; - - { - __m128* magnitudes4 = (__m128*)magnitudes; - __m128* frequencies4 = (__m128*)frequencies; - __m128* fft_data = (__m128*)pf->fft_data; - __m128* sum_phase = (__m128*)pf->sum_phase; - int simd_count = (TS_PITCH_FRAME_SIZE / 2) / 4; - - for (int k = 0; k <= simd_count; k++) - { - __m128 mag = magnitudes4[k]; - __m128 freq = frequencies4[k]; - __m128 freq_per_bin_k = _mm_set_ps((float)(k * 4 + 3) * freqPerBin, (float)(k * 4 + 2) * freqPerBin, (float)(k * 4 + 1) * freqPerBin, (float)(k * 4) * freqPerBin); - - freq = _mm_sub_ps(freq, freq_per_bin_k); - freq = _mm_div_ps(freq, freq_per_bin); - - freq = _mm_mul_ps(two_pi, freq); - freq = _mm_div_ps(freq, pitch_quality); - - __m128 overlap_phase = _mm_set_ps((float)(k * 4 + 3) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 2) * TS_EXPECTED_FREQUENCY, (float)(k * 4 + 1) * TS_EXPECTED_FREQUENCY, (float)(k * 4) * TS_EXPECTED_FREQUENCY); - freq = _mm_add_ps(freq, overlap_phase); - - __m128 phase = sum_phase[k]; - phase = _mm_add_ps(phase, freq); - sum_phase[k] = phase; - - __m128 c, s; - _mm_sincos_ps(phase, &s, &c); - __m128 real = _mm_mul_ps(mag, c); - __m128 imag = _mm_mul_ps(mag, s); - - fft_data[k] = real; - fft_data[(TS_PITCH_FRAME_SIZE / 4) + k] = imag; - } - } - - for (int k = TS_PITCH_FRAME_SIZE + 2; k < 2 * TS_PITCH_FRAME_SIZE - 2; ++k) - pf->fft_data[k] = 0; - - tsFFT(pf->fft_data, pf->fft_data + TS_PITCH_FRAME_SIZE, TS_PITCH_FRAME_SIZE, -1); - - { - __m128* fft_data = (__m128*)pf->fft_data; - __m128* window_accumulator = (__m128*)pf->window_accumulator; - - for (int k = 0; k < TS_PITCH_FRAME_SIZE / 4; ++k) - { - __m128 von_hann = tsVonHann4(k); - __m128 fft_data_segment = fft_data[k]; - __m128 accumulator_segment = window_accumulator[k]; - __m128 divisor = _mm_div_ps(pitch_quality, _mm_set_ps1(8.0f)); - fft_data_segment = _mm_mul_ps(von_hann, fft_data_segment); - fft_data_segment = _mm_div_ps(fft_data_segment, divisor); - accumulator_segment = _mm_add_ps(accumulator_segment, fft_data_segment); - window_accumulator[k] = accumulator_segment; - } - } - - memcpy(pf->out_FIFO, pf->window_accumulator, TS_STEPSIZE * sizeof(float)); - memmove(pf->window_accumulator, pf->window_accumulator + TS_STEPSIZE, TS_PITCH_FRAME_SIZE * sizeof(float)); - memmove(pf->in_FIFO, pf->in_FIFO + TS_STEPSIZE, TS_OVERLAP * sizeof(float)); - } - } -} - -/* -zlib license: - -Copyright (c) 2017 Randy Gaul http://www.randygaul.net - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from -the use of this software. - -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it -freely, subject to the following restrictions: -1. The origin of this software must not be misrepresented; you must not -claim that you wrote the original software. If you use this software -in a product, an acknowledgment in the product documentation would be -appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not -be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ - -#endif |