6 files changed, 1884 insertions, 0 deletions
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/LICENSE b/source/3rd-party/SDL2/src/video/yuv2rgb/LICENSE
new file mode 100644
index 0000000..a76efd7
--- /dev/null
+++ b/source/3rd-party/SDL2/src/video/yuv2rgb/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, Adrien Descamps
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of yuv2rgb nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/README.md b/source/3rd-party/SDL2/src/video/yuv2rgb/README.md
new file mode 100644
index 0000000..21191e9
--- /dev/null
+++ b/source/3rd-party/SDL2/src/video/yuv2rgb/README.md
@@ -0,0 +1,63 @@
+From: https://github.com/descampsa/yuv2rgb
+# yuv2rgb
+C library for fast image conversion between yuv420p and rgb24.
+
+This is a simple library for optimized image conversion between YUV420p and rgb24.
+It was done mainly as an exercise to learn to use sse intrinsics, so there may still be room for optimization.
+
+For each conversion, a standard c optimized function and two sse function (with aligned and unaligned memory) are implemented.
+The sse version requires only SSE2, which is available on any reasonably recent CPU.
+The library also supports the three different YUV (YCrCb to be correct) color spaces that exist (see comments in code), and others can be added simply.
+
+There is a simple test program, that convert a raw YUV file to rgb ppm format, and measure computation time.
+Optionally, it also compares the result and computation time with the ffmpeg implementation (that uses MMX), and with the IPP functions.
+
+To compile, simply do :
+
+    mkdir build
+    cd build
+    cmake -DCMAKE_BUILD_TYPE=Release ..
+    make
+
+The test program only support raw YUV files for the YUV420 format, and ppm for the RGB24 format.
+To generate a raw yuv file, you can use avconv:
+
+    avconv -i example.jpg -c:v rawvideo -pix_fmt yuv420p example.yuv
+
+To generate the rgb file, you can use the ImageMagick convert program:
+
+    convert example.jpg example.ppm
+
+Then, for YUV420 to RGB24 conversion, use the test program like that:
+
+    ./test_yuv_rgb yuv2rgb image.yuv 4096 2160 image
+  
+The second and third parameters are image width and height (that are needed because not available in the raw YUV file), and fourth parameter is the output filename template (several output files will be generated, named for example output_sse.ppm, output_av.ppm, etc.)
+
+Similarly, for RGB24 to YUV420 conversion:
+
+    ./test_yuv_rgb yuv2rgb image.ppm image
+
+On my computer, the test program on a 4K image give the following for yuv2rgb:
+
+    Time will be measured in each configuration for 100 iterations...
+    Processing time (std) : 2.630193 sec
+    Processing time (sse2_unaligned) : 0.704394 sec
+    Processing time (ffmpeg_unaligned) : 1.221432 sec
+    Processing time (ipp_unaligned) : 0.636274 sec
+    Processing time (sse2_aligned) : 0.606648 sec
+    Processing time (ffmpeg_aligned) : 1.227100 sec
+    Processing time (ipp_aligned) : 0.636951 sec
+
+And for rgb2yuv:
+
+    Time will be measured in each configuration for 100 iterations...
+    Processing time (std) : 2.588675 sec
+    Processing time (sse2_unaligned) : 0.676625 sec
+    Processing time (ffmpeg_unaligned) : 3.385816 sec
+    Processing time (ipp_unaligned) : 0.593890 sec
+    Processing time (sse2_aligned) : 0.640630 sec
+    Processing time (ffmpeg_aligned) : 3.397952 sec
+    Processing time (ipp_aligned) : 0.579043 sec
+
+configuration : gcc 4.9.2, swscale 3.0.0, IPP 9.0.1, intel i7-5500U
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb.c b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb.c
new file mode 100644
index 0000000..891dae2
--- /dev/null
+++ b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb.c
@@ -0,0 +1,687 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "../../SDL_internal.h"
+
+#include "yuv_rgb.h"
+
+#include "SDL_cpuinfo.h"
+/*#include <x86intrin.h>*/
+
+#define PRECISION 6
+#define PRECISION_FACTOR (1<<PRECISION)
+
+typedef struct
+{
+	uint8_t y_shift;
+	int16_t matrix[3][3];
+} RGB2YUVParam;
+// |Y|   |y_shift|                        |matrix[0][0] matrix[0][1] matrix[0][2]|   |R|
+// |U| = |  128  | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
+// |V|   |  128  |                        |matrix[2][0] matrix[2][1] matrix[2][2]|   |B|
+
+typedef struct
+{
+	uint8_t y_shift;
+	int16_t y_factor;
+	int16_t v_r_factor;
+	int16_t u_g_factor;
+	int16_t v_g_factor;
+	int16_t u_b_factor;
+} YUV2RGBParam;
+// |R|                        |y_factor      0       v_r_factor|   |Y-y_shift|
+// |G| = 1/PRECISION_FACTOR * |y_factor  u_g_factor  v_g_factor| * |  U-128  |
+// |B|                        |y_factor  u_b_factor      0     |   |  V-128  |
+
+#define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
+
+// for ITU-T T.871, values can be found in section 7
+// for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// all values are rounded to the fourth decimal
+
+static const YUV2RGBParam YUV2RGB[3] = {
+	// ITU-T T.871 (JPEG)
+	{/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
+	// ITU-R BT.601-7
+	{/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
+	// ITU-R BT.709-6
+	{/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)}
+};
+
+static const RGB2YUVParam RGB2YUV[3] = {
+	// ITU-T T.871 (JPEG)
+	{/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
+	// ITU-R BT.601-7
+	{/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
+	// ITU-R BT.709-6
+	{/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}}
+};
+
+/* The various layouts of YUV data we support */
+#define YUV_FORMAT_420	1
+#define YUV_FORMAT_422	2
+#define YUV_FORMAT_NV12	3
+
+/* The various formats of RGB pixel that we support */
+#define RGB_FORMAT_RGB565	1
+#define RGB_FORMAT_RGB24	2
+#define RGB_FORMAT_RGBA		3
+#define RGB_FORMAT_BGRA		4
+#define RGB_FORMAT_ARGB		5
+#define RGB_FORMAT_ABGR		6
+
+// divide by PRECISION_FACTOR and clamp to [0:255] interval
+// input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
+static uint8_t clampU8(int32_t v)
+{
+	static const uint8_t lut[512] = 
+	{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
+	47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
+	91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+	126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,
+	159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
+	192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+	225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
+	};
+	return lut[(v+128*PRECISION_FACTOR)>>PRECISION];
+}
+
+
+#define STD_FUNCTION_NAME	yuv420_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_argb_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuv422_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_argb_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_std_func.h"
+
+#define STD_FUNCTION_NAME	yuvnv12_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_std_func.h"
+
+void rgb24_yuv420_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *RGB, uint32_t RGB_stride, 
+	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	YCbCrType yuv_type)
+{
+	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+	
+	uint32_t x, y;
+	for(y=0; y<(height-1); y+=2)
+	{
+		const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
+			*rgb_ptr2=RGB+(y+1)*RGB_stride;
+			
+		uint8_t *y_ptr1=Y+y*Y_stride,
+			*y_ptr2=Y+(y+1)*Y_stride,
+			*u_ptr=U+(y/2)*UV_stride,
+			*v_ptr=V+(y/2)*UV_stride;
+		
+		for(x=0; x<(width-1); x+=2)
+		{
+			// compute yuv for the four pixels, u and v values are summed
+			int32_t y_tmp, u_tmp, v_tmp;
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
+			u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
+			v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
+			y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
+			u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
+			v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
+			y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
+			u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
+			v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
+			y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
+			u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
+			v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
+			y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
+			
+			u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
+			v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
+			
+			rgb_ptr1 += 6;
+			rgb_ptr2 += 6;
+			y_ptr1 += 2;
+			y_ptr2 += 2;
+			u_ptr += 1;
+			v_ptr += 1;
+		}
+	}
+}
+
+#ifdef __SSE2__
+
+#define SSE_FUNCTION_NAME	yuv420_rgb565_sse
+#define STD_FUNCTION_NAME	yuv420_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgb565_sseu
+#define STD_FUNCTION_NAME	yuv420_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgb24_sse
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgb24_sseu
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgba_sse
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_rgba_sseu
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_bgra_sse
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_bgra_sseu
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_argb_sse
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_argb_sseu
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_abgr_sse
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv420_abgr_sseu
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb565_sse
+#define STD_FUNCTION_NAME	yuv422_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb565_sseu
+#define STD_FUNCTION_NAME	yuv422_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb24_sse
+#define STD_FUNCTION_NAME	yuv422_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgb24_sseu
+#define STD_FUNCTION_NAME	yuv422_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgba_sse
+#define STD_FUNCTION_NAME	yuv422_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_rgba_sseu
+#define STD_FUNCTION_NAME	yuv422_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_bgra_sse
+#define STD_FUNCTION_NAME	yuv422_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_bgra_sseu
+#define STD_FUNCTION_NAME	yuv422_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_argb_sse
+#define STD_FUNCTION_NAME	yuv422_argb_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_argb_sseu
+#define STD_FUNCTION_NAME	yuv422_argb_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_abgr_sse
+#define STD_FUNCTION_NAME	yuv422_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuv422_abgr_sseu
+#define STD_FUNCTION_NAME	yuv422_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_422
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb565_sse
+#define STD_FUNCTION_NAME	yuvnv12_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb565_sseu
+#define STD_FUNCTION_NAME	yuvnv12_rgb565_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB565
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb24_sse
+#define STD_FUNCTION_NAME	yuvnv12_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgb24_sseu
+#define STD_FUNCTION_NAME	yuvnv12_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgba_sse
+#define STD_FUNCTION_NAME	yuvnv12_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_rgba_sseu
+#define STD_FUNCTION_NAME	yuvnv12_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_bgra_sse
+#define STD_FUNCTION_NAME	yuvnv12_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_bgra_sseu
+#define STD_FUNCTION_NAME	yuvnv12_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_argb_sse
+#define STD_FUNCTION_NAME	yuvnv12_argb_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_argb_sseu
+#define STD_FUNCTION_NAME	yuvnv12_argb_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_abgr_sse
+#define STD_FUNCTION_NAME	yuvnv12_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#define SSE_ALIGNED
+#include "yuv_rgb_sse_func.h"
+
+#define SSE_FUNCTION_NAME	yuvnv12_abgr_sseu
+#define STD_FUNCTION_NAME	yuvnv12_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_NV12
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_sse_func.h"
+
+
+#define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
+R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
+G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
+G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
+B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
+B2 = _mm_unpackhi_epi8(RGB3, RGB6);
+
+#define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+RGB1 = _mm_unpacklo_epi8(R1, G2); \
+RGB2 = _mm_unpackhi_epi8(R1, G2); \
+RGB3 = _mm_unpacklo_epi8(R2, B1); \
+RGB4 = _mm_unpackhi_epi8(R2, B1); \
+RGB5 = _mm_unpacklo_epi8(G1, B2); \
+RGB6 = _mm_unpackhi_epi8(G1, B2); \
+
+#define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
+
+#define RGB2YUV_16(R, G, B, Y, U, V) \
+Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
+		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
+Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
+Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
+Y = _mm_srai_epi16(Y, PRECISION); \
+U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
+		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
+U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
+U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
+U = _mm_srai_epi16(U, PRECISION); \
+V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
+		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
+V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
+V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
+V = _mm_srai_epi16(V, PRECISION);
+
+#define RGB2YUV_32 \
+	__m128i r1, r2, b1, b2, g1, g2; \
+	__m128i r_16, g_16, b_16; \
+	__m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
+	__m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
+		rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
+		rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
+		rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
+		rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
+		rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
+	/* unpack rgb24 data to r, g and b data in separate channels*/ \
+	UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
+	/* process pixels of first line */ \
+	r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u1 = _mm_packus_epi16(u1_16, u2_16); \
+	v1 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr1), y); \
+	/* process pixels of second line */ \
+	r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u2 = _mm_packus_epi16(u1_16, u2_16); \
+	v2 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr2), y); \
+	/* vertical subsampling of u/v values */ \
+	u1_tmp = _mm_avg_epu8(u1, u2); \
+	v1_tmp = _mm_avg_epu8(v1, v2); \
+	/* do the same again with next data */ \
+	rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
+	rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
+	rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
+	rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
+	rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
+	rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
+	/* unpack rgb24 data to r, g and b data in separate channels*/ \
+	UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
+	/* process pixels of first line */ \
+	r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u1 = _mm_packus_epi16(u1_16, u2_16); \
+	v1 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr1+16), y); \
+	/* process pixels of second line */ \
+	r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
+	r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
+	g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
+	b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
+	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
+	y = _mm_packus_epi16(y1_16, y2_16); \
+	u2 = _mm_packus_epi16(u1_16, u2_16); \
+	v2 = _mm_packus_epi16(v1_16, v2_16); \
+	/* save Y values */ \
+	SAVE_SI128((__m128i*)(y_ptr2+16), y); \
+	/* vertical subsampling of u/v values */ \
+	u2_tmp = _mm_avg_epu8(u1, u2); \
+	v2_tmp = _mm_avg_epu8(v1, v2); \
+	/* horizontal subsampling of u/v values */ \
+	u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
+	v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
+	u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
+	v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
+	u1 = _mm_avg_epu8(u1, u2); \
+	v1 = _mm_avg_epu8(v1, v2); \
+	SAVE_SI128((__m128i*)(u_ptr), u1); \
+	SAVE_SI128((__m128i*)(v_ptr), v1);
+
+void rgb24_yuv420_sse(uint32_t width, uint32_t height, 
+	const uint8_t *RGB, uint32_t RGB_stride, 
+	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	YCbCrType yuv_type)
+{
+	#define LOAD_SI128 _mm_load_si128
+	#define SAVE_SI128 _mm_stream_si128
+	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+	
+	uint32_t xpos, ypos;
+	for(ypos=0; ypos<(height-1); ypos+=2)
+	{
+		const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
+			*rgb_ptr2=RGB+(ypos+1)*RGB_stride;
+		
+		uint8_t *y_ptr1=Y+ypos*Y_stride,
+			*y_ptr2=Y+(ypos+1)*Y_stride,
+			*u_ptr=U+(ypos/2)*UV_stride,
+			*v_ptr=V+(ypos/2)*UV_stride;
+		
+		for(xpos=0; xpos<(width-31); xpos+=32)
+		{
+			RGB2YUV_32
+			
+			rgb_ptr1+=96;
+			rgb_ptr2+=96;
+			y_ptr1+=32;
+			y_ptr2+=32;
+			u_ptr+=16; 
+			v_ptr+=16;
+		}
+	}
+	#undef LOAD_SI128
+	#undef SAVE_SI128
+}
+
+void rgb24_yuv420_sseu(uint32_t width, uint32_t height, 
+	const uint8_t *RGB, uint32_t RGB_stride, 
+	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	YCbCrType yuv_type)
+{
+	#define LOAD_SI128 _mm_loadu_si128
+	#define SAVE_SI128 _mm_storeu_si128
+	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
+	
+	uint32_t xpos, ypos;
+	for(ypos=0; ypos<(height-1); ypos+=2)
+	{
+		const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
+			*rgb_ptr2=RGB+(ypos+1)*RGB_stride;
+		
+		uint8_t *y_ptr1=Y+ypos*Y_stride,
+			*y_ptr2=Y+(ypos+1)*Y_stride,
+			*u_ptr=U+(ypos/2)*UV_stride,
+			*v_ptr=V+(ypos/2)*UV_stride;
+		
+		for(xpos=0; xpos<(width-31); xpos+=32)
+		{
+			RGB2YUV_32
+			
+			rgb_ptr1+=96;
+			rgb_ptr2+=96;
+			y_ptr1+=32;
+			y_ptr2+=32;
+			u_ptr+=16; 
+			v_ptr+=16;
+		}
+	}
+	#undef LOAD_SI128
+	#undef SAVE_SI128
+}
+
+
+#endif //__SSE2__
+
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb.h b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb.h
new file mode 100644
index 0000000..81d97eb
--- /dev/null
+++ b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb.h
@@ -0,0 +1,381 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
+
+// There are a few slightly different variations of the YCbCr color space with different parameters that 
+// change the conversion matrix.
+// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
+// See the respective standards for details
+// The matrix values used are derived from http://www.equasys.de/colorconversion.html
+
+// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
+// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This 
+// is suboptimal for image quality, but by far the fastest method.
+
+// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
+// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
+
+#include "SDL_stdinc.h"
+/*#include <stdint.h>*/
+
+typedef enum
+{
+	YCBCR_JPEG,
+	YCBCR_601,
+	YCBCR_709
+} YCbCrType;
+
+// yuv to rgb, standard c implementation
+void yuv420_rgb565_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgb24_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgba_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_bgra_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_argb_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_abgr_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb565_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb24_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgba_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_bgra_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_argb_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_abgr_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb565_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb24_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgba_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_bgra_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_argb_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_abgr_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+// yuv to rgb, sse implementation
+// pointers must be 16 byte aligned, and strides must be divisable by 16
+void yuv420_rgb565_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgb24_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgba_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_bgra_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_argb_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_abgr_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb565_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb24_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgba_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_bgra_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_argb_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_abgr_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb565_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb24_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgba_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_bgra_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_argb_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_abgr_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+// yuv to rgb, sse implementation
+// pointers do not need to be 16 byte aligned
+void yuv420_rgb565_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgb24_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_rgba_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_bgra_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_argb_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv420_abgr_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb565_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgb24_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_rgba_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_bgra_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_argb_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuv422_abgr_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb565_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgb24_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_rgba_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_bgra_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_argb_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+void yuvnv12_abgr_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	uint8_t *rgb, uint32_t rgb_stride, 
+	YCbCrType yuv_type);
+
+
+// rgb to yuv, standard c implementation
+void rgb24_yuv420_std(
+	uint32_t width, uint32_t height, 
+	const uint8_t *rgb, uint32_t rgb_stride, 
+	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	YCbCrType yuv_type);
+
+// rgb to yuv, sse implementation
+// pointers must be 16 byte aligned, and strides must be divisible by 16
+void rgb24_yuv420_sse(
+	uint32_t width, uint32_t height, 
+	const uint8_t *rgb, uint32_t rgb_stride, 
+	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	YCbCrType yuv_type);
+
+// rgb to yuv, sse implementation
+// pointers do not need to be 16 byte aligned
+void rgb24_yuv420_sseu(
+	uint32_t width, uint32_t height, 
+	const uint8_t *rgb, uint32_t rgb_stride, 
+	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
+	YCbCrType yuv_type);
+
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h
new file mode 100644
index 0000000..f81140e
--- /dev/null
+++ b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h
@@ -0,0 +1,498 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+/* You need to define the following macros before including this file:
+	SSE_FUNCTION_NAME
+	STD_FUNCTION_NAME
+	YUV_FORMAT
+	RGB_FORMAT
+*/
+/* You may define the following macro, which affects generated code:
+	SSE_ALIGNED
+*/
+
+#ifdef SSE_ALIGNED
+/* Unaligned instructions seem faster, even on aligned data? */
+/*
+#define LOAD_SI128 _mm_load_si128
+#define SAVE_SI128 _mm_stream_si128
+*/
+#define LOAD_SI128 _mm_loadu_si128
+#define SAVE_SI128 _mm_storeu_si128
+#else
+#define LOAD_SI128 _mm_loadu_si128
+#define SAVE_SI128 _mm_storeu_si128
+#endif
+
+#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
+	r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
+	g_tmp = _mm_add_epi16( \
+		_mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
+		_mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
+	b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
+	R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
+	G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
+	B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
+	R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
+	G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
+	B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
+
+#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
+	Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
+	Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
+	\
+	R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
+	G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
+	B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
+	R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
+	G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
+	B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
+
+#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
+{ \
+	__m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
+\
+	red_mask = _mm_set1_epi16((short)0xF800); \
+	RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
+	RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
+	RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
+	RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
+	tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
+	tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
+	tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
+	tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
+	RGB1 = _mm_or_si128(RGB1, tmp1); \
+	RGB2 = _mm_or_si128(RGB2, tmp2); \
+	RGB3 = _mm_or_si128(RGB3, tmp3); \
+	RGB4 = _mm_or_si128(RGB4, tmp4); \
+	tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
+	tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
+	tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
+	tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
+	RGB1 = _mm_or_si128(RGB1, tmp1); \
+	RGB2 = _mm_or_si128(RGB2, tmp2); \
+	RGB3 = _mm_or_si128(RGB3, tmp3); \
+	RGB4 = _mm_or_si128(RGB4, tmp4); \
+}
+
+#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
+RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
+RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
+RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
+RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
+RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
+
+#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
+R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
+G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
+G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
+B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
+B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
+
+#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
+
+#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
+{ \
+	__m128i lo_ab, hi_ab, lo_gr, hi_gr; \
+\
+	lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
+	hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
+	lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
+	hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
+	RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
+	RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
+	RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
+	RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
+\
+	lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
+	hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
+	lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
+	hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
+	RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
+	RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
+	RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
+	RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
+}
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	\
+	PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
+	\
+	PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
+	__m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
+	\
+	PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
+	\
+	PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8((char)0xFF); \
+	\
+	PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8((char)0xFF); \
+	\
+	PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8((char)0xFF); \
+	\
+	PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define PACK_PIXEL \
+	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
+	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
+	__m128i a = _mm_set1_epi8((char)0xFF); \
+	\
+	PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
+	\
+	PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#else
+#error PACK_PIXEL unimplemented
+#endif
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+
+#define SAVE_LINE1 \
+	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+
+#define SAVE_LINE2 \
+	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define SAVE_LINE1 \
+	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
+
+#define SAVE_LINE2 \
+	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
+      RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define SAVE_LINE1 \
+	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
+	SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
+
+#define SAVE_LINE2 \
+	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
+	SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
+
+#else
+#error SAVE_LINE unimplemented
+#endif
+
+#if YUV_FORMAT == YUV_FORMAT_420
+
+#define READ_Y(y_ptr) \
+	y = LOAD_SI128((const __m128i*)(y_ptr)); \
+
+#define READ_UV	\
+	u = LOAD_SI128((const __m128i*)(u_ptr)); \
+	v = LOAD_SI128((const __m128i*)(v_ptr)); \
+
+#elif YUV_FORMAT == YUV_FORMAT_422
+
+#define READ_Y(y_ptr) \
+{ \
+	__m128i y1, y2; \
+	y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
+	y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
+	y = _mm_packus_epi16(y1, y2); \
+}
+
+#define READ_UV	\
+{ \
+	__m128i u1, u2, u3, u4, v1, v2, v3, v4; \
+	u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
+	u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
+	u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
+	u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
+	u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
+	v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
+	v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
+	v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
+	v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
+	v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
+}
+
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+
+#define READ_Y(y_ptr) \
+	y = LOAD_SI128((const __m128i*)(y_ptr)); \
+
+#define READ_UV	\
+{ \
+	__m128i u1, u2, v1, v2; \
+	u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
+	u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
+	u = _mm_packus_epi16(u1, u2); \
+	v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
+	v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
+	v = _mm_packus_epi16(v1, v2); \
+}
+
+#else
+#error READ_UV unimplemented
+#endif
+
+#define YUV2RGB_32 \
+	__m128i r_tmp, g_tmp, b_tmp; \
+	__m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
+	__m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
+	__m128i y_16_1, y_16_2; \
+	__m128i y, u, v, u_16, v_16; \
+    __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
+    __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
+	\
+	READ_UV \
+	\
+	/* process first 16 pixels of first line */\
+	u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
+	v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
+	u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
+	v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
+	\
+	UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
+	r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
+	\
+	READ_Y(y_ptr1) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+	/* process first 16 pixels of second line */\
+	r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
+	r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
+	\
+	READ_Y(y_ptr2) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+	/* process last 16 pixels of first line */\
+	u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
+	v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
+	u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
+	v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
+	\
+	UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
+	r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
+	\
+	READ_Y(y_ptr1+16*y_pixel_stride) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+	/* process last 16 pixels of second line */\
+	r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
+	r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
+	\
+	READ_Y(y_ptr2+16*y_pixel_stride) \
+	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
+	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
+	\
+	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
+	\
+	r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
+	g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
+	b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
+	\
+
+
+void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, 
+	const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	uint8_t *RGB, uint32_t RGB_stride, 
+	YCbCrType yuv_type)
+{
+	const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+	const int y_pixel_stride = 1;
+	const int uv_pixel_stride = 1;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 2;
+#elif YUV_FORMAT == YUV_FORMAT_422
+	const int y_pixel_stride = 2;
+	const int uv_pixel_stride = 4;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 1;
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+	const int y_pixel_stride = 1;
+	const int uv_pixel_stride = 2;
+	const int uv_x_sample_interval = 2;
+	const int uv_y_sample_interval = 2;
+#endif
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+	const int rgb_pixel_stride = 2;
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+	const int rgb_pixel_stride = 3;
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
+      RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
+	const int rgb_pixel_stride = 4;
+#else
+#error Unknown RGB pixel size
+#endif
+
+	if (width >= 32) {
+		uint32_t xpos, ypos;
+		for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval)
+		{
+			const uint8_t *y_ptr1=Y+ypos*Y_stride,
+				*y_ptr2=Y+(ypos+1)*Y_stride,
+				*u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
+				*v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
+			
+			uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
+				*rgb_ptr2=RGB+(ypos+1)*RGB_stride;
+			
+			for(xpos=0; xpos<(width-31); xpos+=32)
+			{
+				YUV2RGB_32
+				{
+					PACK_PIXEL
+					SAVE_LINE1
+					if (uv_y_sample_interval > 1)
+					{
+						SAVE_LINE2
+					}
+				}
+
+				y_ptr1+=32*y_pixel_stride;
+				y_ptr2+=32*y_pixel_stride;
+				u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
+				v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
+				rgb_ptr1+=32*rgb_pixel_stride;
+				rgb_ptr2+=32*rgb_pixel_stride;
+			}
+		}
+
+		/* Catch the last line, if needed */
+		if (uv_y_sample_interval == 2 && ypos == (height-1))
+		{
+			const uint8_t *y_ptr=Y+ypos*Y_stride,
+				*u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
+				*v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
+			
+			uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
+
+			STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+		}
+	}
+
+	/* Catch the right column, if needed */
+	{
+		int converted = (width & ~31);
+		if (converted != width)
+		{
+			const uint8_t *y_ptr=Y+converted*y_pixel_stride,
+				*u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
+				*v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
+			
+			uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
+
+			STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+		}
+	}
+}
+
+#undef SSE_FUNCTION_NAME
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef SSE_ALIGNED
+#undef LOAD_SI128
+#undef SAVE_SI128
+#undef UV2RGB_16
+#undef ADD_Y2RGB_16
+#undef PACK_RGB24_32_STEP1
+#undef PACK_RGB24_32_STEP2
+#undef PACK_RGB24_32
+#undef PACK_RGBA_32
+#undef PACK_PIXEL
+#undef SAVE_LINE1
+#undef SAVE_LINE2
+#undef READ_Y
+#undef READ_UV
+#undef YUV2RGB_32
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_std_func.h b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_std_func.h
new file mode 100644
index 0000000..f0ab5c6
--- /dev/null
+++ b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_std_func.h
@@ -0,0 +1,228 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+/* You need to define the following macros before including this file:
+	STD_FUNCTION_NAME
+	YUV_FORMAT
+	RGB_FORMAT
+*/
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint16 *)rgb_ptr = \
+		((((Uint16)clampU8(y_tmp+r_tmp)) << 8 ) & 0xF800) | \
+		((((Uint16)clampU8(y_tmp+g_tmp)) << 3) & 0x07E0) | \
+		(((Uint16)clampU8(y_tmp+b_tmp)) >> 3); \
+	rgb_ptr += 2; \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define PACK_PIXEL(rgb_ptr) \
+	rgb_ptr[0] = clampU8(y_tmp+r_tmp); \
+	rgb_ptr[1] = clampU8(y_tmp+g_tmp); \
+	rgb_ptr[2] = clampU8(y_tmp+b_tmp); \
+	rgb_ptr += 3; \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 24) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 8) | \
+		0x000000FF; \
+	rgb_ptr += 4; \
+
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 24) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 8) | \
+		0x000000FF; \
+	rgb_ptr += 4; \
+
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		0xFF000000 | \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 0); \
+	rgb_ptr += 4; \
+
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define PACK_PIXEL(rgb_ptr) \
+	*(Uint32 *)rgb_ptr = \
+		0xFF000000 | \
+		(((Uint32)clampU8(y_tmp+b_tmp)) << 16) | \
+		(((Uint32)clampU8(y_tmp+g_tmp)) << 8) | \
+		(((Uint32)clampU8(y_tmp+r_tmp)) << 0); \
+	rgb_ptr += 4; \
+
+#else
+#error PACK_PIXEL unimplemented
+#endif
+
+
+void STD_FUNCTION_NAME(
+	uint32_t width, uint32_t height, 
+	const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
+	uint8_t *RGB, uint32_t RGB_stride, 
+	YCbCrType yuv_type)
+{
+	const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+	#define y_pixel_stride 1
+	#define uv_pixel_stride 1
+	#define uv_x_sample_interval 2
+	#define uv_y_sample_interval 2
+#elif YUV_FORMAT == YUV_FORMAT_422
+	#define y_pixel_stride 2
+	#define uv_pixel_stride 4
+	#define uv_x_sample_interval 2
+	#define uv_y_sample_interval 1
+#elif YUV_FORMAT == YUV_FORMAT_NV12
+	#define y_pixel_stride 1
+	#define uv_pixel_stride 2
+	#define uv_x_sample_interval 2
+	#define uv_y_sample_interval 2
+#endif
+
+	uint32_t x, y;
+	for(y=0; y<(height-(uv_y_sample_interval-1)); y+=uv_y_sample_interval)
+	{
+		const uint8_t *y_ptr1=Y+y*Y_stride,
+			*y_ptr2=Y+(y+1)*Y_stride,
+			*u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+			*v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+		
+		uint8_t *rgb_ptr1=RGB+y*RGB_stride;
+
+		#if uv_y_sample_interval > 1
+        uint8_t *rgb_ptr2=RGB+(y+1)*RGB_stride;
+		#endif
+
+		for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			y_tmp = ((y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			#if uv_y_sample_interval > 1
+			y_tmp = ((y_ptr2[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr2);
+				
+			y_tmp = ((y_ptr2[y_pixel_stride]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr2);
+			#endif
+
+			y_ptr1+=2*y_pixel_stride;
+			y_ptr2+=2*y_pixel_stride;
+			u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+			v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+		}
+
+		/* Catch the last pixel, if needed */
+		if (uv_x_sample_interval == 2 && x == (width-1))
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			#if uv_y_sample_interval > 1
+			y_tmp = ((y_ptr2[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr2);
+			#endif
+		}
+	}
+
+	/* Catch the last line, if needed */
+	if (uv_y_sample_interval == 2 && y == (height-1))
+	{
+		const uint8_t *y_ptr1=Y+y*Y_stride,
+			*u_ptr=U+(y/uv_y_sample_interval)*UV_stride,
+			*v_ptr=V+(y/uv_y_sample_interval)*UV_stride;
+		
+		uint8_t *rgb_ptr1=RGB+y*RGB_stride;
+		
+		for(x=0; x<(width-(uv_x_sample_interval-1)); x+=uv_x_sample_interval)
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			y_tmp = ((y_ptr1[y_pixel_stride]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+			
+			y_ptr1+=2*y_pixel_stride;
+			u_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+			v_ptr+=2*uv_pixel_stride/uv_x_sample_interval;
+		}
+
+		/* Catch the last pixel, if needed */
+		if (uv_x_sample_interval == 2 && x == (width-1))
+		{
+			// Compute U and V contributions, common to the four pixels
+			
+			int32_t u_tmp = ((*u_ptr)-128);
+			int32_t v_tmp = ((*v_ptr)-128);
+			
+			int32_t r_tmp = (v_tmp*param->v_r_factor);
+			int32_t g_tmp = (u_tmp*param->u_g_factor + v_tmp*param->v_g_factor);
+			int32_t b_tmp = (u_tmp*param->u_b_factor);
+			
+			// Compute the Y contribution for each pixel
+			
+			int32_t y_tmp = ((y_ptr1[0]-param->y_shift)*param->y_factor);
+			PACK_PIXEL(rgb_ptr1);
+		}
+	}
+
+	#undef y_pixel_stride
+	#undef uv_pixel_stride
+	#undef uv_x_sample_interval
+	#undef uv_y_sample_interval
+}
+
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef PACK_PIXEL