Milestone 5: deliver embedded RDP sessions and lifecycle hardening

2026-03-03 18:59:26 -07:00
parent 230a401386
commit 36006bd4aa
2941 changed files with 724359 additions and 77 deletions
@@ -0,0 +1,104 @@
+# primitives
+
+set(PRIMITIVES_SRCS
+    prim_add.c
+    prim_add.h
+    prim_andor.c
+    prim_andor.h
+    prim_alphaComp.c
+    prim_alphaComp.h
+    prim_colors.c
+    prim_colors.h
+    prim_copy.c
+    prim_copy.h
+    prim_set.c
+    prim_set.h
+    prim_shift.c
+    prim_shift.h
+    prim_sign.c
+    prim_sign.h
+    prim_YUV.c
+    prim_YUV.h
+    prim_YCoCg.c
+    prim_YCoCg.h
+    primitives.c
+    prim_internal.h
+)
+
+set(PRIMITIVES_SSE3_SRCS
+    sse/prim_avxsse.h
+    sse/prim_templates.h
+    sse/prim_colors_sse2.c
+    sse/prim_set_sse2.c
+    sse/prim_add_sse3.c
+    sse/prim_alphaComp_sse3.c
+    sse/prim_andor_sse3.c
+    sse/prim_shift_sse3.c
+)
+
+set(PRIMITIVES_SSSE3_SRCS sse/prim_sign_ssse3.c sse/prim_YCoCg_ssse3.c)
+
+set(PRIMITIVES_SSE4_1_SRCS sse/prim_copy_sse4_1.c sse/prim_YUV_sse4.1.c)
+
+set(PRIMITIVES_SSE4_2_SRCS)
+
+set(PRIMITIVES_AVX2_SRCS sse/prim_copy_avx2.c)
+
+set(PRIMITIVES_NEON_SRCS neon/prim_colors_neon.c neon/prim_YCoCg_neon.c neon/prim_YUV_neon.c)
+
+set(PRIMITIVES_OPENCL_SRCS opencl/prim_YUV_opencl.c)
+
+if(WITH_OPENCL)
+  include(WarnUnmaintained)
+  warn_unmaintained("OpenCL support for primitives" "-DWITH_OPENCL=OFF")
+
+  set(FILENAME "opencl/primitives.cl")
+  set_source_files_properties(${FILENAME} PROPERTIES HEADER_FILE_ONLY ON)
+  list(APPEND PRIMITIVES_OPENCL_SRCS ${FILENAME})
+
+  include(ConvertFileToHexArray)
+  file_to_hex_array(${FILENAME} FILEDATA)
+
+  set(HDR_FILE "${CMAKE_CURRENT_BINARY_DIR}/opencl/primitives-opencl-program.h")
+  cleaning_configure_file("${CMAKE_CURRENT_SOURCE_DIR}/opencl/primitives.h.in" ${HDR_FILE} @ONLY)
+  list(APPEND PRIMITIVES_OPENCL_SRCS ${HDR_FILE})
+
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/opencl)
+  freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
+  freerdp_library_add(OpenCL::OpenCL)
+  freerdp_pc_add_requires_private("OpenCL")
+endif()
+
+set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS}
+                        ${PRIMITIVES_SSE4_1_SRCS} ${PRIMITIVES_SSE4_2_SRCS} ${PRIMITIVES_OPENCL_SRCS}
+)
+
+if(WITH_AVX2)
+  list(APPEND PRIMITIVES_OPT_SRCS ${PRIMITIVES_AVX2_SRCS})
+endif()
+
+set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS})
+
+include(CompilerDetect)
+include(DetectIntrinsicSupport)
+if(WITH_SIMD)
+  set_simd_source_file_properties("sse3" ${PRIMITIVES_SSE3_SRCS})
+  set_simd_source_file_properties("ssse3" ${PRIMITIVES_SSSE3_SRCS})
+  set_simd_source_file_properties("sse4.1" ${PRIMITIVES_SSE4_1_SRCS})
+  set_simd_source_file_properties("sse4.2" ${PRIMITIVES_SSE4_2_SRCS})
+  set_simd_source_file_properties("avx2" ${PRIMITIVES_AVX2_SRCS})
+  set_simd_source_file_properties("neon" ${PRIMITIVES_OPT_SRCS})
+endif()
+
+freerdp_object_library_add(freerdp-primitives)
+
+if(BUILD_BENCHMARK)
+  add_subdirectory(benchmark)
+endif()
+
+if(BUILD_TESTING_INTERNAL)
+  add_subdirectory(test)
+endif()
@@ -0,0 +1,101 @@
+The Primitives Library
+
+Introduction
+------------
+The purpose of the primitives library is to give the freerdp code easy
+access to *run-time* optimization via SIMD operations.  When the library
+is initialized, dynamic checks of processor features are run (such as
+the support of SSE3 or Neon), and entrypoints are linked to through
+function pointers to provide the fastest possible operations.  All
+routines offer generic C alternatives as fallbacks.
+
+Run-time optimization has the advantage of allowing a single executable
+to run fast on multiple platforms with different SIMD capabilities.
+
+
+Use In Code
+-----------
+A singleton pointing to a structure containing the function pointers
+is accessed through primitives_get().   The function pointers can then
+be used from that structure, e.g.
+
+    primitives_t *prims = primitives_get();
+    prims->shiftC_16s(buffer, shifts, buffer, 256);
+
+Of course, there is some overhead in calling through the function pointer
+and setting up the SIMD operations, so it would be counterproductive to
+call the primitives library for very small operation, e.g. initializing an
+array of eight values to a constant.  The primitives library is intended
+for larger-scale operations, e.g. arrays of size 64 and larger.
+
+
+Initialization and Cleanup
+--------------------------
+Library initialization is done the first time primitives_init() is called
+or the first time primitives_get() is used.  Cleanup (if any) is done by
+primitives_deinit().
+
+
+Intel Integrated Performance Primitives (IPP)
+---------------------------------------------
+If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
+calls will be used (where available) to fill the function pointers.
+Where possible, function names and parameter lists match IPP format so
+that the IPP functions can be plugged into the function pointers without
+a wrapper layer.  Use of IPP is completely optional, and in many cases
+the SSE operations in the primitives library itself are faster or similar
+in performance.
+
+
+Coverage
+--------
+The primitives library is not meant to be comprehensive, offering
+entrypoints for every operation and operand type.  Instead, the coverage
+is focused on operations known to be performance bottlenecks in the code.
+For instance, 16-bit signed operations are used widely in the RemoteFX
+software, so you'll find 16s versions of several operations, but there
+is no attempt to provide (unused) copies of the same code for 8u, 16u,
+32s, etc.
+
+
+New Optimizations
+-----------------
+As the need arises, new optimizations can be added to the library,
+including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
+The CPU feature detection is done in winpr/sysinfo.
+
+
+Adding Entrypoints
+------------------
+As the need for new operations or operands arises, new entrypoints can
+be added.  
+  1) Function prototypes and pointers are added to 
+     include/freerdp/primitives.h
+  2) New module initialization and cleanup function prototypes are added
+     to prim_internal.h and called in primitives.c (primitives_init()
+     and primitives_deinit()).
+  3) Operation names and parameter lists should be compatible with the IPP.
+     IPP manuals are available online at software.intel.com.
+  4) A generic C entrypoint must be available as a fallback.
+  5) prim_templates.h contains macro-based templates for simple operations,
+     such as applying a single SSE operation to arrays of data.
+     The template functions can frequently be used to extend the
+     operations without writing a lot of new code.
+
+Cache Management
+----------------
+I haven't found a lot of speed improvement by attempting prefetch, and
+in fact it seems to have a negative impact in some cases.  Done correctly
+perhaps the routines could be further accelerated by proper use of prefetch,
+fences, etc.
+
+
+Testing
+-------
+In the test subdirectory is an executable (prim_test) that tests both
+functionality and speed of primitives library operations.   Any new
+modules should be added to that test, following the conventions already
+established in that directory.  The program can be executed on various
+target hardware to compare generic C, optimized, and IPP performance
+with various array sizes.
+
@@ -0,0 +1,20 @@
+# FreeRDP: A Remote Desktop Protocol Implementation
+# FreeRDP cmake build script
+#
+# Copyright 2025 Armin Novak <anovak@thincast.com>
+# Copyright 2025 Thincast Technologies GmbH
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(primitives-benchmark benchmark.c)
+target_link_libraries(primitives-benchmark PRIVATE winpr freerdp)
@@ -0,0 +1,254 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * primitives benchmarking tool
+ *
+ * Copyright 2025 Armin Novak <anovak@thincast.com>
+ * Copyright 2025 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+
+#include <winpr/crypto.h>
+#include <winpr/sysinfo.h>
+#include <freerdp/primitives.h>
+
+typedef struct
+{
+	BYTE* channels[3];
+	UINT32 steps[3];
+	prim_size_t roi;
+	BYTE* outputBuffer;
+	BYTE* outputChannels[3];
+	BYTE* rgbBuffer;
+	UINT32 outputStride;
+	UINT32 testedFormat;
+} primitives_YUV_benchmark;
+
+static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
+{
+	if (!bench)
+		return;
+
+	free(bench->outputBuffer);
+	free(bench->rgbBuffer);
+
+	for (size_t i = 0; i < 3; i++)
+	{
+		free(bench->outputChannels[i]);
+		free(bench->channels[i]);
+	}
+
+	const primitives_YUV_benchmark empty = WINPR_C_ARRAY_INIT;
+	*bench = empty;
+}
+
+static primitives_YUV_benchmark primitives_YUV_benchmark_init(void)
+{
+	primitives_YUV_benchmark ret = WINPR_C_ARRAY_INIT;
+	ret.roi.width = 3840 * 4;
+	ret.roi.height = 2160 * 4;
+	ret.outputStride = ret.roi.width * 4;
+	ret.testedFormat = PIXEL_FORMAT_BGRA32;
+
+	ret.outputBuffer = calloc(ret.outputStride, ret.roi.height);
+	if (!ret.outputBuffer)
+		goto fail;
+	ret.rgbBuffer = calloc(ret.outputStride, ret.roi.height);
+	if (!ret.rgbBuffer)
+		goto fail;
+	if (winpr_RAND(ret.rgbBuffer, 1ULL * ret.outputStride * ret.roi.height) < 0)
+		goto fail;
+
+	for (size_t i = 0; i < 3; i++)
+	{
+		ret.channels[i] = calloc(ret.roi.width, ret.roi.height);
+		ret.outputChannels[i] = calloc(ret.roi.width, ret.roi.height);
+		if (!ret.channels[i] || !ret.outputChannels[i])
+			goto fail;
+
+		if (winpr_RAND(ret.channels[i], 1ull * ret.roi.width * ret.roi.height) < 0)
+			goto fail;
+		ret.steps[i] = ret.roi.width;
+	}
+
+	return ret;
+
+fail:
+	primitives_YUV_benchmark_free(&ret);
+	return ret;
+}
+
+static const char* print_time(UINT64 t, char* buffer, size_t size)
+{
+	(void)_snprintf(buffer, size, "%u.%03u.%03u.%03u", (unsigned)(t / 1000000000ull),
+	                (unsigned)((t / 1000000ull) % 1000), (unsigned)((t / 1000ull) % 1000),
+	                (unsigned)((t) % 1000));
+	return buffer;
+}
+
+static BOOL primitives_YUV420_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
+{
+	const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
+
+	for (size_t i = 0; i < 3; i++)
+		channels[i] = bench->channels[i];
+
+	for (size_t x = 0; x < 10; x++)
+	{
+		const UINT64 start = winpr_GetTickCount64NS();
+		pstatus_t status =
+		    prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+		                                 bench->outputStride, bench->testedFormat, &bench->roi);
+		const UINT64 end = winpr_GetTickCount64NS();
+		if (status != PRIMITIVES_SUCCESS)
+		{
+			(void)fprintf(stderr, "Running YUV420ToRGB_8u_P3AC4R failed\n");
+			return FALSE;
+		}
+		const UINT64 diff = end - start;
+		char buffer[32] = WINPR_C_ARRAY_INIT;
+		printf("[%" PRIuz "] YUV420ToRGB_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
+		       bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
+	}
+
+	return TRUE;
+}
+
+static BOOL primitives_YUV444_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
+{
+	const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
+
+	for (size_t i = 0; i < 3; i++)
+		channels[i] = bench->channels[i];
+
+	for (size_t x = 0; x < 10; x++)
+	{
+		const UINT64 start = winpr_GetTickCount64NS();
+		pstatus_t status =
+		    prims->YUV444ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+		                                 bench->outputStride, bench->testedFormat, &bench->roi);
+		const UINT64 end = winpr_GetTickCount64NS();
+		if (status != PRIMITIVES_SUCCESS)
+		{
+			(void)fprintf(stderr, "Running YUV444ToRGB_8u_P3AC4R failed\n");
+			return FALSE;
+		}
+		const UINT64 diff = end - start;
+		char buffer[32] = WINPR_C_ARRAY_INIT;
+		printf("[%" PRIuz "] YUV444ToRGB_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
+		       bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
+	}
+
+	return TRUE;
+}
+
+static BOOL primitives_RGB2420_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
+{
+	for (size_t x = 0; x < 10; x++)
+	{
+		const UINT64 start = winpr_GetTickCount64NS();
+		pstatus_t status =
+		    prims->RGBToYUV420_8u_P3AC4R(bench->rgbBuffer, bench->testedFormat, bench->outputStride,
+		                                 bench->outputChannels, bench->steps, &bench->roi);
+		const UINT64 end = winpr_GetTickCount64NS();
+		if (status != PRIMITIVES_SUCCESS)
+		{
+			(void)fprintf(stderr, "Running RGBToYUV420_8u_P3AC4R failed\n");
+			return FALSE;
+		}
+		const UINT64 diff = end - start;
+		char buffer[32] = WINPR_C_ARRAY_INIT;
+		printf("[%" PRIuz "] RGBToYUV420_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
+		       bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
+	}
+
+	return TRUE;
+}
+
+static BOOL primitives_RGB2444_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
+{
+	for (size_t x = 0; x < 10; x++)
+	{
+		const UINT64 start = winpr_GetTickCount64NS();
+		pstatus_t status =
+		    prims->RGBToYUV444_8u_P3AC4R(bench->rgbBuffer, bench->testedFormat, bench->outputStride,
+		                                 bench->outputChannels, bench->steps, &bench->roi);
+		const UINT64 end = winpr_GetTickCount64NS();
+		if (status != PRIMITIVES_SUCCESS)
+		{
+			(void)fprintf(stderr, "Running RGBToYUV444_8u_P3AC4R failed\n");
+			return FALSE;
+		}
+		const UINT64 diff = end - start;
+		char buffer[32] = WINPR_C_ARRAY_INIT;
+		printf("[%" PRIuz "] RGBToYUV444_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
+		       bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
+	}
+
+	return TRUE;
+}
+
+int main(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	primitives_YUV_benchmark bench = primitives_YUV_benchmark_init();
+
+	for (primitive_hints hint = PRIMITIVES_PURE_SOFT; hint < PRIMITIVES_AUTODETECT; hint++)
+	{
+		const char* hintstr = primtives_hint_str(hint);
+		primitives_t* prim = primitives_get_by_type(hint);
+		if (!prim)
+		{
+			(void)fprintf(stderr, "failed to get primitives: %s\n", hintstr);
+			goto fail;
+		}
+
+		printf("Running YUV420 -> RGB benchmark on %s implementation:\n", hintstr);
+		if (!primitives_YUV420_benchmark_run(&bench, prim))
+		{
+			(void)fprintf(stderr, "YUV420 -> RGB benchmark failed\n");
+			goto fail;
+		}
+		printf("\n");
+
+		printf("Running RGB -> YUV420 benchmark on %s implementation:\n", hintstr);
+		if (!primitives_RGB2420_benchmark_run(&bench, prim))
+		{
+			(void)fprintf(stderr, "RGB -> YUV420 benchmark failed\n");
+			goto fail;
+		}
+		printf("\n");
+
+		printf("Running YUV444 -> RGB benchmark on %s implementation:\n", hintstr);
+		if (!primitives_YUV444_benchmark_run(&bench, prim))
+		{
+			(void)fprintf(stderr, "YUV444 -> RGB benchmark failed\n");
+			goto fail;
+		}
+		printf("\n");
+
+		printf("Running RGB -> YUV444 benchmark on %s implementation:\n", hintstr);
+		if (!primitives_RGB2444_benchmark_run(&bench, prim))
+		{
+			(void)fprintf(stderr, "RGB -> YUV444 benchmark failed\n");
+			goto fail;
+		}
+		printf("\n");
+	}
+fail:
+	primitives_YUV_benchmark_free(&bench);
+	return 0;
+}
@@ -0,0 +1,168 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_internal.h"
+#include "prim_YCoCg.h"
+
+#if defined(NEON_INTRINSICS_ENABLED)
+#include <arm_neon.h>
+
+static primitives_t* generic = nullptr;
+
+static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
+                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
+{
+	BYTE* dptr = pDst;
+	const BYTE* sptr = pSrc;
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	const int8_t cll = shift - 1; /* -1 builds in the /2's */
+	const UINT32 srcPad = srcStep - (width * 4);
+	const UINT32 dstPad = dstStep - (width * formatSize);
+	const UINT32 pad = width % 8;
+	const uint8x8_t aVal = vdup_n_u8(0xFF);
+	const int8x8_t cllv = vdup_n_s8(cll);
+
+	for (UINT32 y = 0; y < height; y++)
+	{
+		for (UINT32 x = 0; x < width - pad; x += 8)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const uint8x8x4_t raw = vld4_u8(sptr);
+			const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+			const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+			const int16x8_t Cg = vmovl_s8(CgRaw);
+			const int16x8_t Co = vmovl_s8(CoRaw);
+			const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
+			const int16x8_t T = vsubq_s16(Y, Cg);
+			const int16x8_t R = vaddq_s16(T, Co);
+			const int16x8_t G = vaddq_s16(Y, Cg);
+			const int16x8_t B = vsubq_s16(T, Co);
+			uint8x8x4_t bgrx;
+			bgrx.val[bPos] = vqmovun_s16(B);
+			bgrx.val[gPos] = vqmovun_s16(G);
+			bgrx.val[rPos] = vqmovun_s16(R);
+
+			if (alpha)
+				bgrx.val[aPos] = raw.val[3];
+			else
+				bgrx.val[aPos] = aVal;
+
+			vst4_u8(dptr, bgrx);
+			sptr += sizeof(raw);
+			dptr += sizeof(bgrx);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
+			const INT16 T = Y - Cg;
+			const INT16 R = T + Co;
+			const INT16 G = Y + Cg;
+			const INT16 B = T - Co;
+			BYTE bgra[4];
+			bgra[bPos] = CLIP(B);
+			bgra[gPos] = CLIP(G);
+			bgra[rPos] = CLIP(R);
+			bgra[aPos] = *sptr++;
+
+			if (!alpha)
+				bgra[aPos] = 0xFF;
+
+			*dptr++ = bgra[0];
+			*dptr++ = bgra[1];
+			*dptr++ = bgra[2];
+			*dptr++ = bgra[3];
+		}
+
+		sptr += srcPad;
+		dptr += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_ARGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_ABGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(NEON_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "NEON optimizations");
+	prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,837 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations
+ *
+ * Copyright 2014 Thomas Erbesdobler
+ * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2016-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_YUV.h"
+
+#if defined(NEON_INTRINSICS_ENABLED)
+#include <arm_neon.h>
+
+static primitives_t* generic = nullptr;
+
+static inline uint8x8_t neon_YUV2R_single(uint16x8_t C, int16x8_t D, int16x8_t E)
+{
+	/* R = (256 * Y + 403 * (V - 128)) >> 8 */
+	const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
+	const int32x4_t e403h = vmull_n_s16(vget_high_s16(E), 403);
+	const int32x4_t cehm = vaddq_s32(Ch, e403h);
+	const int32x4_t ceh = vshrq_n_s32(cehm, 8);
+
+	const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
+	const int32x4_t e403l = vmull_n_s16(vget_low_s16(E), 403);
+	const int32x4_t celm = vaddq_s32(Cl, e403l);
+	const int32x4_t cel = vshrq_n_s32(celm, 8);
+	const int16x8_t ce = vcombine_s16(vqmovn_s32(cel), vqmovn_s32(ceh));
+	return vqmovun_s16(ce);
+}
+
+static inline uint8x8x2_t neon_YUV2R(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
+{
+	uint8x8x2_t res = { { neon_YUV2R_single(C.val[0], D.val[0], E.val[0]),
+		                  neon_YUV2R_single(C.val[1], D.val[1], E.val[1]) } };
+	return res;
+}
+
+static inline uint8x8_t neon_YUV2G_single(uint16x8_t C, int16x8_t D, int16x8_t E)
+{
+	/* G = (256L * Y -  48 * (U - 128) - 120 * (V - 128)) >> 8 */
+	const int16x8_t d48 = vmulq_n_s16(D, 48);
+	const int16x8_t e120 = vmulq_n_s16(E, 120);
+	const int32x4_t deh = vaddl_s16(vget_high_s16(d48), vget_high_s16(e120));
+	const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
+	const int32x4_t cdeh32m = vsubq_s32(Ch, deh);
+	const int32x4_t cdeh32 = vshrq_n_s32(cdeh32m, 8);
+	const int16x4_t cdeh = vqmovn_s32(cdeh32);
+
+	const int32x4_t del = vaddl_s16(vget_low_s16(d48), vget_low_s16(e120));
+	const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
+	const int32x4_t cdel32m = vsubq_s32(Cl, del);
+	const int32x4_t cdel32 = vshrq_n_s32(cdel32m, 8);
+	const int16x4_t cdel = vqmovn_s32(cdel32);
+	const int16x8_t cde = vcombine_s16(cdel, cdeh);
+	return vqmovun_s16(cde);
+}
+
+static inline uint8x8x2_t neon_YUV2G(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
+{
+	uint8x8x2_t res = { { neon_YUV2G_single(C.val[0], D.val[0], E.val[0]),
+		                  neon_YUV2G_single(C.val[1], D.val[1], E.val[1]) } };
+	return res;
+}
+
+static inline uint8x8_t neon_YUV2B_single(uint16x8_t C, int16x8_t D, int16x8_t E)
+{
+	/* B = (256L * Y + 475 * (U - 128)) >> 8*/
+	const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
+	const int32x4_t d475h = vmull_n_s16(vget_high_s16(D), 475);
+	const int32x4_t cdhm = vaddq_s32(Ch, d475h);
+	const int32x4_t cdh = vshrq_n_s32(cdhm, 8);
+
+	const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
+	const int32x4_t d475l = vmull_n_s16(vget_low_s16(D), 475);
+	const int32x4_t cdlm = vaddq_s32(Cl, d475l);
+	const int32x4_t cdl = vshrq_n_s32(cdlm, 8);
+	const int16x8_t cd = vcombine_s16(vqmovn_s32(cdl), vqmovn_s32(cdh));
+	return vqmovun_s16(cd);
+}
+
+static inline uint8x8x2_t neon_YUV2B(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
+{
+	uint8x8x2_t res = { { neon_YUV2B_single(C.val[0], D.val[0], E.val[0]),
+		                  neon_YUV2B_single(C.val[1], D.val[1], E.val[1]) } };
+	return res;
+}
+
+static inline void neon_store_bgrx(BYTE* WINPR_RESTRICT pRGB, uint8x8_t r, uint8x8_t g, uint8x8_t b,
+                                   uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	uint8x8x4_t bgrx = vld4_u8(pRGB);
+	bgrx.val[rPos] = r;
+	bgrx.val[gPos] = g;
+	bgrx.val[bPos] = b;
+	vst4_u8(pRGB, bgrx);
+}
+
+static inline void neon_YuvToRgbPixel(BYTE* pRGB, uint8x8x2_t Y, int16x8x2_t D, int16x8x2_t E,
+                                      const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+                                      const uint8_t aPos)
+{
+	/* Y * 256 == Y << 8  */
+	const uint16x8x2_t C = { { vshlq_n_u16(vmovl_u8(Y.val[0]), 8),
+		                       vshlq_n_u16(vmovl_u8(Y.val[1]), 8) } };
+
+	const uint8x8x2_t r = neon_YUV2R(C, D, E);
+	const uint8x8x2_t g = neon_YUV2G(C, D, E);
+	const uint8x8x2_t b = neon_YUV2B(C, D, E);
+
+	neon_store_bgrx(pRGB, r.val[0], g.val[0], b.val[0], rPos, gPos, bPos, aPos);
+	neon_store_bgrx(pRGB + sizeof(uint8x8x4_t), r.val[1], g.val[1], b.val[1], rPos, gPos, bPos,
+	                aPos);
+}
+
+static inline int16x8x2_t loadUV(const BYTE* WINPR_RESTRICT pV, size_t x)
+{
+	const uint8x8_t Vraw = vld1_u8(&pV[x / 2]);
+	const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vraw));
+	const int16x8_t c128 = vdupq_n_s16(128);
+	const int16x8_t E = vsubq_s16(V, c128);
+	return vzipq_s16(E, E);
+}
+
+static inline void neon_write_pixel(BYTE* pRGB, BYTE Y, BYTE U, BYTE V, const uint8_t rPos,
+                                    const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
+{
+	const BYTE r = YUV2R(Y, U, V);
+	const BYTE g = YUV2G(Y, U, V);
+	const BYTE b = YUV2B(Y, U, V);
+
+	pRGB[rPos] = r;
+	pRGB[gPos] = g;
+	pRGB[bPos] = b;
+}
+
+static inline void neon_YUV420ToX_DOUBLE_ROW(const BYTE* WINPR_RESTRICT pY[2],
+                                             const BYTE* WINPR_RESTRICT pU,
+                                             const BYTE* WINPR_RESTRICT pV,
+                                             BYTE* WINPR_RESTRICT pRGB[2], size_t width,
+                                             const uint8_t rPos, const uint8_t gPos,
+                                             const uint8_t bPos, const uint8_t aPos)
+{
+	UINT32 x = 0;
+
+	for (; x < width - width % 16; x += 16)
+	{
+		const uint8x16_t Y0raw = vld1q_u8(&pY[0][x]);
+		const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
+		const int16x8x2_t D = loadUV(pU, x);
+		const int16x8x2_t E = loadUV(pV, x);
+		neon_YuvToRgbPixel(&pRGB[0][4ULL * x], Y0, D, E, rPos, gPos, bPos, aPos);
+
+		const uint8x16_t Y1raw = vld1q_u8(&pY[1][x]);
+		const uint8x8x2_t Y1 = { { vget_low_u8(Y1raw), vget_high_u8(Y1raw) } };
+		neon_YuvToRgbPixel(&pRGB[1][4ULL * x], Y1, D, E, rPos, gPos, bPos, aPos);
+	}
+
+	for (; x < width - width % 2; x += 2)
+	{
+		const BYTE U = pU[x / 2];
+		const BYTE V = pV[x / 2];
+
+		neon_write_pixel(&pRGB[0][4 * x], pY[0][x], U, V, rPos, gPos, bPos, aPos);
+		neon_write_pixel(&pRGB[0][4 * (1ULL + x)], pY[0][1ULL + x], U, V, rPos, gPos, bPos, aPos);
+		neon_write_pixel(&pRGB[1][4 * x], pY[1][x], U, V, rPos, gPos, bPos, aPos);
+		neon_write_pixel(&pRGB[1][4 * (1ULL + x)], pY[1][1ULL + x], U, V, rPos, gPos, bPos, aPos);
+	}
+
+	for (; x < width; x++)
+	{
+		const BYTE U = pU[x / 2];
+		const BYTE V = pV[x / 2];
+
+		neon_write_pixel(&pRGB[0][4 * x], pY[0][x], U, V, rPos, gPos, bPos, aPos);
+		neon_write_pixel(&pRGB[1][4 * x], pY[1][x], U, V, rPos, gPos, bPos, aPos);
+	}
+}
+
+static inline void neon_YUV420ToX_SINGLE_ROW(const BYTE* WINPR_RESTRICT pY,
+                                             const BYTE* WINPR_RESTRICT pU,
+                                             const BYTE* WINPR_RESTRICT pV,
+                                             BYTE* WINPR_RESTRICT pRGB, size_t width,
+                                             const uint8_t rPos, const uint8_t gPos,
+                                             const uint8_t bPos, const uint8_t aPos)
+{
+	UINT32 x = 0;
+
+	for (; x < width - width % 16; x += 16)
+	{
+		const uint8x16_t Y0raw = vld1q_u8(&pY[x]);
+		const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
+		const int16x8x2_t D = loadUV(pU, x);
+		const int16x8x2_t E = loadUV(pV, x);
+		neon_YuvToRgbPixel(&pRGB[4ULL * x], Y0, D, E, rPos, gPos, bPos, aPos);
+	}
+
+	for (; x < width - width % 2; x += 2)
+	{
+		const BYTE U = pU[x / 2];
+		const BYTE V = pV[x / 2];
+
+		neon_write_pixel(&pRGB[4 * x], pY[x], U, V, rPos, gPos, bPos, aPos);
+		neon_write_pixel(&pRGB[4 * (1ULL + x)], pY[1ULL + x], U, V, rPos, gPos, bPos, aPos);
+	}
+	for (; x < width; x++)
+	{
+		const BYTE U = pU[x / 2];
+		const BYTE V = pV[x / 2];
+
+		neon_write_pixel(&pRGB[4 * x], pY[x], U, V, rPos, gPos, bPos, aPos);
+	}
+}
+
+static inline pstatus_t neon_YUV420ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
+                                       BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                       const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
+                                       const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+
+	WINPR_ASSERT(nHeight > 0);
+	UINT32 y = 0;
+	for (; y < (nHeight - 1); y += 2)
+	{
+		const uint8_t* pY[2] = { pSrc[0] + y * srcStep[0], pSrc[0] + (1ULL + y) * srcStep[0] };
+		const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
+		const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
+		uint8_t* pRGB[2] = { pDst + y * dstStep, pDst + (1ULL + y) * dstStep };
+
+		neon_YUV420ToX_DOUBLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
+	}
+	for (; y < nHeight; y++)
+	{
+		const uint8_t* pY = pSrc[0] + y * srcStep[0];
+		const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
+		const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
+		uint8_t* pRGB = pDst + y * dstStep;
+
+		neon_YUV420ToX_SINGLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
+                                            const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                            UINT32 dstStep, UINT32 DstFormat,
+                                            const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static inline int16x8_t loadUVreg(uint8x8_t Vraw)
+{
+	const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vraw));
+	const int16x8_t c128 = vdupq_n_s16(128);
+	const int16x8_t E = vsubq_s16(V, c128);
+	return E;
+}
+
+static inline int16x8x2_t loadUV444(uint8x16_t Vld)
+{
+	const uint8x8x2_t V = { { vget_low_u8(Vld), vget_high_u8(Vld) } };
+	const int16x8x2_t res = { {
+		loadUVreg(V.val[0]),
+		loadUVreg(V.val[1]),
+	} };
+	return res;
+}
+
+static inline void avgUV(BYTE U[2][2])
+{
+	const BYTE u00 = U[0][0];
+	const INT16 umul = (INT16)u00 << 2;
+	const INT16 sum = (INT16)U[0][1] + U[1][0] + U[1][1];
+	const INT16 wavg = umul - sum;
+	const BYTE val = CONDITIONAL_CLIP(wavg, u00);
+	U[0][0] = val;
+}
+
+static inline void neon_avgUV(uint8x16_t pU[2])
+{
+	/* put even and odd values into different registers.
+	 * U 0/0 is in lower half */
+	const uint8x16x2_t usplit = vuzpq_u8(pU[0], pU[1]);
+	const uint8x16_t ueven = usplit.val[0];
+	const uint8x16_t uodd = usplit.val[1];
+
+	const uint8x8_t u00 = vget_low_u8(ueven);
+	const uint8x8_t u01 = vget_low_u8(uodd);
+	const uint8x8_t u10 = vget_high_u8(ueven);
+	const uint8x8_t u11 = vget_high_u8(uodd);
+
+	/* Create sum of U01 + U10 + U11 */
+	const uint16x8_t uoddsum = vaddl_u8(u01, u10);
+	const uint16x8_t usum = vaddq_u16(uoddsum, vmovl_u8(u11));
+
+	/* U00 * 4 */
+	const uint16x8_t umul = vshll_n_u8(u00, 2);
+
+	/* U00 - (U01 + U10 + U11) */
+	const int16x8_t wavg = vsubq_s16(vreinterpretq_s16_u16(umul), vreinterpretq_s16_u16(usum));
+	const uint8x8_t avg = vqmovun_s16(wavg);
+
+	/* abs(u00 - avg) */
+	const uint8x8_t absdiff = vabd_u8(avg, u00);
+
+	/* (diff < 30) ? u00 : avg */
+	const uint8x8_t mask = vclt_u8(absdiff, vdup_n_u8(30));
+
+	/* out1 = u00 & mask */
+	const uint8x8_t out1 = vand_u8(u00, mask);
+
+	/* invmask = ~mask */
+	const uint8x8_t notmask = vmvn_u8(mask);
+
+	/* out2 = avg & invmask */
+	const uint8x8_t out2 = vand_u8(avg, notmask);
+
+	/* out = out1 | out2 */
+	const uint8x8_t out = vorr_u8(out1, out2);
+
+	const uint8x8x2_t ua = vzip_u8(out, u01);
+	const uint8x16_t u = vcombine_u8(ua.val[0], ua.val[1]);
+	pU[0] = u;
+}
+
+static inline pstatus_t neon_YUV444ToX_SINGLE_ROW(const BYTE* WINPR_RESTRICT pY,
+                                                  const BYTE* WINPR_RESTRICT pU,
+                                                  const BYTE* WINPR_RESTRICT pV,
+                                                  BYTE* WINPR_RESTRICT pRGB, size_t width,
+                                                  const uint8_t rPos, const uint8_t gPos,
+                                                  const uint8_t bPos, const uint8_t aPos)
+{
+	WINPR_ASSERT(width % 2 == 0);
+
+	size_t x = 0;
+
+	for (; x < width - width % 16; x += 16)
+	{
+		uint8x16_t U = vld1q_u8(&pU[x]);
+		uint8x16_t V = vld1q_u8(&pV[x]);
+		const uint8x16_t Y0raw = vld1q_u8(&pY[x]);
+		const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
+		const int16x8x2_t D0 = loadUV444(U);
+		const int16x8x2_t E0 = loadUV444(V);
+		neon_YuvToRgbPixel(&pRGB[4ULL * x], Y0, D0, E0, rPos, gPos, bPos, aPos);
+	}
+
+	for (; x < width; x += 2)
+	{
+		BYTE* rgb = &pRGB[x * 4];
+
+		for (size_t j = 0; j < 2; j++)
+		{
+			const BYTE y = pY[x + j];
+			const BYTE u = pU[x + j];
+			const BYTE v = pV[x + j];
+
+			neon_write_pixel(&rgb[4 * (j)], y, u, v, rPos, gPos, bPos, aPos);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t neon_YUV444ToX_DOUBLE_ROW(const BYTE* WINPR_RESTRICT pY[2],
+                                                  const BYTE* WINPR_RESTRICT pU[2],
+                                                  const BYTE* WINPR_RESTRICT pV[2],
+                                                  BYTE* WINPR_RESTRICT pRGB[2], size_t width,
+                                                  const uint8_t rPos, const uint8_t gPos,
+                                                  const uint8_t bPos, const uint8_t aPos)
+{
+	WINPR_ASSERT(width % 2 == 0);
+
+	size_t x = 0;
+
+	for (; x < width - width % 16; x += 16)
+	{
+		uint8x16_t U[2] = { vld1q_u8(&pU[0][x]), vld1q_u8(&pU[1][x]) };
+		neon_avgUV(U);
+
+		uint8x16_t V[2] = { vld1q_u8(&pV[0][x]), vld1q_u8(&pV[1][x]) };
+		neon_avgUV(V);
+
+		const uint8x16_t Y0raw = vld1q_u8(&pY[0][x]);
+		const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
+		const int16x8x2_t D0 = loadUV444(U[0]);
+		const int16x8x2_t E0 = loadUV444(V[0]);
+		neon_YuvToRgbPixel(&pRGB[0][4ULL * x], Y0, D0, E0, rPos, gPos, bPos, aPos);
+
+		const uint8x16_t Y1raw = vld1q_u8(&pY[1][x]);
+		const uint8x8x2_t Y1 = { { vget_low_u8(Y1raw), vget_high_u8(Y1raw) } };
+		const int16x8x2_t D1 = loadUV444(U[1]);
+		const int16x8x2_t E1 = loadUV444(V[1]);
+		neon_YuvToRgbPixel(&pRGB[1][4ULL * x], Y1, D1, E1, rPos, gPos, bPos, aPos);
+	}
+
+	for (; x < width; x += 2)
+	{
+		BYTE* rgb[2] = { &pRGB[0][x * 4], &pRGB[1][x * 4] };
+		BYTE U[2][2] = { { pU[0][x], pU[0][x + 1] }, { pU[1][x], pU[1][x + 1] } };
+		avgUV(U);
+
+		BYTE V[2][2] = { { pV[0][x], pV[0][x + 1] }, { pV[1][x], pV[1][x + 1] } };
+		avgUV(V);
+
+		for (size_t i = 0; i < 2; i++)
+		{
+			for (size_t j = 0; j < 2; j++)
+			{
+				const BYTE y = pY[i][x + j];
+				const BYTE u = U[i][j];
+				const BYTE v = V[i][j];
+
+				neon_write_pixel(&rgb[i][4 * (j)], y, u, v, rPos, gPos, bPos, aPos);
+			}
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t neon_YUV444ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
+                                       BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                       const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
+                                       const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
+{
+	WINPR_ASSERT(roi);
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+
+	size_t y = 0;
+	for (; y < nHeight - nHeight % 2; y += 2)
+	{
+		const uint8_t* WINPR_RESTRICT pY[2] = { pSrc[0] + y * srcStep[0],
+			                                    pSrc[0] + (y + 1) * srcStep[0] };
+		const uint8_t* WINPR_RESTRICT pU[2] = { pSrc[1] + y * srcStep[1],
+			                                    pSrc[1] + (y + 1) * srcStep[1] };
+		const uint8_t* WINPR_RESTRICT pV[2] = { pSrc[2] + y * srcStep[2],
+			                                    pSrc[2] + (y + 1) * srcStep[2] };
+
+		uint8_t* WINPR_RESTRICT pRGB[2] = { &pDst[y * dstStep], &pDst[(y + 1) * dstStep] };
+
+		const pstatus_t rc =
+		    neon_YUV444ToX_DOUBLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
+		if (rc != PRIMITIVES_SUCCESS)
+			return rc;
+	}
+	for (; y < nHeight; y++)
+	{
+		const uint8_t* WINPR_RESTRICT pY = pSrc[0] + y * srcStep[0];
+		const uint8_t* WINPR_RESTRICT pU = pSrc[1] + y * srcStep[1];
+		const uint8_t* WINPR_RESTRICT pV = pSrc[2] + y * srcStep[2];
+		uint8_t* WINPR_RESTRICT pRGB = &pDst[y * dstStep];
+
+		const pstatus_t rc =
+		    neon_YUV444ToX_SINGLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
+		if (rc != PRIMITIVES_SUCCESS)
+			return rc;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
+                                            const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                            UINT32 dstStep, UINT32 DstFormat,
+                                            const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t neon_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3], const UINT32 srcStep[3],
+                                   BYTE* WINPR_RESTRICT pDstRaw[3], const UINT32 dstStep[3],
+                                   const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 evenY = 0;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* Y data is already here... */
+	/* B1 */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+		BYTE* pY = pDst[0] + dstStep[0] * y;
+		memcpy(pY, Ym, nWidth);
+	}
+
+	/* The first half of U, V are already here part of this frame. */
+	/* B2 and B3 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (2 * y + evenY);
+		const BYTE* Um = pSrc[1] + srcStep[1] * y;
+		const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+		BYTE* pU1 = pU + dstStep[1];
+		BYTE* pV1 = pV + dstStep[2];
+
+		UINT32 x = 0;
+		for (; x + 16 < halfWidth; x += 16)
+		{
+			{
+				const uint8x16_t u = vld1q_u8(Um);
+				uint8x16x2_t u2x;
+				u2x.val[0] = u;
+				u2x.val[1] = u;
+				vst2q_u8(pU, u2x);
+				vst2q_u8(pU1, u2x);
+				Um += 16;
+				pU += 32;
+				pU1 += 32;
+			}
+			{
+				const uint8x16_t v = vld1q_u8(Vm);
+				uint8x16x2_t v2x;
+				v2x.val[0] = v;
+				v2x.val[1] = v;
+				vst2q_u8(pV, v2x);
+				vst2q_u8(pV1, v2x);
+				Vm += 16;
+				pV += 32;
+				pV1 += 32;
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const BYTE u = *Um++;
+			const BYTE v = *Vm++;
+			*pU++ = u;
+			*pU++ = u;
+			*pU1++ = u;
+			*pU1++ = u;
+			*pV++ = v;
+			*pV++ = v;
+			*pV1++ = v;
+			*pV1++ = v;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
+                                       const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                       const UINT32 dstStep[3],
+                                       const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 mod = 16;
+	UINT32 uY = 0;
+	UINT32 vY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth) / 2;
+	const UINT32 halfHeight = (nHeight) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	/* The auxiliary frame is aligned to multiples of 16x16.
+	 * We need the padded height for B4 and B5 conversion. */
+	const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+	const UINT32 halfPad = halfWidth % 16;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* The second half of U and V is a bit more tricky... */
+	/* B4 and B5 */
+	for (UINT32 y = 0; y < padHeigth; y++)
+	{
+		const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+		BYTE* pX;
+
+		if ((y) % mod < (mod + 1) / 2)
+		{
+			const UINT32 pos = (2 * uY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[1] + dstStep[1] * pos;
+		}
+		else
+		{
+			const UINT32 pos = (2 * vY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[2] + dstStep[2] * pos;
+		}
+
+		memcpy(pX, Ya, nWidth);
+	}
+
+	/* B6 and B7 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+		const BYTE* Va = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
+				u.val[1] = vld1q_u8(&Ua[x]);
+				vst2q_u8(&pU[2 * x], u);
+			}
+			{
+				uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
+				v.val[1] = vld1q_u8(&Va[x]);
+				vst2q_u8(&pV[2 * x], v);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 val2x1 = (x * 2 + oddX);
+			pU[val2x1] = Ua[x];
+			pV[val2x1] = Va[x];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
+                                       UINT32 nTotalWidth, UINT32 nTotalHeight,
+                                       BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                       const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 quaterWidth = (nWidth + 3) / 4;
+	const UINT32 quaterPad = quaterWidth % 16;
+
+	/* B4 and B5: odd UV values for width/2, height */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const UINT32 yTop = y + roi->top;
+		const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+		const BYTE* pYaV = pYaU + nTotalWidth / 2;
+		BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
+				u.val[1] = vld1q_u8(&pYaU[x]);
+				vst2q_u8(&pU[2 * x], u);
+			}
+			{
+				uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
+				v.val[1] = vld1q_u8(&pYaV[x]);
+				vst2q_u8(&pV[2 * x], v);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 odd = 2 * x + 1;
+			pU[odd] = pYaU[x];
+			pV[odd] = pYaV[x];
+		}
+	}
+
+	/* B6 - B9 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pUaV = pUaU + nTotalWidth / 4;
+		const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pVaV = pVaU + nTotalWidth / 4;
+		BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+		UINT32 x = 0;
+		for (; x < quaterWidth - quaterPad; x += 16)
+		{
+			{
+				uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
+				u.val[0] = vld1q_u8(&pUaU[x]);
+				u.val[2] = vld1q_u8(&pVaU[x]);
+				vst4q_u8(&pU[4 * x], u);
+			}
+			{
+				uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
+				v.val[0] = vld1q_u8(&pUaV[x]);
+				v.val[2] = vld1q_u8(&pVaV[x]);
+				vst4q_u8(&pV[4 * x], v);
+			}
+		}
+
+		for (; x < quaterWidth; x++)
+		{
+			pU[4 * x + 0] = pUaU[x];
+			pV[4 * x + 0] = pUaV[x];
+			pU[4 * x + 2] = pVaU[x];
+			pV[4 * x + 2] = pVaV[x];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
+                                            const BYTE* WINPR_RESTRICT pSrc[3],
+                                            const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
+                                            BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                            const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+		return -1;
+
+	if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+		return -1;
+
+	if (!roi)
+		return -1;
+
+	switch (type)
+	{
+		case AVC444_LUMA:
+			return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv1:
+			return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv2:
+			return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+		default:
+			return -1;
+	}
+}
+#endif
+
+void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(NEON_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+	WLog_VRB(PRIM_TAG, "NEON optimizations");
+	prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
+	prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
+	prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,274 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_internal.h"
+#include "prim_colors.h"
+
+/*---------------------------------------------------------------------------*/
+#if defined(NEON_INTRINSICS_ENABLED)
+#include <arm_neon.h>
+
+static primitives_t* generic = nullptr;
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+                                                BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                                const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
+                                                uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
+	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
+	const size_t pad = roi->width % 8;
+	const int16x4_t c4096 = vdup_n_s16(4096);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 8)
+		{
+			const int16x8_t Y = vld1q_s16(pY);
+			const int16x4_t Yh = vget_high_s16(Y);
+			const int16x4_t Yl = vget_low_s16(Y);
+			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
+			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
+			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
+			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
+			const int16x8_t Cr = vld1q_s16(pCr);
+			const int16x4_t Crh = vget_high_s16(Cr);
+			const int16x4_t Crl = vget_low_s16(Cr);
+			const int16x8_t Cb = vld1q_s16(pCb);
+			const int16x4_t Cbh = vget_high_s16(Cb);
+			const int16x4_t Cbl = vget_low_s16(Cb);
+			uint8x8x4_t bgrx;
+			{
+				/* R */
+				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
+				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
+				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
+				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
+				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
+				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
+				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
+				bgrx.val[rPos] = vqmovun_s16(Rs);
+			}
+			{
+				/* G */
+				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
+				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
+				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
+				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
+				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
+				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
+				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
+				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
+				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
+				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
+				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
+				const uint8x8_t G = vqmovun_s16(Gs);
+				bgrx.val[gPos] = G;
+			}
+			{
+				/* B */
+				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
+				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
+				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
+				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
+				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
+				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
+				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
+				const uint8x8_t B = vqmovun_s16(Bs);
+				bgrx.val[bPos] = B;
+			}
+			/* A */
+			{
+				bgrx.val[aPos] = vdup_n_u8(0xFF);
+			}
+			vst4_u8(pRGB, bgrx);
+			pY += 8;
+			pCb += 8;
+			pCr += 8;
+			pRGB += 32;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = ((*pY++) + 4096) << divisor;
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+			BYTE bgrx[4];
+			bgrx[bPos] = CLIP(B);
+			bgrx[gPos] = CLIP(G);
+			bgrx[rPos] = CLIP(R);
+			bgrx[aPos] = 0xFF;
+			*pRGB++ = bgrx[0];
+			*pRGB++ = bgrx[1];
+			*pRGB++ = bgrx[2];
+			*pRGB++ = bgrx[3];
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+                                              BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                              UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t
+neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+                             UINT32 srcStep,            /* bytes between rows in source data */
+                             BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+                             UINT32 dstStep,            /* bytes between rows in dest data */
+                             const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
+                             uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	UINT32 pad = roi->width % 8;
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		const INT16* pr = (const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
+		const INT16* pg = (const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
+		const INT16* pb = (const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
+		BYTE* dst = pDst + y * dstStep;
+
+		for (UINT32 x = 0; x < roi->width - pad; x += 8)
+		{
+			int16x8_t r = vld1q_s16(pr);
+			int16x8_t g = vld1q_s16(pg);
+			int16x8_t b = vld1q_s16(pb);
+			uint8x8x4_t bgrx;
+			bgrx.val[aPos] = vdup_n_u8(0xFF);
+			bgrx.val[rPos] = vqmovun_s16(r);
+			bgrx.val[gPos] = vqmovun_s16(g);
+			bgrx.val[bPos] = vqmovun_s16(b);
+			vst4_u8(dst, bgrx);
+			pr += 8;
+			pg += 8;
+			pb += 8;
+			dst += 32;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			BYTE bgrx[4];
+			bgrx[bPos] = *pb++;
+			bgrx[gPos] = *pg++;
+			bgrx[rPos] = *pr++;
+			bgrx[aPos] = 0xFF;
+			*dst++ = bgrx[0];
+			*dst++ = bgrx[1];
+			*dst++ = bgrx[2];
+			*dst++ = bgrx[3];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+neon_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+                           UINT32 srcStep,            /* bytes between rows in source data */
+                           BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+                           UINT32 dstStep,            /* bytes between rows in dest data */
+                           UINT32 DstFormat,
+                           const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+#endif /* NEON_INTRINSICS_ENABLED */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(NEON_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "NEON optimizations");
+	prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
+	prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,501 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations using openCL
+ *
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include "prim_internal.h"
+
+#if defined(WITH_OPENCL)
+#ifdef __APPLE__
+#include "OpenCL/opencl.h"
+#else
+#include <CL/cl.h>
+#endif
+#include "primitives-opencl-program.h"
+
+#include <freerdp/log.h>
+#define TAG FREERDP_TAG("primitives")
+
+typedef struct
+{
+	BOOL support;
+	cl_platform_id platformId;
+	cl_device_id deviceId;
+	cl_context context;
+	cl_command_queue commandQueue;
+	cl_program program;
+} primitives_opencl_context;
+
+typedef struct
+{
+	primitives_opencl_context* cl;
+	cl_kernel kernel;
+	cl_mem srcObjs[3];
+	cl_mem dstObj;
+	prim_size_t roi;
+	size_t dstStep;
+} primitives_cl_kernel;
+
+static primitives_opencl_context* primitives_get_opencl_context(void);
+
+static void cl_kernel_free(primitives_cl_kernel* kernel)
+{
+	if (!kernel)
+		return;
+
+	if (kernel->dstObj)
+		clReleaseMemObject(kernel->dstObj);
+
+	for (size_t i = 0; i < ARRAYSIZE(kernel->srcObjs); i++)
+	{
+		cl_mem obj = kernel->srcObjs[i];
+		kernel->srcObjs[i] = nullptr;
+		if (obj)
+			clReleaseMemObject(obj);
+	}
+
+	if (kernel->kernel)
+		clReleaseKernel(kernel->kernel);
+
+	free(kernel);
+}
+
+static primitives_cl_kernel* cl_kernel_new(const char* kernelName, const prim_size_t* roi)
+{
+	WINPR_ASSERT(kernelName);
+	WINPR_ASSERT(roi);
+
+	primitives_cl_kernel* kernel = calloc(1, sizeof(primitives_cl_kernel));
+	if (!kernel)
+		goto fail;
+
+	kernel->roi = *roi;
+	kernel->cl = primitives_get_opencl_context();
+	if (!kernel->cl)
+		goto fail;
+
+	cl_int ret = CL_INVALID_VALUE;
+	kernel->kernel = clCreateKernel(kernel->cl->program, kernelName, &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName);
+		goto fail;
+	}
+
+	return kernel;
+fail:
+	cl_kernel_free(kernel);
+	return nullptr;
+}
+
+static BOOL cl_kernel_set_sources(primitives_cl_kernel* ctx, const BYTE* WINPR_RESTRICT pSrc[3],
+                                  const UINT32 srcStep[3])
+{
+	const char* sourceNames[] = { "Y", "U", "V" };
+
+	WINPR_ASSERT(ctx);
+	WINPR_ASSERT(pSrc);
+	WINPR_ASSERT(srcStep);
+
+	for (cl_uint i = 0; i < ARRAYSIZE(ctx->srcObjs); i++)
+	{
+		cl_int ret = CL_INVALID_VALUE;
+		const BYTE* csrc = pSrc[i];
+		void* WINPR_RESTRICT src = WINPR_CAST_CONST_PTR_AWAY(csrc, void* WINPR_RESTRICT);
+		ctx->srcObjs[i] = clCreateBuffer(ctx->cl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+		                                 1ull * srcStep[i] * ctx->roi.height, src, &ret);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
+			return FALSE;
+		}
+
+		ret = clSetKernelArg(ctx->kernel, i * 2, sizeof(cl_mem), (const void*)&ctx->srcObjs[i]);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
+			return FALSE;
+		}
+
+		ret = clSetKernelArg(ctx->kernel, i * 2 + 1, sizeof(cl_uint), &srcStep[i]);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL cl_kernel_set_destination(primitives_cl_kernel* ctx, UINT32 dstStep)
+{
+
+	WINPR_ASSERT(ctx);
+
+	ctx->dstStep = dstStep;
+	cl_int ret = CL_INVALID_VALUE;
+	ctx->dstObj = clCreateBuffer(ctx->cl->context, CL_MEM_WRITE_ONLY,
+	                             1ull * dstStep * ctx->roi.height, nullptr, &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to create dest obj");
+		return FALSE;
+	}
+
+	ret = clSetKernelArg(ctx->kernel, 6, sizeof(cl_mem), (const void*)&ctx->dstObj);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to set arg destObj");
+		return FALSE;
+	}
+
+	ret = clSetKernelArg(ctx->kernel, 7, sizeof(cl_uint), &dstStep);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to set arg dstStep");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static BOOL cl_kernel_process(primitives_cl_kernel* ctx, BYTE* pDst)
+{
+	WINPR_ASSERT(ctx);
+	WINPR_ASSERT(pDst);
+
+	size_t indexes[2] = WINPR_C_ARRAY_INIT;
+	indexes[0] = ctx->roi.width;
+	indexes[1] = ctx->roi.height;
+
+	cl_int ret = clEnqueueNDRangeKernel(ctx->cl->commandQueue, ctx->kernel, 2, nullptr, indexes,
+	                                    nullptr, 0, nullptr, nullptr);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to enqueue call kernel");
+		return FALSE;
+	}
+
+	/* Transfer result to host */
+	ret = clEnqueueReadBuffer(ctx->cl->commandQueue, ctx->dstObj, CL_TRUE, 0,
+	                          ctx->roi.height * ctx->dstStep, pDst, 0, nullptr, nullptr);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to read back buffer");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static pstatus_t opencl_YUVToRGB(const char* kernelName, const BYTE* WINPR_RESTRICT pSrc[3],
+                                 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	pstatus_t res = -1;
+
+	primitives_cl_kernel* ctx = cl_kernel_new(kernelName, roi);
+	if (!ctx)
+		goto fail;
+
+	if (!cl_kernel_set_sources(ctx, pSrc, srcStep))
+		goto fail;
+
+	if (!cl_kernel_set_destination(ctx, dstStep))
+		goto fail;
+
+	if (!cl_kernel_process(ctx, pDst))
+		goto fail;
+
+	res = PRIMITIVES_SUCCESS;
+
+fail:
+	cl_kernel_free(ctx);
+	return res;
+}
+
+static primitives_opencl_context openclContext = WINPR_C_ARRAY_INIT;
+
+static primitives_opencl_context* primitives_get_opencl_context(void)
+{
+	return &openclContext;
+}
+
+static void cl_context_free(primitives_opencl_context* ctx)
+{
+	if (!ctx)
+		return;
+	clReleaseProgram(ctx->program);
+	clReleaseCommandQueue(ctx->commandQueue);
+	clReleaseContext(ctx->context);
+	clReleaseDevice(ctx->deviceId);
+	ctx->support = FALSE;
+}
+
+static pstatus_t primitives_uninit_opencl(void)
+{
+	if (!openclContext.support)
+		return PRIMITIVES_SUCCESS;
+
+	cl_context_free(&openclContext);
+	return PRIMITIVES_SUCCESS;
+}
+
+static BOOL primitives_init_opencl_context(primitives_opencl_context* WINPR_RESTRICT prims)
+{
+	cl_uint ndevices = 0;
+	cl_uint nplatforms = 0;
+	cl_kernel kernel = nullptr;
+
+	BOOL gotGPU = FALSE;
+	size_t programLen = 0;
+
+	cl_int ret = clGetPlatformIDs(0, nullptr, &nplatforms);
+	if (ret != CL_SUCCESS || nplatforms < 1)
+		return FALSE;
+
+	cl_platform_id* platform_ids = (cl_platform_id*)calloc(nplatforms, sizeof(cl_platform_id));
+	if (!platform_ids)
+		return FALSE;
+
+	ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms);
+	if (ret != CL_SUCCESS)
+	{
+		free((void*)platform_ids);
+		return FALSE;
+	}
+
+	for (cl_uint i = 0; (i < nplatforms) && !gotGPU; i++)
+	{
+		cl_device_id device_id = nullptr;
+		cl_context context = nullptr;
+		char platformName[1000] = WINPR_C_ARRAY_INIT;
+		char deviceName[1000] = WINPR_C_ARRAY_INIT;
+
+		ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName),
+		                        platformName, nullptr);
+		if (ret != CL_SUCCESS)
+			continue;
+
+		ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices);
+		if (ret != CL_SUCCESS)
+			continue;
+
+		ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName);
+			clReleaseDevice(device_id);
+			continue;
+		}
+
+		context = clCreateContext(nullptr, 1, &device_id, nullptr, nullptr, &ret);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s",
+			         platformName, deviceName);
+			clReleaseDevice(device_id);
+			continue;
+		}
+
+#if defined(CL_VERSION_2_0)
+		prims->commandQueue = clCreateCommandQueueWithProperties(context, device_id, nullptr, &ret);
+#else
+		prims->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
+#endif
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "openCL: unable to create command queue");
+			clReleaseContext(context);
+			clReleaseDevice(device_id);
+			continue;
+		}
+
+		WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName);
+
+		prims->platformId = platform_ids[i];
+		prims->deviceId = device_id;
+		prims->context = context;
+		gotGPU = TRUE;
+	}
+
+	free((void*)platform_ids);
+
+	if (!gotGPU)
+	{
+		WLog_ERR(TAG, "openCL: no GPU found");
+		return FALSE;
+	}
+
+	programLen = strnlen(openclProgram, sizeof(openclProgram));
+	const char* ptr = openclProgram;
+	prims->program = clCreateProgramWithSource(prims->context, 1, &ptr, &programLen, &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "openCL: unable to create program");
+		goto fail;
+	}
+
+	ret = clBuildProgram(prims->program, 1, &prims->deviceId, nullptr, nullptr, nullptr);
+	if (ret != CL_SUCCESS)
+	{
+		size_t length = 0;
+		char buffer[2048];
+		ret = clGetProgramBuildInfo(prims->program, prims->deviceId, CL_PROGRAM_BUILD_LOG,
+		                            sizeof(buffer), buffer, &length);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG,
+			         "openCL: building program failed but unable to retrieve buildLog, error=%d",
+			         ret);
+		}
+		else
+		{
+			WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer);
+		}
+		goto fail;
+	}
+
+	kernel = clCreateKernel(prims->program, "yuv420_to_bgra_1b", &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel");
+		goto fail;
+	}
+	clReleaseKernel(kernel);
+
+	prims->support = TRUE;
+	return TRUE;
+
+fail:
+	cl_context_free(prims);
+	return FALSE;
+}
+
+static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
+                                              const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                              UINT32 dstStep, UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	const char* kernel_name = nullptr;
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_ABGR32:
+			kernel_name = "yuv420_to_abgr_1b";
+			break;
+		case PIXEL_FORMAT_XBGR32:
+			kernel_name = "yuv420_to_xbgr_1b";
+			break;
+		case PIXEL_FORMAT_RGBX32:
+			kernel_name = "yuv420_to_rgba_1b";
+			break;
+		case PIXEL_FORMAT_RGBA32:
+			kernel_name = "yuv420_to_rgbx_1b";
+			break;
+		case PIXEL_FORMAT_BGRA32:
+			kernel_name = "yuv420_to_bgra_1b";
+			break;
+		case PIXEL_FORMAT_BGRX32:
+			kernel_name = "yuv420_to_bgrx_1b";
+			break;
+		case PIXEL_FORMAT_XRGB32:
+			kernel_name = "yuv420_to_xrgb_1b";
+			break;
+		case PIXEL_FORMAT_ARGB32:
+			kernel_name = "yuv420_to_argb_1b";
+			break;
+		default:
+		{
+			primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+			if (!p)
+				return -1;
+			return p->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+		}
+	}
+
+	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
+}
+
+static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
+                                              const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                              UINT32 dstStep, UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	const char* kernel_name = nullptr;
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_ABGR32:
+			kernel_name = "yuv444_to_abgr_1b";
+			break;
+		case PIXEL_FORMAT_XBGR32:
+			kernel_name = "yuv444_to_xbgr_1b";
+			break;
+		case PIXEL_FORMAT_RGBX32:
+			kernel_name = "yuv444_to_rgba_1b";
+			break;
+		case PIXEL_FORMAT_RGBA32:
+			kernel_name = "yuv444_to_rgbx_1b";
+			break;
+		case PIXEL_FORMAT_BGRA32:
+			kernel_name = "yuv444_to_bgra_1b";
+			break;
+		case PIXEL_FORMAT_BGRX32:
+			kernel_name = "yuv444_to_bgrx_1b";
+			break;
+		case PIXEL_FORMAT_XRGB32:
+			kernel_name = "yuv444_to_xrgb_1b";
+			break;
+		case PIXEL_FORMAT_ARGB32:
+			kernel_name = "yuv444_to_argb_1b";
+			break;
+		default:
+		{
+			primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+			if (!p)
+				return -1;
+			return p->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+		}
+	}
+
+	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
+}
+
+BOOL primitives_init_opencl(primitives_t* prims)
+{
+	primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+	if (!prims || !p)
+		return FALSE;
+	*prims = *p;
+
+	if (!primitives_init_opencl_context(&openclContext))
+		return TRUE;
+
+	prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
+	prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
+	prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
+	prims->uninit = primitives_uninit_opencl;
+	return TRUE;
+}
+#endif
@@ -0,0 +1,474 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized operations using openCL
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+uchar clamp_uc(int v, short l, short h)
+{
+	if (v > h)
+		v = h;
+	if (v < l)
+		v = l;
+	return (uchar)v;
+}
+
+short avgUV(__global const uchar* buf, unsigned stride, unsigned x, unsigned y)
+{
+	const short U00 = buf[y * stride];
+	if ((x != 0) || (y != 0))
+		return U00;
+	const short U01 = buf[y * stride + 1];
+	const short U10 = buf[(y + 1) * stride];
+	const short U11 = buf[(y + 1) * stride + 1];
+	const short avg = U00 * 4 - U01 - U10 - U11;
+	const short avgU = clamp_uc(avg, 0, 255);
+	const short diff = abs(U00 - avgU);
+	if (diff < 30)
+		return U00;
+	return avgU;
+}
+
+__kernel void yuv420_to_rgba_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255);               /* R */
+	destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255);               /* B */
+	                                                                         /* A */
+}
+
+__kernel void yuv420_to_abgr_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	/* A */
+	destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255);            /* B */
+	destPtr[2] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255);            /* R */
+}
+
+__kernel void yuv444_to_abgr_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	/* A */
+	destPtr[1] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+}
+
+__kernel void yuv444_to_rgba_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+	destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	                                                                   /* A */
+}
+
+__kernel void yuv420_to_rgbx_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255);               /* R */
+	destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255);               /* B */
+	destPtr[3] = 0xff;                                                       /* A */
+}
+
+__kernel void yuv420_to_xbgr_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = 0xff;                                                 /* A */
+	destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255);            /* B */
+	destPtr[2] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255);            /* R */
+}
+
+__kernel void yuv444_to_xbgr_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = 0xff;                                                 /* A */
+	destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255);            /* B */
+	destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+}
+
+__kernel void yuv444_to_rgbx_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+	destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	destPtr[3] = 0xff;                                                 /* A */
+}
+
+__kernel void yuv420_to_argb_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	/* A */
+	destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255);               /* R */
+	destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255);               /* B */
+}
+
+__kernel void yuv420_to_bgra_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255);            /* B */
+	destPtr[1] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255);            /* R */
+	                                                                   /* A */
+}
+
+__kernel void yuv444_to_bgra_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+	                                                                   /* A */
+}
+
+__kernel void yuv444_to_argb_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[3] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[1] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+	                                                                   /* A */
+}
+
+__kernel void yuv420_to_xrgb_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = 0xff;                                                       /* A */
+	destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255);               /* R */
+	destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255);               /* B */
+}
+
+__kernel void yuv420_to_bgrx_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255);            /* B */
+	destPtr[1] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255);            /* R */
+	destPtr[3] = 0xff;                                                 /* A */
+}
+
+__kernel void yuv444_to_bgrx_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+	destPtr[3] = 0xff;                                                 /* A */
+}
+
+__kernel void yuv444_to_xrgb_1b(__global const uchar* bufY, unsigned strideY,
+                                __global const uchar* bufU, unsigned strideU,
+                                __global const uchar* bufV, unsigned strideV, __global uchar* dest,
+                                unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = avgUV(bufU, strideU, x, y);
+	short V = avgUV(bufV, strideV, x, y);
+	short D = U - 128;
+	short E = V - 128;
+
+	__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[3] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255);            /* B */
+	destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
+	destPtr[1] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255);            /* R */
+	destPtr[0] = 0xff;                                                 /* A */
+}
@@ -0,0 +1,11 @@
+/* AUTOGENERATED file, do not edit
+ *
+ * part of @PROJECT_NAME@
+ * generated from libfreerdp/primitives/opencl/primitives.h.in
+ *
+ * with file contents of @FILENAME@
+ */
+#pragma once
+
+static const char openclProgram[] = { @FILEDATA@ };
+
@@ -0,0 +1,82 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_YCoCg.h"
+
+/* helper function to convert raw 8 bit values to signed 16bit values.
+ */
+static INT16 convert(UINT8 raw, int shift)
+{
+	const int cll = shift - 1; /* -1 builds in the /2's */
+	return (INT16)((INT8)(raw << cll));
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                            BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                            INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
+                                            BOOL withAlpha)
+{
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, TRUE);
+
+	for (size_t y = 0; y < height; y++)
+	{
+		const BYTE* sptr = &pSrc[y * WINPR_ASSERTING_INT_CAST(uint32_t, srcStep)];
+		BYTE* dptr = &pDst[y * WINPR_ASSERTING_INT_CAST(uint32_t, dstStep)];
+		for (size_t x = 0; x < width; x++)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const INT16 Cg = convert(*sptr++, shift);
+			const INT16 Co = convert(*sptr++, shift);
+			const INT16 Y = *sptr++; /* UINT8->INT16 */
+			const INT16 T = (INT16)(Y - Cg);
+			const INT16 B = (INT16)(T + Co);
+			const INT16 G = (INT16)(Y + Cg);
+			const INT16 R = (INT16)(T - Co);
+			BYTE A = *sptr++;
+
+			if (!withAlpha)
+				A = 0xFFU;
+
+			dptr = writePixel(dptr, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), A);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims)
+{
+	prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
+}
+
+void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_YCoCg(prims);
+	primitives_init_YCoCg_ssse3(prims);
+	primitives_init_YCoCg_neon(prims);
+}
@@ -0,0 +1,53 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_YCoCg_H
+#define FREERDP_LIB_PRIM_YCoCg_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims);
+
+static inline void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
+	    !IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_YCoCg_ssse3_int(prims);
+}
+
+FREERDP_LOCAL void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims);
+
+static inline void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+		return;
+	primitives_init_YCoCg_neon_int(prims);
+}
+
+#endif
@@ -0,0 +1,51 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_YUV_H
+#define FREERDP_LIB_PRIM_YUV_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_YUV_sse41(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresentEx(PF_EX_SSE41) ||
+	    !IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_YUV_sse41_int(prims);
+}
+
+FREERDP_LOCAL void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_YUV_neon(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_YUV_neon_int(prims);
+}
+
+#endif
@@ -0,0 +1,83 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <stdint.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_add.h"
+
+/* ----------------------------------------------------------------------------
+ * 16-bit signed add with saturation (under and over).
+ */
+static inline INT16 add(INT16 a, INT16 b)
+{
+	INT32 k = (INT32)a + (INT32)b;
+
+	if (k > INT16_MAX)
+		return INT16_MAX;
+
+	if (k < INT16_MIN)
+		return INT16_MIN;
+
+	return (INT16)k;
+}
+
+static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
+                                 const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
+                                 UINT32 len)
+{
+	const UINT32 rem = len % 16;
+	const UINT32 align = len - rem;
+
+	for (UINT32 x = 0; x < align; x++)
+		*pDst++ = add(*pSrc1++, *pSrc2++);
+
+	for (UINT32 x = 0; x < rem; x++)
+		*pDst++ = add(*pSrc1++, *pSrc2++);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
+                                         INT16* WINPR_RESTRICT pSrcDst2, UINT32 len)
+{
+	for (UINT32 x = 0; x < len; x++)
+	{
+		INT16 v = add(pSrcDst1[x], pSrcDst2[x]);
+		pSrcDst1[x] = v;
+		pSrcDst2[x] = v;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add(primitives_t* WINPR_RESTRICT prims)
+{
+	prims->add_16s = general_add_16s;
+	prims->add_16s_inplace = general_add_16s_inplace;
+}
+
+void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_add(prims);
+	primitives_init_add_sse3(prims);
+}
@@ -0,0 +1,42 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_ADD_H
+#define FREERDP_LIB_PRIM_ADD_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
+		return;
+
+	primitives_init_add_sse3_int(prims);
+}
+
+#endif
@@ -0,0 +1,98 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_alphaComp.h"
+
+#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
+                                        const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
+                                        BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
+                                        UINT32 height)
+{
+	for (size_t y = 0; y < height; y++)
+	{
+		const UINT32* sptr1 = (const UINT32*)(pSrc1 + y * src1Step);
+		const UINT32* sptr2 = (const UINT32*)(pSrc2 + y * src2Step);
+		UINT32* dptr = (UINT32*)(pDst + y * dstStep);
+
+		for (size_t x = 0; x < width; x++)
+		{
+			const UINT32 src1 = *sptr1++;
+			const UINT32 src2 = *sptr2++;
+			UINT32 alpha = ALPHA(src1) + 1;
+
+			if (alpha == 256)
+			{
+				/* If alpha is 255+1, just copy src1. */
+				*dptr++ = src1;
+			}
+			else if (alpha <= 1)
+			{
+				/* If alpha is 0+1, just copy src2. */
+				*dptr++ = src2;
+			}
+			else
+			{
+				/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
+				 * rather than adding one to alpha and dividing by 256, but this
+				 * is much faster and only differs by one 16% of the time.
+				 * I'm not sure who first designed the double-ops trick
+				 * (Red Blue and Alpha Green).
+				 */
+				UINT32 rb = 0;
+				UINT32 ag = 0;
+				UINT32 s2rb = src2 & 0x00FF00FFU;
+				UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
+				UINT32 s1rb = src1 & 0x00FF00FFU;
+				UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
+				UINT32 drb = s1rb - s2rb;
+				UINT32 dag = s1ag - s2ag;
+				drb *= alpha;
+				dag *= alpha;
+				rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
+				ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
+				*dptr++ = rb | ag;
+			}
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims)
+{
+	prims->alphaComp_argb = general_alphaComp_argb;
+}
+
+void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_alphaComp(prims);
+	primitives_init_alphaComp_sse3(prims);
+}
@@ -0,0 +1,42 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H
+#define FREERDP_LIB_PRIM_ALPHA_COMP_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
+		return;
+
+	primitives_init_alphaComp_sse3_int(prims);
+}
+
+#endif
@@ -0,0 +1,66 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_andor.h"
+
+/* ----------------------------------------------------------------------------
+ * 32-bit AND with a constant.
+ */
+static pstatus_t general_andC_32u(const UINT32* WINPR_RESTRICT pSrc, UINT32 val,
+                                  UINT32* WINPR_RESTRICT pDst, INT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	while (len--)
+		*pDst++ = *pSrc++ & val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ----------------------------------------------------------------------------
+ * 32-bit OR with a constant.
+ */
+static pstatus_t general_orC_32u(const UINT32* WINPR_RESTRICT pSrc, UINT32 val,
+                                 UINT32* WINPR_RESTRICT pDst, INT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	while (len--)
+		*pDst++ = *pSrc++ | val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor(primitives_t* WINPR_RESTRICT prims)
+{
+	/* Start with the default. */
+	prims->andC_32u = general_andC_32u;
+	prims->orC_32u = general_orC_32u;
+}
+
+void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_andor(prims);
+	primitives_init_andor_sse3(prims);
+}
@@ -0,0 +1,42 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_ANDOR_H
+#define FREERDP_LIB_PRIM_ANDOR_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_andor_sse3_int(prims);
+}
+
+#endif
@@ -0,0 +1,576 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <math.h>
+
+#include <freerdp/config.h>
+#include <winpr/assert.h>
+#include <winpr/cast.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+
+#include "prim_internal.h"
+#include "prim_colors.h"
+
+#ifndef MINMAX
+#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
+#endif /* !MINMAX */
+/* ------------------------------------------------------------------------- */
+
+/* pregenerated table for ycbcr constants: [0,27]
+ *
+ * rounded integer values derived from the following formula:
+ *
+ * { (1.402525f * 2^divisor), (0.714401f * 2^divisor), (0.343730f * 2^divisor), (1.769905f *
+ * 2^divisor) }
+ */
+
+static const INT32 ycbcr_constants[][4] = { { 1, 1, 0, 2 },
+	                                        { 3, 1, 1, 4 },
+	                                        { 6, 3, 1, 7 },
+	                                        { 11, 6, 3, 14 },
+	                                        { 22, 11, 5, 28 },
+	                                        { 45, 23, 11, 57 },
+	                                        { 90, 46, 22, 113 },
+	                                        { 180, 91, 44, 227 },
+	                                        { 359, 183, 88, 453 },
+	                                        { 718, 366, 176, 906 },
+	                                        { 1436, 732, 352, 1812 },
+	                                        { 2872, 1463, 704, 3625 },
+	                                        { 5745, 2926, 1408, 7250 },
+	                                        { 11489, 5852, 2816, 14499 },
+	                                        { 22979, 11705, 5632, 28998 },
+	                                        { 45958, 23409, 11263, 57996 },
+	                                        { 91916, 46819, 22527, 115992 },
+	                                        { 183832, 93638, 45053, 231985 },
+	                                        { 367664, 187276, 90107, 463970 },
+	                                        { 735327, 374552, 180214, 927940 },
+	                                        { 1470654, 749104, 360427, 1855880 },
+	                                        { 2941308, 1498207, 720854, 3711760 },
+	                                        { 5882616, 2996415, 1441708, 7423520 },
+	                                        { 11765232, 5992830, 2883416, 14847039 },
+	                                        { 23530465, 11985660, 5766832, 29694078 },
+	                                        { 47060930, 23971320, 11533665, 59388157 },
+	                                        { 94121859, 47942640, 23067330, 118776314 },
+	                                        { 188243719, 95885279, 46134660, 237552628 },
+	                                        { 376487438, 191770558, 92269319, 475105256 },
+	                                        { 752974876, 383541116, 184538639, 950210512 },
+	                                        { 1505949752, 767082233, 369077277, 1900421023 } };
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3],
+                                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                      UINT32 dstStep, UINT32 DstFormat,
+                                                      const prim_size_t* WINPR_RESTRICT roi)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
+	const size_t dstPad = (dstStep - (roi->width * 4));
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; x++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+
+			const INT32 CrR = WINPR_ASSERTING_INT_CAST(
+			    int32_t, Cr* ycbcr_constants[divisor][0]); //(1.402525f * 2^divisor);
+			const INT32 CrG = WINPR_ASSERTING_INT_CAST(
+			    int32_t, Cr* ycbcr_constants[divisor][1]); //(0.714401f * 2^divisor);
+			const INT32 CbG = WINPR_ASSERTING_INT_CAST(
+			    int32_t, Cb* ycbcr_constants[divisor][2]); //(0.343730f * 2^divisor);
+			const INT32 CbB = WINPR_ASSERTING_INT_CAST(
+			    int32_t, Cb* ycbcr_constants[divisor][3]); //(1.769905f * 2^divisor);
+			const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, ((CrR + Y) >> divisor) >> 5);
+			const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, ((Y - CbG - CrG) >> divisor) >> 5);
+			const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, ((CbB + Y) >> divisor) >> 5);
+			pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_general(const INT16* WINPR_RESTRICT pSrc[3],
+                                                         UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                         UINT32 dstStep, UINT32 DstFormat,
+                                                         const prim_size_t* WINPR_RESTRICT roi)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
+	const size_t dstPad = (dstStep - (roi->width * 4));
+	const fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; x++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+			const INT32 CrR = Cr * ycbcr_constants[divisor][0];
+			const INT32 CrG = Cr * ycbcr_constants[divisor][1];
+			const INT32 CbG = Cb * ycbcr_constants[divisor][2];
+			const INT32 CbB = Cb * ycbcr_constants[divisor][3];
+			const INT32 R = (CrR + Y) >> (divisor + 5);
+			const INT32 G = (Y - CbG - CrG) >> (divisor + 5);
+			const INT32 B = (CbB + Y) >> (divisor + 5);
+			pRGB = writePixel(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3],
+                                                 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                 UINT32 dstStep, UINT32 DstFormat,
+                                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                            roi);
+
+		default:
+			return general_yCbCrToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                               roi);
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+
+static pstatus_t
+general_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
+                               INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+                               const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	/**
+	 * The decoded YCbCr coeffectients are represented as 11.5 fixed-point
+	 * numbers:
+	 *
+	 * 1 sign bit + 10 integer bits + 5 fractional bits
+	 *
+	 * However only 7 integer bits will be actually used since the value range
+	 * is [-128.0, 127.0].  In other words, the decoded coefficients are scaled
+	 * by << 5 when interpreted as INT16.
+	 * It was scaled in the quantization phase, so we must scale it back here.
+	 */
+	const INT16* yptr = pSrc[0];
+	const INT16* cbptr = pSrc[1];
+	const INT16* crptr = pSrc[2];
+	INT16* rptr = pDst[0];
+	INT16* gptr = pDst[1];
+	INT16* bptr = pDst[2];
+	UINT32 srcbump = (WINPR_ASSERTING_INT_CAST(uint32_t, srcStep) - (roi->width * sizeof(UINT16))) /
+	                 sizeof(UINT16);
+	UINT32 dstbump = (WINPR_ASSERTING_INT_CAST(uint32_t, dstStep) - (roi->width * sizeof(UINT16))) /
+	                 sizeof(UINT16);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; ++x)
+		{
+			/* INT32 is used intentionally because we calculate
+			 * with shifted factors!
+			 */
+			INT32 cy = (INT32)(*yptr++);
+			INT32 cb = (INT32)(*cbptr++);
+			INT32 cr = (INT32)(*crptr++);
+			INT64 r = 0;
+			INT64 g = 0;
+			INT64 b = 0;
+			/*
+			 * This is the slow floating point version kept here for reference.
+			 * y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
+			 * r = y + cr*1.403f;
+			 * g = y - cb*0.344f - cr*0.714f;
+			 * b = y + cb*1.770f;
+			 * y_r_buf[i]  = CLIP(r>>5);
+			 * cb_g_buf[i] = CLIP(g>>5);
+			 * cr_b_buf[i] = CLIP(b>>5);
+			 */
+			/*
+			 * We scale the factors by << 16 into 32-bit integers in order to
+			 * avoid slower floating point multiplications.  Since the final
+			 * result needs to be scaled by >> 5 we will extract only the
+			 * upper 11 bits (>> 21) from the final sum.
+			 * Hence we also have to scale the other terms of the sum by << 16.
+			 * R: 1.403 << 16 = 91947
+			 * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
+			 * B: 1.770 << 16 = 115998
+			 */
+			cy = (INT32)((UINT32)(cy + 4096) << 16);
+
+			r = 1LL * cy + 1LL * cr * ycbcr_constants[16][0];
+			g = 1LL * cy - 1LL * cb * ycbcr_constants[16][1] - 1LL * cr * ycbcr_constants[16][2];
+			b = 1LL * cy + 1LL * cb * ycbcr_constants[16][3];
+			*rptr++ = CLIP(r >> 21);
+			*gptr++ = CLIP(g >> 21);
+			*bptr++ = CLIP(b >> 21);
+		}
+
+		yptr += srcbump;
+		cbptr += srcbump;
+		crptr += srcbump;
+		rptr += dstbump;
+		gptr += dstbump;
+		bptr += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t
+general_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
+                               INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+                               const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
+	 * numbers:
+	 *
+	 * 1 sign bit + 10 integer bits + 5 fractional bits
+	 *
+	 * However only 7 integer bits will be actually used since the value
+	 * range is [-128.0, 127.0].  In other words, the encoded coefficients
+	 * is scaled by << 5 when interpreted as INT16.
+	 * It will be scaled down to original during the quantization phase.
+	 */
+	const INT16* rptr = pSrc[0];
+	const INT16* gptr = pSrc[1];
+	const INT16* bptr = pSrc[2];
+	INT16* yptr = pDst[0];
+	INT16* cbptr = pDst[1];
+	INT16* crptr = pDst[2];
+	UINT32 srcbump = (WINPR_ASSERTING_INT_CAST(uint32_t, srcStep) - (roi->width * sizeof(UINT16))) /
+	                 sizeof(UINT16);
+	UINT32 dstbump = (WINPR_ASSERTING_INT_CAST(uint32_t, dstStep) - (roi->width * sizeof(UINT16))) /
+	                 sizeof(UINT16);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; ++x)
+		{
+			/* INT32 is used intentionally because we calculate with
+			 * shifted factors!
+			 */
+			INT32 r = (INT32)(*rptr++);
+			INT32 g = (INT32)(*gptr++);
+			INT32 b = (INT32)(*bptr++);
+			/* We scale the factors by << 15 into 32-bit integers in order
+			 * to avoid slower floating point multiplications.  Since the
+			 * terms need to be scaled by << 5 we simply scale the final
+			 * sum by >> 10
+			 *
+			 * Y:  0.299000 << 15 = 9798,  0.587000 << 15 = 19235,
+			 *     0.114000 << 15 = 3735
+			 * Cb: 0.168935 << 15 = 5535,  0.331665 << 15 = 10868,
+			 *     0.500590 << 15 = 16403
+			 * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
+			 *     0.081282 << 15 = 2663
+			 */
+			INT32 cy = (r * 9798 + g * 19235 + b * 3735) >> 10;
+			INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
+			INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
+			*yptr++ = (INT16)MINMAX(cy - 4096, -4096, 4095);
+			*cbptr++ = (INT16)MINMAX(cb, -4096, 4095);
+			*crptr++ = (INT16)MINMAX(cr, -4096, 4095);
+		}
+
+		yptr += srcbump;
+		cbptr += srcbump;
+		crptr += srcbump;
+		rptr += dstbump;
+		gptr += dstbump;
+		bptr += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline void writeScanlineGeneric(BYTE* dst, DWORD formatSize, UINT32 DstFormat,
+                                        const INT16* r, const INT16* g, const INT16* b, DWORD width)
+{
+	fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const INT16 pr = *r++;
+		const INT16 pg = *g++;
+		const INT16 pb = *b++;
+
+		dst =
+		    writePixel(dst, formatSize, DstFormat, WINPR_ASSERTING_INT_CAST(UINT8, pr),
+		               WINPR_ASSERTING_INT_CAST(UINT8, pg), WINPR_ASSERTING_INT_CAST(UINT8, pb), 0);
+	}
+}
+
+static inline void writeScanlineRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                    const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = R;
+		*dst++ = G;
+		*dst++ = B;
+	}
+}
+
+static inline void writeScanlineBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                    const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = B;
+		*dst++ = G;
+		*dst++ = R;
+	}
+}
+
+static inline void writeScanlineBGRX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = B;
+		*dst++ = G;
+		*dst++ = R;
+		*dst++ = 0xFF;
+	}
+}
+
+static inline void writeScanlineRGBX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = R;
+		*dst++ = G;
+		*dst++ = B;
+		*dst++ = 0xFF;
+	}
+}
+
+static inline void writeScanlineXBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = 0xFF;
+		*dst++ = B;
+		*dst++ = G;
+		*dst++ = R;
+	}
+}
+
+static inline void writeScanlineXRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = 0xFF;
+		*dst++ = R;
+		*dst++ = G;
+		*dst++ = B;
+	}
+}
+
+typedef void (*fkt_writeScanline)(BYTE*, DWORD, UINT32, const INT16*, const INT16*, const INT16*,
+                                  DWORD);
+
+static inline fkt_writeScanline getScanlineWriteFunction(DWORD format)
+{
+	switch (format)
+	{
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return writeScanlineXRGB;
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return writeScanlineXBGR;
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return writeScanlineRGBX;
+
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return writeScanlineBGRX;
+
+		case PIXEL_FORMAT_BGR24:
+			return writeScanlineBGR;
+
+		case PIXEL_FORMAT_RGB24:
+			return writeScanlineRGB;
+
+		default:
+			return writeScanlineGeneric;
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R_general(
+    const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                      /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,           /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                      /* bytes between rows in dest data */
+    UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	const INT16* r = pSrc[0];
+	const INT16* g = pSrc[1];
+	const INT16* b = pSrc[2];
+	const DWORD srcAdd = srcStep / sizeof(INT16);
+	fkt_writeScanline writeScanline = getScanlineWriteFunction(DstFormat);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		(*writeScanline)(pDst, formatSize, DstFormat, r, g, b, roi->width);
+		pDst += dstStep;
+		r += srcAdd;
+		g += srcAdd;
+		b += srcAdd;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R_BGRX(
+    const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                      /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,           /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                      /* bytes between rows in dest data */
+    UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	const INT16* r = pSrc[0];
+	const INT16* g = pSrc[1];
+	const INT16* b = pSrc[2];
+	const DWORD srcAdd = srcStep / sizeof(INT16);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		writeScanlineBGRX(pDst, formatSize, DstFormat, r, g, b, roi->width);
+		pDst += dstStep;
+		r += srcAdd;
+		g += srcAdd;
+		b += srcAdd;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+general_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+                              UINT32 srcStep,            /* bytes between rows in source data */
+                              BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+                              UINT32 dstStep,            /* bytes between rows in dest data */
+                              UINT32 DstFormat,
+                              const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+		default:
+			return general_RGBToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                             roi);
+	}
+}
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors(primitives_t* WINPR_RESTRICT prims)
+{
+	prims->yCbCrToRGB_16s8u_P3AC4R = general_yCbCrToRGB_16s8u_P3AC4R;
+	prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
+	prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
+	prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_colors(prims);
+	primitives_init_colors_sse2(prims);
+	primitives_init_colors_neon(prims);
+}
@@ -0,0 +1,51 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives colors
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_COLORS_H
+#define FREERDP_LIB_PRIM_COLORS_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_colors_sse2(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_colors_sse2_int(prims);
+}
+
+FREERDP_LOCAL void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_colors_neon(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_colors_neon_int(prims);
+}
+
+#endif
@@ -0,0 +1,439 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/log.h>
+
+#include "prim_internal.h"
+#include "prim_copy.h"
+#include "../codec/color.h"
+
+#include <freerdp/codec/color.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_1d(*/
+static BOOL memory_regions_overlap_1d(const BYTE* p1, const BYTE* p2, size_t bytes)
+{
+	const ULONG_PTR p1m = (const ULONG_PTR)p1;
+	const ULONG_PTR p2m = (const ULONG_PTR)p2;
+
+	if (p1m <= p2m)
+	{
+		if (p1m + bytes > p2m)
+			return TRUE;
+	}
+	else
+	{
+		if (p2m + bytes > p1m)
+			return TRUE;
+	}
+
+	/* else */
+	return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_2d( */
+static BOOL memory_regions_overlap_2d(const BYTE* p1, int p1Step, int p1Size, const BYTE* p2,
+                                      int p2Step, int p2Size, int width, int height)
+{
+	ULONG_PTR p1m = (ULONG_PTR)p1;
+	ULONG_PTR p2m = (ULONG_PTR)p2;
+
+	if (p1m <= p2m)
+	{
+		ULONG_PTR p1mEnd = p1m +
+		                   1ull * (WINPR_ASSERTING_INT_CAST(uint32_t, height - 1)) *
+		                       WINPR_ASSERTING_INT_CAST(uint32_t, p1Step) +
+		                   1ull * WINPR_ASSERTING_INT_CAST(uint32_t, width* p1Size);
+
+		if (p1mEnd > p2m)
+			return TRUE;
+	}
+	else
+	{
+		ULONG_PTR p2mEnd = p2m +
+		                   1ull * (WINPR_ASSERTING_INT_CAST(uintptr_t, height - 1)) *
+		                       WINPR_ASSERTING_INT_CAST(uintptr_t, p2Step) +
+		                   1ull * WINPR_ASSERTING_INT_CAST(uintptr_t, width* p2Size);
+
+		if (p2mEnd > p1m)
+			return TRUE;
+	}
+
+	/* else */
+	return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_copy_8u(const BYTE* WINPR_RESTRICT pSrc, BYTE* WINPR_RESTRICT pDst,
+                                 INT32 len)
+{
+	if (memory_regions_overlap_1d(pSrc, pDst, (size_t)len))
+	{
+		memmove((void*)pDst, (const void*)pSrc, (size_t)len);
+	}
+	else
+	{
+		memcpy((void*)pDst, (const void*)pSrc, (size_t)len);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+/* Copy a block of pixels from one buffer to another.
+ * The addresses are assumed to have been already offset to the upper-left
+ * corners of the source and destination region of interest.
+ */
+static pstatus_t general_copy_8u_AC4r(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                      BYTE* WINPR_RESTRICT pDst, INT32 dstStep, INT32 width,
+                                      INT32 height)
+{
+	const BYTE* src = pSrc;
+	BYTE* dst = pDst;
+	const size_t rowbytes = WINPR_ASSERTING_INT_CAST(size_t, width) * sizeof(UINT32);
+
+	if ((width == 0) || (height == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32), pDst, dstStep, sizeof(UINT32),
+	                              width, height))
+	{
+		do
+		{
+			const pstatus_t rc =
+			    generic->copy(src, dst, WINPR_ASSERTING_INT_CAST(int32_t, rowbytes));
+			if (rc != PRIMITIVES_SUCCESS)
+				return rc;
+
+			src += srcStep;
+			dst += dstStep;
+		} while (--height);
+	}
+	else
+	{
+		/* TODO: do it in one operation when the rowdata is adjacent. */
+		do
+		{
+			/* If we find a replacement for memcpy that is consistently
+			 * faster, this could be replaced with that.
+			 */
+			memcpy(dst, src, rowbytes);
+			src += srcStep;
+			dst += dstStep;
+		} while (--height);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t generic_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData,
+                                                        UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                                        UINT32 nWidth, UINT32 nHeight,
+                                                        const BYTE* WINPR_RESTRICT pSrcData,
+                                                        UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                        int64_t srcVMultiplier, int64_t srcVOffset,
+                                                        int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 3;
+	const int64_t dstByte = 4;
+
+	const UINT32 width = nWidth - nWidth % 8;
+
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		WINPR_PRAGMA_UNROLL_LOOP
+		for (; x < width; x++)
+		{
+			dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
+			dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
+			dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
+		}
+
+		for (; x < nWidth; x++)
+		{
+			dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
+			dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
+			dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t
+generic_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep, UINT32 nXDst,
+                                 UINT32 nYDst, UINT32 nWidth, UINT32 nHeight,
+                                 const BYTE* WINPR_RESTRICT pSrcData, UINT32 nSrcStep, UINT32 nXSrc,
+                                 UINT32 nYSrc, int64_t srcVMultiplier, int64_t srcVOffset,
+                                 int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 4;
+	const int64_t dstByte = 4;
+
+	const UINT32 width = nWidth - nWidth % 8;
+
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		WINPR_PRAGMA_UNROLL_LOOP
+		for (; x < width; x++)
+		{
+			dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
+			dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
+			dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
+		}
+		for (; x < nWidth; x++)
+		{
+			dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
+			dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
+			dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+pstatus_t generic_image_copy_no_overlap_convert(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset)
+{
+	const int64_t srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
+	const int64_t dstByte = FreeRDPGetBytesPerPixel(DstFormat);
+
+	const UINT32 width = nWidth - nWidth % 8;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		// WINPR_PRAGMA_UNROLL_LOOP
+		for (; x < width; x++)
+		{
+			const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
+			const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
+			if (!FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor))
+				return -1;
+		}
+		for (; x < nWidth; x++)
+		{
+			const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
+			const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
+			if (!FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor))
+				return -1;
+		}
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+pstatus_t generic_image_copy_no_overlap_memcpy(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+    WINPR_ATTR_UNUSED const gdiPalette* WINPR_RESTRICT palette, int64_t srcVMultiplier,
+    int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset, WINPR_ATTR_UNUSED UINT32 flags)
+{
+	const int64_t dstByte = FreeRDPGetBytesPerPixel(DstFormat);
+	const int64_t srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
+	const int64_t copyDstWidth = nWidth * dstByte;
+	const int64_t xSrcOffset = nXSrc * srcByte;
+	const int64_t xDstOffset = nXDst * dstByte;
+
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+		memcpy(&dstLine[xDstOffset], &srcLine[xSrcOffset],
+		       WINPR_ASSERTING_INT_CAST(size_t, copyDstWidth));
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t generic_image_copy_no_overlap_dst_alpha(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset)
+{
+	WINPR_ASSERT(pDstData);
+	WINPR_ASSERT(pSrcData);
+
+	switch (SrcFormat)
+	{
+		case PIXEL_FORMAT_BGR24:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return generic_image_copy_bgr24_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return generic_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_RGBX32:
+				case PIXEL_FORMAT_RGBA32:
+					return generic_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				case PIXEL_FORMAT_RGB24:
+					return generic_image_copy_bgr24_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		default:
+			break;
+	}
+
+	return generic_image_copy_no_overlap_convert(
+	    pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+	    nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+}
+
+static inline pstatus_t generic_image_copy_no_overlap_no_alpha(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset,
+    UINT32 flags)
+{
+	if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
+		return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset, flags);
+	else
+		return generic_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                             nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                             nXSrc, nYSrc, palette, srcVMultiplier,
+		                                             srcVOffset, dstVMultiplier, dstVOffset);
+}
+
+static pstatus_t generic_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
+                                               UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                               UINT32 nWidth, UINT32 nHeight,
+                                               const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+                                               UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                               const gdiPalette* WINPR_RESTRICT palette,
+                                               UINT32 flags)
+{
+	const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
+	int64_t srcVOffset = 0;
+	int64_t srcVMultiplier = 1;
+	int64_t dstVOffset = 0;
+	int64_t dstVMultiplier = 1;
+
+	if ((nWidth == 0) || (nHeight == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
+		return -1;
+
+	if (!pDstData || !pSrcData)
+		return -1;
+
+	if (nDstStep == 0)
+		nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
+
+	if (nSrcStep == 0)
+		nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
+
+	if (vSrcVFlip)
+	{
+		srcVOffset = (nHeight - 1ll) * nSrcStep;
+		srcVMultiplier = -1;
+	}
+
+	if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
+		return generic_image_copy_no_overlap_dst_alpha(
+		    pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
+		    nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier,
+		    dstVOffset);
+	else
+		return generic_image_copy_no_overlap_no_alpha(
+		    pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
+		    nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset,
+		    flags);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy(primitives_t* WINPR_RESTRICT prims)
+{
+	/* Start with the default. */
+	prims->copy_8u = general_copy_8u;
+	prims->copy_8u_AC4r = general_copy_8u_AC4r;
+	prims->copy = WINPR_FUNC_PTR_CAST(prims->copy_8u, fn_copy_t);
+	prims->copy_no_overlap = generic_image_copy_no_overlap;
+}
+
+void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_copy(prims);
+	primitives_init_copy_sse41(prims);
+#if defined(WITH_AVX2)
+	primitives_init_copy_avx2(prims);
+#endif
+}
@@ -0,0 +1,63 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_COPY_H
+#define FREERDP_LIB_PRIM_COPY_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+WINPR_ATTR_NODISCARD FREERDP_LOCAL pstatus_t generic_image_copy_no_overlap_convert(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset);
+
+WINPR_ATTR_NODISCARD FREERDP_LOCAL pstatus_t generic_image_copy_no_overlap_memcpy(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset,
+    UINT32 flags);
+
+FREERDP_LOCAL void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_copy_sse41(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_copy_sse41_int(prims);
+}
+
+#if defined(WITH_AVX2)
+FREERDP_LOCAL void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_copy_avx2(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_copy_avx2_int(prims);
+}
+#endif
+
+#endif
@@ -0,0 +1,352 @@
+/* prim_internal.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifndef FREERDP_LIB_PRIM_INTERNAL_H
+#define FREERDP_LIB_PRIM_INTERNAL_H
+
+#include <winpr/platform.h>
+#include <freerdp/config.h>
+
+#include <freerdp/primitives.h>
+#include <freerdp/api.h>
+
+#include "../core/simd.h"
+
+#define PRIM_ALIGN_128 DECLSPEC_ALIGN(16)
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED) || defined(NEON_INTRINSICS_ENABLED) || defined(WITH_OPENCL)
+#define HAVE_OPTIMIZED_PRIMITIVES 1
+#endif
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED) || defined(NEON_INTRINSICS_ENABLED)
+#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
+#endif
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelBGRA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	*dst++ = A;
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelBGRX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	dst++; /* Do not touch alpha */
+
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelRGBA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	*dst++ = A;
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelRGBX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	dst++; /* Do not touch alpha */
+
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelABGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = A;
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelXBGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	dst++; /* Do not touch alpha */
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelARGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = A;
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelXRGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	dst++; /* Do not touch alpha */
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	return dst;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelGenericAlpha(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R,
+                                           BYTE G, BYTE B, BYTE A)
+{
+	UINT32 color = FreeRDPGetColor(format, R, G, B, A);
+	FreeRDPWriteColor(dst, format, color);
+	return dst + formatSize;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE* writePixelGeneric(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                      BYTE B, BYTE A)
+{
+	UINT32 color = FreeRDPGetColor(format, R, G, B, A);
+	FreeRDPWriteColorIgnoreAlpha(dst, format, color);
+	return dst + formatSize;
+}
+
+typedef BYTE* (*fkt_writePixel)(BYTE*, DWORD, UINT32, BYTE, BYTE, BYTE, BYTE);
+
+WINPR_ATTR_NODISCARD
+static inline fkt_writePixel getPixelWriteFunction(DWORD format, BOOL useAlpha)
+{
+	switch (format)
+	{
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return useAlpha ? writePixelARGB : writePixelXRGB;
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return useAlpha ? writePixelABGR : writePixelXBGR;
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return useAlpha ? writePixelRGBA : writePixelRGBX;
+
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return useAlpha ? writePixelBGRA : writePixelBGRX;
+
+		default:
+			return useAlpha ? writePixelGenericAlpha : writePixelGeneric;
+	}
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE CLIP(INT64 X)
+{
+	if (X > 255L)
+		return 255L;
+
+	if (X < 0L)
+		return 0L;
+
+	return (BYTE)X;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE CONDITIONAL_CLIP(INT32 in, BYTE original)
+{
+	BYTE out = CLIP(in);
+	BYTE diff = 0;
+	if (out > original)
+		diff = out - original;
+	else
+		diff = original - out;
+	if (diff < 30)
+		return original;
+	return out;
+}
+
+/**
+ * | R |   ( | 256     0    403 | |    Y    | )
+ * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+ * | B |   ( | 256   475      0 | | V - 128 | )
+ */
+static inline INT32 C(INT32 Y)
+{
+	return (Y)-0;
+}
+
+static inline INT32 D(INT32 U)
+{
+	return (U)-128;
+}
+
+static inline INT32 E(INT32 V)
+{
+	return (V)-128;
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE YUV2R(INT32 Y, INT32 U, INT32 V)
+{
+	const INT32 r = (256 * C(Y) + 0 * D(U) + 403 * E(V));
+	const INT32 r8 = r >> 8;
+	return CLIP(r8);
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE YUV2G(INT32 Y, INT32 U, INT32 V)
+{
+	const INT32 g = (256 * C(Y) - 48 * D(U) - 120 * E(V));
+	const INT32 g8 = g >> 8;
+	return CLIP(g8);
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE YUV2B(INT32 Y, INT32 U, INT32 V)
+{
+	const INT32 b = (256 * C(Y) + 475 * D(U) + 0 * E(V));
+	const INT32 b8 = b >> 8;
+	return CLIP(b8);
+}
+
+/**
+ * | Y |    ( |  54   183     18 | | R | )        |  0  |
+ * | U | =  ( | -29   -99    128 | | G | ) >> 8 + | 128 |
+ * | V |    ( | 128  -116    -12 | | B | )        | 128 |
+ */
+WINPR_ATTR_NODISCARD
+static inline BYTE RGB2Y(INT32 R, INT32 G, INT32 B)
+{
+	const INT32 val = ((54 * R + 183 * G + 18 * B) >> 8);
+	return WINPR_ASSERTING_INT_CAST(BYTE, val);
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE RGB2U(INT32 R, INT32 G, INT32 B)
+{
+	const INT32 val = (((-29 * R - 99 * G + 128 * B) >> 8) + 128);
+	return WINPR_ASSERTING_INT_CAST(BYTE, val);
+}
+
+WINPR_ATTR_NODISCARD
+static inline BYTE RGB2V(INT32 R, INT32 G, INT32 B)
+{
+	const INT32 val = (((128 * R - 116 * G - 12 * B) >> 8) + 128);
+	return WINPR_ASSERTING_INT_CAST(BYTE, val);
+}
+
+static inline BYTE* writeYUVPixel(BYTE* dst, UINT32 DstFormat, INT32 y, INT32 u, INT32 v,
+                                  fkt_writePixel fkt)
+{
+	WINPR_ASSERT(fkt);
+	const BYTE r = YUV2R(y, u, v);
+	const BYTE g = YUV2G(y, u, v);
+	const BYTE b = YUV2B(y, u, v);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	return fkt(dst, formatSize, DstFormat, r, g, b, 0);
+}
+
+FREERDP_LOCAL void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    size_t offset, const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width);
+
+FREERDP_LOCAL void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+    size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
+    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width);
+
+/* Function prototypes for all the init/deinit routines. */
+FREERDP_LOCAL void primitives_init_copy(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_set(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_add(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_andor(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_shift(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_sign(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_colors(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims);
+
+FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims);
+FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims);
+
+#if defined(WITH_OPENCL)
+WINPR_ATTR_NODISCARD
+FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* WINPR_RESTRICT prims);
+#endif
+
+#endif /* FREERDP_LIB_PRIM_INTERNAL_H */
@@ -0,0 +1,137 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_set.h"
+
+/* ========================================================================= */
+static pstatus_t general_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
+{
+	memset((void*)pDst, (int)val, (size_t)len);
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_zero(void* WINPR_RESTRICT pDst, size_t len)
+{
+	memset(pDst, 0, len);
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ========================================================================= */
+static pstatus_t general_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+	INT32* dptr = pDst;
+	size_t span = 0;
+	size_t remaining = 0;
+
+	if (len < 256)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* else quadratic growth memcpy algorithm */
+	span = 1;
+	*dptr = val;
+	remaining = len - 1;
+	primitives_t* prims = primitives_get();
+
+	while (remaining)
+	{
+		size_t thiswidth = span;
+
+		if (thiswidth > remaining)
+			thiswidth = remaining;
+
+		const size_t s = thiswidth << 2;
+		WINPR_ASSERT(thiswidth <= INT32_MAX);
+		const pstatus_t rc = prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), (INT32)s);
+		if (rc != PRIMITIVES_SUCCESS)
+			return rc;
+		remaining -= thiswidth;
+		span <<= 1;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+	UINT32* dptr = pDst;
+	size_t span = 0;
+	size_t remaining = 0;
+	primitives_t* prims = nullptr;
+
+	if (len < 256)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* else quadratic growth memcpy algorithm */
+	span = 1;
+	*dptr = val;
+	remaining = len - 1;
+	prims = primitives_get();
+
+	while (remaining)
+	{
+		size_t thiswidth = span;
+
+		if (thiswidth > remaining)
+			thiswidth = remaining;
+
+		const size_t s = thiswidth << 2;
+		WINPR_ASSERT(thiswidth <= INT32_MAX);
+		const pstatus_t rc = prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), (INT32)s);
+		if (rc != PRIMITIVES_SUCCESS)
+			return rc;
+
+		remaining -= thiswidth;
+		span <<= 1;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set(primitives_t* WINPR_RESTRICT prims)
+{
+	/* Start with the default. */
+	prims->set_8u = general_set_8u;
+	prims->set_32s = general_set_32s;
+	prims->set_32u = general_set_32u;
+	prims->zero = general_zero;
+}
+
+void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_set(prims);
+	primitives_init_set_sse2(prims);
+}
@@ -0,0 +1,42 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_SET_H
+#define FREERDP_LIB_PRIM_SET_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_set_sse2_int(prims);
+}
+
+#endif
@@ -0,0 +1,150 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+#include <winpr/assert.h>
+#include <winpr/cast.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_shift.h"
+
+/* ------------------------------------------------------------------------- */
+static inline INT16 shift(INT16 val, UINT32 sh)
+{
+	const INT16 rc = (int16_t)(((UINT32)val << sh) & 0xFFFF);
+	return WINPR_ASSERTING_INT_CAST(INT16, rc);
+}
+
+static inline pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val,
+                                                    UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	for (UINT32 x = 0; x < len; x++)
+		pSrcDst[x] = shift(pSrcDst[x], val);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t general_lShiftC_16s(const INT16* WINPR_RESTRICT pSrc, UINT32 val,
+                                            INT16* WINPR_RESTRICT pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	for (UINT32 x = 0; x < len; x++)
+		pDst[x] = shift(pSrc[x], val);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static inline pstatus_t general_rShiftC_16s(const INT16* WINPR_RESTRICT pSrc, UINT32 val,
+                                            INT16* WINPR_RESTRICT pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	for (UINT32 x = 0; x < len; x++)
+		pDst[x] = WINPR_ASSERTING_INT_CAST(int16_t, pSrc[x] >> val);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static inline pstatus_t general_lShiftC_16u(const UINT16* WINPR_RESTRICT pSrc, UINT32 val,
+                                            UINT16* WINPR_RESTRICT pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	for (UINT32 x = 0; x < len; x++)
+		pDst[x] = WINPR_ASSERTING_INT_CAST(UINT16, ((pSrc[x] << val) & 0xFFFF));
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static inline pstatus_t general_rShiftC_16u(const UINT16* WINPR_RESTRICT pSrc, UINT32 val,
+                                            UINT16* WINPR_RESTRICT pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	for (UINT32 x = 0; x < len; x++)
+		pDst[x] = pSrc[x] >> val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static inline pstatus_t general_shiftC_16s(const INT16* WINPR_RESTRICT pSrc, INT32 val,
+                                           INT16* WINPR_RESTRICT pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	if (val < 0)
+		return general_rShiftC_16s(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, -val), pDst, len);
+	else
+		return general_lShiftC_16s(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, val), pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+static inline pstatus_t general_shiftC_16u(const UINT16* WINPR_RESTRICT pSrc, INT32 val,
+                                           UINT16* WINPR_RESTRICT pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	if (val < 0)
+		return general_rShiftC_16u(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, -val), pDst, len);
+	else
+		return general_lShiftC_16u(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, val), pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift(primitives_t* WINPR_RESTRICT prims)
+{
+	/* Start with the default. */
+	prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace;
+	prims->lShiftC_16s = general_lShiftC_16s;
+	prims->rShiftC_16s = general_rShiftC_16s;
+	prims->lShiftC_16u = general_lShiftC_16u;
+	prims->rShiftC_16u = general_rShiftC_16u;
+	/* Wrappers */
+	prims->shiftC_16s = general_shiftC_16s;
+	prims->shiftC_16u = general_shiftC_16u;
+}
+
+void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_shift(prims);
+	primitives_init_shift_sse3(prims);
+}
@@ -0,0 +1,41 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_SHIFT_H
+#define FREERDP_LIB_PRIM_SHIFT_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_shift_sse3_int(prims);
+}
+
+#endif
@@ -0,0 +1,50 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_sign.h"
+
+/* ----------------------------------------------------------------------------
+ * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
+ */
+static pstatus_t general_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
+                                  UINT32 len)
+{
+	while (len--)
+	{
+		INT16 src = *pSrc++;
+		*pDst++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign(primitives_t* WINPR_RESTRICT prims)
+{
+	/* Start with the default. */
+	prims->sign_16s = general_sign_16s;
+}
+
+void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_sign(prims);
+	primitives_init_sign_ssse3(prims);
+}
@@ -0,0 +1,42 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_SIGN_H
+#define FREERDP_LIB_PRIM_SIGN_H
+
+#include <winpr/wtypes.h>
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+FREERDP_LOCAL void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims);
+static inline void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
+{
+	if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return;
+
+	primitives_init_sign_ssse3_int(prims);
+}
+
+#endif
@@ -0,0 +1,455 @@
+/* primitives.c
+ * This code queries processor features and calls the init/deinit routines.
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <stdlib.h>
+
+#include <winpr/synch.h>
+#include <winpr/sysinfo.h>
+#include <winpr/crypto.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#include <freerdp/log.h>
+#define TAG FREERDP_TAG("primitives")
+
+/* hints to know which kind of primitives to use */
+static primitive_hints primitivesHints = PRIMITIVES_AUTODETECT;
+static BOOL primitives_init_optimized(primitives_t* prims);
+
+void primitives_set_hints(primitive_hints hints)
+{
+	primitivesHints = hints;
+}
+
+primitive_hints primitives_get_hints(void)
+{
+	return primitivesHints;
+}
+
+/* Singleton pointer used throughout the program when requested. */
+static primitives_t pPrimitivesGeneric = WINPR_C_ARRAY_INIT;
+static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+static primitives_t pPrimitivesCpu = WINPR_C_ARRAY_INIT;
+static INIT_ONCE cpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#endif
+#if defined(WITH_OPENCL)
+static primitives_t pPrimitivesGpu = WINPR_C_ARRAY_INIT;
+static INIT_ONCE gpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#endif
+
+static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+static primitives_t pPrimitives = WINPR_C_ARRAY_INIT;
+
+/* ------------------------------------------------------------------------- */
+static BOOL primitives_init_generic(primitives_t* prims)
+{
+	primitives_init_add(prims);
+	primitives_init_andor(prims);
+	primitives_init_alphaComp(prims);
+	primitives_init_copy(prims);
+	primitives_init_set(prims);
+	primitives_init_shift(prims);
+	primitives_init_sign(prims);
+	primitives_init_colors(prims);
+	primitives_init_YCoCg(prims);
+	primitives_init_YUV(prims);
+	prims->uninit = nullptr;
+	return TRUE;
+}
+
+static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+	return primitives_init_generic(&pPrimitivesGeneric);
+}
+
+static BOOL primitives_init_optimized(primitives_t* prims)
+{
+	primitives_init_generic(prims);
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+	primitives_init_add_opt(prims);
+	primitives_init_andor_opt(prims);
+	primitives_init_alphaComp_opt(prims);
+	primitives_init_copy_opt(prims);
+	primitives_init_set_opt(prims);
+	primitives_init_shift_opt(prims);
+	primitives_init_sign_opt(prims);
+	primitives_init_colors_opt(prims);
+	primitives_init_YCoCg_opt(prims);
+	primitives_init_YUV_opt(prims);
+	prims->flags |= PRIM_FLAGS_HAVE_EXTCPU;
+#endif
+	return TRUE;
+}
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) && defined(WITH_OPENCL)
+typedef struct
+{
+	BYTE* channels[3];
+	UINT32 steps[3];
+	prim_size_t roi;
+	BYTE* outputBuffer;
+	UINT32 outputStride;
+	UINT32 testedFormat;
+} primitives_YUV_benchmark;
+
+static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
+{
+	if (!bench)
+		return;
+
+	free(bench->outputBuffer);
+
+	for (int i = 0; i < 3; i++)
+		free(bench->channels[i]);
+	memset(bench, 0, sizeof(primitives_YUV_benchmark));
+}
+
+static primitives_YUV_benchmark* primitives_YUV_benchmark_init(primitives_YUV_benchmark* ret)
+{
+	prim_size_t* roi = nullptr;
+	if (!ret)
+		return nullptr;
+
+	memset(ret, 0, sizeof(primitives_YUV_benchmark));
+	roi = &ret->roi;
+	roi->width = 1024;
+	roi->height = 768;
+	ret->outputStride = roi->width * 4;
+	ret->testedFormat = PIXEL_FORMAT_BGRA32;
+
+	ret->outputBuffer = calloc(ret->outputStride, roi->height);
+	if (!ret->outputBuffer)
+		goto fail;
+
+	for (int i = 0; i < 3; i++)
+	{
+		BYTE* buf = ret->channels[i] = calloc(roi->width, roi->height);
+		if (!buf)
+			goto fail;
+
+		if (winpr_RAND(buf, 1ull * roi->width * roi->height) < 0)
+			goto fail;
+		ret->steps[i] = roi->width;
+	}
+
+	return ret;
+
+fail:
+	primitives_YUV_benchmark_free(ret);
+	return ret;
+}
+
+static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
+                                         UINT64 runTime, UINT32* computations)
+{
+	ULONGLONG dueDate = 0;
+	const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
+	pstatus_t status = 0;
+
+	*computations = 0;
+
+	for (size_t i = 0; i < 3; i++)
+		channels[i] = bench->channels[i];
+
+	/* do a first dry run to initialize cache and such */
+	status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+	                                      bench->outputStride, bench->testedFormat, &bench->roi);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* let's run the benchmark */
+	dueDate = GetTickCount64() + runTime;
+	while (GetTickCount64() < dueDate)
+	{
+		pstatus_t cstatus =
+		    prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+		                                 bench->outputStride, bench->testedFormat, &bench->roi);
+		if (cstatus != PRIMITIVES_SUCCESS)
+			return FALSE;
+		*computations = *computations + 1;
+	}
+	return TRUE;
+}
+#endif
+
+static BOOL primitives_autodetect_best(primitives_t* prims)
+{
+	BOOL ret = FALSE;
+	struct prim_benchmark
+	{
+		const char* name;
+		primitives_t* prims;
+		primitive_hints flags;
+		UINT32 count;
+	};
+
+	struct prim_benchmark testcases[] = {
+		{ "generic", nullptr, PRIMITIVES_PURE_SOFT, 0 },
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+		{ "optimized", nullptr, PRIMITIVES_ONLY_CPU, 0 },
+#endif
+#if defined(WITH_OPENCL)
+		{ "opencl", nullptr, PRIMITIVES_ONLY_GPU, 0 },
+#endif
+	};
+	const struct prim_benchmark* best = nullptr;
+
+#if !defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) || !defined(WITH_OPENCL)
+	{
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) || defined(WITH_OPENCL)
+		struct prim_benchmark* cur = &testcases[1];
+#else
+		struct prim_benchmark* cur = &testcases[0];
+#endif
+		cur->prims = primitives_get_by_type(cur->flags);
+		if (!cur->prims)
+		{
+			WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
+			return FALSE;
+		}
+		WLog_DBG(TAG, "primitives benchmark: only one backend, skipping...");
+		best = cur;
+	}
+#else
+	{
+		UINT64 benchDuration = 150; /* 150 ms */
+		primitives_YUV_benchmark bench = WINPR_C_ARRAY_INIT;
+		primitives_YUV_benchmark* yuvBench = primitives_YUV_benchmark_init(&bench);
+		if (!yuvBench)
+			return FALSE;
+
+		WLog_DBG(TAG, "primitives benchmark result:");
+		for (size_t x = 0; x < ARRAYSIZE(testcases); x++)
+		{
+			struct prim_benchmark* cur = &testcases[x];
+			cur->prims = primitives_get_by_type(cur->flags);
+			if (!cur->prims)
+			{
+				WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
+				continue;
+			}
+			if (!primitives_YUV_benchmark_run(yuvBench, cur->prims, benchDuration, &cur->count))
+			{
+				WLog_WARN(TAG, "error running %s YUV bench", cur->name);
+				continue;
+			}
+
+			WLog_DBG(TAG, " * %s= %" PRIu32, cur->name, cur->count);
+			if (!best || (best->count < cur->count))
+				best = cur;
+		}
+		primitives_YUV_benchmark_free(yuvBench);
+	}
+#endif
+
+	if (!best)
+	{
+		WLog_ERR(TAG, "No primitives to test, aborting.");
+		goto out;
+	}
+	/* finally compute the results */
+	*prims = *best->prims;
+
+	WLog_DBG(TAG, "primitives autodetect, using %s", best->name);
+	ret = TRUE;
+out:
+	if (!ret)
+		*prims = pPrimitivesGeneric;
+
+	return ret;
+}
+
+#if defined(WITH_OPENCL)
+static BOOL CALLBACK primitives_init_gpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+
+	return primitives_init_opencl(&pPrimitivesGpu);
+}
+#endif
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+static BOOL CALLBACK primitives_init_cpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+
+	return (primitives_init_optimized(&pPrimitivesCpu));
+}
+#endif
+
+static BOOL CALLBACK primitives_auto_init_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+
+	return primitives_init(&pPrimitives, primitivesHints);
+}
+
+BOOL primitives_init(primitives_t* p, primitive_hints hints)
+{
+	switch (hints)
+	{
+		case PRIMITIVES_AUTODETECT:
+			return primitives_autodetect_best(p);
+		case PRIMITIVES_PURE_SOFT:
+			*p = pPrimitivesGeneric;
+			return TRUE;
+		case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+			*p = pPrimitivesCpu;
+			return TRUE;
+#endif
+		case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
+			*p = pPrimitivesGpu;
+			return TRUE;
+#endif
+		default:
+			WLog_ERR(TAG, "unknown hint %u", hints);
+			return FALSE;
+	}
+}
+
+void primitives_uninit(void)
+{
+#if defined(WITH_OPENCL)
+	if (pPrimitivesGpu.uninit)
+		pPrimitivesGpu.uninit();
+#endif
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+	if (pPrimitivesCpu.uninit)
+		pPrimitivesCpu.uninit();
+#endif
+	if (pPrimitivesGeneric.uninit)
+		pPrimitivesGeneric.uninit();
+}
+
+/* ------------------------------------------------------------------------- */
+static void setup(void)
+{
+	if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
+	                         nullptr))
+		return;
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+	if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, nullptr, nullptr))
+		return;
+#endif
+#if defined(WITH_OPENCL)
+	if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, nullptr, nullptr))
+		return;
+#endif
+	if (!InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, nullptr, nullptr))
+		return;
+}
+
+primitives_t* primitives_get(void)
+{
+	setup();
+	return &pPrimitives;
+}
+
+primitives_t* primitives_get_generic(void)
+{
+	if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
+	                         nullptr))
+		return nullptr;
+	return &pPrimitivesGeneric;
+}
+
+primitives_t* primitives_get_by_type(primitive_hints type)
+{
+	if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
+	                         nullptr))
+		return nullptr;
+
+	switch (type)
+	{
+		case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
+			if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, nullptr,
+			                         nullptr))
+				return nullptr;
+			return &pPrimitivesGpu;
+#endif
+		case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+			if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, nullptr,
+			                         nullptr))
+				return nullptr;
+			return &pPrimitivesCpu;
+#endif
+		case PRIMITIVES_PURE_SOFT:
+		default:
+			return &pPrimitivesGeneric;
+	}
+}
+
+DWORD primitives_flags(primitives_t* p)
+{
+	return p->flags;
+}
+
+const char* primitives_avc444_frame_type_str(avc444_frame_type type)
+{
+	switch (type)
+	{
+		case AVC444_LUMA:
+			return "AVC444_LUMA";
+		case AVC444_CHROMAv1:
+			return "AVC444_CHROMAv1";
+		case AVC444_CHROMAv2:
+			return "AVC444_CHROMAv2";
+		default:
+			return "INVALID_FRAME_TYPE";
+	}
+}
+
+const char* primtives_hint_str(primitive_hints hint)
+{
+	switch (hint)
+	{
+		case PRIMITIVES_PURE_SOFT:
+			return "PRIMITIVES_PURE_SOFT";
+		case PRIMITIVES_ONLY_CPU:
+			return "PRIMITIVES_ONLY_CPU";
+		case PRIMITIVES_ONLY_GPU:
+			return "PRIMITIVES_ONLY_GPU";
+		case PRIMITIVES_AUTODETECT:
+			return "PRIMITIVES_AUTODETECT";
+		default:
+			return "PRIMITIVES_UNKNOWN";
+	}
+}
@@ -0,0 +1,383 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_YCoCg.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                                  UINT32 dstStep, UINT32 width, UINT32 height,
+                                                  UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = pDst;
+
+	WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
+	WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
+	const size_t sRowBump = srcStep - width * sizeof(UINT32);
+	const size_t dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
+		                                   DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
+		                                   width, height, shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		UINT32 w = width;
+
+		while (w >= 8)
+		{
+			__m128i R0;
+			__m128i R1;
+			__m128i R2;
+			__m128i R3;
+			__m128i R4;
+			__m128i R5;
+			__m128i R6;
+			__m128i R7;
+
+			R0 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+			R1 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			R3 = _mm_shuffle_epi8(R0, R2);
+			R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			R5 = _mm_unpackhi_epi32(R3, R4);
+			R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = mm_set1_epu32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = mm_set1_epu32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = mm_set1_epu8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			R0 = _mm_packus_epi16(R3, R5);
+			/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			STORE_SI128(dptr, R4);
+			dptr += (128 / 8);
+			STORE_SI128(dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(
+			    sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += w * sizeof(UINT32);
+			dptr += w * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
+                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
+                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = pDst;
+	size_t sRowBump = srcStep - width * sizeof(UINT32);
+	size_t dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
+		                                   DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
+		                                   width, height, shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		UINT32 w = width;
+
+		while (w >= 8)
+		{
+			__m128i R7;
+
+			/* The faster path, 16-byte aligned load. */
+			__m128i R0 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+			__m128i R1 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			__m128i R3 = _mm_shuffle_epi8(R0, R2);
+			__m128i R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			__m128i R5 = _mm_unpackhi_epi32(R3, R4);
+			__m128i R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = mm_set1_epu32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = mm_set1_epu32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = mm_set1_epu8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			/* This line is the only diff between inverted and non-inverted.
+			 * Unfortunately, it would be expensive to check "inverted"
+			 * every time through this loop.
+			 */
+			R0 = _mm_packus_epi16(R5, R3);
+			/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			STORE_SI128(dptr, R4);
+			dptr += (128 / 8);
+			STORE_SI128(dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(
+			    sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
+			    shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
+			dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
+                                           BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_invert(
+			    pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
+			    pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
+	prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,187 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_add.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
+                 generic->add_16s(sptr1++, sptr2++, dptr++, 1))
+
+static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
+                                      INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
+{
+	const int shifts = 2;
+	INT16* dptr1 = pSrcDst1;
+	INT16* dptr2 = pSrcDst2;
+
+	if (ulen < 16) /* pointless if too small */
+		return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
+
+	UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
+	if ((ULONG_PTR)pSrcDst1 & offBeatMask)
+	{
+		/* Incrementing the pointer skips over 16-byte boundary. */
+		return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
+	}
+	/* Get to the 16-byte boundary now. */
+	const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
+	if (rem != 0)
+	{
+		const UINT32 add = 16 - (UINT32)rem;
+		pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
+		if (status != PRIMITIVES_SUCCESS)
+			return status;
+		dptr1 += add;
+		dptr2 += add;
+	}
+	/* Use 4 128-bit SSE registers. */
+	size_t len = ulen;
+	size_t count = len >> (7 - shifts);
+	len -= count << (7 - shifts);
+	if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
+	{
+		/* Unaligned loads */
+		while (count--)
+		{
+			const __m128i* vsptr1 = (const __m128i*)dptr1;
+			const __m128i* vsptr2 = (const __m128i*)dptr2;
+			__m128i* vdptr1 = (__m128i*)dptr1;
+			__m128i* vdptr2 = (__m128i*)dptr2;
+
+			__m128i xmm0 = LOAD_SI128(vsptr1++);
+			__m128i xmm1 = LOAD_SI128(vsptr1++);
+			__m128i xmm2 = LOAD_SI128(vsptr1++);
+			__m128i xmm3 = LOAD_SI128(vsptr1++);
+			__m128i xmm4 = LOAD_SI128(vsptr2++);
+			__m128i xmm5 = LOAD_SI128(vsptr2++);
+			__m128i xmm6 = LOAD_SI128(vsptr2++);
+			__m128i xmm7 = LOAD_SI128(vsptr2++);
+
+			xmm0 = _mm_adds_epi16(xmm0, xmm4);
+			xmm1 = _mm_adds_epi16(xmm1, xmm5);
+			xmm2 = _mm_adds_epi16(xmm2, xmm6);
+			xmm3 = _mm_adds_epi16(xmm3, xmm7);
+
+			STORE_SI128(vdptr1++, xmm0);
+			STORE_SI128(vdptr1++, xmm1);
+			STORE_SI128(vdptr1++, xmm2);
+			STORE_SI128(vdptr1++, xmm3);
+
+			STORE_SI128(vdptr2++, xmm0);
+			STORE_SI128(vdptr2++, xmm1);
+			STORE_SI128(vdptr2++, xmm2);
+			STORE_SI128(vdptr2++, xmm3);
+
+			dptr1 = (INT16*)vdptr1;
+			dptr2 = (INT16*)vdptr2;
+		}
+	}
+	else
+	{
+		/* Aligned loads */
+		while (count--)
+		{
+			const __m128i* vsptr1 = (const __m128i*)dptr1;
+			const __m128i* vsptr2 = (const __m128i*)dptr2;
+			__m128i* vdptr1 = (__m128i*)dptr1;
+			__m128i* vdptr2 = (__m128i*)dptr2;
+
+			__m128i xmm0 = LOAD_SI128(vsptr1++);
+			__m128i xmm1 = LOAD_SI128(vsptr1++);
+			__m128i xmm2 = LOAD_SI128(vsptr1++);
+			__m128i xmm3 = LOAD_SI128(vsptr1++);
+			__m128i xmm4 = LOAD_SI128(vsptr2++);
+			__m128i xmm5 = LOAD_SI128(vsptr2++);
+			__m128i xmm6 = LOAD_SI128(vsptr2++);
+			__m128i xmm7 = LOAD_SI128(vsptr2++);
+
+			xmm0 = _mm_adds_epi16(xmm0, xmm4);
+			xmm1 = _mm_adds_epi16(xmm1, xmm5);
+			xmm2 = _mm_adds_epi16(xmm2, xmm6);
+			xmm3 = _mm_adds_epi16(xmm3, xmm7);
+
+			STORE_SI128(vdptr1++, xmm0);
+			STORE_SI128(vdptr1++, xmm1);
+			STORE_SI128(vdptr1++, xmm2);
+			STORE_SI128(vdptr1++, xmm3);
+
+			STORE_SI128(vdptr2++, xmm0);
+			STORE_SI128(vdptr2++, xmm1);
+			STORE_SI128(vdptr2++, xmm2);
+			STORE_SI128(vdptr2++, xmm3);
+
+			dptr1 = (INT16*)vdptr1;
+			dptr2 = (INT16*)vdptr2;
+		}
+	}
+	/* Use a single 128-bit SSE register. */
+	count = len >> (5 - shifts);
+	len -= count << (5 - shifts);
+	while (count--)
+	{
+		const __m128i* vsptr1 = (const __m128i*)dptr1;
+		const __m128i* vsptr2 = (const __m128i*)dptr2;
+		__m128i* vdptr1 = (__m128i*)dptr1;
+		__m128i* vdptr2 = (__m128i*)dptr2;
+
+		__m128i xmm0 = LOAD_SI128(vsptr1);
+		__m128i xmm1 = LOAD_SI128(vsptr2);
+
+		xmm0 = _mm_adds_epi16(xmm0, xmm1);
+
+		STORE_SI128(vdptr1++, xmm0);
+		STORE_SI128(vdptr2++, xmm0);
+
+		dptr1 = (INT16*)vdptr1;
+		dptr2 = (INT16*)vdptr2;
+	}
+	/* Finish off the remainder. */
+	if (len > 0)
+		return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->add_16s = sse3_add_16s;
+	prims->add_16s_inplace = sse3_add_16s_inplace;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,215 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_alphaComp.h"
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+
+/* ------------------------------------------------------------------------- */
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
+                                     const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
+                                     BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
+                                     UINT32 height)
+{
+	const UINT32* sptr1 = (const UINT32*)pSrc1;
+	const UINT32* sptr2 = (const UINT32*)pSrc2;
+
+	if ((width <= 0) || (height <= 0))
+		return PRIMITIVES_SUCCESS;
+
+	if (width < 4) /* pointless if too small */
+	{
+		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
+		                               height);
+	}
+
+	UINT32* dptr = (UINT32*)pDst;
+	const size_t linebytes = width * sizeof(UINT32);
+	const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
+	__m128i xmm0 = mm_set1_epu32(0);
+	__m128i xmm1 = _mm_set1_epi16(1);
+
+	for (UINT32 y = 0; y < height; ++y)
+	{
+		uint32_t pixels = width;
+		uint32_t count = 0;
+		/* Get to the 16-byte boundary now. */
+		uint32_t leadIn = 0;
+
+		switch ((ULONG_PTR)dptr & 0x0f)
+		{
+			case 0:
+				leadIn = 0;
+				break;
+
+			case 4:
+				leadIn = 3;
+				break;
+
+			case 8:
+				leadIn = 2;
+				break;
+
+			case 12:
+				leadIn = 1;
+				break;
+
+			default:
+				/* We'll never hit a 16-byte boundary, so do the whole
+				 * thing the slow way.
+				 */
+				leadIn = width;
+				break;
+		}
+
+		if (leadIn)
+		{
+			pstatus_t status = 0;
+			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+			                                 src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr1 += leadIn;
+			sptr2 += leadIn;
+			dptr += leadIn;
+			pixels -= leadIn;
+		}
+
+		/* Use SSE registers to do 4 pixels at a time. */
+		count = pixels >> 2;
+		pixels -= count << 2;
+
+		while (count--)
+		{
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+			xmm2 = LOAD_SI128(sptr1);
+			sptr1 += 4;
+			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+			xmm3 = LOAD_SI128(sptr2);
+			sptr2 += 4;
+			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm6 = _mm_subs_epi16(xmm4, xmm5);
+			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+			/* Add one to alphas */
+			xmm4 = _mm_adds_epi16(xmm4, xmm1);
+			/* Multiply and take low word */
+			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+			/* Shift 8 right */
+			xmm4 = _mm_srai_epi16(xmm4, 8);
+			/* Add xmm5 */
+			xmm4 = _mm_adds_epi16(xmm4, xmm5);
+			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm7 = _mm_subs_epi16(xmm5, xmm6);
+			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+			/* Add one to alphas */
+			xmm5 = _mm_adds_epi16(xmm5, xmm1);
+			/* Multiply and take low word */
+			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+			/* Shift 8 right */
+			xmm5 = _mm_srai_epi16(xmm5, 8);
+			/* Add xmm6 */
+			xmm5 = _mm_adds_epi16(xmm5, xmm6);
+			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+			/* Must mask off remainders or pack gets confused */
+			xmm3 = _mm_set1_epi16(0x00ffU);
+			xmm4 = _mm_and_si128(xmm4, xmm3);
+			xmm5 = _mm_and_si128(xmm5, xmm3);
+			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+			xmm5 = _mm_packus_epi16(xmm5, xmm4);
+			STORE_SI128(dptr, xmm5);
+			dptr += 4;
+		}
+
+		/* Finish off the remainder. */
+		if (pixels)
+		{
+			pstatus_t status = 0;
+			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+			                                 src2Step, (BYTE*)dptr, dstStep, pixels, 1);
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr1 += pixels;
+			sptr2 += pixels;
+			dptr += pixels;
+		}
+
+		/* Jump to next row. */
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr += dstJump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->alphaComp_argb = sse2_alphaComp_argb;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,54 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_andor.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
+                     *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->andC_32u = sse3_andC_32u;
+	prims->orC_32u = sse3_orC_32u;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,79 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * FreeRDP primitives SSE implementation
+ *
+ * Copyright 2025 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2025 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <winpr/cast.h>
+
+#include "../../core/simd.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
+{
+	return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
+	                     WINPR_CXX_COMPAT_CAST(int32_t, val3),
+	                     WINPR_CXX_COMPAT_CAST(int32_t, val4));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
+                                  uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
+                                  uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
+                                  uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
+{
+	return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set1_epu32(uint32_t val)
+{
+	return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set1_epu8(uint8_t val)
+{
+	return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i LOAD_SI128(const void* ptr)
+{
+	const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
+	return _mm_lddqu_si128(mptr);
+}
+
+static inline void STORE_SI128(void* ptr, __m128i val)
+{
+	__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
+	_mm_storeu_si128(mptr, val);
+}
+
+#endif
@@ -0,0 +1,278 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/log.h>
+
+#include "prim_internal.h"
+#include "prim_copy.h"
+#include "../codec/color.h"
+
+#include <freerdp/codec/color.h>
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <immintrin.h>
+
+static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
+                                      uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
+{
+	return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
+	                        (int32_t)i5, (int32_t)i6, (int32_t)i7);
+}
+
+static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
+                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
+                                                     UINT32 nHeight,
+                                                     const BYTE* WINPR_RESTRICT pSrcData,
+                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                     int64_t srcVMultiplier, int64_t srcVOffset,
+                                                     int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 3;
+	const int64_t dstByte = 4;
+
+	const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
+	                                     0xFF000000, 0xFF000000, 0xFF000000);
+	const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
+	                                      0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
+	const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
+	                                          0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+	const UINT32 rem = nWidth % 8;
+	const int64_t width = nWidth - rem;
+
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+
+		/* Ensure alignment requirements can be met */
+		for (; x < width; x += 8)
+		{
+			const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
+			__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
+			const __m256i s0 = _mm256_loadu_si256(src);
+			__m256i s1 = _mm256_shuffle_epi8(s0, smask);
+
+			/* _mm256_shuffle_epi8 can not cross 128bit lanes.
+			 * manually copy these bytes with extract/insert */
+			const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
+			const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
+			const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
+			                                       0x00000000, 0x00000000, 0x00000000, 0x00000000);
+			const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
+
+			const __m256i s2 = _mm256_loadu_si256(dst);
+			__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
+			_mm256_storeu_si256(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
+                                                      UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                                      UINT32 nWidth, UINT32 nHeight,
+                                                      const BYTE* WINPR_RESTRICT pSrcData,
+                                                      UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                      int64_t srcVMultiplier, int64_t srcVOffset,
+                                                      int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 4;
+	const int64_t dstByte = 4;
+
+	const __m256i mask = _mm256_setr_epi8(
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
+	const UINT32 rem = nWidth % 8;
+	const int64_t width = nWidth - rem;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		for (; x < width; x += 8)
+		{
+			const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
+			__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
+			const __m256i s0 = _mm256_loadu_si256(src);
+			const __m256i s1 = _mm256_loadu_si256(dst);
+			__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
+			_mm256_storeu_si256(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
+    int64_t dstVOffset)
+{
+	WINPR_ASSERT(pDstData);
+	WINPR_ASSERT(pSrcData);
+
+	switch (SrcFormat)
+	{
+		case PIXEL_FORMAT_BGR24:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return avx2_image_copy_bgr24_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return avx2_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_RGBX32:
+				case PIXEL_FORMAT_RGBA32:
+					return avx2_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		default:
+			break;
+	}
+
+	primitives_t* gen = primitives_get_generic();
+	return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+	                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+}
+
+static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
+                                            UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                            UINT32 nWidth, UINT32 nHeight,
+                                            const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+                                            UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                            const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
+{
+	const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
+	int64_t srcVOffset = 0;
+	int64_t srcVMultiplier = 1;
+	int64_t dstVOffset = 0;
+	int64_t dstVMultiplier = 1;
+
+	if ((nWidth == 0) || (nHeight == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
+		return -1;
+
+	if (!pDstData || !pSrcData)
+		return -1;
+
+	if (nDstStep == 0)
+		nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
+
+	if (nSrcStep == 0)
+		nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
+
+	if (vSrcVFlip)
+	{
+		srcVOffset = (nHeight - 1ll) * nSrcStep;
+		srcVMultiplier = -1;
+	}
+
+	if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
+		return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, flags, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset);
+	else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
+		return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset, flags);
+	else
+	{
+		primitives_t* gen = primitives_get_generic();
+		return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+		                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+	}
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	WLog_VRB(PRIM_TAG, "AVX2 optimizations");
+	prims->copy_no_overlap = avx2_image_copy_no_overlap;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,257 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/log.h>
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+#include "prim_copy.h"
+#include "../codec/color.h"
+
+#include <freerdp/codec/color.h>
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <immintrin.h>
+
+static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
+                                                    UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
+                                                    UINT32 nHeight,
+                                                    const BYTE* WINPR_RESTRICT pSrcData,
+                                                    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                    int64_t srcVMultiplier, int64_t srcVOffset,
+                                                    int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 3;
+	const int64_t dstByte = 4;
+
+	const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
+	const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
+	const UINT32 rem = nWidth % 4;
+
+	const int64_t width = nWidth - rem;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		/* Ensure alignment requirements can be met */
+		for (; x < width; x += 4)
+		{
+			const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
+			__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
+			const __m128i s0 = LOAD_SI128(src);
+			const __m128i s1 = _mm_shuffle_epi8(s0, smask);
+			const __m128i s2 = LOAD_SI128(dst);
+
+			__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
+			STORE_SI128(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
+                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
+                                                     UINT32 nHeight,
+                                                     const BYTE* WINPR_RESTRICT pSrcData,
+                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                     int64_t srcVMultiplier, int64_t srcVOffset,
+                                                     int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 4;
+	const int64_t dstByte = 4;
+
+	const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
+	                                   (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
+	                                   (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
+	const UINT32 rem = nWidth % 4;
+	const int64_t width = nWidth - rem;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		for (; x < width; x += 4)
+		{
+			const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
+			__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
+			const __m128i s0 = LOAD_SI128(src);
+			const __m128i s1 = LOAD_SI128(dst);
+			__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
+			STORE_SI128(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse_image_copy_no_overlap_dst_alpha(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
+    int64_t dstVOffset)
+{
+	WINPR_ASSERT(pDstData);
+	WINPR_ASSERT(pSrcData);
+
+	switch (SrcFormat)
+	{
+		case PIXEL_FORMAT_BGR24:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return sse_image_copy_bgr24_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return sse_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_RGBX32:
+				case PIXEL_FORMAT_RGBA32:
+					return sse_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		default:
+			break;
+	}
+
+	primitives_t* gen = primitives_get_generic();
+	return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+	                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+}
+
+static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
+                                           UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                           UINT32 nWidth, UINT32 nHeight,
+                                           const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+                                           UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                           const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
+{
+	const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
+	int64_t srcVOffset = 0;
+	int64_t srcVMultiplier = 1;
+	int64_t dstVOffset = 0;
+	int64_t dstVMultiplier = 1;
+
+	if ((nWidth == 0) || (nHeight == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
+		return -1;
+
+	if (!pDstData || !pSrcData)
+		return -1;
+
+	if (nDstStep == 0)
+		nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
+
+	if (nSrcStep == 0)
+		nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
+
+	if (vSrcVFlip)
+	{
+		srcVOffset = (nHeight - 1ll) * nSrcStep;
+		srcVMultiplier = -1;
+	}
+
+	if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
+		return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                           nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                           nXSrc, nYSrc, palette, flags, srcVMultiplier,
+		                                           srcVOffset, dstVMultiplier, dstVOffset);
+	else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
+		return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset, flags);
+	else
+	{
+		primitives_t* gen = primitives_get_generic();
+		return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+		                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+	}
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
+	prims->copy_no_overlap = sse_image_copy_no_overlap;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,235 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+#include "prim_set.h"
+
+/* ========================================================================= */
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
+{
+	size_t len = ulen;
+	BYTE byte = 0;
+	BYTE* dptr = nullptr;
+	__m128i xmm0;
+	size_t count = 0;
+
+	if (len < 16)
+		return generic->set_8u(val, pDst, ulen);
+
+	byte = val;
+	dptr = pDst;
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		*dptr++ = byte;
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = mm_set1_epu8(byte);
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 8;
+	len -= count << 8;
+
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 4;
+	len -= count << 4;
+
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+	}
+
+	/* Do leftover bytes. */
+	while (len--)
+		*dptr++ = byte;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
+{
+	size_t len = ulen;
+	const primitives_t* prim = primitives_get_generic();
+	UINT32* dptr = pDst;
+	__m128i xmm0;
+	size_t count = 0;
+
+	/* If really short, just do it here. */
+	if (len < 32)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* Assure we can reach 16-byte alignment. */
+	if (((ULONG_PTR)dptr & 0x03) != 0)
+	{
+		return prim->set_32u(val, pDst, ulen);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		*dptr++ = val;
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = mm_set1_epu32(val);
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 6;
+	len -= count << 6;
+
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 2;
+	len -= count << 2;
+
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+	}
+
+	/* Do leftover bytes. */
+	while (len--)
+		*dptr++ = val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+	UINT32 uval = *((UINT32*)&val);
+	return sse2_set_32u(uval, (UINT32*)pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	/* Pick tuned versions if possible. */
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->set_8u = sse2_set_8u;
+	prims->set_32s = sse2_set_32s;
+	prims->set_32u = sse2_set_32u;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,160 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_shift.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
+                 *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
+                 *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
+                 *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
+                 *dptr++ = *sptr++ >> val)
+
+static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
+{
+	size_t len = ulen;
+	const INT32 shifts = 2;
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+	if (len < 16) /* pointless if too small */
+		return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
+
+	UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
+	if ((ULONG_PTR)pSrcDst & offBeatMask)
+	{
+		/* Incrementing the pointer skips over 16-byte boundary. */
+		return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
+	}
+	/* Get to the 16-byte boundary now. */
+	const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
+	if (rem > 0)
+	{
+		const UINT32 add = 16 - rem;
+		pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
+		if (status != PRIMITIVES_SUCCESS)
+			return status;
+		pSrcDst += add;
+		len -= add;
+	}
+
+	/* Use 8 128-bit SSE registers. */
+	size_t count = len >> (8 - shifts);
+	len -= count << (8 - shifts);
+
+	while (count--)
+	{
+		const __m128i* src = (const __m128i*)pSrcDst;
+
+		__m128i xmm0 = LOAD_SI128(src++);
+		__m128i xmm1 = LOAD_SI128(src++);
+		__m128i xmm2 = LOAD_SI128(src++);
+		__m128i xmm3 = LOAD_SI128(src++);
+		__m128i xmm4 = LOAD_SI128(src++);
+		__m128i xmm5 = LOAD_SI128(src++);
+		__m128i xmm6 = LOAD_SI128(src++);
+		__m128i xmm7 = LOAD_SI128(src);
+
+		xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
+		xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
+		xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
+		xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
+		xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
+		xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
+		xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
+		xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
+
+		__m128i* dst = (__m128i*)pSrcDst;
+
+		STORE_SI128(dst++, xmm0);
+		STORE_SI128(dst++, xmm1);
+		STORE_SI128(dst++, xmm2);
+		STORE_SI128(dst++, xmm3);
+		STORE_SI128(dst++, xmm4);
+		STORE_SI128(dst++, xmm5);
+		STORE_SI128(dst++, xmm6);
+		STORE_SI128(dst++, xmm7);
+
+		pSrcDst = (INT16*)dst;
+	}
+
+	/* Use a single 128-bit SSE register. */
+	count = len >> (5 - shifts);
+	len -= count << (5 - shifts);
+	while (count--)
+	{
+		const __m128i* src = (const __m128i*)pSrcDst;
+		__m128i xmm0 = LOAD_SI128(src);
+
+		xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
+
+		__m128i* dst = (__m128i*)pSrcDst;
+		STORE_SI128(dst++, xmm0);
+		pSrcDst = (INT16*)dst;
+	}
+
+	/* Finish off the remainder. */
+	if (len > 0)
+		return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val.  To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
+	prims->lShiftC_16s = sse2_lShiftC_16s;
+	prims->rShiftC_16s = sse2_rShiftC_16s;
+	prims->lShiftC_16u = sse2_lShiftC_16u;
+	prims->rShiftC_16u = sse2_rShiftC_16u;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,188 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_sign.h"
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
+                                UINT32 ulen)
+{
+	size_t len = ulen;
+	const INT16* sptr = pSrc;
+	INT16* dptr = pDst;
+	size_t count = 0;
+
+	if (len < 16)
+	{
+		return generic->sign_16s(pSrc, pDst, ulen);
+	}
+
+	/* Check for 16-byte alignment (eventually). */
+	if ((ULONG_PTR)pDst & 0x01)
+	{
+		return generic->sign_16s(pSrc, pDst, ulen);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	/* Do 32-short chunks using 8 XMM registers. */
+	count = len >> 5;  /* / 32  */
+	len -= count << 5; /* * 32 */
+
+	if ((ULONG_PTR)sptr & 0x0f)
+	{
+		/* Unaligned */
+		while (count--)
+		{
+			__m128i xmm0;
+			__m128i xmm1;
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm5 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm6 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm7 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			STORE_SI128(dptr, xmm0);
+			dptr += 8;
+			STORE_SI128(dptr, xmm1);
+			dptr += 8;
+			STORE_SI128(dptr, xmm2);
+			dptr += 8;
+			STORE_SI128(dptr, xmm3);
+			dptr += 8;
+		}
+	}
+	else
+	{
+		/* Aligned */
+		while (count--)
+		{
+			__m128i xmm0;
+			__m128i xmm1;
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm5 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm6 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm7 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			STORE_SI128(dptr, xmm0);
+			dptr += 8;
+			STORE_SI128(dptr, xmm1);
+			dptr += 8;
+			STORE_SI128(dptr, xmm2);
+			dptr += 8;
+			STORE_SI128(dptr, xmm3);
+			dptr += 8;
+		}
+	}
+
+	/* Do 8-short chunks using two XMM registers. */
+	count = len >> 3;
+	len -= count << 3;
+
+	while (count--)
+	{
+		__m128i xmm0 = _mm_set1_epi16(0x0001U);
+		__m128i xmm1 = LOAD_SI128(sptr);
+		sptr += 8;
+		xmm0 = _mm_sign_epi16(xmm0, xmm1);
+		STORE_SI128(dptr, xmm0);
+		dptr += 8;
+	}
+
+	/* Do leftovers. */
+	while (len--)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+#endif /* SSE_AVX_INTRINSICS_ENABLED */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	/* Pick tuned versions if possible. */
+	/* I didn't spot an IPP version of this. */
+
+	WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
+	prims->sign_16s = ssse3_sign_16s;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,278 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#pragma once
+
+#include "prim_avxsse.h"
+
+/* These are prototypes for SSE (potentially NEON) routines that do a
+ * simple SSE operation over an array of data.  Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code.  The naming convention depends on
+ * the parameters:  S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note:  If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
+	WINPR_ATTR_NODISCARD                                                         \
+	static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val,       \
+	                        _type_* WINPR_RESTRICT pDst, UINT32 ulen)            \
+	{                                                                            \
+		size_t len = ulen;                                                       \
+		INT32 shifts = 0;                                                        \
+		const _type_* sptr = pSrc;                                               \
+		_type_* dptr = pDst;                                                     \
+		if (val == 0)                                                            \
+			return PRIMITIVES_SUCCESS;                                           \
+		if (val >= 16)                                                           \
+			return -1;                                                           \
+		if (sizeof(_type_) == 1)                                                 \
+			shifts = 1;                                                          \
+		else if (sizeof(_type_) == 2)                                            \
+			shifts = 2;                                                          \
+		else if (sizeof(_type_) == 4)                                            \
+			shifts = 3;                                                          \
+		else if (sizeof(_type_) == 8)                                            \
+			shifts = 4;                                                          \
+		/* Use 8 128-bit SSE registers. */                                       \
+		size_t count = len >> (8 - shifts);                                      \
+		len -= count << (8 - shifts);                                            \
+                                                                                 \
+		while (count--)                                                          \
+		{                                                                        \
+			__m128i xmm0 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm1 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm2 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm3 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm4 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm5 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm6 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm7 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			xmm0 = _op_(xmm0, (_op_type_)val);                                   \
+			xmm1 = _op_(xmm1, (_op_type_)val);                                   \
+			xmm2 = _op_(xmm2, (_op_type_)val);                                   \
+			xmm3 = _op_(xmm3, (_op_type_)val);                                   \
+			xmm4 = _op_(xmm4, (_op_type_)val);                                   \
+			xmm5 = _op_(xmm5, (_op_type_)val);                                   \
+			xmm6 = _op_(xmm6, (_op_type_)val);                                   \
+			xmm7 = _op_(xmm7, (_op_type_)val);                                   \
+			STORE_SI128(dptr, xmm0);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm1);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm2);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm3);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm4);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm5);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm6);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm7);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+		}                                                                        \
+                                                                                 \
+		/* Use a single 128-bit SSE register. */                                 \
+		count = len >> (5 - shifts);                                             \
+		len -= count << (5 - shifts);                                            \
+		while (count--)                                                          \
+		{                                                                        \
+			__m128i xmm0 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			xmm0 = _op_(xmm0, (_op_type_)val);                                   \
+			STORE_SI128(dptr, xmm0);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+		}                                                                        \
+		/* Finish off the remainder. */                                          \
+		while (len--)                                                            \
+		{                                                                        \
+			_slowWay_;                                                           \
+		}                                                                        \
+		return PRIMITIVES_SUCCESS;                                               \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)  \
+	WINPR_ATTR_NODISCARD                                                   \
+	static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
+	                        _type_* WINPR_RESTRICT pDst, INT32 ilen)       \
+	{                                                                      \
+		size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen);               \
+		int shifts = 0;                                                    \
+		const _type_* sptr = pSrc;                                         \
+		_type_* dptr = pDst;                                               \
+		__m128i xmm0;                                                      \
+		if (sizeof(_type_) == 1)                                           \
+			shifts = 1;                                                    \
+		else if (sizeof(_type_) == 2)                                      \
+			shifts = 2;                                                    \
+		else if (sizeof(_type_) == 4)                                      \
+			shifts = 3;                                                    \
+		else if (sizeof(_type_) == 8)                                      \
+			shifts = 4;                                                    \
+		/* Use 4 128-bit SSE registers. */                                 \
+		size_t count = len >> (7 - shifts);                                \
+		len -= count << (7 - shifts);                                      \
+		xmm0 = mm_set1_epu32(val);                                         \
+		for (size_t x = 0; x < count; x++)                                 \
+		{                                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			__m128i xmm2 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			__m128i xmm3 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			__m128i xmm4 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			xmm1 = _op_(xmm1, xmm0);                                       \
+			xmm2 = _op_(xmm2, xmm0);                                       \
+			xmm3 = _op_(xmm3, xmm0);                                       \
+			xmm4 = _op_(xmm4, xmm0);                                       \
+			STORE_SI128(dptr, xmm1);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+			STORE_SI128(dptr, xmm2);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+			STORE_SI128(dptr, xmm3);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+			STORE_SI128(dptr, xmm4);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+		}                                                                  \
+		/* Use a single 128-bit SSE register. */                           \
+		count = len >> (5 - shifts);                                       \
+		len -= count << (5 - shifts);                                      \
+		for (size_t x = 0; x < count; x++)                                 \
+		{                                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			xmm1 = _op_(xmm1, xmm0);                                       \
+			STORE_SI128(dptr, xmm1);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+		}                                                                  \
+		/* Finish off the remainder. */                                    \
+		for (size_t x = 0; x < len; x++)                                   \
+		{                                                                  \
+			_slowWay_;                                                     \
+		}                                                                  \
+		return PRIMITIVES_SUCCESS;                                         \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                        \
+	WINPR_ATTR_NODISCARD                                                                     \
+	static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1,                              \
+	                        const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
+	                        UINT32 ulen)                                                     \
+	{                                                                                        \
+		size_t len = ulen;                                                                   \
+		int shifts = 0;                                                                      \
+		const _type_* sptr1 = pSrc1;                                                         \
+		const _type_* sptr2 = pSrc2;                                                         \
+		_type_* dptr = pDst;                                                                 \
+		size_t count;                                                                        \
+		if (sizeof(_type_) == 1)                                                             \
+			shifts = 1;                                                                      \
+		else if (sizeof(_type_) == 2)                                                        \
+			shifts = 2;                                                                      \
+		else if (sizeof(_type_) == 4)                                                        \
+			shifts = 3;                                                                      \
+		else if (sizeof(_type_) == 8)                                                        \
+			shifts = 4;                                                                      \
+		/* Use 4 128-bit SSE registers. */                                                   \
+		count = len >> (7 - shifts);                                                         \
+		len -= count << (7 - shifts);                                                        \
+		/* Aligned loads */                                                                  \
+		while (count--)                                                                      \
+		{                                                                                    \
+			__m128i xmm0 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm2 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm3 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm4 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm5 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm6 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm7 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			xmm0 = _op_(xmm0, xmm4);                                                         \
+			xmm1 = _op_(xmm1, xmm5);                                                         \
+			xmm2 = _op_(xmm2, xmm6);                                                         \
+			xmm3 = _op_(xmm3, xmm7);                                                         \
+			STORE_SI128(dptr, xmm0);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+			STORE_SI128(dptr, xmm1);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+			STORE_SI128(dptr, xmm2);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+			STORE_SI128(dptr, xmm3);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+		}                                                                                    \
+		/* Use a single 128-bit SSE register. */                                             \
+		count = len >> (5 - shifts);                                                         \
+		len -= count << (5 - shifts);                                                        \
+		while (count--)                                                                      \
+		{                                                                                    \
+			__m128i xmm0 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			xmm0 = _op_(xmm0, xmm1);                                                         \
+			STORE_SI128(dptr, xmm0);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+		}                                                                                    \
+		/* Finish off the remainder. */                                                      \
+		while (len--)                                                                        \
+		{                                                                                    \
+			const pstatus_t rc = _slowWay_;                                                  \
+			if (rc != PRIMITIVES_SUCCESS)                                                    \
+				return rc;                                                                   \
+		}                                                                                    \
+		return PRIMITIVES_SUCCESS;                                                           \
+	}
@@ -0,0 +1,39 @@
+set(MODULE_NAME "TestPrimitives")
+set(MODULE_PREFIX "TEST_FREERDP_PRIMITIVES")
+
+disable_warnings_for_directory(${CMAKE_CURRENT_BINARY_DIR})
+
+set(${MODULE_PREFIX}_DRIVER ${MODULE_NAME}.c)
+
+set(${MODULE_PREFIX}_TESTS
+    TestPrimitivesAdd.c
+    TestPrimitivesAlphaComp.c
+    TestPrimitivesAndOr.c
+    TestPrimitivesColors.c
+    TestPrimitivesCopy.c
+    TestPrimitivesSet.c
+    TestPrimitivesShift.c
+    TestPrimitivesSign.c
+    TestPrimitivesYUV.c
+    TestPrimitivesYCbCr.c
+    TestPrimitivesYCoCg.c
+)
+
+create_test_sourcelist(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_DRIVER} ${${MODULE_PREFIX}_TESTS})
+
+set(${MODULE_PREFIX}_EXTRA_SRCS prim_test.c prim_test.h measure.h)
+
+add_executable(${MODULE_NAME} ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_EXTRA_SRCS})
+
+set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} winpr freerdp)
+
+target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
+
+set_target_properties(${MODULE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${TESTING_OUTPUT_DIRECTORY}")
+
+foreach(test ${${MODULE_PREFIX}_TESTS})
+  get_filename_component(TestName ${test} NAME_WE)
+  add_test(${TestName} ${TESTING_OUTPUT_DIRECTORY}/${MODULE_NAME} ${TestName})
+endforeach()
+
+set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/Test")
@@ -0,0 +1,80 @@
+/* test_add.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+/* ========================================================================= */
+static BOOL test_add16s_func(void)
+{
+	pstatus_t status = 0;
+
+	INT16 src1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 src2[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 d2[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+
+	if (winpr_RAND(src1, sizeof(src1)) < 0)
+		return FALSE;
+	if (winpr_RAND(src2, sizeof(src2)) < 0)
+		return FALSE;
+	status = generic->add_16s(src1 + 1, src2 + 1, d1 + 1, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = optimized->add_16s(src1 + 1, src2 + 1, d2 + 2, FUNC_TEST_SIZE);
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_add16s_speed(void)
+{
+	BYTE src1[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	BYTE src2[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	BYTE dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+
+	if (!g_TestPrimitivesPerformance)
+		return TRUE;
+
+	if (winpr_RAND(src1, sizeof(src1)) < 0)
+		return FALSE;
+	if (winpr_RAND(src2, sizeof(src2)) < 0)
+		return FALSE;
+
+	return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->add_16s,
+	                   (speed_test_fkt)optimized->add_16s, src1, src2, dst, FUNC_TEST_SIZE));
+}
+
+int TestPrimitivesAdd(int argc, char* argv[])
+{
+
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+	if (!test_add16s_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_add16s_speed())
+			return -1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,203 @@
+/* test_alphaComp.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+
+#include "prim_test.h"
+
+#define MAX_BLOCK_SIZE 256
+#define SIZE_SQUARED (MAX_BLOCK_SIZE * MAX_BLOCK_SIZE)
+
+/* ========================================================================= */
+#define ALF(_c_) (((_c_)&0xFF000000U) >> 24)
+#define RED(_c_) (((_c_)&0x00FF0000U) >> 16)
+#define GRN(_c_) (((_c_)&0x0000FF00U) >> 8)
+#define BLU(_c_) ((_c_)&0x000000FFU)
+#define TOLERANCE 1
+static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
+{
+	const BYTE* addr = _addr_ + 1ULL * _x_ * sizeof(UINT32) + 1ULL * _y_ * _bytes_;
+	return (const UINT32*)addr;
+}
+
+#define SRC1_WIDTH 6
+#define SRC1_HEIGHT 6
+#define SRC2_WIDTH 7
+#define SRC2_HEIGHT 7
+#define DST_WIDTH 9
+#define DST_HEIGHT 9
+#define TEST_WIDTH 4
+#define TEST_HEIGHT 5
+
+/* ------------------------------------------------------------------------- */
+static UINT32 alpha_add(UINT32 c1, UINT32 c2)
+{
+	UINT32 a1 = ALF(c1);
+	UINT32 r1 = RED(c1);
+	UINT32 g1 = GRN(c1);
+	UINT32 b1 = BLU(c1);
+	UINT32 a2 = ALF(c2);
+	UINT32 r2 = RED(c2);
+	UINT32 g2 = GRN(c2);
+	UINT32 b2 = BLU(c2);
+	UINT32 a3 = ((a1 * a1 + (255 - a1) * a2) / 255) & 0xff;
+	UINT32 r3 = ((a1 * r1 + (255 - a1) * r2) / 255) & 0xff;
+	UINT32 g3 = ((a1 * g1 + (255 - a1) * g2) / 255) & 0xff;
+	UINT32 b3 = ((a1 * b1 + (255 - a1) * b2) / 255) & 0xff;
+	return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
+}
+
+/* ------------------------------------------------------------------------- */
+static UINT32 colordist(UINT32 c1, UINT32 c2)
+{
+	int d = 0;
+	int maxd = 0;
+	d = ABS((INT32)(ALF(c1) - ALF(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	d = ABS((INT32)(RED(c1) - RED(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	d = ABS((INT32)(GRN(c1) - GRN(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	d = ABS((INT32)(BLU(c1) - BLU(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	return maxd;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL check(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step,
+                  BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height)
+{
+	for (UINT32 y = 0; y < height; ++y)
+	{
+		for (UINT32 x = 0; x < width; ++x)
+		{
+			UINT32 s1 = *PIXEL(pSrc1, src1Step, x, y);
+			UINT32 s2 = *PIXEL(pSrc2, src2Step, x, y);
+			UINT32 c0 = alpha_add(s1, s2);
+			UINT32 c1 = *PIXEL(pDst, dstStep, x, y);
+
+			if (colordist(c0, c1) > TOLERANCE)
+			{
+				printf("alphaComp-general: [%" PRIu32 ",%" PRIu32 "] 0x%08" PRIx32 "+0x%08" PRIx32
+				       "=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
+				       x, y, s1, s2, c0, c1);
+				return FALSE;
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_alphaComp_func(void)
+{
+	pstatus_t status = 0;
+	BYTE src1[SRC1_WIDTH * SRC1_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
+	BYTE src2[SRC2_WIDTH * SRC2_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
+	BYTE dst1[DST_WIDTH * DST_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
+	UINT32* ptr = nullptr;
+	if (winpr_RAND(src1, sizeof(src1)) < 0)
+		return FALSE;
+	/* Special-case the first two values */
+	src1[0] &= 0x00FFFFFFU;
+	src1[1] |= 0xFF000000U;
+	if (winpr_RAND(src2, sizeof(src2)) < 0)
+		return FALSE;
+	/* Set the second operand to fully-opaque. */
+	ptr = (UINT32*)src2;
+
+	for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
+		*ptr++ |= 0xFF000000U;
+
+	status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1,
+	                                 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
+	           TEST_HEIGHT))
+		return FALSE;
+
+	status = optimized->alphaComp_argb((const BYTE*)src1, 4 * SRC1_WIDTH, (const BYTE*)src2,
+	                                   4 * SRC2_WIDTH, (BYTE*)dst1, 4 * DST_WIDTH, TEST_WIDTH,
+	                                   TEST_HEIGHT);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
+	           TEST_HEIGHT))
+		return FALSE;
+
+	return TRUE;
+}
+
+static int test_alphaComp_speed(void)
+{
+	BYTE src1[SRC1_WIDTH * SRC1_HEIGHT] = WINPR_C_ARRAY_INIT;
+	BYTE src2[SRC2_WIDTH * SRC2_HEIGHT] = WINPR_C_ARRAY_INIT;
+	BYTE dst1[DST_WIDTH * DST_HEIGHT] = WINPR_C_ARRAY_INIT;
+	UINT32* ptr = nullptr;
+
+	if (winpr_RAND(src1, sizeof(src1)) < 0)
+		return -1;
+	/* Special-case the first two values */
+	src1[0] &= 0x00FFFFFFU;
+	src1[1] |= 0xFF000000U;
+	if (winpr_RAND(src2, sizeof(src2)) < 0)
+		return -1;
+	/* Set the second operand to fully-opaque. */
+	ptr = (UINT32*)src2;
+
+	for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
+		*ptr++ |= 0xFF000000U;
+
+	return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->alphaComp_argb,
+	                   (speed_test_fkt)optimized->alphaComp_argb, src1, 4 * SRC1_WIDTH, src2,
+	                   4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT));
+}
+
+int TestPrimitivesAlphaComp(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+
+	if (!test_alphaComp_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_alphaComp_speed())
+			return -1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,171 @@
+/* test_andor.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+
+#define VALUE (0xA5A5A5A5U)
+
+/* ========================================================================= */
+static BOOL test_and_32u_impl(const char* name, fn_andC_32u_t fkt, const UINT32* src,
+                              const UINT32 val, UINT32* dst, size_t size)
+{
+	pstatus_t status = fkt(src, val, dst, WINPR_ASSERTING_INT_CAST(int32_t, size));
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	for (size_t i = 0; i < size; ++i)
+	{
+		if (dst[i] != (src[i] & val))
+		{
+
+			printf("AND %s FAIL[%" PRIuz "] 0x%08" PRIx32 "&0x%08" PRIx32 "=0x%08" PRIx32
+			       ", got 0x%08" PRIx32 "\n",
+			       name, i, src[i], val, (src[i] & val), dst[i]);
+
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_and_32u_func(void)
+{
+	UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u, src + 1, VALUE, dst + 1,
+	                       FUNC_TEST_SIZE))
+		return FALSE;
+	if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u, src + 1, VALUE,
+	                       dst + 2, FUNC_TEST_SIZE))
+		return FALSE;
+	if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u, src + 1, VALUE,
+	                       dst + 1, FUNC_TEST_SIZE))
+		return FALSE;
+	if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u, src + 1, VALUE,
+	                       dst + 2, FUNC_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_and_32u_speed(void)
+{
+	UINT32 src[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	if (!speed_test("andC_32u", "aligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
+	                (speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
+		return FALSE;
+	if (!speed_test("andC_32u", "unaligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
+	                (speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ========================================================================= */
+static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
+{
+	for (UINT32 i = 0; i < size; ++i)
+	{
+		if (dst[i] != (src[i] | value))
+		{
+			printf("OR-general general FAIL[%" PRIu32 "] 0x%08" PRIx32 "&0x%08" PRIx32
+			       "=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
+			       i, src[i], value, src[i] | value, dst[i]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_or_32u_func(void)
+{
+	pstatus_t status = 0;
+	UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
+		return FALSE;
+
+	status = optimized->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_or_32u_speed(void)
+{
+	UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->orC_32u,
+	                   (speed_test_fkt)optimized->orC_32u, src + 1, VALUE, dst + 1,
+	                   FUNC_TEST_SIZE));
+}
+
+int TestPrimitivesAndOr(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+
+	if (!test_and_32u_func())
+		return -1;
+
+	if (!test_or_32u_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_and_32u_speed())
+			return -1;
+		if (!test_or_32u_speed())
+			return -1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,291 @@
+/* test_colors.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <freerdp/utils/profiler.h>
+
+#include "prim_test.h"
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_RGBToRGB_16s8u_P3AC4R_func(prim_size_t roi, DWORD DstFormat)
+{
+	INT16* r = nullptr;
+	INT16* g = nullptr;
+	INT16* b = nullptr;
+	BYTE* out1 = nullptr;
+	BYTE* out2 = nullptr;
+	BOOL failed = FALSE;
+	const INT16* ptrs[3];
+	const UINT32 rgbStride = roi.width * 2;
+	const UINT32 dstStride = roi.width * 4;
+	PROFILER_DEFINE(genericProf)
+	PROFILER_DEFINE(optProf)
+	PROFILER_CREATE(genericProf, "RGBToRGB_16s8u_P3AC4R-GENERIC")
+	PROFILER_CREATE(optProf, "RGBToRGB_16s8u_P3AC4R-OPTIMIZED")
+	r = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
+	g = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
+	b = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
+	out1 = winpr_aligned_calloc(1, 1ULL * dstStride * roi.height, 16);
+	out2 = winpr_aligned_calloc(1, 1ULL * dstStride * roi.height, 16);
+
+	if (!r || !g || !b || !out1 || !out2)
+		goto fail;
+
+	if (winpr_RAND(r, 1ULL * rgbStride * roi.height) < 0)
+		goto fail;
+	if (winpr_RAND(g, 1ULL * rgbStride * roi.height) < 0)
+		goto fail;
+	if (winpr_RAND(b, 1ULL * rgbStride * roi.height) < 0)
+		goto fail;
+	ptrs[0] = r;
+	ptrs[1] = g;
+	ptrs[2] = b;
+	PROFILER_ENTER(genericProf)
+
+	if (generic->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out1, dstStride, DstFormat, &roi) !=
+	    PRIMITIVES_SUCCESS)
+		goto fail;
+
+	PROFILER_EXIT(genericProf)
+	PROFILER_ENTER(optProf)
+
+	if (optimized->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out2, dstStride, DstFormat, &roi) !=
+	    PRIMITIVES_SUCCESS)
+		goto fail;
+
+	PROFILER_EXIT(optProf)
+
+	if (memcmp(out1, out2, 1ULL * dstStride * roi.height) != 0)
+	{
+		for (UINT64 i = 0; i < 1ull * roi.width * roi.height; ++i)
+		{
+			const UINT32 o1 = FreeRDPReadColor(out1 + 4 * i, DstFormat);
+			const UINT32 o2 = FreeRDPReadColor(out2 + 4 * i, DstFormat);
+
+			if (o1 != o2)
+			{
+				printf("RGBToRGB_16s8u_P3AC4R FAIL: out1[%" PRIu64 "]=0x%08" PRIx8 " out2[%" PRIu64
+				       "]=0x%08" PRIx8 "\n",
+				       i, out1[i], i, out2[i]);
+				failed = TRUE;
+			}
+		}
+	}
+
+	printf("Results for %" PRIu32 "x%" PRIu32 " [%s]\n", roi.width, roi.height,
+	       FreeRDPGetColorFormatName(DstFormat));
+	PROFILER_PRINT_HEADER
+	PROFILER_PRINT(genericProf)
+	PROFILER_PRINT(optProf)
+	PROFILER_PRINT_FOOTER
+fail:
+	PROFILER_FREE(genericProf)
+	PROFILER_FREE(optProf)
+	winpr_aligned_free(r);
+	winpr_aligned_free(g);
+	winpr_aligned_free(b);
+	winpr_aligned_free(out1);
+	winpr_aligned_free(out2);
+	return !failed;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
+{
+	union
+	{
+		const INT16** cpv;
+		INT16** pv;
+	} cnv;
+	const prim_size_t roi64x64 = { 64, 64 };
+	INT16 r[4096 + 1] = WINPR_C_ARRAY_INIT;
+	INT16 g[4096 + 1] = WINPR_C_ARRAY_INIT;
+	INT16 b[4096 + 1] = WINPR_C_ARRAY_INIT;
+	UINT32 dst[4096 + 1] = WINPR_C_ARRAY_INIT;
+	INT16* ptrs[3] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(r, sizeof(r)) < 0)
+		return FALSE;
+	if (winpr_RAND(g, sizeof(g)) < 0)
+		return FALSE;
+	if (winpr_RAND(b, sizeof(b)) < 0)
+		return FALSE;
+
+	/* clear upper bytes */
+	for (int i = 0; i < 4096; ++i)
+	{
+		r[i] &= 0x00FFU;
+		g[i] &= 0x00FFU;
+		b[i] &= 0x00FFU;
+	}
+
+	ptrs[0] = r + 1;
+	ptrs[1] = g + 1;
+	ptrs[2] = b + 1;
+
+	cnv.pv = ptrs;
+	if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
+	                (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
+	                (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv, 64 * 2, (BYTE*)dst,
+	                64 * 4, &roi64x64))
+		return FALSE;
+
+	if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
+	                (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
+	                (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv, 64 * 2,
+	                ((BYTE*)dst) + 1, 64 * 4, &roi64x64))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ========================================================================= */
+static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
+{
+	pstatus_t status = 0;
+	INT16 y[4096] = WINPR_C_ARRAY_INIT;
+	INT16 cb[4096] = WINPR_C_ARRAY_INIT;
+	INT16 cr[4096] = WINPR_C_ARRAY_INIT;
+	INT16 r1[4096] = WINPR_C_ARRAY_INIT;
+	INT16 g1[4096] = WINPR_C_ARRAY_INIT;
+	INT16 b1[4096] = WINPR_C_ARRAY_INIT;
+	INT16 r2[4096] = WINPR_C_ARRAY_INIT;
+	INT16 g2[4096] = WINPR_C_ARRAY_INIT;
+	INT16 b2[4096] = WINPR_C_ARRAY_INIT;
+	const INT16* in[3];
+	INT16* out1[3];
+	INT16* out2[3];
+	prim_size_t roi = { 64, 64 };
+	if (winpr_RAND(y, sizeof(y)) < 0)
+		return FALSE;
+	if (winpr_RAND(cb, sizeof(cb)) < 0)
+		return FALSE;
+	if (winpr_RAND(cr, sizeof(cr)) < 0)
+		return FALSE;
+
+	/* Normalize to 11.5 fixed radix */
+	for (int i = 0; i < 4096; ++i)
+	{
+		y[i] &= 0x1FE0U;
+		cb[i] &= 0x1FE0U;
+		cr[i] &= 0x1FE0U;
+	}
+
+	in[0] = y;
+	in[1] = cb;
+	in[2] = cr;
+	out1[0] = r1;
+	out1[1] = g1;
+	out1[2] = b1;
+	out2[0] = r2;
+	out2[1] = g2;
+	out2[2] = b2;
+	status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	for (int i = 0; i < 4096; ++i)
+	{
+		if ((ABS(r1[i] - r2[i]) > 1) || (ABS(g1[i] - g2[i]) > 1) || (ABS(b1[i] - b2[i]) > 1))
+		{
+			printf("YCbCrToRGB-SSE FAIL[%d]: %" PRId16 ",%" PRId16 ",%" PRId16 " vs %" PRId16
+			       ",%" PRId16 ",%" PRId16 "\n",
+			       i, r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
+{
+	prim_size_t roi = { 64, 64 };
+	INT16 y[4096] = WINPR_C_ARRAY_INIT;
+	INT16 cb[4096] = WINPR_C_ARRAY_INIT;
+	INT16 cr[4096] = WINPR_C_ARRAY_INIT;
+	INT16 r[4096] = WINPR_C_ARRAY_INIT;
+	INT16 g[4096] = WINPR_C_ARRAY_INIT;
+	INT16 b[4096] = WINPR_C_ARRAY_INIT;
+	const INT16* input[3] = WINPR_C_ARRAY_INIT;
+	INT16* output[3] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(y, sizeof(y)) < 0)
+		return FALSE;
+	if (winpr_RAND(cb, sizeof(cb)) < 0)
+		return FALSE;
+	if (winpr_RAND(cr, sizeof(cr)) < 0)
+		return FALSE;
+
+	/* Normalize to 11.5 fixed radix */
+	for (int i = 0; i < 4096; ++i)
+	{
+		y[i] &= 0x1FE0U;
+		cb[i] &= 0x1FE0U;
+		cr[i] &= 0x1FE0U;
+	}
+
+	input[0] = y;
+	input[1] = cb;
+	input[2] = cr;
+	output[0] = r;
+	output[1] = g;
+	output[2] = b;
+
+	return (speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
+	                   (speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
+	                   (speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3, input, 64 * 2, output,
+	                   64 * 2, &roi));
+}
+
+int TestPrimitivesColors(int argc, char* argv[])
+{
+	const DWORD formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_ABGR32,
+		                      PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+		                      PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	prim_size_t roi = { 1920 / 4, 1080 / 4 };
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+	{
+		if (!test_RGBToRGB_16s8u_P3AC4R_func(roi, formats[x]))
+			return 1;
+
+		if (g_TestPrimitivesPerformance)
+		{
+			if (!test_RGBToRGB_16s8u_P3AC4R_speed())
+				return 1;
+		}
+
+		if (!test_yCbCrToRGB_16s16s_P3P3_func())
+			return 1;
+
+		if (g_TestPrimitivesPerformance)
+		{
+			if (!test_yCbCrToRGB_16s16s_P3P3_speed())
+				return 1;
+		}
+	}
+
+	return 0;
+}
@@ -0,0 +1,296 @@
+/* test_copy.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <stdio.h>
+
+#include <freerdp/config.h>
+#include <winpr/crypto.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define COPY_TESTSIZE (256 * 2 + 16 * 2 + 15 + 15)
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_copy8u_func(void)
+{
+	primitives_t* prims = primitives_get();
+	BYTE data[COPY_TESTSIZE + 15] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(data, sizeof(data)) < 0)
+		return FALSE;
+
+	for (int soff = 0; soff < 16; ++soff)
+	{
+		for (int doff = 0; doff < 16; ++doff)
+		{
+			for (int length = 1; length <= COPY_TESTSIZE - doff; ++length)
+			{
+				BYTE dest[COPY_TESTSIZE + 15] = WINPR_C_ARRAY_INIT;
+
+				if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
+					return FALSE;
+
+				for (int i = 0; i < length; ++i)
+				{
+					if (dest[i + doff] != data[i + soff])
+					{
+						printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02" PRIx8 ""
+						       "data[%d]=0x%02" PRIx8 "\n",
+						       doff, length, i + doff, dest[i + doff], i + soff, data[i + soff]);
+						return FALSE;
+					}
+				}
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_copy8u_speed(void)
+{
+	BYTE src[MAX_TEST_SIZE + 4] = WINPR_C_ARRAY_INIT;
+	BYTE dst[MAX_TEST_SIZE + 4] = WINPR_C_ARRAY_INIT;
+
+	if (!speed_test("copy_8u", "aligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
+	                (speed_test_fkt)optimized->copy_8u, src, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("copy_8u", "unaligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
+	                (speed_test_fkt)optimized->copy_8u, src + 1, dst + 1, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+static BYTE* rand_alloc(size_t w, size_t h, size_t bpp, size_t pad, BYTE** copy)
+{
+	const size_t s = w * bpp + pad;
+	BYTE* ptr = calloc(s, h);
+	if (!ptr)
+		return nullptr;
+
+	if (winpr_RAND(ptr, s * h) < 0)
+	{
+		free(ptr);
+		return nullptr;
+	}
+
+	if (copy)
+	{
+		BYTE* ptr2 = calloc(s, h);
+		if (!ptr2)
+		{
+			free(ptr);
+			return nullptr;
+		}
+		memcpy(ptr2, ptr, s * h);
+		*copy = ptr2;
+	}
+	return ptr;
+}
+
+static size_t runcount = 0;
+
+static BOOL test_copy_no_overlap_off(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
+                                     UINT32 pad, UINT32 w, UINT32 h, UINT32 dxoff, UINT32 dyoff,
+                                     UINT32 sxoff, UINT32 syoff)
+{
+	BOOL rc = FALSE;
+	primitives_t* gen = primitives_get_generic();
+	primitives_t* prims = primitives_get();
+	if (!gen || !prims)
+		return FALSE;
+
+	runcount++;
+
+	WINPR_ASSERT(dxoff < w);
+	WINPR_ASSERT(sxoff < w);
+	WINPR_ASSERT(dyoff < h);
+	WINPR_ASSERT(syoff < h);
+
+	const UINT32 sbpp = FreeRDPGetBytesPerPixel(srcFormat);
+	const UINT32 dbpp = FreeRDPGetBytesPerPixel(dstFormat);
+
+	if (verbose)
+	{
+		(void)fprintf(stderr,
+		              "run src: %s, dst: %s [flags 0x%08" PRIx32 "] %" PRIu32 "x%" PRIu32
+		              ", soff=%" PRIu32 "x%" PRIu32 ", doff=%" PRIu32 "x%" PRIu32 ", pad=%" PRIu32
+		              "\n",
+		              FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
+		              flags, w, h, sxoff, syoff, dxoff, dyoff, pad);
+	}
+
+	const UINT32 sstride = (w + sxoff) * sbpp + pad;
+	const UINT32 dstride = (w + dxoff) * dbpp + pad;
+	BYTE* dst2 = nullptr;
+	BYTE* src2 = nullptr;
+	BYTE* dst1 = rand_alloc(w + dxoff, h + dyoff, dbpp, pad, &dst2);
+	BYTE* src1 = rand_alloc(w + sxoff, h + syoff, sbpp, pad, &src2);
+	if (!dst1 || !dst2 || !src1 || !src2)
+		goto fail;
+
+	if (gen->copy_no_overlap(dst1, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat, sstride,
+	                         sxoff, syoff, nullptr, flags) != PRIMITIVES_SUCCESS)
+		goto fail;
+
+	if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
+		goto fail;
+
+	if (prims->copy_no_overlap(dst2, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat,
+	                           sstride, sxoff, syoff, nullptr, flags) != PRIMITIVES_SUCCESS)
+		goto fail;
+
+	if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
+		goto fail;
+
+	if (memcmp(dst1, dst2, 1ULL * dstride * h) != 0)
+		goto fail;
+
+	if (flags == FREERDP_KEEP_DST_ALPHA)
+	{
+		for (size_t y = 0; y < h; y++)
+		{
+			const BYTE* d1 = &dst1[(y + dyoff) * dstride];
+			const BYTE* d2 = &dst2[(y + dyoff) * dstride];
+			for (size_t x = 0; x < w; x++)
+			{
+				const UINT32 c1 = FreeRDPReadColor(&d1[(x + dxoff) * dbpp], dstFormat);
+				const UINT32 c2 = FreeRDPReadColor(&d2[(x + dxoff) * dbpp], dstFormat);
+				BYTE a1 = 0;
+				BYTE a2 = 0;
+				FreeRDPSplitColor(c1, dstFormat, nullptr, nullptr, nullptr, &a1, nullptr);
+				FreeRDPSplitColor(c2, dstFormat, nullptr, nullptr, nullptr, &a2, nullptr);
+				if (a1 != a2)
+					goto fail;
+			}
+		}
+	}
+	rc = TRUE;
+
+fail:
+	if (!rc)
+	{
+		(void)fprintf(stderr, "failed to compare copy_no_overlap(%s -> %s [0x%08" PRIx32 "])\n",
+		              FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
+		              flags);
+	}
+	free(dst1);
+	free(dst2);
+	free(src1);
+	free(src2);
+	return rc;
+}
+
+static BOOL test_copy_no_overlap(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
+                                 UINT32 width, UINT32 height)
+{
+	BOOL rc = TRUE;
+	const UINT32 mw = 4;
+	const UINT32 mh = 4;
+	for (UINT32 dxoff = 0; dxoff < mw; dxoff++)
+	{
+		for (UINT32 dyoff = 0; dyoff <= mh; dyoff++)
+		{
+			for (UINT32 sxoff = 0; sxoff <= mw; sxoff++)
+			{
+				for (UINT32 syoff = 0; syoff <= mh; syoff++)
+				{
+					/* We need minimum alignment of 8 bytes.
+					 * AVX2 can read 8 pixels (at most 8x4=32 bytes) per step
+					 * if we have 24bpp input that is 24 bytes with 8 bytes read
+					 * out of bound */
+					for (UINT32 pad = 8; pad <= 12; pad++)
+					{
+						if (!test_copy_no_overlap_off(verbose, srcFormat, dstFormat, flags, pad,
+						                              width, height, dxoff, dyoff, sxoff, syoff))
+							rc = FALSE;
+					}
+				}
+			}
+		}
+	}
+
+	return rc;
+}
+
+int TestPrimitivesCopy(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	const BOOL verbose = argc > 1;
+
+	prim_test_setup(FALSE);
+
+	if (!test_copy8u_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_copy8u_speed())
+			return 1;
+	}
+
+	const UINT32 flags[] = {
+		FREERDP_FLIP_NONE,
+		FREERDP_KEEP_DST_ALPHA,
+		FREERDP_FLIP_HORIZONTAL,
+		FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_HORIZONTAL,
+#if defined(TEST_ALL_FLAGS)
+		FREERDP_FLIP_VERTICAL,
+		FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL,
+		FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL,
+		FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL
+#endif
+	};
+	const UINT32 formats[] = { PIXEL_FORMAT_BGRA32,
+		                       PIXEL_FORMAT_BGRX32,
+		                       PIXEL_FORMAT_BGR24
+#if defined(TEST_ALL_FLAGS) /* Only the previous 3 have SIMD optimizations, so skip the rest */
+		                       ,
+		                       PIXEL_FORMAT_RGB24,
+		                       PIXEL_FORMAT_ABGR32,
+		                       PIXEL_FORMAT_ARGB32,
+		                       PIXEL_FORMAT_XBGR32,
+		                       PIXEL_FORMAT_XRGB32,
+		                       PIXEL_FORMAT_RGBA32,
+		                       PIXEL_FORMAT_RGBX32
+#endif
+	};
+
+	int rc = 0;
+	for (size_t z = 0; z < ARRAYSIZE(flags); z++)
+	{
+		const UINT32 flag = flags[z];
+		for (size_t x = 0; x < ARRAYSIZE(formats); x++)
+		{
+			const UINT32 sformat = formats[x];
+			for (size_t y = 0; y < ARRAYSIZE(formats); y++)
+			{
+				const UINT32 dformat = formats[y];
+
+				if (!test_copy_no_overlap(verbose, sformat, dformat, flag, 21, 17))
+					rc = -1;
+			}
+		}
+	}
+
+	if (verbose)
+		(void)fprintf(stderr, "runcount=%" PRIuz "\n", runcount);
+
+	return rc;
+}
@@ -0,0 +1,277 @@
+/* test_set.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+/* ------------------------------------------------------------------------- */
+static BOOL check8(const BYTE* src, UINT32 length, UINT32 offset, BYTE value)
+{
+	for (UINT32 i = 0; i < length; ++i)
+	{
+		if (src[offset + i] != value)
+		{
+			printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%02" PRIx8
+			       "\n",
+			       offset, length, i + offset, src[i + offset]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_set8u_func(void)
+{
+	pstatus_t status = 0;
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		BYTE dest[1024];
+
+		memset(dest, 3, sizeof(dest));
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = generic->set_8u(0xa5, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check8(dest, len, off, 0xa5))
+				return FALSE;
+		}
+	}
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		BYTE dest[1024];
+
+		memset(dest, 3, sizeof(dest));
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = optimized->set_8u(0xa5, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check8(dest, len, off, 0xa5))
+				return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set8u_speed(void)
+{
+	BYTE dest[1024];
+	BYTE value = 0;
+
+	for (UINT32 x = 0; x < 16; x++)
+	{
+		if (winpr_RAND(&value, sizeof(value)) < 0)
+			return FALSE;
+
+		if (!speed_test("set_8u", "", g_Iterations, (speed_test_fkt)generic->set_8u,
+		                (speed_test_fkt)optimized->set_8u, value, dest + x, x))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+static BOOL check32s(const INT32* src, UINT32 length, UINT32 offset, INT32 value)
+{
+	for (UINT32 i = 0; i < length; ++i)
+	{
+		if (src[offset + i] != value)
+		{
+			printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
+			       "\n",
+			       offset, length, i + offset, src[i + offset]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32s_func(void)
+{
+	pstatus_t status = 0;
+	const INT32 value = -0x12345678;
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		INT32 dest[1024] = WINPR_C_ARRAY_INIT;
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = generic->set_32s(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32s(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		INT32 dest[1024] = WINPR_C_ARRAY_INIT;
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = optimized->set_32s(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32s(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL check32u(const UINT32* src, UINT32 length, UINT32 offset, UINT32 value)
+{
+	for (UINT32 i = 0; i < length; ++i)
+	{
+		if (src[offset + i] != value)
+		{
+			printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
+			       "\n",
+			       offset, length, i + offset, src[i + offset]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32u_func(void)
+{
+	pstatus_t status = 0;
+	const UINT32 value = 0xABCDEF12;
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		UINT32 dest[1024] = WINPR_C_ARRAY_INIT;
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = generic->set_32u(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32u(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		UINT32 dest[1024] = WINPR_C_ARRAY_INIT;
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = optimized->set_32u(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32u(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32u_speed(void)
+{
+	UINT32 dest[1024];
+	BYTE value = 0;
+
+	for (UINT32 x = 0; x < 16; x++)
+	{
+		if (winpr_RAND(&value, sizeof(value)) < 0)
+			return FALSE;
+
+		if (!speed_test("set_32u", "", g_Iterations, (speed_test_fkt)generic->set_32u,
+		                (speed_test_fkt)optimized->set_32u, value, dest + x, x))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32s_speed(void)
+{
+	INT32 dest[1024];
+	BYTE value = 0;
+
+	for (UINT32 x = 0; x < 16; x++)
+	{
+		if (winpr_RAND(&value, sizeof(value)) < 0)
+			return FALSE;
+
+		if (!speed_test("set_32s", "", g_Iterations, (speed_test_fkt)generic->set_32s,
+		                (speed_test_fkt)optimized->set_32s, value, dest + x, x))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+int TestPrimitivesSet(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	if (!test_set8u_func())
+		return -1;
+
+	if (!test_set32s_func())
+		return -1;
+
+	if (!test_set32u_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_set8u_speed())
+			return -1;
+
+		if (!test_set32s_speed())
+			return -1;
+
+		if (!test_set32u_speed())
+			return -1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,470 @@
+/* test_shift.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+
+static BOOL test_lShift_16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 val = 0;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	val = val % 16;
+	/* Negative tests */
+	status = generic->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+static BOOL test_lShift_16u_func(void)
+{
+	pstatus_t status = 0;
+	UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 val = 0;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	val = val % 16;
+
+	/* Negative tests */
+	status = generic->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+static BOOL test_rShift_16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 val = 0;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	val = val % 16;
+
+	/* Negative Tests */
+	status = generic->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+static BOOL test_rShift_16u_func(void)
+{
+	pstatus_t status = 0;
+	UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 val = 0;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	val = val % 16;
+	/* Negative tests */
+	status = generic->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+static BOOL test_ShiftWrapper_16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 tmp = 0;
+	if (winpr_RAND(&tmp, sizeof(tmp)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	INT32 val = WINPR_ASSERTING_INT_CAST(int32_t, tmp % 16);
+
+	/* Negative tests */
+	status = generic->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+static BOOL test_ShiftWrapper_16u_func(void)
+{
+	pstatus_t status = 0;
+	UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	UINT32 tmp = 0;
+	if (winpr_RAND(&tmp, sizeof(tmp)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	INT32 val = WINPR_ASSERTING_INT_CAST(int32_t, tmp % 16);
+
+	/* Negative */
+	status = generic->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	return (status == PRIMITIVES_SUCCESS);
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_lShift_16s_speed(void)
+{
+	UINT32 val = 0;
+	INT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	INT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	if (winpr_RAND(&val, sizeof(val)))
+		return FALSE;
+
+	val = val % 16;
+	if (!speed_test("lShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
+	                (speed_test_fkt)optimized->lShiftC_16s, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("lShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
+	                (speed_test_fkt)optimized->lShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_lShift_16u_speed(void)
+{
+	UINT32 val = 0;
+	UINT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	UINT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	val = val % 16;
+	if (!speed_test("lShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
+	                (speed_test_fkt)optimized->lShiftC_16u, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("lShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
+	                (speed_test_fkt)optimized->lShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_rShift_16s_speed(void)
+{
+	UINT32 val = 0;
+	INT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	INT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+
+	val = val % 16;
+	if (!speed_test("rShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
+	                (speed_test_fkt)optimized->rShiftC_16s, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("rShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
+	                (speed_test_fkt)optimized->rShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_rShift_16u_speed(void)
+{
+	UINT32 val = 0;
+	UINT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	UINT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(&val, sizeof(val)) < 0)
+		return FALSE;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	val = val % 16;
+	if (!speed_test("rShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
+	                (speed_test_fkt)optimized->rShiftC_16u, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("rShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
+	                (speed_test_fkt)optimized->rShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesShift(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	if (!test_lShift_16s_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_lShift_16s_speed())
+			return 1;
+	}
+
+	if (!test_lShift_16u_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_lShift_16u_speed())
+			return 1;
+	}
+
+	if (!test_rShift_16s_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_rShift_16s_speed())
+			return 1;
+	}
+
+	if (!test_rShift_16u_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_rShift_16u_speed())
+			return 1;
+	}
+
+	if (!test_ShiftWrapper_16s_func())
+		return 1;
+
+	if (!test_ShiftWrapper_16u_func())
+		return 1;
+
+	return 0;
+}
@@ -0,0 +1,95 @@
+/* test_sign.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define TEST_BUFFER_SIZE 65535
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_sign16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 src[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
+	INT16 d1[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
+	INT16 d2[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+	status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (memcmp(d1, d2, sizeof(d1)) != 0)
+		return FALSE;
+
+	status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (memcmp(d1, d2, sizeof(d1)) != 0)
+		return FALSE;
+
+	return TRUE;
+}
+
+static int test_sign16s_speed(void)
+{
+	INT16 src[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	INT16 dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
+	if (winpr_RAND(src, sizeof(src)) < 0)
+		return FALSE;
+
+	if (!speed_test("sign16s", "aligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
+	                (speed_test_fkt)optimized->sign_16s, src + 1, dst + 1, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("sign16s", "unaligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
+	                (speed_test_fkt)optimized->sign_16s, src + 1, dst + 2, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesSign(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+
+	if (!test_sign16s_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_sign16s_speed())
+			return 1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,150 @@
+/* test_YCoCg.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+#include <freerdp/utils/profiler.h>
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
+{
+	pstatus_t status = -1;
+	BYTE* out_sse = nullptr;
+	BYTE* in = nullptr;
+	BYTE* out_c = nullptr;
+	const UINT32 srcStride = width * 4;
+	const UINT32 size = srcStride * height;
+	const UINT32 formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32,
+		                       PIXEL_FORMAT_RGBX32, PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	PROFILER_DEFINE(genericProf)
+	PROFILER_DEFINE(optProf)
+	in = winpr_aligned_calloc(1, size, 16);
+	out_c = winpr_aligned_calloc(1, size, 16);
+	out_sse = winpr_aligned_calloc(1, size, 16);
+
+	if (!in || !out_c || !out_sse)
+		goto fail;
+
+	if (winpr_RAND(in, size) < 0)
+		goto fail;
+
+	for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+	{
+		const UINT32 format = formats[x];
+		const UINT32 dstStride = width * FreeRDPGetBytesPerPixel(format);
+		const char* formatName = FreeRDPGetColorFormatName(format);
+		PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC")
+		PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT")
+		PROFILER_ENTER(genericProf)
+		status = generic->YCoCgToRGB_8u_AC4R(in, WINPR_ASSERTING_INT_CAST(int, srcStride), out_c,
+		                                     format, WINPR_ASSERTING_INT_CAST(int, dstStride),
+		                                     width, height, 2, TRUE);
+		PROFILER_EXIT(genericProf)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto loop_fail;
+
+		PROFILER_ENTER(optProf)
+		status = optimized->YCoCgToRGB_8u_AC4R(
+		    in, WINPR_ASSERTING_INT_CAST(int, srcStride), out_sse, format,
+		    WINPR_ASSERTING_INT_CAST(int, dstStride), width, height, 2, TRUE);
+		PROFILER_EXIT(optProf)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto loop_fail;
+
+		if (memcmp(out_c, out_sse, 1ULL * dstStride * height) != 0)
+		{
+			for (size_t i = 0; i < 1ull * width * height; ++i)
+			{
+				const UINT32 c = FreeRDPReadColor(out_c + 4 * i, format);
+				const UINT32 sse = FreeRDPReadColor(out_sse + 4 * i, format);
+
+				if (c != sse)
+				{
+					printf("optimized->YCoCgRToRGB FAIL[%s] [%" PRIuz "]: 0x%08" PRIx32
+					       " -> C 0x%08" PRIx32 " vs optimized 0x%08" PRIx32 "\n",
+					       formatName, i, in[i + 1], c, sse);
+					status = -1;
+				}
+			}
+		}
+
+		printf("--------------------------- [%s] [%" PRIu32 "x%" PRIu32
+		       "] ---------------------------\n",
+		       formatName, width, height);
+		PROFILER_PRINT_HEADER
+		PROFILER_PRINT(genericProf)
+		PROFILER_PRINT(optProf)
+		PROFILER_PRINT_FOOTER
+	loop_fail:
+		PROFILER_FREE(genericProf)
+		PROFILER_FREE(optProf)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto fail;
+	}
+
+fail:
+	winpr_aligned_free(in);
+	winpr_aligned_free(out_c);
+	winpr_aligned_free(out_sse);
+	return status == PRIMITIVES_SUCCESS;
+}
+
+int TestPrimitivesYCoCg(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	/* Random resolution tests */
+	if (argc < 2)
+	{
+		for (UINT32 x = 0; x < 10; x++)
+		{
+			UINT32 w = 0;
+			UINT32 h = 0;
+
+			do
+			{
+				if (winpr_RAND(&w, sizeof(w)) < 0)
+					return -1;
+				w %= 2048 / 4;
+			} while (w < 16);
+
+			do
+			{
+				if (winpr_RAND(&h, sizeof(h)) < 0)
+					return -1;
+				h %= 2048 / 4;
+			} while (h < 16);
+
+			if (!test_YCoCgRToRGB_8u_AC4R_func(w, h))
+				return 1;
+		}
+	}
+
+	/* Test once with full HD/4 */
+	if (!test_YCoCgRToRGB_8u_AC4R_func(1920 / 4, 1080 / 4))
+		return 1;
+
+	return 0;
+}
@@ -0,0 +1,138 @@
+/* measure.h
+ * Macros to help with performance measurement.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ * MEASURE_LOOP_START("measurement", 2000)
+ *   code to be measured
+ * MEASURE_LOOP_STOP
+ *   buffer flush and such
+ * MEASURE_SHOW_RESULTS
+ *
+ * Define GOOGLE_PROFILER if you want gperftools included.
+ */
+
+#ifndef TEST_MEASURE_H_INCLUDED
+#define TEST_MEASURE_H_INCLUDED
+
+#include <freerdp/config.h>
+
+#include <time.h>
+#include <winpr/string.h>
+#include <winpr/sysinfo.h>
+
+#ifndef _WIN32
+#include <sys/param.h>
+#endif
+
+#include <winpr/crt.h>
+
+#ifdef _WIN32
+
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+
+#define MEASURE_LOOP_START(_prefix_, _count_)
+#define MEASURE_LOOP_STOP
+#define MEASURE_GET_RESULTS(_result_)
+#define MEASURE_SHOW_RESULTS(_result_)
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_)
+
+#else
+
+#ifdef GOOGLE_PROFILER
+#include <gperftools/profiler.h>
+#define PROFILER_START(_prefix_)                                  \
+	do                                                            \
+	{                                                             \
+		char _path[PATH_MAX];                                     \
+		sprintf_s(_path, sizeof(_path), "./%s.prof", (_prefix_)); \
+		ProfilerStart(_path);                                     \
+	} while (0);
+#define PROFILER_STOP   \
+	do                  \
+	{                   \
+		ProfilerStop(); \
+	} while (0);
+#else
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+#endif // GOOGLE_PROFILER
+
+extern float measure_delta_time(UINT64 t0, UINT64 t1);
+extern void measure_floatprint(float t, char* output, size_t len);
+
+#define MEASURE_LOOP_START(_prefix_, _count_)          \
+	{                                                  \
+		int _count = (_count_);                        \
+		int _loop;                                     \
+		char str1[32] = WINPR_C_ARRAY_INIT;            \
+		char str2[32] = WINPR_C_ARRAY_INIT;            \
+		char* _prefix = _strdup(_prefix_);             \
+		const UINT64 start = winpr_GetTickCount64NS(); \
+		PROFILER_START(_prefix);                       \
+		_loop = (_count);                              \
+		do                                             \
+		{
+
+#define MEASURE_LOOP_STOP \
+	}                     \
+	while (--_loop)       \
+		;
+
+#define MEASURE_GET_RESULTS(_result_)                    \
+	PROFILER_STOP;                                       \
+	const UINT64 stop = winpr_GetTickCount64NS();        \
+	const float delta = measure_delta_time(start, stop); \
+	(_result_) = (float)_count / delta;                  \
+	free(_prefix);                                       \
+	}
+
+#define MEASURE_SHOW_RESULTS(_result_)                                                     \
+	PROFILER_STOP;                                                                         \
+	const UINT64 stop = winpr_GetTickCount64NS();                                          \
+	const float delta = measure_delta_time(start, stop);                                   \
+	(_result_) = (float)_count / delta;                                                    \
+	measure_floatprint((float)_count / delta, str1);                                       \
+	printf("%s: %9d iterations in %5.1f seconds = %s/s \n", _prefix, _count, delta, str1); \
+	free(_prefix);                                                                         \
+	}
+
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)                                            \
+	PROFILER_STOP;                                                                               \
+	const UINT64 stop = winpr_GetTickCount64NS();                                                \
+	const float delta = measure_delta_time(start, stop);                                         \
+	measure_floatprint((float)_count / delta, str1);                                             \
+	measure_floatprint((float)_count / delta * (_scale_), str2);                                 \
+	printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", _prefix, _count, delta, str1, \
+	       str2, _label_);                                                                       \
+	free(_prefix);                                                                               \
+	}
+
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
+	{                                                                      \
+		float _r;                                                          \
+		MEASURE_LOOP_START(_label_, _init_iter_);                          \
+		_call_;                                                            \
+		MEASURE_LOOP_STOP;                                                 \
+		MEASURE_GET_RESULTS(_r);                                           \
+		MEASURE_LOOP_START(_label_, _r* _test_time_);                      \
+		_call_;                                                            \
+		MEASURE_LOOP_STOP;                                                 \
+		MEASURE_SHOW_RESULTS(_result_);                                    \
+	}
+
+#endif
+
+#endif // __MEASURE_H_INCLUDED__
@@ -0,0 +1,94 @@
+/* prim_test.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include "prim_test.h"
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include <winpr/sysinfo.h>
+#include <winpr/platform.h>
+#include <winpr/crypto.h>
+
+primitives_t* generic = nullptr;
+primitives_t* optimized = nullptr;
+BOOL g_TestPrimitivesPerformance = FALSE;
+UINT32 g_Iterations = 1000;
+
+int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
+
+/* ------------------------------------------------------------------------- */
+
+float measure_delta_time(UINT64 t0, UINT64 t1)
+{
+	INT64 diff = (INT64)(t1 - t0);
+	double retval = ((double)diff / 1000000000.0);
+	return (retval < 0.0) ? 0.0f : (float)retval;
+}
+
+/* ------------------------------------------------------------------------- */
+void measure_floatprint(float t, char* output, size_t len)
+{
+	/* I don't want to link against -lm, so avoid log,exp,... */
+	float f = 10.0f;
+	int i = 0;
+
+	while (t > f)
+		f *= 10.0f;
+
+	f /= 1000.0f;
+	i = ((int)(t / f + 0.5f)) * (int)f;
+
+	if (t < 0.0f)
+		(void)_snprintf(output, len, "%f", t);
+	else if (i == 0)
+		(void)_snprintf(output, len, "%d", (int)(t + 0.5f));
+	else if (t < 1e+3f)
+		(void)_snprintf(output, len, "%3d", i);
+	else if (t < 1e+6f)
+		(void)_snprintf(output, len, "%3d,%03d", i / 1000, i % 1000);
+	else if (t < 1e+9f)
+		(void)_snprintf(output, len, "%3d,%03d,000", i / 1000000, (i % 1000000) / 1000);
+	else if (t < 1e+12f)
+		(void)_snprintf(output, len, "%3d,%03d,000,000", i / 1000000000,
+		                (i % 1000000000) / 1000000);
+	else
+		(void)_snprintf(output, len, "%f", t);
+}
+
+void prim_test_setup(BOOL performance)
+{
+	generic = primitives_get_generic();
+	optimized = primitives_get();
+	g_TestPrimitivesPerformance = performance;
+}
+
+BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
+                speed_test_fkt optimized, ...)
+{
+	if (!name || !generic || !optimized || (iterations == 0))
+		return FALSE;
+
+	for (UINT32 i = 0; i < iterations; i++)
+	{
+	}
+
+	return TRUE;
+}
@@ -0,0 +1,48 @@
+/* primtest.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifndef FREERDP_LIB_PRIMTEST_H
+#define FREERDP_LIB_PRIMTEST_H
+
+#include <winpr/crt.h>
+#include <winpr/spec.h>
+#include <winpr/wtypes.h>
+#include <winpr/platform.h>
+#include <winpr/crypto.h>
+
+#include <freerdp/primitives.h>
+
+#include "measure.h"
+
+#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
+#define MAX_TEST_SIZE 4096
+
+extern int test_sizes[];
+#define NUM_TEST_SIZES 10
+
+extern BOOL g_TestPrimitivesPerformance;
+extern UINT32 g_Iterations;
+
+extern primitives_t* generic;
+extern primitives_t* optimized;
+
+void prim_test_setup(BOOL performance);
+
+typedef pstatus_t (*speed_test_fkt)();
+
+BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
+                speed_test_fkt optimized, ...);
+
+#endif /* FREERDP_LIB_PRIMTEST_H */