Milestone 5: deliver embedded RDP sessions and lifecycle hardening

2026-03-03 18:59:26 -07:00
parent 230a401386
commit 36006bd4aa
2941 changed files with 724359 additions and 77 deletions
@@ -0,0 +1,383 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_YCoCg.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                                  UINT32 dstStep, UINT32 width, UINT32 height,
+                                                  UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = pDst;
+
+	WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
+	WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
+	const size_t sRowBump = srcStep - width * sizeof(UINT32);
+	const size_t dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
+		                                   DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
+		                                   width, height, shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		UINT32 w = width;
+
+		while (w >= 8)
+		{
+			__m128i R0;
+			__m128i R1;
+			__m128i R2;
+			__m128i R3;
+			__m128i R4;
+			__m128i R5;
+			__m128i R6;
+			__m128i R7;
+
+			R0 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+			R1 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			R3 = _mm_shuffle_epi8(R0, R2);
+			R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			R5 = _mm_unpackhi_epi32(R3, R4);
+			R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = mm_set1_epu32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = mm_set1_epu32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = mm_set1_epu8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			R0 = _mm_packus_epi16(R3, R5);
+			/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			STORE_SI128(dptr, R4);
+			dptr += (128 / 8);
+			STORE_SI128(dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(
+			    sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += w * sizeof(UINT32);
+			dptr += w * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
+                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
+                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = pDst;
+	size_t sRowBump = srcStep - width * sizeof(UINT32);
+	size_t dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
+		                                   DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
+		                                   width, height, shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		UINT32 w = width;
+
+		while (w >= 8)
+		{
+			__m128i R7;
+
+			/* The faster path, 16-byte aligned load. */
+			__m128i R0 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+			__m128i R1 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			__m128i R3 = _mm_shuffle_epi8(R0, R2);
+			__m128i R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			__m128i R5 = _mm_unpackhi_epi32(R3, R4);
+			__m128i R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = mm_set1_epu32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = mm_set1_epu32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = mm_set1_epu8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			/* This line is the only diff between inverted and non-inverted.
+			 * Unfortunately, it would be expensive to check "inverted"
+			 * every time through this loop.
+			 */
+			R0 = _mm_packus_epi16(R5, R3);
+			/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			STORE_SI128(dptr, R4);
+			dptr += (128 / 8);
+			STORE_SI128(dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(
+			    sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
+			    shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
+			dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
+                                           BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_invert(
+			    pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
+			    pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
+			    WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
+	prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,187 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_add.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
+                 generic->add_16s(sptr1++, sptr2++, dptr++, 1))
+
+static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
+                                      INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
+{
+	const int shifts = 2;
+	INT16* dptr1 = pSrcDst1;
+	INT16* dptr2 = pSrcDst2;
+
+	if (ulen < 16) /* pointless if too small */
+		return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
+
+	UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
+	if ((ULONG_PTR)pSrcDst1 & offBeatMask)
+	{
+		/* Incrementing the pointer skips over 16-byte boundary. */
+		return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
+	}
+	/* Get to the 16-byte boundary now. */
+	const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
+	if (rem != 0)
+	{
+		const UINT32 add = 16 - (UINT32)rem;
+		pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
+		if (status != PRIMITIVES_SUCCESS)
+			return status;
+		dptr1 += add;
+		dptr2 += add;
+	}
+	/* Use 4 128-bit SSE registers. */
+	size_t len = ulen;
+	size_t count = len >> (7 - shifts);
+	len -= count << (7 - shifts);
+	if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
+	{
+		/* Unaligned loads */
+		while (count--)
+		{
+			const __m128i* vsptr1 = (const __m128i*)dptr1;
+			const __m128i* vsptr2 = (const __m128i*)dptr2;
+			__m128i* vdptr1 = (__m128i*)dptr1;
+			__m128i* vdptr2 = (__m128i*)dptr2;
+
+			__m128i xmm0 = LOAD_SI128(vsptr1++);
+			__m128i xmm1 = LOAD_SI128(vsptr1++);
+			__m128i xmm2 = LOAD_SI128(vsptr1++);
+			__m128i xmm3 = LOAD_SI128(vsptr1++);
+			__m128i xmm4 = LOAD_SI128(vsptr2++);
+			__m128i xmm5 = LOAD_SI128(vsptr2++);
+			__m128i xmm6 = LOAD_SI128(vsptr2++);
+			__m128i xmm7 = LOAD_SI128(vsptr2++);
+
+			xmm0 = _mm_adds_epi16(xmm0, xmm4);
+			xmm1 = _mm_adds_epi16(xmm1, xmm5);
+			xmm2 = _mm_adds_epi16(xmm2, xmm6);
+			xmm3 = _mm_adds_epi16(xmm3, xmm7);
+
+			STORE_SI128(vdptr1++, xmm0);
+			STORE_SI128(vdptr1++, xmm1);
+			STORE_SI128(vdptr1++, xmm2);
+			STORE_SI128(vdptr1++, xmm3);
+
+			STORE_SI128(vdptr2++, xmm0);
+			STORE_SI128(vdptr2++, xmm1);
+			STORE_SI128(vdptr2++, xmm2);
+			STORE_SI128(vdptr2++, xmm3);
+
+			dptr1 = (INT16*)vdptr1;
+			dptr2 = (INT16*)vdptr2;
+		}
+	}
+	else
+	{
+		/* Aligned loads */
+		while (count--)
+		{
+			const __m128i* vsptr1 = (const __m128i*)dptr1;
+			const __m128i* vsptr2 = (const __m128i*)dptr2;
+			__m128i* vdptr1 = (__m128i*)dptr1;
+			__m128i* vdptr2 = (__m128i*)dptr2;
+
+			__m128i xmm0 = LOAD_SI128(vsptr1++);
+			__m128i xmm1 = LOAD_SI128(vsptr1++);
+			__m128i xmm2 = LOAD_SI128(vsptr1++);
+			__m128i xmm3 = LOAD_SI128(vsptr1++);
+			__m128i xmm4 = LOAD_SI128(vsptr2++);
+			__m128i xmm5 = LOAD_SI128(vsptr2++);
+			__m128i xmm6 = LOAD_SI128(vsptr2++);
+			__m128i xmm7 = LOAD_SI128(vsptr2++);
+
+			xmm0 = _mm_adds_epi16(xmm0, xmm4);
+			xmm1 = _mm_adds_epi16(xmm1, xmm5);
+			xmm2 = _mm_adds_epi16(xmm2, xmm6);
+			xmm3 = _mm_adds_epi16(xmm3, xmm7);
+
+			STORE_SI128(vdptr1++, xmm0);
+			STORE_SI128(vdptr1++, xmm1);
+			STORE_SI128(vdptr1++, xmm2);
+			STORE_SI128(vdptr1++, xmm3);
+
+			STORE_SI128(vdptr2++, xmm0);
+			STORE_SI128(vdptr2++, xmm1);
+			STORE_SI128(vdptr2++, xmm2);
+			STORE_SI128(vdptr2++, xmm3);
+
+			dptr1 = (INT16*)vdptr1;
+			dptr2 = (INT16*)vdptr2;
+		}
+	}
+	/* Use a single 128-bit SSE register. */
+	count = len >> (5 - shifts);
+	len -= count << (5 - shifts);
+	while (count--)
+	{
+		const __m128i* vsptr1 = (const __m128i*)dptr1;
+		const __m128i* vsptr2 = (const __m128i*)dptr2;
+		__m128i* vdptr1 = (__m128i*)dptr1;
+		__m128i* vdptr2 = (__m128i*)dptr2;
+
+		__m128i xmm0 = LOAD_SI128(vsptr1);
+		__m128i xmm1 = LOAD_SI128(vsptr2);
+
+		xmm0 = _mm_adds_epi16(xmm0, xmm1);
+
+		STORE_SI128(vdptr1++, xmm0);
+		STORE_SI128(vdptr2++, xmm0);
+
+		dptr1 = (INT16*)vdptr1;
+		dptr2 = (INT16*)vdptr2;
+	}
+	/* Finish off the remainder. */
+	if (len > 0)
+		return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->add_16s = sse3_add_16s;
+	prims->add_16s_inplace = sse3_add_16s_inplace;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,215 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_alphaComp.h"
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+
+/* ------------------------------------------------------------------------- */
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
+                                     const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
+                                     BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
+                                     UINT32 height)
+{
+	const UINT32* sptr1 = (const UINT32*)pSrc1;
+	const UINT32* sptr2 = (const UINT32*)pSrc2;
+
+	if ((width <= 0) || (height <= 0))
+		return PRIMITIVES_SUCCESS;
+
+	if (width < 4) /* pointless if too small */
+	{
+		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
+		                               height);
+	}
+
+	UINT32* dptr = (UINT32*)pDst;
+	const size_t linebytes = width * sizeof(UINT32);
+	const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
+	__m128i xmm0 = mm_set1_epu32(0);
+	__m128i xmm1 = _mm_set1_epi16(1);
+
+	for (UINT32 y = 0; y < height; ++y)
+	{
+		uint32_t pixels = width;
+		uint32_t count = 0;
+		/* Get to the 16-byte boundary now. */
+		uint32_t leadIn = 0;
+
+		switch ((ULONG_PTR)dptr & 0x0f)
+		{
+			case 0:
+				leadIn = 0;
+				break;
+
+			case 4:
+				leadIn = 3;
+				break;
+
+			case 8:
+				leadIn = 2;
+				break;
+
+			case 12:
+				leadIn = 1;
+				break;
+
+			default:
+				/* We'll never hit a 16-byte boundary, so do the whole
+				 * thing the slow way.
+				 */
+				leadIn = width;
+				break;
+		}
+
+		if (leadIn)
+		{
+			pstatus_t status = 0;
+			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+			                                 src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr1 += leadIn;
+			sptr2 += leadIn;
+			dptr += leadIn;
+			pixels -= leadIn;
+		}
+
+		/* Use SSE registers to do 4 pixels at a time. */
+		count = pixels >> 2;
+		pixels -= count << 2;
+
+		while (count--)
+		{
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+			xmm2 = LOAD_SI128(sptr1);
+			sptr1 += 4;
+			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+			xmm3 = LOAD_SI128(sptr2);
+			sptr2 += 4;
+			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm6 = _mm_subs_epi16(xmm4, xmm5);
+			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+			/* Add one to alphas */
+			xmm4 = _mm_adds_epi16(xmm4, xmm1);
+			/* Multiply and take low word */
+			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+			/* Shift 8 right */
+			xmm4 = _mm_srai_epi16(xmm4, 8);
+			/* Add xmm5 */
+			xmm4 = _mm_adds_epi16(xmm4, xmm5);
+			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm7 = _mm_subs_epi16(xmm5, xmm6);
+			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+			/* Add one to alphas */
+			xmm5 = _mm_adds_epi16(xmm5, xmm1);
+			/* Multiply and take low word */
+			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+			/* Shift 8 right */
+			xmm5 = _mm_srai_epi16(xmm5, 8);
+			/* Add xmm6 */
+			xmm5 = _mm_adds_epi16(xmm5, xmm6);
+			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+			/* Must mask off remainders or pack gets confused */
+			xmm3 = _mm_set1_epi16(0x00ffU);
+			xmm4 = _mm_and_si128(xmm4, xmm3);
+			xmm5 = _mm_and_si128(xmm5, xmm3);
+			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+			xmm5 = _mm_packus_epi16(xmm5, xmm4);
+			STORE_SI128(dptr, xmm5);
+			dptr += 4;
+		}
+
+		/* Finish off the remainder. */
+		if (pixels)
+		{
+			pstatus_t status = 0;
+			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+			                                 src2Step, (BYTE*)dptr, dstStep, pixels, 1);
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr1 += pixels;
+			sptr2 += pixels;
+			dptr += pixels;
+		}
+
+		/* Jump to next row. */
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr += dstJump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->alphaComp_argb = sse2_alphaComp_argb;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,54 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_andor.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
+                     *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->andC_32u = sse3_andC_32u;
+	prims->orC_32u = sse3_orC_32u;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,79 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * FreeRDP primitives SSE implementation
+ *
+ * Copyright 2025 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2025 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <winpr/cast.h>
+
+#include "../../core/simd.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
+{
+	return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
+	                     WINPR_CXX_COMPAT_CAST(int32_t, val3),
+	                     WINPR_CXX_COMPAT_CAST(int32_t, val4));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
+                                  uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
+                                  uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
+                                  uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
+{
+	return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set1_epu32(uint32_t val)
+{
+	return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i mm_set1_epu8(uint8_t val)
+{
+	return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
+}
+
+WINPR_ATTR_NODISCARD
+static inline __m128i LOAD_SI128(const void* ptr)
+{
+	const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
+	return _mm_lddqu_si128(mptr);
+}
+
+static inline void STORE_SI128(void* ptr, __m128i val)
+{
+	__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
+	_mm_storeu_si128(mptr, val);
+}
+
+#endif
@@ -0,0 +1,278 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/log.h>
+
+#include "prim_internal.h"
+#include "prim_copy.h"
+#include "../codec/color.h"
+
+#include <freerdp/codec/color.h>
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <immintrin.h>
+
+static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
+                                      uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
+{
+	return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
+	                        (int32_t)i5, (int32_t)i6, (int32_t)i7);
+}
+
+static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
+                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
+                                                     UINT32 nHeight,
+                                                     const BYTE* WINPR_RESTRICT pSrcData,
+                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                     int64_t srcVMultiplier, int64_t srcVOffset,
+                                                     int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 3;
+	const int64_t dstByte = 4;
+
+	const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
+	                                     0xFF000000, 0xFF000000, 0xFF000000);
+	const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
+	                                      0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
+	const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
+	                                          0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+	const UINT32 rem = nWidth % 8;
+	const int64_t width = nWidth - rem;
+
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+
+		/* Ensure alignment requirements can be met */
+		for (; x < width; x += 8)
+		{
+			const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
+			__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
+			const __m256i s0 = _mm256_loadu_si256(src);
+			__m256i s1 = _mm256_shuffle_epi8(s0, smask);
+
+			/* _mm256_shuffle_epi8 can not cross 128bit lanes.
+			 * manually copy these bytes with extract/insert */
+			const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
+			const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
+			const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
+			                                       0x00000000, 0x00000000, 0x00000000, 0x00000000);
+			const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
+
+			const __m256i s2 = _mm256_loadu_si256(dst);
+			__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
+			_mm256_storeu_si256(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
+                                                      UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                                      UINT32 nWidth, UINT32 nHeight,
+                                                      const BYTE* WINPR_RESTRICT pSrcData,
+                                                      UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                      int64_t srcVMultiplier, int64_t srcVOffset,
+                                                      int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 4;
+	const int64_t dstByte = 4;
+
+	const __m256i mask = _mm256_setr_epi8(
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
+	    (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
+	const UINT32 rem = nWidth % 8;
+	const int64_t width = nWidth - rem;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		for (; x < width; x += 8)
+		{
+			const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
+			__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
+			const __m256i s0 = _mm256_loadu_si256(src);
+			const __m256i s1 = _mm256_loadu_si256(dst);
+			__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
+			_mm256_storeu_si256(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
+    int64_t dstVOffset)
+{
+	WINPR_ASSERT(pDstData);
+	WINPR_ASSERT(pSrcData);
+
+	switch (SrcFormat)
+	{
+		case PIXEL_FORMAT_BGR24:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return avx2_image_copy_bgr24_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return avx2_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_RGBX32:
+				case PIXEL_FORMAT_RGBA32:
+					return avx2_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		default:
+			break;
+	}
+
+	primitives_t* gen = primitives_get_generic();
+	return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+	                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+}
+
+static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
+                                            UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                            UINT32 nWidth, UINT32 nHeight,
+                                            const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+                                            UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                            const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
+{
+	const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
+	int64_t srcVOffset = 0;
+	int64_t srcVMultiplier = 1;
+	int64_t dstVOffset = 0;
+	int64_t dstVMultiplier = 1;
+
+	if ((nWidth == 0) || (nHeight == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
+		return -1;
+
+	if (!pDstData || !pSrcData)
+		return -1;
+
+	if (nDstStep == 0)
+		nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
+
+	if (nSrcStep == 0)
+		nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
+
+	if (vSrcVFlip)
+	{
+		srcVOffset = (nHeight - 1ll) * nSrcStep;
+		srcVMultiplier = -1;
+	}
+
+	if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
+		return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, flags, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset);
+	else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
+		return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset, flags);
+	else
+	{
+		primitives_t* gen = primitives_get_generic();
+		return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+		                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+	}
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	WLog_VRB(PRIM_TAG, "AVX2 optimizations");
+	prims->copy_no_overlap = avx2_image_copy_no_overlap;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,257 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <winpr/sysinfo.h>
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/log.h>
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+#include "prim_copy.h"
+#include "../codec/color.h"
+
+#include <freerdp/codec/color.h>
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <immintrin.h>
+
+static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
+                                                    UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
+                                                    UINT32 nHeight,
+                                                    const BYTE* WINPR_RESTRICT pSrcData,
+                                                    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                    int64_t srcVMultiplier, int64_t srcVOffset,
+                                                    int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 3;
+	const int64_t dstByte = 4;
+
+	const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
+	const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
+	const UINT32 rem = nWidth % 4;
+
+	const int64_t width = nWidth - rem;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		/* Ensure alignment requirements can be met */
+		for (; x < width; x += 4)
+		{
+			const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
+			__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
+			const __m128i s0 = LOAD_SI128(src);
+			const __m128i s1 = _mm_shuffle_epi8(s0, smask);
+			const __m128i s2 = LOAD_SI128(dst);
+
+			__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
+			STORE_SI128(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
+                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
+                                                     UINT32 nHeight,
+                                                     const BYTE* WINPR_RESTRICT pSrcData,
+                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                                     int64_t srcVMultiplier, int64_t srcVOffset,
+                                                     int64_t dstVMultiplier, int64_t dstVOffset)
+{
+
+	const int64_t srcByte = 4;
+	const int64_t dstByte = 4;
+
+	const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
+	                                   (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
+	                                   (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
+	const UINT32 rem = nWidth % 4;
+	const int64_t width = nWidth - rem;
+	for (int64_t y = 0; y < nHeight; y++)
+	{
+		const BYTE* WINPR_RESTRICT srcLine =
+		    &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
+		BYTE* WINPR_RESTRICT dstLine =
+		    &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
+
+		int64_t x = 0;
+		for (; x < width; x += 4)
+		{
+			const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
+			__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
+			const __m128i s0 = LOAD_SI128(src);
+			const __m128i s1 = LOAD_SI128(dst);
+			__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
+			STORE_SI128(dst, d0);
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
+			BYTE* dst = &dstLine[(x + nXDst) * dstByte];
+			*dst++ = *src++;
+			*dst++ = *src++;
+			*dst++ = *src++;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse_image_copy_no_overlap_dst_alpha(
+    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
+    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
+    int64_t dstVOffset)
+{
+	WINPR_ASSERT(pDstData);
+	WINPR_ASSERT(pSrcData);
+
+	switch (SrcFormat)
+	{
+		case PIXEL_FORMAT_BGR24:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return sse_image_copy_bgr24_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_BGRX32:
+				case PIXEL_FORMAT_BGRA32:
+					return sse_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			switch (DstFormat)
+			{
+				case PIXEL_FORMAT_RGBX32:
+				case PIXEL_FORMAT_RGBA32:
+					return sse_image_copy_bgrx32_bgrx32(
+					    pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
+					    nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
+				default:
+					break;
+			}
+			break;
+		default:
+			break;
+	}
+
+	primitives_t* gen = primitives_get_generic();
+	return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+	                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+}
+
+static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
+                                           UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
+                                           UINT32 nWidth, UINT32 nHeight,
+                                           const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
+                                           UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
+                                           const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
+{
+	const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
+	int64_t srcVOffset = 0;
+	int64_t srcVMultiplier = 1;
+	int64_t dstVOffset = 0;
+	int64_t dstVMultiplier = 1;
+
+	if ((nWidth == 0) || (nHeight == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
+		return -1;
+
+	if (!pDstData || !pSrcData)
+		return -1;
+
+	if (nDstStep == 0)
+		nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
+
+	if (nSrcStep == 0)
+		nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
+
+	if (vSrcVFlip)
+	{
+		srcVOffset = (nHeight - 1ll) * nSrcStep;
+		srcVMultiplier = -1;
+	}
+
+	if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
+		return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                           nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                           nXSrc, nYSrc, palette, flags, srcVMultiplier,
+		                                           srcVOffset, dstVMultiplier, dstVOffset);
+	else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
+		return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
+		                                            nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
+		                                            nXSrc, nYSrc, palette, srcVMultiplier,
+		                                            srcVOffset, dstVMultiplier, dstVOffset, flags);
+	else
+	{
+		primitives_t* gen = primitives_get_generic();
+		return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
+		                            pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
+	}
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
+	prims->copy_no_overlap = sse_image_copy_no_overlap;
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,235 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+#include "prim_set.h"
+
+/* ========================================================================= */
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
+{
+	size_t len = ulen;
+	BYTE byte = 0;
+	BYTE* dptr = nullptr;
+	__m128i xmm0;
+	size_t count = 0;
+
+	if (len < 16)
+		return generic->set_8u(val, pDst, ulen);
+
+	byte = val;
+	dptr = pDst;
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		*dptr++ = byte;
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = mm_set1_epu8(byte);
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 8;
+	len -= count << 8;
+
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 4;
+	len -= count << 4;
+
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 16;
+	}
+
+	/* Do leftover bytes. */
+	while (len--)
+		*dptr++ = byte;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
+{
+	size_t len = ulen;
+	const primitives_t* prim = primitives_get_generic();
+	UINT32* dptr = pDst;
+	__m128i xmm0;
+	size_t count = 0;
+
+	/* If really short, just do it here. */
+	if (len < 32)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* Assure we can reach 16-byte alignment. */
+	if (((ULONG_PTR)dptr & 0x03) != 0)
+	{
+		return prim->set_32u(val, pDst, ulen);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		*dptr++ = val;
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = mm_set1_epu32(val);
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 6;
+	len -= count << 6;
+
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 2;
+	len -= count << 2;
+
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		STORE_SI128(dptr, xmm0);
+		dptr += 4;
+	}
+
+	/* Do leftover bytes. */
+	while (len--)
+		*dptr++ = val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+	UINT32 uval = *((UINT32*)&val);
+	return sse2_set_32u(uval, (UINT32*)pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	/* Pick tuned versions if possible. */
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->set_8u = sse2_set_8u;
+	prims->set_32s = sse2_set_32s;
+	prims->set_32u = sse2_set_32u;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,160 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_shift.h"
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <pmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
+                 *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
+                 *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
+                 *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
+                 *dptr++ = *sptr++ >> val)
+
+static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
+{
+	size_t len = ulen;
+	const INT32 shifts = 2;
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+	if (len < 16) /* pointless if too small */
+		return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
+
+	UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
+	if ((ULONG_PTR)pSrcDst & offBeatMask)
+	{
+		/* Incrementing the pointer skips over 16-byte boundary. */
+		return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
+	}
+	/* Get to the 16-byte boundary now. */
+	const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
+	if (rem > 0)
+	{
+		const UINT32 add = 16 - rem;
+		pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
+		if (status != PRIMITIVES_SUCCESS)
+			return status;
+		pSrcDst += add;
+		len -= add;
+	}
+
+	/* Use 8 128-bit SSE registers. */
+	size_t count = len >> (8 - shifts);
+	len -= count << (8 - shifts);
+
+	while (count--)
+	{
+		const __m128i* src = (const __m128i*)pSrcDst;
+
+		__m128i xmm0 = LOAD_SI128(src++);
+		__m128i xmm1 = LOAD_SI128(src++);
+		__m128i xmm2 = LOAD_SI128(src++);
+		__m128i xmm3 = LOAD_SI128(src++);
+		__m128i xmm4 = LOAD_SI128(src++);
+		__m128i xmm5 = LOAD_SI128(src++);
+		__m128i xmm6 = LOAD_SI128(src++);
+		__m128i xmm7 = LOAD_SI128(src);
+
+		xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
+		xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
+		xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
+		xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
+		xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
+		xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
+		xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
+		xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
+
+		__m128i* dst = (__m128i*)pSrcDst;
+
+		STORE_SI128(dst++, xmm0);
+		STORE_SI128(dst++, xmm1);
+		STORE_SI128(dst++, xmm2);
+		STORE_SI128(dst++, xmm3);
+		STORE_SI128(dst++, xmm4);
+		STORE_SI128(dst++, xmm5);
+		STORE_SI128(dst++, xmm6);
+		STORE_SI128(dst++, xmm7);
+
+		pSrcDst = (INT16*)dst;
+	}
+
+	/* Use a single 128-bit SSE register. */
+	count = len >> (5 - shifts);
+	len -= count << (5 - shifts);
+	while (count--)
+	{
+		const __m128i* src = (const __m128i*)pSrcDst;
+		__m128i xmm0 = LOAD_SI128(src);
+
+		xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
+
+		__m128i* dst = (__m128i*)pSrcDst;
+		STORE_SI128(dst++, xmm0);
+		pSrcDst = (INT16*)dst;
+	}
+
+	/* Finish off the remainder. */
+	if (len > 0)
+		return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val.  To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
+	prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
+	prims->lShiftC_16s = sse2_lShiftC_16s;
+	prims->rShiftC_16s = sse2_rShiftC_16s;
+	prims->lShiftC_16u = sse2_lShiftC_16u;
+	prims->rShiftC_16u = sse2_rShiftC_16u;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,188 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#include "prim_sign.h"
+
+#include "prim_internal.h"
+#include "prim_avxsse.h"
+
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static primitives_t* generic = nullptr;
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
+                                UINT32 ulen)
+{
+	size_t len = ulen;
+	const INT16* sptr = pSrc;
+	INT16* dptr = pDst;
+	size_t count = 0;
+
+	if (len < 16)
+	{
+		return generic->sign_16s(pSrc, pDst, ulen);
+	}
+
+	/* Check for 16-byte alignment (eventually). */
+	if ((ULONG_PTR)pDst & 0x01)
+	{
+		return generic->sign_16s(pSrc, pDst, ulen);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	/* Do 32-short chunks using 8 XMM registers. */
+	count = len >> 5;  /* / 32  */
+	len -= count << 5; /* * 32 */
+
+	if ((ULONG_PTR)sptr & 0x0f)
+	{
+		/* Unaligned */
+		while (count--)
+		{
+			__m128i xmm0;
+			__m128i xmm1;
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm5 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm6 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm7 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			STORE_SI128(dptr, xmm0);
+			dptr += 8;
+			STORE_SI128(dptr, xmm1);
+			dptr += 8;
+			STORE_SI128(dptr, xmm2);
+			dptr += 8;
+			STORE_SI128(dptr, xmm3);
+			dptr += 8;
+		}
+	}
+	else
+	{
+		/* Aligned */
+		while (count--)
+		{
+			__m128i xmm0;
+			__m128i xmm1;
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm5 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm6 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm7 = LOAD_SI128(sptr);
+			sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			STORE_SI128(dptr, xmm0);
+			dptr += 8;
+			STORE_SI128(dptr, xmm1);
+			dptr += 8;
+			STORE_SI128(dptr, xmm2);
+			dptr += 8;
+			STORE_SI128(dptr, xmm3);
+			dptr += 8;
+		}
+	}
+
+	/* Do 8-short chunks using two XMM registers. */
+	count = len >> 3;
+	len -= count << 3;
+
+	while (count--)
+	{
+		__m128i xmm0 = _mm_set1_epi16(0x0001U);
+		__m128i xmm1 = LOAD_SI128(sptr);
+		sptr += 8;
+		xmm0 = _mm_sign_epi16(xmm0, xmm1);
+		STORE_SI128(dptr, xmm0);
+		dptr += 8;
+	}
+
+	/* Do leftovers. */
+	while (len--)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+#endif /* SSE_AVX_INTRINSICS_ENABLED */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(SSE_AVX_INTRINSICS_ENABLED)
+	generic = primitives_get_generic();
+
+	/* Pick tuned versions if possible. */
+	/* I didn't spot an IPP version of this. */
+
+	WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
+	prims->sign_16s = ssse3_sign_16s;
+
+#else
+	WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
+	WINPR_UNUSED(prims);
+#endif
+}
@@ -0,0 +1,278 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#pragma once
+
+#include "prim_avxsse.h"
+
+/* These are prototypes for SSE (potentially NEON) routines that do a
+ * simple SSE operation over an array of data.  Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code.  The naming convention depends on
+ * the parameters:  S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note:  If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
+	WINPR_ATTR_NODISCARD                                                         \
+	static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val,       \
+	                        _type_* WINPR_RESTRICT pDst, UINT32 ulen)            \
+	{                                                                            \
+		size_t len = ulen;                                                       \
+		INT32 shifts = 0;                                                        \
+		const _type_* sptr = pSrc;                                               \
+		_type_* dptr = pDst;                                                     \
+		if (val == 0)                                                            \
+			return PRIMITIVES_SUCCESS;                                           \
+		if (val >= 16)                                                           \
+			return -1;                                                           \
+		if (sizeof(_type_) == 1)                                                 \
+			shifts = 1;                                                          \
+		else if (sizeof(_type_) == 2)                                            \
+			shifts = 2;                                                          \
+		else if (sizeof(_type_) == 4)                                            \
+			shifts = 3;                                                          \
+		else if (sizeof(_type_) == 8)                                            \
+			shifts = 4;                                                          \
+		/* Use 8 128-bit SSE registers. */                                       \
+		size_t count = len >> (8 - shifts);                                      \
+		len -= count << (8 - shifts);                                            \
+                                                                                 \
+		while (count--)                                                          \
+		{                                                                        \
+			__m128i xmm0 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm1 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm2 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm3 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm4 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm5 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm6 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			__m128i xmm7 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			xmm0 = _op_(xmm0, (_op_type_)val);                                   \
+			xmm1 = _op_(xmm1, (_op_type_)val);                                   \
+			xmm2 = _op_(xmm2, (_op_type_)val);                                   \
+			xmm3 = _op_(xmm3, (_op_type_)val);                                   \
+			xmm4 = _op_(xmm4, (_op_type_)val);                                   \
+			xmm5 = _op_(xmm5, (_op_type_)val);                                   \
+			xmm6 = _op_(xmm6, (_op_type_)val);                                   \
+			xmm7 = _op_(xmm7, (_op_type_)val);                                   \
+			STORE_SI128(dptr, xmm0);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm1);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm2);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm3);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm4);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm5);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm6);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+			STORE_SI128(dptr, xmm7);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+		}                                                                        \
+                                                                                 \
+		/* Use a single 128-bit SSE register. */                                 \
+		count = len >> (5 - shifts);                                             \
+		len -= count << (5 - shifts);                                            \
+		while (count--)                                                          \
+		{                                                                        \
+			__m128i xmm0 = LOAD_SI128(sptr);                                     \
+			sptr += (16 / sizeof(_type_));                                       \
+			xmm0 = _op_(xmm0, (_op_type_)val);                                   \
+			STORE_SI128(dptr, xmm0);                                             \
+			dptr += (16 / sizeof(_type_));                                       \
+		}                                                                        \
+		/* Finish off the remainder. */                                          \
+		while (len--)                                                            \
+		{                                                                        \
+			_slowWay_;                                                           \
+		}                                                                        \
+		return PRIMITIVES_SUCCESS;                                               \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)  \
+	WINPR_ATTR_NODISCARD                                                   \
+	static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
+	                        _type_* WINPR_RESTRICT pDst, INT32 ilen)       \
+	{                                                                      \
+		size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen);               \
+		int shifts = 0;                                                    \
+		const _type_* sptr = pSrc;                                         \
+		_type_* dptr = pDst;                                               \
+		__m128i xmm0;                                                      \
+		if (sizeof(_type_) == 1)                                           \
+			shifts = 1;                                                    \
+		else if (sizeof(_type_) == 2)                                      \
+			shifts = 2;                                                    \
+		else if (sizeof(_type_) == 4)                                      \
+			shifts = 3;                                                    \
+		else if (sizeof(_type_) == 8)                                      \
+			shifts = 4;                                                    \
+		/* Use 4 128-bit SSE registers. */                                 \
+		size_t count = len >> (7 - shifts);                                \
+		len -= count << (7 - shifts);                                      \
+		xmm0 = mm_set1_epu32(val);                                         \
+		for (size_t x = 0; x < count; x++)                                 \
+		{                                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			__m128i xmm2 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			__m128i xmm3 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			__m128i xmm4 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			xmm1 = _op_(xmm1, xmm0);                                       \
+			xmm2 = _op_(xmm2, xmm0);                                       \
+			xmm3 = _op_(xmm3, xmm0);                                       \
+			xmm4 = _op_(xmm4, xmm0);                                       \
+			STORE_SI128(dptr, xmm1);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+			STORE_SI128(dptr, xmm2);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+			STORE_SI128(dptr, xmm3);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+			STORE_SI128(dptr, xmm4);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+		}                                                                  \
+		/* Use a single 128-bit SSE register. */                           \
+		count = len >> (5 - shifts);                                       \
+		len -= count << (5 - shifts);                                      \
+		for (size_t x = 0; x < count; x++)                                 \
+		{                                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr);                               \
+			sptr += (16 / sizeof(_type_));                                 \
+			xmm1 = _op_(xmm1, xmm0);                                       \
+			STORE_SI128(dptr, xmm1);                                       \
+			dptr += (16 / sizeof(_type_));                                 \
+		}                                                                  \
+		/* Finish off the remainder. */                                    \
+		for (size_t x = 0; x < len; x++)                                   \
+		{                                                                  \
+			_slowWay_;                                                     \
+		}                                                                  \
+		return PRIMITIVES_SUCCESS;                                         \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                        \
+	WINPR_ATTR_NODISCARD                                                                     \
+	static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1,                              \
+	                        const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
+	                        UINT32 ulen)                                                     \
+	{                                                                                        \
+		size_t len = ulen;                                                                   \
+		int shifts = 0;                                                                      \
+		const _type_* sptr1 = pSrc1;                                                         \
+		const _type_* sptr2 = pSrc2;                                                         \
+		_type_* dptr = pDst;                                                                 \
+		size_t count;                                                                        \
+		if (sizeof(_type_) == 1)                                                             \
+			shifts = 1;                                                                      \
+		else if (sizeof(_type_) == 2)                                                        \
+			shifts = 2;                                                                      \
+		else if (sizeof(_type_) == 4)                                                        \
+			shifts = 3;                                                                      \
+		else if (sizeof(_type_) == 8)                                                        \
+			shifts = 4;                                                                      \
+		/* Use 4 128-bit SSE registers. */                                                   \
+		count = len >> (7 - shifts);                                                         \
+		len -= count << (7 - shifts);                                                        \
+		/* Aligned loads */                                                                  \
+		while (count--)                                                                      \
+		{                                                                                    \
+			__m128i xmm0 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm2 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm3 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm4 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm5 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm6 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm7 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			xmm0 = _op_(xmm0, xmm4);                                                         \
+			xmm1 = _op_(xmm1, xmm5);                                                         \
+			xmm2 = _op_(xmm2, xmm6);                                                         \
+			xmm3 = _op_(xmm3, xmm7);                                                         \
+			STORE_SI128(dptr, xmm0);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+			STORE_SI128(dptr, xmm1);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+			STORE_SI128(dptr, xmm2);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+			STORE_SI128(dptr, xmm3);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+		}                                                                                    \
+		/* Use a single 128-bit SSE register. */                                             \
+		count = len >> (5 - shifts);                                                         \
+		len -= count << (5 - shifts);                                                        \
+		while (count--)                                                                      \
+		{                                                                                    \
+			__m128i xmm0 = LOAD_SI128(sptr1);                                                \
+			sptr1 += (16 / sizeof(_type_));                                                  \
+			__m128i xmm1 = LOAD_SI128(sptr2);                                                \
+			sptr2 += (16 / sizeof(_type_));                                                  \
+			xmm0 = _op_(xmm0, xmm1);                                                         \
+			STORE_SI128(dptr, xmm0);                                                         \
+			dptr += (16 / sizeof(_type_));                                                   \
+		}                                                                                    \
+		/* Finish off the remainder. */                                                      \
+		while (len--)                                                                        \
+		{                                                                                    \
+			const pstatus_t rc = _slowWay_;                                                  \
+			if (rc != PRIMITIVES_SUCCESS)                                                    \
+				return rc;                                                                   \
+		}                                                                                    \
+		return PRIMITIVES_SUCCESS;                                                           \
+	}