Milestone 5: deliver embedded RDP sessions and lifecycle hardening

This commit is contained in:
Keith Smith
2026-03-03 18:59:26 -07:00
parent 230a401386
commit 36006bd4aa
2941 changed files with 724359 additions and 77 deletions

View File

@@ -0,0 +1,383 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized YCoCg<->RGB conversion operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_YCoCg.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <tmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
UINT32 dstStep, UINT32 width, UINT32 height,
UINT8 shift, BOOL withAlpha)
{
const BYTE* sptr = pSrc;
BYTE* dptr = pDst;
WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
const size_t sRowBump = srcStep - width * sizeof(UINT32);
const size_t dRowBump = dstStep - width * sizeof(UINT32);
/* Shift left by "shift" and divide by two is the same as shift
* left by "shift-1".
*/
int dataShift = shift - 1;
BYTE mask = (BYTE)(0xFFU << dataShift);
/* Let's say the data is of the form:
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
* Apply:
* |R| | 1 1/2 -1/2 | |y|
* |G| = | 1 0 1/2 | * |o|
* |B| | 1 -1/2 -1/2 | |g|
* where Y is 8-bit unsigned and o & g are 8-bit signed.
*/
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
{
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
width, height, shift, withAlpha);
}
for (UINT32 h = 0; h < height; h++)
{
UINT32 w = width;
while (w >= 8)
{
__m128i R0;
__m128i R1;
__m128i R2;
__m128i R3;
__m128i R4;
__m128i R5;
__m128i R6;
__m128i R7;
R0 = LOAD_SI128(sptr);
sptr += (128 / 8);
R1 = LOAD_SI128(sptr);
sptr += (128 / 8);
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
/* Shuffle to pack all the like types together. */
R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
R3 = _mm_shuffle_epi8(R0, R2);
R4 = _mm_shuffle_epi8(R1, R2);
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
R5 = _mm_unpackhi_epi32(R3, R4);
R6 = _mm_unpacklo_epi32(R3, R4);
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Save alphas aside */
if (withAlpha)
R7 = _mm_unpackhi_epi64(R5, R5);
else
R7 = mm_set1_epu32(0xFFFFFFFFU);
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
R1 = mm_set1_epu32(0);
R0 = _mm_unpacklo_epi8(R5, R1);
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
* Note: this must be done before sign-conversion.
* Note also there is no slli_epi8, so we have to use a 16-bit
* version and then mask.
*/
R6 = _mm_slli_epi16(R6, dataShift);
R1 = mm_set1_epu8(mask);
R6 = _mm_and_si128(R6, R1);
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Expand Co's from 8-bit signed to 16-bit signed */
R1 = _mm_unpackhi_epi8(R6, R6);
R1 = _mm_srai_epi16(R1, 8);
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
/* Expand Cg's form 8-bit signed to 16-bit signed */
R2 = _mm_unpacklo_epi8(R6, R6);
R2 = _mm_srai_epi16(R2, 8);
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
/* Get Y - halfCg and save */
R6 = _mm_subs_epi16(R0, R2);
/* R = (Y-halfCg) + halfCo */
R3 = _mm_adds_epi16(R6, R1);
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
/* G = Y + Cg(/2) */
R4 = _mm_adds_epi16(R0, R2);
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
/* B = (Y-halfCg) - Co(/2) */
R5 = _mm_subs_epi16(R6, R1);
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
/* Repack R's & B's. */
R0 = _mm_packus_epi16(R3, R5);
/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
/* Repack G's. */
R1 = _mm_packus_epi16(R4, R4);
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
/* And add the A's. */
R1 = _mm_unpackhi_epi64(R1, R7);
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
/* Now do interleaving again. */
R2 = _mm_unpacklo_epi8(R0, R1);
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
R3 = _mm_unpackhi_epi8(R0, R1);
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
R4 = _mm_unpacklo_epi16(R2, R3);
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
R5 = _mm_unpackhi_epi16(R2, R3);
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
STORE_SI128(dptr, R4);
dptr += (128 / 8);
STORE_SI128(dptr, R5);
dptr += (128 / 8);
w -= 8;
}
/* Handle any remainder pixels. */
if (w > 0)
{
pstatus_t status = 0;
status = generic->YCoCgToRGB_8u_AC4R(
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += w * sizeof(UINT32);
dptr += w * sizeof(UINT32);
}
sptr += sRowBump;
dptr += dRowBump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 DstFormat, UINT32 dstStep, UINT32 width,
UINT32 height, UINT8 shift, BOOL withAlpha)
{
const BYTE* sptr = pSrc;
BYTE* dptr = pDst;
size_t sRowBump = srcStep - width * sizeof(UINT32);
size_t dRowBump = dstStep - width * sizeof(UINT32);
/* Shift left by "shift" and divide by two is the same as shift
* left by "shift-1".
*/
int dataShift = shift - 1;
BYTE mask = (BYTE)(0xFFU << dataShift);
/* Let's say the data is of the form:
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
* Apply:
* |R| | 1 1/2 -1/2 | |y|
* |G| = | 1 0 1/2 | * |o|
* |B| | 1 -1/2 -1/2 | |g|
* where Y is 8-bit unsigned and o & g are 8-bit signed.
*/
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
{
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
width, height, shift, withAlpha);
}
for (UINT32 h = 0; h < height; h++)
{
UINT32 w = width;
while (w >= 8)
{
__m128i R7;
/* The faster path, 16-byte aligned load. */
__m128i R0 = LOAD_SI128(sptr);
sptr += (128 / 8);
__m128i R1 = LOAD_SI128(sptr);
sptr += (128 / 8);
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
/* Shuffle to pack all the like types together. */
__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
__m128i R3 = _mm_shuffle_epi8(R0, R2);
__m128i R4 = _mm_shuffle_epi8(R1, R2);
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
__m128i R5 = _mm_unpackhi_epi32(R3, R4);
__m128i R6 = _mm_unpacklo_epi32(R3, R4);
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Save alphas aside */
if (withAlpha)
R7 = _mm_unpackhi_epi64(R5, R5);
else
R7 = mm_set1_epu32(0xFFFFFFFFU);
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
R1 = mm_set1_epu32(0);
R0 = _mm_unpacklo_epi8(R5, R1);
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
* Note: this must be done before sign-conversion.
* Note also there is no slli_epi8, so we have to use a 16-bit
* version and then mask.
*/
R6 = _mm_slli_epi16(R6, dataShift);
R1 = mm_set1_epu8(mask);
R6 = _mm_and_si128(R6, R1);
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Expand Co's from 8-bit signed to 16-bit signed */
R1 = _mm_unpackhi_epi8(R6, R6);
R1 = _mm_srai_epi16(R1, 8);
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
/* Expand Cg's form 8-bit signed to 16-bit signed */
R2 = _mm_unpacklo_epi8(R6, R6);
R2 = _mm_srai_epi16(R2, 8);
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
/* Get Y - halfCg and save */
R6 = _mm_subs_epi16(R0, R2);
/* R = (Y-halfCg) + halfCo */
R3 = _mm_adds_epi16(R6, R1);
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
/* G = Y + Cg(/2) */
R4 = _mm_adds_epi16(R0, R2);
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
/* B = (Y-halfCg) - Co(/2) */
R5 = _mm_subs_epi16(R6, R1);
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
/* Repack R's & B's. */
/* This line is the only diff between inverted and non-inverted.
* Unfortunately, it would be expensive to check "inverted"
* every time through this loop.
*/
R0 = _mm_packus_epi16(R5, R3);
/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
/* Repack G's. */
R1 = _mm_packus_epi16(R4, R4);
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
/* And add the A's. */
R1 = _mm_unpackhi_epi64(R1, R7);
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
/* Now do interleaving again. */
R2 = _mm_unpacklo_epi8(R0, R1);
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
R3 = _mm_unpackhi_epi8(R0, R1);
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
R4 = _mm_unpacklo_epi16(R2, R3);
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
R5 = _mm_unpackhi_epi16(R2, R3);
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
STORE_SI128(dptr, R4);
dptr += (128 / 8);
STORE_SI128(dptr, R5);
dptr += (128 / 8);
w -= 8;
}
/* Handle any remainder pixels. */
if (w > 0)
{
pstatus_t status = 0;
status = generic->YCoCgToRGB_8u_AC4R(
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
}
sptr += sRowBump;
dptr += dRowBump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
BOOL withAlpha)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return ssse3_YCoCgRToRGB_8u_AC4R_invert(
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
default:
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
height, shift, withAlpha);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,187 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized add operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_add.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
{
const int shifts = 2;
INT16* dptr1 = pSrcDst1;
INT16* dptr2 = pSrcDst2;
if (ulen < 16) /* pointless if too small */
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst1 & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
}
/* Get to the 16-byte boundary now. */
const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
if (rem != 0)
{
const UINT32 add = 16 - (UINT32)rem;
pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
if (status != PRIMITIVES_SUCCESS)
return status;
dptr1 += add;
dptr2 += add;
}
/* Use 4 128-bit SSE registers. */
size_t len = ulen;
size_t count = len >> (7 - shifts);
len -= count << (7 - shifts);
if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
{
/* Unaligned loads */
while (count--)
{
const __m128i* vsptr1 = (const __m128i*)dptr1;
const __m128i* vsptr2 = (const __m128i*)dptr2;
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = LOAD_SI128(vsptr1++);
__m128i xmm1 = LOAD_SI128(vsptr1++);
__m128i xmm2 = LOAD_SI128(vsptr1++);
__m128i xmm3 = LOAD_SI128(vsptr1++);
__m128i xmm4 = LOAD_SI128(vsptr2++);
__m128i xmm5 = LOAD_SI128(vsptr2++);
__m128i xmm6 = LOAD_SI128(vsptr2++);
__m128i xmm7 = LOAD_SI128(vsptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr1++, xmm1);
STORE_SI128(vdptr1++, xmm2);
STORE_SI128(vdptr1++, xmm3);
STORE_SI128(vdptr2++, xmm0);
STORE_SI128(vdptr2++, xmm1);
STORE_SI128(vdptr2++, xmm2);
STORE_SI128(vdptr2++, xmm3);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
}
}
else
{
/* Aligned loads */
while (count--)
{
const __m128i* vsptr1 = (const __m128i*)dptr1;
const __m128i* vsptr2 = (const __m128i*)dptr2;
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = LOAD_SI128(vsptr1++);
__m128i xmm1 = LOAD_SI128(vsptr1++);
__m128i xmm2 = LOAD_SI128(vsptr1++);
__m128i xmm3 = LOAD_SI128(vsptr1++);
__m128i xmm4 = LOAD_SI128(vsptr2++);
__m128i xmm5 = LOAD_SI128(vsptr2++);
__m128i xmm6 = LOAD_SI128(vsptr2++);
__m128i xmm7 = LOAD_SI128(vsptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr1++, xmm1);
STORE_SI128(vdptr1++, xmm2);
STORE_SI128(vdptr1++, xmm3);
STORE_SI128(vdptr2++, xmm0);
STORE_SI128(vdptr2++, xmm1);
STORE_SI128(vdptr2++, xmm2);
STORE_SI128(vdptr2++, xmm3);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
}
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* vsptr1 = (const __m128i*)dptr1;
const __m128i* vsptr2 = (const __m128i*)dptr2;
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = LOAD_SI128(vsptr1);
__m128i xmm1 = LOAD_SI128(vsptr2);
xmm0 = _mm_adds_epi16(xmm0, xmm1);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr2++, xmm0);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
}
/* Finish off the remainder. */
if (len > 0)
return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
return PRIMITIVES_SUCCESS;
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->add_16s = sse3_add_16s;
prims->add_16s_inplace = sse3_add_16s_inplace;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,215 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized alpha blending routines.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Note: this code assumes the second operand is fully opaque,
* e.g.
* newval = alpha1*val1 + (1-alpha1)*val2
* rather than
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
* The IPP gives other options.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_alphaComp.h"
#include "prim_internal.h"
#include "prim_avxsse.h"
/* ------------------------------------------------------------------------- */
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
UINT32 height)
{
const UINT32* sptr1 = (const UINT32*)pSrc1;
const UINT32* sptr2 = (const UINT32*)pSrc2;
if ((width <= 0) || (height <= 0))
return PRIMITIVES_SUCCESS;
if (width < 4) /* pointless if too small */
{
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
height);
}
UINT32* dptr = (UINT32*)pDst;
const size_t linebytes = width * sizeof(UINT32);
const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
__m128i xmm0 = mm_set1_epu32(0);
__m128i xmm1 = _mm_set1_epi16(1);
for (UINT32 y = 0; y < height; ++y)
{
uint32_t pixels = width;
uint32_t count = 0;
/* Get to the 16-byte boundary now. */
uint32_t leadIn = 0;
switch ((ULONG_PTR)dptr & 0x0f)
{
case 0:
leadIn = 0;
break;
case 4:
leadIn = 3;
break;
case 8:
leadIn = 2;
break;
case 12:
leadIn = 1;
break;
default:
/* We'll never hit a 16-byte boundary, so do the whole
* thing the slow way.
*/
leadIn = width;
break;
}
if (leadIn)
{
pstatus_t status = 0;
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += leadIn;
sptr2 += leadIn;
dptr += leadIn;
pixels -= leadIn;
}
/* Use SSE registers to do 4 pixels at a time. */
count = pixels >> 2;
pixels -= count << 2;
while (count--)
{
__m128i xmm2;
__m128i xmm3;
__m128i xmm4;
__m128i xmm5;
__m128i xmm6;
__m128i xmm7;
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
xmm2 = LOAD_SI128(sptr1);
sptr1 += 4;
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
xmm3 = LOAD_SI128(sptr2);
sptr2 += 4;
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
/* subtract */
xmm6 = _mm_subs_epi16(xmm4, xmm5);
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
/* Add one to alphas */
xmm4 = _mm_adds_epi16(xmm4, xmm1);
/* Multiply and take low word */
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
/* Shift 8 right */
xmm4 = _mm_srai_epi16(xmm4, 8);
/* Add xmm5 */
xmm4 = _mm_adds_epi16(xmm4, xmm5);
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
/* subtract */
xmm7 = _mm_subs_epi16(xmm5, xmm6);
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
/* Add one to alphas */
xmm5 = _mm_adds_epi16(xmm5, xmm1);
/* Multiply and take low word */
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
/* Shift 8 right */
xmm5 = _mm_srai_epi16(xmm5, 8);
/* Add xmm6 */
xmm5 = _mm_adds_epi16(xmm5, xmm6);
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
/* Must mask off remainders or pack gets confused */
xmm3 = _mm_set1_epi16(0x00ffU);
xmm4 = _mm_and_si128(xmm4, xmm3);
xmm5 = _mm_and_si128(xmm5, xmm3);
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
xmm5 = _mm_packus_epi16(xmm5, xmm4);
STORE_SI128(dptr, xmm5);
dptr += 4;
}
/* Finish off the remainder. */
if (pixels)
{
pstatus_t status = 0;
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
src2Step, (BYTE*)dptr, dstStep, pixels, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += pixels;
sptr2 += pixels;
dptr += pixels;
}
/* Jump to next row. */
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->alphaComp_argb = sse2_alphaComp_argb;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,54 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized Logical operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_andor.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
*dptr++ = *sptr++ & val)
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->andC_32u = sse3_andC_32u;
prims->orC_32u = sse3_orC_32u;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,79 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* FreeRDP primitives SSE implementation
*
* Copyright 2025 Armin Novak <armin.novak@thincast.com>
* Copyright 2025 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <winpr/cast.h>
#include "../../core/simd.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
WINPR_ATTR_NODISCARD
static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
{
return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
WINPR_CXX_COMPAT_CAST(int32_t, val3),
WINPR_CXX_COMPAT_CAST(int32_t, val4));
}
WINPR_ATTR_NODISCARD
static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
{
return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
}
WINPR_ATTR_NODISCARD
static inline __m128i mm_set1_epu32(uint32_t val)
{
return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
}
WINPR_ATTR_NODISCARD
static inline __m128i mm_set1_epu8(uint8_t val)
{
return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
}
WINPR_ATTR_NODISCARD
static inline __m128i LOAD_SI128(const void* ptr)
{
const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
return _mm_lddqu_si128(mptr);
}
static inline void STORE_SI128(void* ptr, __m128i val)
{
__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
_mm_storeu_si128(mptr, val);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,278 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <immintrin.h>
static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
{
return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
(int32_t)i5, (int32_t)i6, (int32_t)i7);
}
static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 3;
const int64_t dstByte = 4;
const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
0xFF000000, 0xFF000000, 0xFF000000);
const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
const UINT32 rem = nWidth % 8;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
/* Ensure alignment requirements can be met */
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
__m256i s1 = _mm256_shuffle_epi8(s0, smask);
/* _mm256_shuffle_epi8 can not cross 128bit lanes.
* manually copy these bytes with extract/insert */
const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
const __m256i s2 = _mm256_loadu_si256(dst);
__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
_mm256_storeu_si256(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 4;
const int64_t dstByte = 4;
const __m256i mask = _mm256_setr_epi8(
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
const UINT32 rem = nWidth % 8;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
const __m256i s1 = _mm256_loadu_si256(dst);
__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
_mm256_storeu_si256(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
int64_t dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return avx2_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return avx2_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
switch (DstFormat)
{
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return avx2_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
int64_t srcVOffset = 0;
int64_t srcVMultiplier = 1;
int64_t dstVOffset = 0;
int64_t dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, flags, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
{
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
WLog_VRB(PRIM_TAG, "AVX2 optimizations");
prims->copy_no_overlap = avx2_image_copy_no_overlap;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,257 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <immintrin.h>
static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 3;
const int64_t dstByte = 4;
const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
const UINT32 rem = nWidth % 4;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
/* Ensure alignment requirements can be met */
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = LOAD_SI128(src);
const __m128i s1 = _mm_shuffle_epi8(s0, smask);
const __m128i s2 = LOAD_SI128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
STORE_SI128(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 4;
const int64_t dstByte = 4;
const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
(char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
(char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
const UINT32 rem = nWidth % 4;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = LOAD_SI128(src);
const __m128i s1 = LOAD_SI128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
STORE_SI128(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t sse_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
int64_t dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return sse_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return sse_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
switch (DstFormat)
{
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return sse_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
int64_t srcVOffset = 0;
int64_t srcVMultiplier = 1;
int64_t dstVOffset = 0;
int64_t dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, flags, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
{
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
prims->copy_no_overlap = sse_image_copy_no_overlap;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,235 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized routines to set a chunk of memory to a constant.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_set.h"
/* ========================================================================= */
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
static primitives_t* generic = nullptr;
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
{
size_t len = ulen;
BYTE byte = 0;
BYTE* dptr = nullptr;
__m128i xmm0;
size_t count = 0;
if (len < 16)
return generic->set_8u(val, pDst, ulen);
byte = val;
dptr = pDst;
/* Seek 16-byte alignment. */
while ((ULONG_PTR)dptr & 0x0f)
{
*dptr++ = byte;
if (--len == 0)
return PRIMITIVES_SUCCESS;
}
xmm0 = mm_set1_epu8(byte);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 8;
len -= count << 8;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 4;
len -= count << 4;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 16;
}
/* Do leftover bytes. */
while (len--)
*dptr++ = byte;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
{
size_t len = ulen;
const primitives_t* prim = primitives_get_generic();
UINT32* dptr = pDst;
__m128i xmm0;
size_t count = 0;
/* If really short, just do it here. */
if (len < 32)
{
while (len--)
*dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* Assure we can reach 16-byte alignment. */
if (((ULONG_PTR)dptr & 0x03) != 0)
{
return prim->set_32u(val, pDst, ulen);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR)dptr & 0x0f)
{
*dptr++ = val;
if (--len == 0)
return PRIMITIVES_SUCCESS;
}
xmm0 = mm_set1_epu32(val);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 6;
len -= count << 6;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 2;
len -= count << 2;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 4;
}
/* Do leftover bytes. */
while (len--)
*dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
{
UINT32 uval = *((UINT32*)&val);
return sse2_set_32u(uval, (UINT32*)pDst, len);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
/* Pick tuned versions if possible. */
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->set_8u = sse2_set_8u;
prims->set_32s = sse2_set_32s;
prims->set_32u = sse2_set_32u;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,160 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_shift.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
*dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
*dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
*dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
*dptr++ = *sptr++ >> val)
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
{
size_t len = ulen;
const INT32 shifts = 2;
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
if (len < 16) /* pointless if too small */
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
}
/* Get to the 16-byte boundary now. */
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
if (rem > 0)
{
const UINT32 add = 16 - rem;
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
if (status != PRIMITIVES_SUCCESS)
return status;
pSrcDst += add;
len -= add;
}
/* Use 8 128-bit SSE registers. */
size_t count = len >> (8 - shifts);
len -= count << (8 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = LOAD_SI128(src++);
__m128i xmm1 = LOAD_SI128(src++);
__m128i xmm2 = LOAD_SI128(src++);
__m128i xmm3 = LOAD_SI128(src++);
__m128i xmm4 = LOAD_SI128(src++);
__m128i xmm5 = LOAD_SI128(src++);
__m128i xmm6 = LOAD_SI128(src++);
__m128i xmm7 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
__m128i* dst = (__m128i*)pSrcDst;
STORE_SI128(dst++, xmm0);
STORE_SI128(dst++, xmm1);
STORE_SI128(dst++, xmm2);
STORE_SI128(dst++, xmm3);
STORE_SI128(dst++, xmm4);
STORE_SI128(dst++, xmm5);
STORE_SI128(dst++, xmm6);
STORE_SI128(dst++, xmm7);
pSrcDst = (INT16*)dst;
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
__m128i* dst = (__m128i*)pSrcDst;
STORE_SI128(dst++, xmm0);
pSrcDst = (INT16*)dst;
}
/* Finish off the remainder. */
if (len > 0)
return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
return PRIMITIVES_SUCCESS;
}
#endif
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
* depending on the sign of val. To avoid using the deprecated inplace
* routines, a wrapper can use the src for the dest.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u;
prims->rShiftC_16u = sse2_rShiftC_16u;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,188 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized sign operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_sign.h"
#include "prim_internal.h"
#include "prim_avxsse.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <tmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
UINT32 ulen)
{
size_t len = ulen;
const INT16* sptr = pSrc;
INT16* dptr = pDst;
size_t count = 0;
if (len < 16)
{
return generic->sign_16s(pSrc, pDst, ulen);
}
/* Check for 16-byte alignment (eventually). */
if ((ULONG_PTR)pDst & 0x01)
{
return generic->sign_16s(pSrc, pDst, ulen);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR)dptr & 0x0f)
{
INT16 src = *sptr++;
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
if (--len == 0)
return PRIMITIVES_SUCCESS;
}
/* Do 32-short chunks using 8 XMM registers. */
count = len >> 5; /* / 32 */
len -= count << 5; /* * 32 */
if ((ULONG_PTR)sptr & 0x0f)
{
/* Unaligned */
while (count--)
{
__m128i xmm0;
__m128i xmm1;
__m128i xmm2;
__m128i xmm3;
__m128i xmm4;
__m128i xmm5;
__m128i xmm6;
__m128i xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = LOAD_SI128(sptr);
sptr += 8;
xmm5 = LOAD_SI128(sptr);
sptr += 8;
xmm6 = LOAD_SI128(sptr);
sptr += 8;
xmm7 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
STORE_SI128(dptr, xmm0);
dptr += 8;
STORE_SI128(dptr, xmm1);
dptr += 8;
STORE_SI128(dptr, xmm2);
dptr += 8;
STORE_SI128(dptr, xmm3);
dptr += 8;
}
}
else
{
/* Aligned */
while (count--)
{
__m128i xmm0;
__m128i xmm1;
__m128i xmm2;
__m128i xmm3;
__m128i xmm4;
__m128i xmm5;
__m128i xmm6;
__m128i xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = LOAD_SI128(sptr);
sptr += 8;
xmm5 = LOAD_SI128(sptr);
sptr += 8;
xmm6 = LOAD_SI128(sptr);
sptr += 8;
xmm7 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
STORE_SI128(dptr, xmm0);
dptr += 8;
STORE_SI128(dptr, xmm1);
dptr += 8;
STORE_SI128(dptr, xmm2);
dptr += 8;
STORE_SI128(dptr, xmm3);
dptr += 8;
}
}
/* Do 8-short chunks using two XMM registers. */
count = len >> 3;
len -= count << 3;
while (count--)
{
__m128i xmm0 = _mm_set1_epi16(0x0001U);
__m128i xmm1 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm1);
STORE_SI128(dptr, xmm0);
dptr += 8;
}
/* Do leftovers. */
while (len--)
{
INT16 src = *sptr++;
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
}
return PRIMITIVES_SUCCESS;
}
#endif /* SSE_AVX_INTRINSICS_ENABLED */
/* ------------------------------------------------------------------------- */
void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
/* Pick tuned versions if possible. */
/* I didn't spot an IPP version of this. */
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
prims->sign_16s = ssse3_sign_16s;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,278 @@
/* prim_templates.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*/
#pragma once
#include "prim_avxsse.h"
/* These are prototypes for SSE (potentially NEON) routines that do a
* simple SSE operation over an array of data. Since so much of this
* code is shared except for the operation itself, these prototypes are
* used rather than duplicating code. The naming convention depends on
* the parameters: S=Source param; C=Constant; D=Destination.
* All the macros have parameters for a fallback procedure if the data
* is too small and an operation "the slow way" for use at 16-byte edges.
*/
/* SSE3 note: If someone needs to support an SSE2 version of these without
* SSE3 support, an alternative version could be added that merely checks
* that 16-byte alignment on both destination and source(s) can be
* achieved, rather than use LDDQU for unaligned reads.
*/
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
* It easily can't do that if the value is stored in a variable.
* So don't save it as an intermediate value.
*/
/* ----------------------------------------------------------------------------
* SCD = Source, Constant, Destination
*/
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
WINPR_ATTR_NODISCARD \
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
_type_* WINPR_RESTRICT pDst, UINT32 ulen) \
{ \
size_t len = ulen; \
INT32 shifts = 0; \
const _type_* sptr = pSrc; \
_type_* dptr = pDst; \
if (val == 0) \
return PRIMITIVES_SUCCESS; \
if (val >= 16) \
return -1; \
if (sizeof(_type_) == 1) \
shifts = 1; \
else if (sizeof(_type_) == 2) \
shifts = 2; \
else if (sizeof(_type_) == 4) \
shifts = 3; \
else if (sizeof(_type_) == 8) \
shifts = 4; \
/* Use 8 128-bit SSE registers. */ \
size_t count = len >> (8 - shifts); \
len -= count << (8 - shifts); \
\
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm1 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm2 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm3 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm4 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm5 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm6 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm7 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, (_op_type_)val); \
xmm1 = _op_(xmm1, (_op_type_)val); \
xmm2 = _op_(xmm2, (_op_type_)val); \
xmm3 = _op_(xmm3, (_op_type_)val); \
xmm4 = _op_(xmm4, (_op_type_)val); \
xmm5 = _op_(xmm5, (_op_type_)val); \
xmm6 = _op_(xmm6, (_op_type_)val); \
xmm7 = _op_(xmm7, (_op_type_)val); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm2); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm3); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm4); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm5); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm6); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm7); \
dptr += (16 / sizeof(_type_)); \
} \
\
/* Use a single 128-bit SSE register. */ \
count = len >> (5 - shifts); \
len -= count << (5 - shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, (_op_type_)val); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) \
{ \
_slowWay_; \
} \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SCD = Source, Constant, Destination
* PRE = preload xmm0 with the constant.
*/
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
WINPR_ATTR_NODISCARD \
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
_type_* WINPR_RESTRICT pDst, INT32 ilen) \
{ \
size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
int shifts = 0; \
const _type_* sptr = pSrc; \
_type_* dptr = pDst; \
__m128i xmm0; \
if (sizeof(_type_) == 1) \
shifts = 1; \
else if (sizeof(_type_) == 2) \
shifts = 2; \
else if (sizeof(_type_) == 4) \
shifts = 3; \
else if (sizeof(_type_) == 8) \
shifts = 4; \
/* Use 4 128-bit SSE registers. */ \
size_t count = len >> (7 - shifts); \
len -= count << (7 - shifts); \
xmm0 = mm_set1_epu32(val); \
for (size_t x = 0; x < count; x++) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm2 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm3 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm4 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm2); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm3); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm4); \
dptr += (16 / sizeof(_type_)); \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5 - shifts); \
len -= count << (5 - shifts); \
for (size_t x = 0; x < count; x++) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
for (size_t x = 0; x < len; x++) \
{ \
_slowWay_; \
} \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SSD = Source1, Source2, Destination
*/
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
WINPR_ATTR_NODISCARD \
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
UINT32 ulen) \
{ \
size_t len = ulen; \
int shifts = 0; \
const _type_* sptr1 = pSrc1; \
const _type_* sptr2 = pSrc2; \
_type_* dptr = pDst; \
size_t count; \
if (sizeof(_type_) == 1) \
shifts = 1; \
else if (sizeof(_type_) == 2) \
shifts = 2; \
else if (sizeof(_type_) == 4) \
shifts = 3; \
else if (sizeof(_type_) == 8) \
shifts = 4; \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7 - shifts); \
len -= count << (7 - shifts); \
/* Aligned loads */ \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm1 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm2 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm3 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm4 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
__m128i xmm5 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
__m128i xmm6 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
__m128i xmm7 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm2); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm3); \
dptr += (16 / sizeof(_type_)); \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5 - shifts); \
len -= count << (5 - shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm1 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm1); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) \
{ \
const pstatus_t rc = _slowWay_; \
if (rc != PRIMITIVES_SUCCESS) \
return rc; \
} \
return PRIMITIVES_SUCCESS; \
}