Milestone 5: deliver embedded RDP sessions and lifecycle hardening
This commit is contained in:
383
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
vendored
Normal file
383
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
vendored
Normal file
@@ -0,0 +1,383 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized YCoCg<->RGB conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
UINT32 dstStep, UINT32 width, UINT32 height,
|
||||
UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
const BYTE* sptr = pSrc;
|
||||
BYTE* dptr = pDst;
|
||||
|
||||
WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
|
||||
WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
|
||||
const size_t sRowBump = srcStep - width * sizeof(UINT32);
|
||||
const size_t dRowBump = dstStep - width * sizeof(UINT32);
|
||||
/* Shift left by "shift" and divide by two is the same as shift
|
||||
* left by "shift-1".
|
||||
*/
|
||||
int dataShift = shift - 1;
|
||||
BYTE mask = (BYTE)(0xFFU << dataShift);
|
||||
|
||||
/* Let's say the data is of the form:
|
||||
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
|
||||
* Apply:
|
||||
* |R| | 1 1/2 -1/2 | |y|
|
||||
* |G| = | 1 0 1/2 | * |o|
|
||||
* |B| | 1 -1/2 -1/2 | |g|
|
||||
* where Y is 8-bit unsigned and o & g are 8-bit signed.
|
||||
*/
|
||||
|
||||
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
|
||||
{
|
||||
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
|
||||
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
|
||||
width, height, shift, withAlpha);
|
||||
}
|
||||
|
||||
for (UINT32 h = 0; h < height; h++)
|
||||
{
|
||||
UINT32 w = width;
|
||||
|
||||
while (w >= 8)
|
||||
{
|
||||
__m128i R0;
|
||||
__m128i R1;
|
||||
__m128i R2;
|
||||
__m128i R3;
|
||||
__m128i R4;
|
||||
__m128i R5;
|
||||
__m128i R6;
|
||||
__m128i R7;
|
||||
|
||||
R0 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
R1 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
|
||||
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
|
||||
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
|
||||
/* Shuffle to pack all the like types together. */
|
||||
R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
|
||||
R3 = _mm_shuffle_epi8(R0, R2);
|
||||
R4 = _mm_shuffle_epi8(R1, R2);
|
||||
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
|
||||
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
|
||||
R5 = _mm_unpackhi_epi32(R3, R4);
|
||||
R6 = _mm_unpacklo_epi32(R3, R4);
|
||||
|
||||
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
|
||||
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Save alphas aside */
|
||||
if (withAlpha)
|
||||
R7 = _mm_unpackhi_epi64(R5, R5);
|
||||
else
|
||||
R7 = mm_set1_epu32(0xFFFFFFFFU);
|
||||
|
||||
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
|
||||
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
|
||||
R1 = mm_set1_epu32(0);
|
||||
R0 = _mm_unpacklo_epi8(R5, R1);
|
||||
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
|
||||
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
|
||||
* Note: this must be done before sign-conversion.
|
||||
* Note also there is no slli_epi8, so we have to use a 16-bit
|
||||
* version and then mask.
|
||||
*/
|
||||
R6 = _mm_slli_epi16(R6, dataShift);
|
||||
R1 = mm_set1_epu8(mask);
|
||||
R6 = _mm_and_si128(R6, R1);
|
||||
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Expand Co's from 8-bit signed to 16-bit signed */
|
||||
R1 = _mm_unpackhi_epi8(R6, R6);
|
||||
R1 = _mm_srai_epi16(R1, 8);
|
||||
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
|
||||
/* Expand Cg's form 8-bit signed to 16-bit signed */
|
||||
R2 = _mm_unpacklo_epi8(R6, R6);
|
||||
R2 = _mm_srai_epi16(R2, 8);
|
||||
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
|
||||
/* Get Y - halfCg and save */
|
||||
R6 = _mm_subs_epi16(R0, R2);
|
||||
/* R = (Y-halfCg) + halfCo */
|
||||
R3 = _mm_adds_epi16(R6, R1);
|
||||
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
|
||||
/* G = Y + Cg(/2) */
|
||||
R4 = _mm_adds_epi16(R0, R2);
|
||||
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
|
||||
/* B = (Y-halfCg) - Co(/2) */
|
||||
R5 = _mm_subs_epi16(R6, R1);
|
||||
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
|
||||
/* Repack R's & B's. */
|
||||
R0 = _mm_packus_epi16(R3, R5);
|
||||
/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
|
||||
/* Repack G's. */
|
||||
R1 = _mm_packus_epi16(R4, R4);
|
||||
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
|
||||
/* And add the A's. */
|
||||
R1 = _mm_unpackhi_epi64(R1, R7);
|
||||
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
|
||||
/* Now do interleaving again. */
|
||||
R2 = _mm_unpacklo_epi8(R0, R1);
|
||||
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
|
||||
R3 = _mm_unpackhi_epi8(R0, R1);
|
||||
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
|
||||
R4 = _mm_unpacklo_epi16(R2, R3);
|
||||
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
|
||||
R5 = _mm_unpackhi_epi16(R2, R3);
|
||||
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
|
||||
STORE_SI128(dptr, R4);
|
||||
dptr += (128 / 8);
|
||||
STORE_SI128(dptr, R5);
|
||||
dptr += (128 / 8);
|
||||
w -= 8;
|
||||
}
|
||||
|
||||
/* Handle any remainder pixels. */
|
||||
if (w > 0)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->YCoCgToRGB_8u_AC4R(
|
||||
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr += w * sizeof(UINT32);
|
||||
dptr += w * sizeof(UINT32);
|
||||
}
|
||||
|
||||
sptr += sRowBump;
|
||||
dptr += dRowBump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 DstFormat, UINT32 dstStep, UINT32 width,
|
||||
UINT32 height, UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
const BYTE* sptr = pSrc;
|
||||
BYTE* dptr = pDst;
|
||||
size_t sRowBump = srcStep - width * sizeof(UINT32);
|
||||
size_t dRowBump = dstStep - width * sizeof(UINT32);
|
||||
/* Shift left by "shift" and divide by two is the same as shift
|
||||
* left by "shift-1".
|
||||
*/
|
||||
int dataShift = shift - 1;
|
||||
BYTE mask = (BYTE)(0xFFU << dataShift);
|
||||
|
||||
/* Let's say the data is of the form:
|
||||
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
|
||||
* Apply:
|
||||
* |R| | 1 1/2 -1/2 | |y|
|
||||
* |G| = | 1 0 1/2 | * |o|
|
||||
* |B| | 1 -1/2 -1/2 | |g|
|
||||
* where Y is 8-bit unsigned and o & g are 8-bit signed.
|
||||
*/
|
||||
|
||||
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
|
||||
{
|
||||
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
|
||||
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
|
||||
width, height, shift, withAlpha);
|
||||
}
|
||||
|
||||
for (UINT32 h = 0; h < height; h++)
|
||||
{
|
||||
UINT32 w = width;
|
||||
|
||||
while (w >= 8)
|
||||
{
|
||||
__m128i R7;
|
||||
|
||||
/* The faster path, 16-byte aligned load. */
|
||||
__m128i R0 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
__m128i R1 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
|
||||
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
|
||||
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
|
||||
/* Shuffle to pack all the like types together. */
|
||||
__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
|
||||
__m128i R3 = _mm_shuffle_epi8(R0, R2);
|
||||
__m128i R4 = _mm_shuffle_epi8(R1, R2);
|
||||
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
|
||||
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
|
||||
__m128i R5 = _mm_unpackhi_epi32(R3, R4);
|
||||
__m128i R6 = _mm_unpacklo_epi32(R3, R4);
|
||||
|
||||
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
|
||||
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Save alphas aside */
|
||||
if (withAlpha)
|
||||
R7 = _mm_unpackhi_epi64(R5, R5);
|
||||
else
|
||||
R7 = mm_set1_epu32(0xFFFFFFFFU);
|
||||
|
||||
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
|
||||
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
|
||||
R1 = mm_set1_epu32(0);
|
||||
R0 = _mm_unpacklo_epi8(R5, R1);
|
||||
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
|
||||
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
|
||||
* Note: this must be done before sign-conversion.
|
||||
* Note also there is no slli_epi8, so we have to use a 16-bit
|
||||
* version and then mask.
|
||||
*/
|
||||
R6 = _mm_slli_epi16(R6, dataShift);
|
||||
R1 = mm_set1_epu8(mask);
|
||||
R6 = _mm_and_si128(R6, R1);
|
||||
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Expand Co's from 8-bit signed to 16-bit signed */
|
||||
R1 = _mm_unpackhi_epi8(R6, R6);
|
||||
R1 = _mm_srai_epi16(R1, 8);
|
||||
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
|
||||
/* Expand Cg's form 8-bit signed to 16-bit signed */
|
||||
R2 = _mm_unpacklo_epi8(R6, R6);
|
||||
R2 = _mm_srai_epi16(R2, 8);
|
||||
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
|
||||
/* Get Y - halfCg and save */
|
||||
R6 = _mm_subs_epi16(R0, R2);
|
||||
/* R = (Y-halfCg) + halfCo */
|
||||
R3 = _mm_adds_epi16(R6, R1);
|
||||
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
|
||||
/* G = Y + Cg(/2) */
|
||||
R4 = _mm_adds_epi16(R0, R2);
|
||||
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
|
||||
/* B = (Y-halfCg) - Co(/2) */
|
||||
R5 = _mm_subs_epi16(R6, R1);
|
||||
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
|
||||
/* Repack R's & B's. */
|
||||
/* This line is the only diff between inverted and non-inverted.
|
||||
* Unfortunately, it would be expensive to check "inverted"
|
||||
* every time through this loop.
|
||||
*/
|
||||
R0 = _mm_packus_epi16(R5, R3);
|
||||
/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
|
||||
/* Repack G's. */
|
||||
R1 = _mm_packus_epi16(R4, R4);
|
||||
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
|
||||
/* And add the A's. */
|
||||
R1 = _mm_unpackhi_epi64(R1, R7);
|
||||
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
|
||||
/* Now do interleaving again. */
|
||||
R2 = _mm_unpacklo_epi8(R0, R1);
|
||||
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
|
||||
R3 = _mm_unpackhi_epi8(R0, R1);
|
||||
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
|
||||
R4 = _mm_unpacklo_epi16(R2, R3);
|
||||
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
|
||||
R5 = _mm_unpackhi_epi16(R2, R3);
|
||||
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
|
||||
STORE_SI128(dptr, R4);
|
||||
dptr += (128 / 8);
|
||||
STORE_SI128(dptr, R5);
|
||||
dptr += (128 / 8);
|
||||
w -= 8;
|
||||
}
|
||||
|
||||
/* Handle any remainder pixels. */
|
||||
if (w > 0)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->YCoCgToRGB_8u_AC4R(
|
||||
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
|
||||
shift, withAlpha);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
|
||||
dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
|
||||
}
|
||||
|
||||
sptr += sRowBump;
|
||||
dptr += dRowBump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
|
||||
BOOL withAlpha)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
||||
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
||||
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
|
||||
|
||||
default:
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
|
||||
height, shift, withAlpha);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
|
||||
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
1742
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
vendored
Normal file
1742
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
187
third_party/FreeRDP/libfreerdp/primitives/sse/prim_add_sse3.c
vendored
Normal file
187
third_party/FreeRDP/libfreerdp/primitives/sse/prim_add_sse3.c
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized add operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_add.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
|
||||
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||
|
||||
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
|
||||
INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
|
||||
{
|
||||
const int shifts = 2;
|
||||
INT16* dptr1 = pSrcDst1;
|
||||
INT16* dptr2 = pSrcDst2;
|
||||
|
||||
if (ulen < 16) /* pointless if too small */
|
||||
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
|
||||
|
||||
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
|
||||
if ((ULONG_PTR)pSrcDst1 & offBeatMask)
|
||||
{
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
|
||||
}
|
||||
/* Get to the 16-byte boundary now. */
|
||||
const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
|
||||
if (rem != 0)
|
||||
{
|
||||
const UINT32 add = 16 - (UINT32)rem;
|
||||
pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
dptr1 += add;
|
||||
dptr2 += add;
|
||||
}
|
||||
/* Use 4 128-bit SSE registers. */
|
||||
size_t len = ulen;
|
||||
size_t count = len >> (7 - shifts);
|
||||
len -= count << (7 - shifts);
|
||||
if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
|
||||
{
|
||||
/* Unaligned loads */
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* vsptr1 = (const __m128i*)dptr1;
|
||||
const __m128i* vsptr2 = (const __m128i*)dptr2;
|
||||
__m128i* vdptr1 = (__m128i*)dptr1;
|
||||
__m128i* vdptr2 = (__m128i*)dptr2;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm1 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm2 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm3 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm4 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm5 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm6 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm7 = LOAD_SI128(vsptr2++);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||
|
||||
STORE_SI128(vdptr1++, xmm0);
|
||||
STORE_SI128(vdptr1++, xmm1);
|
||||
STORE_SI128(vdptr1++, xmm2);
|
||||
STORE_SI128(vdptr1++, xmm3);
|
||||
|
||||
STORE_SI128(vdptr2++, xmm0);
|
||||
STORE_SI128(vdptr2++, xmm1);
|
||||
STORE_SI128(vdptr2++, xmm2);
|
||||
STORE_SI128(vdptr2++, xmm3);
|
||||
|
||||
dptr1 = (INT16*)vdptr1;
|
||||
dptr2 = (INT16*)vdptr2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Aligned loads */
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* vsptr1 = (const __m128i*)dptr1;
|
||||
const __m128i* vsptr2 = (const __m128i*)dptr2;
|
||||
__m128i* vdptr1 = (__m128i*)dptr1;
|
||||
__m128i* vdptr2 = (__m128i*)dptr2;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm1 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm2 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm3 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm4 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm5 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm6 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm7 = LOAD_SI128(vsptr2++);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||
|
||||
STORE_SI128(vdptr1++, xmm0);
|
||||
STORE_SI128(vdptr1++, xmm1);
|
||||
STORE_SI128(vdptr1++, xmm2);
|
||||
STORE_SI128(vdptr1++, xmm3);
|
||||
|
||||
STORE_SI128(vdptr2++, xmm0);
|
||||
STORE_SI128(vdptr2++, xmm1);
|
||||
STORE_SI128(vdptr2++, xmm2);
|
||||
STORE_SI128(vdptr2++, xmm3);
|
||||
|
||||
dptr1 = (INT16*)vdptr1;
|
||||
dptr2 = (INT16*)vdptr2;
|
||||
}
|
||||
}
|
||||
/* Use a single 128-bit SSE register. */
|
||||
count = len >> (5 - shifts);
|
||||
len -= count << (5 - shifts);
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* vsptr1 = (const __m128i*)dptr1;
|
||||
const __m128i* vsptr2 = (const __m128i*)dptr2;
|
||||
__m128i* vdptr1 = (__m128i*)dptr1;
|
||||
__m128i* vdptr2 = (__m128i*)dptr2;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(vsptr1);
|
||||
__m128i xmm1 = LOAD_SI128(vsptr2);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm1);
|
||||
|
||||
STORE_SI128(vdptr1++, xmm0);
|
||||
STORE_SI128(vdptr2++, xmm0);
|
||||
|
||||
dptr1 = (INT16*)vdptr1;
|
||||
dptr2 = (INT16*)vdptr2;
|
||||
}
|
||||
/* Finish off the remainder. */
|
||||
if (len > 0)
|
||||
return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->add_16s = sse3_add_16s;
|
||||
prims->add_16s_inplace = sse3_add_16s_inplace;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
215
third_party/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
vendored
Normal file
215
third_party/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
vendored
Normal file
@@ -0,0 +1,215 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized alpha blending routines.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
* Note: this code assumes the second operand is fully opaque,
|
||||
* e.g.
|
||||
* newval = alpha1*val1 + (1-alpha1)*val2
|
||||
* rather than
|
||||
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
|
||||
* The IPP gives other options.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_alphaComp.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
|
||||
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
|
||||
UINT32 height)
|
||||
{
|
||||
const UINT32* sptr1 = (const UINT32*)pSrc1;
|
||||
const UINT32* sptr2 = (const UINT32*)pSrc2;
|
||||
|
||||
if ((width <= 0) || (height <= 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (width < 4) /* pointless if too small */
|
||||
{
|
||||
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
|
||||
height);
|
||||
}
|
||||
|
||||
UINT32* dptr = (UINT32*)pDst;
|
||||
const size_t linebytes = width * sizeof(UINT32);
|
||||
const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
|
||||
const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
|
||||
const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
|
||||
__m128i xmm0 = mm_set1_epu32(0);
|
||||
__m128i xmm1 = _mm_set1_epi16(1);
|
||||
|
||||
for (UINT32 y = 0; y < height; ++y)
|
||||
{
|
||||
uint32_t pixels = width;
|
||||
uint32_t count = 0;
|
||||
/* Get to the 16-byte boundary now. */
|
||||
uint32_t leadIn = 0;
|
||||
|
||||
switch ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
case 0:
|
||||
leadIn = 0;
|
||||
break;
|
||||
|
||||
case 4:
|
||||
leadIn = 3;
|
||||
break;
|
||||
|
||||
case 8:
|
||||
leadIn = 2;
|
||||
break;
|
||||
|
||||
case 12:
|
||||
leadIn = 1;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* We'll never hit a 16-byte boundary, so do the whole
|
||||
* thing the slow way.
|
||||
*/
|
||||
leadIn = width;
|
||||
break;
|
||||
}
|
||||
|
||||
if (leadIn)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
|
||||
src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr1 += leadIn;
|
||||
sptr2 += leadIn;
|
||||
dptr += leadIn;
|
||||
pixels -= leadIn;
|
||||
}
|
||||
|
||||
/* Use SSE registers to do 4 pixels at a time. */
|
||||
count = pixels >> 2;
|
||||
pixels -= count << 2;
|
||||
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm2;
|
||||
__m128i xmm3;
|
||||
__m128i xmm4;
|
||||
__m128i xmm5;
|
||||
__m128i xmm6;
|
||||
__m128i xmm7;
|
||||
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
|
||||
xmm2 = LOAD_SI128(sptr1);
|
||||
sptr1 += 4;
|
||||
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
|
||||
xmm3 = LOAD_SI128(sptr2);
|
||||
sptr2 += 4;
|
||||
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
|
||||
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
|
||||
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
|
||||
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
|
||||
/* subtract */
|
||||
xmm6 = _mm_subs_epi16(xmm4, xmm5);
|
||||
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
|
||||
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
|
||||
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
|
||||
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
|
||||
/* Add one to alphas */
|
||||
xmm4 = _mm_adds_epi16(xmm4, xmm1);
|
||||
/* Multiply and take low word */
|
||||
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
|
||||
/* Shift 8 right */
|
||||
xmm4 = _mm_srai_epi16(xmm4, 8);
|
||||
/* Add xmm5 */
|
||||
xmm4 = _mm_adds_epi16(xmm4, xmm5);
|
||||
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
|
||||
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
|
||||
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
|
||||
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
|
||||
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
|
||||
/* subtract */
|
||||
xmm7 = _mm_subs_epi16(xmm5, xmm6);
|
||||
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
|
||||
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
|
||||
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
|
||||
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
|
||||
/* Add one to alphas */
|
||||
xmm5 = _mm_adds_epi16(xmm5, xmm1);
|
||||
/* Multiply and take low word */
|
||||
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
|
||||
/* Shift 8 right */
|
||||
xmm5 = _mm_srai_epi16(xmm5, 8);
|
||||
/* Add xmm6 */
|
||||
xmm5 = _mm_adds_epi16(xmm5, xmm6);
|
||||
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
|
||||
/* Must mask off remainders or pack gets confused */
|
||||
xmm3 = _mm_set1_epi16(0x00ffU);
|
||||
xmm4 = _mm_and_si128(xmm4, xmm3);
|
||||
xmm5 = _mm_and_si128(xmm5, xmm3);
|
||||
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
|
||||
xmm5 = _mm_packus_epi16(xmm5, xmm4);
|
||||
STORE_SI128(dptr, xmm5);
|
||||
dptr += 4;
|
||||
}
|
||||
|
||||
/* Finish off the remainder. */
|
||||
if (pixels)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
|
||||
src2Step, (BYTE*)dptr, dstStep, pixels, 1);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr1 += pixels;
|
||||
sptr2 += pixels;
|
||||
dptr += pixels;
|
||||
}
|
||||
|
||||
/* Jump to next row. */
|
||||
sptr1 += src1Jump;
|
||||
sptr2 += src2Jump;
|
||||
dptr += dstJump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->alphaComp_argb = sse2_alphaComp_argb;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
54
third_party/FreeRDP/libfreerdp/primitives/sse/prim_andor_sse3.c
vendored
Normal file
54
third_party/FreeRDP/libfreerdp/primitives/sse/prim_andor_sse3.c
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized Logical operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_andor.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
|
||||
*dptr++ = *sptr++ & val)
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->andC_32u = sse3_andC_32u;
|
||||
prims->orC_32u = sse3_orC_32u;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
79
third_party/FreeRDP/libfreerdp/primitives/sse/prim_avxsse.h
vendored
Normal file
79
third_party/FreeRDP/libfreerdp/primitives/sse/prim_avxsse.h
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* FreeRDP primitives SSE implementation
|
||||
*
|
||||
* Copyright 2025 Armin Novak <armin.novak@thincast.com>
|
||||
* Copyright 2025 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <winpr/cast.h>
|
||||
|
||||
#include "../../core/simd.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
|
||||
{
|
||||
return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
|
||||
WINPR_CXX_COMPAT_CAST(int32_t, val3),
|
||||
WINPR_CXX_COMPAT_CAST(int32_t, val4));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
|
||||
uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
|
||||
uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
|
||||
uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
|
||||
{
|
||||
return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set1_epu32(uint32_t val)
|
||||
{
|
||||
return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set1_epu8(uint8_t val)
|
||||
{
|
||||
return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i LOAD_SI128(const void* ptr)
|
||||
{
|
||||
const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
|
||||
return _mm_lddqu_si128(mptr);
|
||||
}
|
||||
|
||||
static inline void STORE_SI128(void* ptr, __m128i val)
|
||||
{
|
||||
__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
|
||||
_mm_storeu_si128(mptr, val);
|
||||
}
|
||||
|
||||
#endif
|
||||
1056
third_party/FreeRDP/libfreerdp/primitives/sse/prim_colors_sse2.c
vendored
Normal file
1056
third_party/FreeRDP/libfreerdp/primitives/sse/prim_colors_sse2.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
vendored
Normal file
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
vendored
Normal file
@@ -0,0 +1,278 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Copy operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/log.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_copy.h"
|
||||
#include "../codec/color.h"
|
||||
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
|
||||
{
|
||||
return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
|
||||
(int32_t)i5, (int32_t)i6, (int32_t)i7);
|
||||
}
|
||||
|
||||
static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
|
||||
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
|
||||
UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 3;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
|
||||
0xFF000000, 0xFF000000, 0xFF000000);
|
||||
const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
|
||||
0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
|
||||
const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
|
||||
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
const UINT32 rem = nWidth % 8;
|
||||
const int64_t width = nWidth - rem;
|
||||
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
|
||||
/* Ensure alignment requirements can be met */
|
||||
for (; x < width; x += 8)
|
||||
{
|
||||
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m256i s0 = _mm256_loadu_si256(src);
|
||||
__m256i s1 = _mm256_shuffle_epi8(s0, smask);
|
||||
|
||||
/* _mm256_shuffle_epi8 can not cross 128bit lanes.
|
||||
* manually copy these bytes with extract/insert */
|
||||
const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
|
||||
const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
|
||||
const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000);
|
||||
const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
|
||||
|
||||
const __m256i s2 = _mm256_loadu_si256(dst);
|
||||
__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
|
||||
_mm256_storeu_si256(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 4;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m256i mask = _mm256_setr_epi8(
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
|
||||
const UINT32 rem = nWidth % 8;
|
||||
const int64_t width = nWidth - rem;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
for (; x < width; x += 8)
|
||||
{
|
||||
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m256i s0 = _mm256_loadu_si256(src);
|
||||
const __m256i s1 = _mm256_loadu_si256(dst);
|
||||
__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
|
||||
_mm256_storeu_si256(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
|
||||
int64_t dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
|
||||
switch (SrcFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return avx2_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return avx2_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return avx2_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
|
||||
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
|
||||
{
|
||||
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
|
||||
int64_t srcVOffset = 0;
|
||||
int64_t srcVMultiplier = 1;
|
||||
int64_t dstVOffset = 0;
|
||||
int64_t dstVMultiplier = 1;
|
||||
|
||||
if ((nWidth == 0) || (nHeight == 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
|
||||
return -1;
|
||||
|
||||
if (!pDstData || !pSrcData)
|
||||
return -1;
|
||||
|
||||
if (nDstStep == 0)
|
||||
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
if (nSrcStep == 0)
|
||||
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
|
||||
if (vSrcVFlip)
|
||||
{
|
||||
srcVOffset = (nHeight - 1ll) * nSrcStep;
|
||||
srcVMultiplier = -1;
|
||||
}
|
||||
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, flags, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
{
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
WLog_VRB(PRIM_TAG, "AVX2 optimizations");
|
||||
prims->copy_no_overlap = avx2_image_copy_no_overlap;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
257
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_sse4_1.c
vendored
Normal file
257
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_sse4_1.c
vendored
Normal file
@@ -0,0 +1,257 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Copy operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/log.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
#include "prim_copy.h"
|
||||
#include "../codec/color.h"
|
||||
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
|
||||
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
|
||||
UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 3;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
|
||||
const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
|
||||
const UINT32 rem = nWidth % 4;
|
||||
|
||||
const int64_t width = nWidth - rem;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
/* Ensure alignment requirements can be met */
|
||||
for (; x < width; x += 4)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m128i s0 = LOAD_SI128(src);
|
||||
const __m128i s1 = _mm_shuffle_epi8(s0, smask);
|
||||
const __m128i s2 = LOAD_SI128(dst);
|
||||
|
||||
__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
|
||||
STORE_SI128(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
|
||||
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
|
||||
UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 4;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
|
||||
(char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
|
||||
(char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
|
||||
const UINT32 rem = nWidth % 4;
|
||||
const int64_t width = nWidth - rem;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
for (; x < width; x += 4)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m128i s0 = LOAD_SI128(src);
|
||||
const __m128i s1 = LOAD_SI128(dst);
|
||||
__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
|
||||
STORE_SI128(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t sse_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
|
||||
int64_t dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
|
||||
switch (SrcFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return sse_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return sse_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return sse_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
|
||||
static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
|
||||
{
|
||||
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
|
||||
int64_t srcVOffset = 0;
|
||||
int64_t srcVMultiplier = 1;
|
||||
int64_t dstVOffset = 0;
|
||||
int64_t dstVMultiplier = 1;
|
||||
|
||||
if ((nWidth == 0) || (nHeight == 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
|
||||
return -1;
|
||||
|
||||
if (!pDstData || !pSrcData)
|
||||
return -1;
|
||||
|
||||
if (nDstStep == 0)
|
||||
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
if (nSrcStep == 0)
|
||||
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
|
||||
if (vSrcVFlip)
|
||||
{
|
||||
srcVOffset = (nHeight - 1ll) * nSrcStep;
|
||||
srcVMultiplier = -1;
|
||||
}
|
||||
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, flags, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
{
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
|
||||
prims->copy_no_overlap = sse_image_copy_no_overlap;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
235
third_party/FreeRDP/libfreerdp/primitives/sse/prim_set_sse2.c
vendored
Normal file
235
third_party/FreeRDP/libfreerdp/primitives/sse/prim_set_sse2.c
vendored
Normal file
@@ -0,0 +1,235 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized routines to set a chunk of memory to a constant.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
#include "prim_set.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
BYTE byte = 0;
|
||||
BYTE* dptr = nullptr;
|
||||
__m128i xmm0;
|
||||
size_t count = 0;
|
||||
|
||||
if (len < 16)
|
||||
return generic->set_8u(val, pDst, ulen);
|
||||
|
||||
byte = val;
|
||||
dptr = pDst;
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
*dptr++ = byte;
|
||||
|
||||
if (--len == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
xmm0 = mm_set1_epu8(byte);
|
||||
/* Cover 256-byte chunks via SSE register stores. */
|
||||
count = len >> 8;
|
||||
len -= count << 8;
|
||||
|
||||
/* Do 256-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
}
|
||||
|
||||
/* Cover 16-byte chunks via SSE register stores. */
|
||||
count = len >> 4;
|
||||
len -= count << 4;
|
||||
|
||||
/* Do 16-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
}
|
||||
|
||||
/* Do leftover bytes. */
|
||||
while (len--)
|
||||
*dptr++ = byte;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
const primitives_t* prim = primitives_get_generic();
|
||||
UINT32* dptr = pDst;
|
||||
__m128i xmm0;
|
||||
size_t count = 0;
|
||||
|
||||
/* If really short, just do it here. */
|
||||
if (len < 32)
|
||||
{
|
||||
while (len--)
|
||||
*dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* Assure we can reach 16-byte alignment. */
|
||||
if (((ULONG_PTR)dptr & 0x03) != 0)
|
||||
{
|
||||
return prim->set_32u(val, pDst, ulen);
|
||||
}
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
*dptr++ = val;
|
||||
|
||||
if (--len == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
xmm0 = mm_set1_epu32(val);
|
||||
/* Cover 256-byte chunks via SSE register stores. */
|
||||
count = len >> 6;
|
||||
len -= count << 6;
|
||||
|
||||
/* Do 256-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
}
|
||||
|
||||
/* Cover 16-byte chunks via SSE register stores. */
|
||||
count = len >> 2;
|
||||
len -= count << 2;
|
||||
|
||||
/* Do 16-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
}
|
||||
|
||||
/* Do leftover bytes. */
|
||||
while (len--)
|
||||
*dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
UINT32 uval = *((UINT32*)&val);
|
||||
return sse2_set_32u(uval, (UINT32*)pDst, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
/* Pick tuned versions if possible. */
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->set_8u = sse2_set_8u;
|
||||
prims->set_32s = sse2_set_32s;
|
||||
prims->set_32u = sse2_set_32u;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
160
third_party/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
vendored
Normal file
160
third_party/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Shift operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_shift.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
|
||||
*dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
|
||||
*dptr++ = *sptr++ >> val)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
|
||||
*dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
|
||||
*dptr++ = *sptr++ >> val)
|
||||
|
||||
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
const INT32 shifts = 2;
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
if (len < 16) /* pointless if too small */
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
|
||||
|
||||
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
|
||||
if ((ULONG_PTR)pSrcDst & offBeatMask)
|
||||
{
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
|
||||
}
|
||||
/* Get to the 16-byte boundary now. */
|
||||
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
|
||||
if (rem > 0)
|
||||
{
|
||||
const UINT32 add = 16 - rem;
|
||||
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
pSrcDst += add;
|
||||
len -= add;
|
||||
}
|
||||
|
||||
/* Use 8 128-bit SSE registers. */
|
||||
size_t count = len >> (8 - shifts);
|
||||
len -= count << (8 - shifts);
|
||||
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)pSrcDst;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(src++);
|
||||
__m128i xmm1 = LOAD_SI128(src++);
|
||||
__m128i xmm2 = LOAD_SI128(src++);
|
||||
__m128i xmm3 = LOAD_SI128(src++);
|
||||
__m128i xmm4 = LOAD_SI128(src++);
|
||||
__m128i xmm5 = LOAD_SI128(src++);
|
||||
__m128i xmm6 = LOAD_SI128(src++);
|
||||
__m128i xmm7 = LOAD_SI128(src);
|
||||
|
||||
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
|
||||
xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
|
||||
xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
|
||||
xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
|
||||
xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
|
||||
xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
|
||||
xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
|
||||
xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
|
||||
|
||||
__m128i* dst = (__m128i*)pSrcDst;
|
||||
|
||||
STORE_SI128(dst++, xmm0);
|
||||
STORE_SI128(dst++, xmm1);
|
||||
STORE_SI128(dst++, xmm2);
|
||||
STORE_SI128(dst++, xmm3);
|
||||
STORE_SI128(dst++, xmm4);
|
||||
STORE_SI128(dst++, xmm5);
|
||||
STORE_SI128(dst++, xmm6);
|
||||
STORE_SI128(dst++, xmm7);
|
||||
|
||||
pSrcDst = (INT16*)dst;
|
||||
}
|
||||
|
||||
/* Use a single 128-bit SSE register. */
|
||||
count = len >> (5 - shifts);
|
||||
len -= count << (5 - shifts);
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)pSrcDst;
|
||||
__m128i xmm0 = LOAD_SI128(src);
|
||||
|
||||
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
|
||||
|
||||
__m128i* dst = (__m128i*)pSrcDst;
|
||||
STORE_SI128(dst++, xmm0);
|
||||
pSrcDst = (INT16*)dst;
|
||||
}
|
||||
|
||||
/* Finish off the remainder. */
|
||||
if (len > 0)
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
|
||||
* depending on the sign of val. To avoid using the deprecated inplace
|
||||
* routines, a wrapper can use the src for the dest.
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
|
||||
prims->lShiftC_16s = sse2_lShiftC_16s;
|
||||
prims->rShiftC_16s = sse2_rShiftC_16s;
|
||||
prims->lShiftC_16u = sse2_lShiftC_16u;
|
||||
prims->rShiftC_16u = sse2_rShiftC_16u;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
188
third_party/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
vendored
Normal file
188
third_party/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized sign operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_sign.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
const INT16* sptr = pSrc;
|
||||
INT16* dptr = pDst;
|
||||
size_t count = 0;
|
||||
|
||||
if (len < 16)
|
||||
{
|
||||
return generic->sign_16s(pSrc, pDst, ulen);
|
||||
}
|
||||
|
||||
/* Check for 16-byte alignment (eventually). */
|
||||
if ((ULONG_PTR)pDst & 0x01)
|
||||
{
|
||||
return generic->sign_16s(pSrc, pDst, ulen);
|
||||
}
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
INT16 src = *sptr++;
|
||||
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
|
||||
|
||||
if (--len == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* Do 32-short chunks using 8 XMM registers. */
|
||||
count = len >> 5; /* / 32 */
|
||||
len -= count << 5; /* * 32 */
|
||||
|
||||
if ((ULONG_PTR)sptr & 0x0f)
|
||||
{
|
||||
/* Unaligned */
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0;
|
||||
__m128i xmm1;
|
||||
__m128i xmm2;
|
||||
__m128i xmm3;
|
||||
__m128i xmm4;
|
||||
__m128i xmm5;
|
||||
__m128i xmm6;
|
||||
__m128i xmm7;
|
||||
xmm0 = _mm_set1_epi16(0x0001U);
|
||||
xmm1 = _mm_set1_epi16(0x0001U);
|
||||
xmm2 = _mm_set1_epi16(0x0001U);
|
||||
xmm3 = _mm_set1_epi16(0x0001U);
|
||||
xmm4 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm5 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm6 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm7 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_sign_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_sign_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_sign_epi16(xmm3, xmm7);
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm1);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm2);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm3);
|
||||
dptr += 8;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Aligned */
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0;
|
||||
__m128i xmm1;
|
||||
__m128i xmm2;
|
||||
__m128i xmm3;
|
||||
__m128i xmm4;
|
||||
__m128i xmm5;
|
||||
__m128i xmm6;
|
||||
__m128i xmm7;
|
||||
xmm0 = _mm_set1_epi16(0x0001U);
|
||||
xmm1 = _mm_set1_epi16(0x0001U);
|
||||
xmm2 = _mm_set1_epi16(0x0001U);
|
||||
xmm3 = _mm_set1_epi16(0x0001U);
|
||||
xmm4 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm5 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm6 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm7 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_sign_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_sign_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_sign_epi16(xmm3, xmm7);
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm1);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm2);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm3);
|
||||
dptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do 8-short chunks using two XMM registers. */
|
||||
count = len >> 3;
|
||||
len -= count << 3;
|
||||
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0 = _mm_set1_epi16(0x0001U);
|
||||
__m128i xmm1 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm1);
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 8;
|
||||
}
|
||||
|
||||
/* Do leftovers. */
|
||||
while (len--)
|
||||
{
|
||||
INT16 src = *sptr++;
|
||||
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#endif /* SSE_AVX_INTRINSICS_ENABLED */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
/* Pick tuned versions if possible. */
|
||||
/* I didn't spot an IPP version of this. */
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
|
||||
prims->sign_16s = ssse3_sign_16s;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_templates.h
vendored
Normal file
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_templates.h
vendored
Normal file
@@ -0,0 +1,278 @@
|
||||
/* prim_templates.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "prim_avxsse.h"
|
||||
|
||||
/* These are prototypes for SSE (potentially NEON) routines that do a
|
||||
* simple SSE operation over an array of data. Since so much of this
|
||||
* code is shared except for the operation itself, these prototypes are
|
||||
* used rather than duplicating code. The naming convention depends on
|
||||
* the parameters: S=Source param; C=Constant; D=Destination.
|
||||
* All the macros have parameters for a fallback procedure if the data
|
||||
* is too small and an operation "the slow way" for use at 16-byte edges.
|
||||
*/
|
||||
|
||||
/* SSE3 note: If someone needs to support an SSE2 version of these without
|
||||
* SSE3 support, an alternative version could be added that merely checks
|
||||
* that 16-byte alignment on both destination and source(s) can be
|
||||
* achieved, rather than use LDDQU for unaligned reads.
|
||||
*/
|
||||
|
||||
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
|
||||
* It easily can't do that if the value is stored in a variable.
|
||||
* So don't save it as an intermediate value.
|
||||
*/
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SCD = Source, Constant, Destination
|
||||
*/
|
||||
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
|
||||
WINPR_ATTR_NODISCARD \
|
||||
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
|
||||
_type_* WINPR_RESTRICT pDst, UINT32 ulen) \
|
||||
{ \
|
||||
size_t len = ulen; \
|
||||
INT32 shifts = 0; \
|
||||
const _type_* sptr = pSrc; \
|
||||
_type_* dptr = pDst; \
|
||||
if (val == 0) \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
if (val >= 16) \
|
||||
return -1; \
|
||||
if (sizeof(_type_) == 1) \
|
||||
shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) \
|
||||
shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) \
|
||||
shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) \
|
||||
shifts = 4; \
|
||||
/* Use 8 128-bit SSE registers. */ \
|
||||
size_t count = len >> (8 - shifts); \
|
||||
len -= count << (8 - shifts); \
|
||||
\
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm2 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm3 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm4 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm5 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm6 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm7 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, (_op_type_)val); \
|
||||
xmm1 = _op_(xmm1, (_op_type_)val); \
|
||||
xmm2 = _op_(xmm2, (_op_type_)val); \
|
||||
xmm3 = _op_(xmm3, (_op_type_)val); \
|
||||
xmm4 = _op_(xmm4, (_op_type_)val); \
|
||||
xmm5 = _op_(xmm5, (_op_type_)val); \
|
||||
xmm6 = _op_(xmm6, (_op_type_)val); \
|
||||
xmm7 = _op_(xmm7, (_op_type_)val); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm2); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm3); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm4); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm5); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm6); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm7); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
\
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5 - shifts); \
|
||||
len -= count << (5 - shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, (_op_type_)val); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
} \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SCD = Source, Constant, Destination
|
||||
* PRE = preload xmm0 with the constant.
|
||||
*/
|
||||
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
WINPR_ATTR_NODISCARD \
|
||||
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
|
||||
_type_* WINPR_RESTRICT pDst, INT32 ilen) \
|
||||
{ \
|
||||
size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
|
||||
int shifts = 0; \
|
||||
const _type_* sptr = pSrc; \
|
||||
_type_* dptr = pDst; \
|
||||
__m128i xmm0; \
|
||||
if (sizeof(_type_) == 1) \
|
||||
shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) \
|
||||
shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) \
|
||||
shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) \
|
||||
shifts = 4; \
|
||||
/* Use 4 128-bit SSE registers. */ \
|
||||
size_t count = len >> (7 - shifts); \
|
||||
len -= count << (7 - shifts); \
|
||||
xmm0 = mm_set1_epu32(val); \
|
||||
for (size_t x = 0; x < count; x++) \
|
||||
{ \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm2 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm3 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm4 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
xmm2 = _op_(xmm2, xmm0); \
|
||||
xmm3 = _op_(xmm3, xmm0); \
|
||||
xmm4 = _op_(xmm4, xmm0); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm2); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm3); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm4); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5 - shifts); \
|
||||
len -= count << (5 - shifts); \
|
||||
for (size_t x = 0; x < count; x++) \
|
||||
{ \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
for (size_t x = 0; x < len; x++) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
} \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SSD = Source1, Source2, Destination
|
||||
*/
|
||||
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
WINPR_ATTR_NODISCARD \
|
||||
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
|
||||
const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
|
||||
UINT32 ulen) \
|
||||
{ \
|
||||
size_t len = ulen; \
|
||||
int shifts = 0; \
|
||||
const _type_* sptr1 = pSrc1; \
|
||||
const _type_* sptr2 = pSrc2; \
|
||||
_type_* dptr = pDst; \
|
||||
size_t count; \
|
||||
if (sizeof(_type_) == 1) \
|
||||
shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) \
|
||||
shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) \
|
||||
shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) \
|
||||
shifts = 4; \
|
||||
/* Use 4 128-bit SSE registers. */ \
|
||||
count = len >> (7 - shifts); \
|
||||
len -= count << (7 - shifts); \
|
||||
/* Aligned loads */ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm1 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm2 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm3 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm4 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm5 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm6 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm7 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm4); \
|
||||
xmm1 = _op_(xmm1, xmm5); \
|
||||
xmm2 = _op_(xmm2, xmm6); \
|
||||
xmm3 = _op_(xmm3, xmm7); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm2); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm3); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5 - shifts); \
|
||||
len -= count << (5 - shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm1 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm1); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) \
|
||||
{ \
|
||||
const pstatus_t rc = _slowWay_; \
|
||||
if (rc != PRIMITIVES_SUCCESS) \
|
||||
return rc; \
|
||||
} \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
Reference in New Issue
Block a user