Milestone 5: deliver embedded RDP sessions and lifecycle hardening

This commit is contained in:
Keith Smith
2026-03-03 18:59:26 -07:00
parent 230a401386
commit 36006bd4aa
2941 changed files with 724359 additions and 77 deletions

View File

@@ -0,0 +1,104 @@
# primitives
set(PRIMITIVES_SRCS
prim_add.c
prim_add.h
prim_andor.c
prim_andor.h
prim_alphaComp.c
prim_alphaComp.h
prim_colors.c
prim_colors.h
prim_copy.c
prim_copy.h
prim_set.c
prim_set.h
prim_shift.c
prim_shift.h
prim_sign.c
prim_sign.h
prim_YUV.c
prim_YUV.h
prim_YCoCg.c
prim_YCoCg.h
primitives.c
prim_internal.h
)
set(PRIMITIVES_SSE3_SRCS
sse/prim_avxsse.h
sse/prim_templates.h
sse/prim_colors_sse2.c
sse/prim_set_sse2.c
sse/prim_add_sse3.c
sse/prim_alphaComp_sse3.c
sse/prim_andor_sse3.c
sse/prim_shift_sse3.c
)
set(PRIMITIVES_SSSE3_SRCS sse/prim_sign_ssse3.c sse/prim_YCoCg_ssse3.c)
set(PRIMITIVES_SSE4_1_SRCS sse/prim_copy_sse4_1.c sse/prim_YUV_sse4.1.c)
set(PRIMITIVES_SSE4_2_SRCS)
set(PRIMITIVES_AVX2_SRCS sse/prim_copy_avx2.c)
set(PRIMITIVES_NEON_SRCS neon/prim_colors_neon.c neon/prim_YCoCg_neon.c neon/prim_YUV_neon.c)
set(PRIMITIVES_OPENCL_SRCS opencl/prim_YUV_opencl.c)
if(WITH_OPENCL)
include(WarnUnmaintained)
warn_unmaintained("OpenCL support for primitives" "-DWITH_OPENCL=OFF")
set(FILENAME "opencl/primitives.cl")
set_source_files_properties(${FILENAME} PROPERTIES HEADER_FILE_ONLY ON)
list(APPEND PRIMITIVES_OPENCL_SRCS ${FILENAME})
include(ConvertFileToHexArray)
file_to_hex_array(${FILENAME} FILEDATA)
set(HDR_FILE "${CMAKE_CURRENT_BINARY_DIR}/opencl/primitives-opencl-program.h")
cleaning_configure_file("${CMAKE_CURRENT_SOURCE_DIR}/opencl/primitives.h.in" ${HDR_FILE} @ONLY)
list(APPEND PRIMITIVES_OPENCL_SRCS ${HDR_FILE})
include_directories(${CMAKE_CURRENT_BINARY_DIR}/opencl)
freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
freerdp_library_add(OpenCL::OpenCL)
freerdp_pc_add_requires_private("OpenCL")
endif()
set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS}
${PRIMITIVES_SSE4_1_SRCS} ${PRIMITIVES_SSE4_2_SRCS} ${PRIMITIVES_OPENCL_SRCS}
)
if(WITH_AVX2)
list(APPEND PRIMITIVES_OPT_SRCS ${PRIMITIVES_AVX2_SRCS})
endif()
set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS})
include(CompilerDetect)
include(DetectIntrinsicSupport)
if(WITH_SIMD)
set_simd_source_file_properties("sse3" ${PRIMITIVES_SSE3_SRCS})
set_simd_source_file_properties("ssse3" ${PRIMITIVES_SSSE3_SRCS})
set_simd_source_file_properties("sse4.1" ${PRIMITIVES_SSE4_1_SRCS})
set_simd_source_file_properties("sse4.2" ${PRIMITIVES_SSE4_2_SRCS})
set_simd_source_file_properties("avx2" ${PRIMITIVES_AVX2_SRCS})
set_simd_source_file_properties("neon" ${PRIMITIVES_OPT_SRCS})
endif()
freerdp_object_library_add(freerdp-primitives)
if(BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif()
if(BUILD_TESTING_INTERNAL)
add_subdirectory(test)
endif()

View File

@@ -0,0 +1,101 @@
The Primitives Library
Introduction
------------
The purpose of the primitives library is to give the freerdp code easy
access to *run-time* optimization via SIMD operations. When the library
is initialized, dynamic checks of processor features are run (such as
the support of SSE3 or Neon), and entrypoints are linked to through
function pointers to provide the fastest possible operations. All
routines offer generic C alternatives as fallbacks.
Run-time optimization has the advantage of allowing a single executable
to run fast on multiple platforms with different SIMD capabilities.
Use In Code
-----------
A singleton pointing to a structure containing the function pointers
is accessed through primitives_get(). The function pointers can then
be used from that structure, e.g.
primitives_t *prims = primitives_get();
prims->shiftC_16s(buffer, shifts, buffer, 256);
Of course, there is some overhead in calling through the function pointer
and setting up the SIMD operations, so it would be counterproductive to
call the primitives library for very small operation, e.g. initializing an
array of eight values to a constant. The primitives library is intended
for larger-scale operations, e.g. arrays of size 64 and larger.
Initialization and Cleanup
--------------------------
Library initialization is done the first time primitives_init() is called
or the first time primitives_get() is used. Cleanup (if any) is done by
primitives_deinit().
Intel Integrated Performance Primitives (IPP)
---------------------------------------------
If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
calls will be used (where available) to fill the function pointers.
Where possible, function names and parameter lists match IPP format so
that the IPP functions can be plugged into the function pointers without
a wrapper layer. Use of IPP is completely optional, and in many cases
the SSE operations in the primitives library itself are faster or similar
in performance.
Coverage
--------
The primitives library is not meant to be comprehensive, offering
entrypoints for every operation and operand type. Instead, the coverage
is focused on operations known to be performance bottlenecks in the code.
For instance, 16-bit signed operations are used widely in the RemoteFX
software, so you'll find 16s versions of several operations, but there
is no attempt to provide (unused) copies of the same code for 8u, 16u,
32s, etc.
New Optimizations
-----------------
As the need arises, new optimizations can be added to the library,
including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
The CPU feature detection is done in winpr/sysinfo.
Adding Entrypoints
------------------
As the need for new operations or operands arises, new entrypoints can
be added.
1) Function prototypes and pointers are added to
include/freerdp/primitives.h
2) New module initialization and cleanup function prototypes are added
to prim_internal.h and called in primitives.c (primitives_init()
and primitives_deinit()).
3) Operation names and parameter lists should be compatible with the IPP.
IPP manuals are available online at software.intel.com.
4) A generic C entrypoint must be available as a fallback.
5) prim_templates.h contains macro-based templates for simple operations,
such as applying a single SSE operation to arrays of data.
The template functions can frequently be used to extend the
operations without writing a lot of new code.
Cache Management
----------------
I haven't found a lot of speed improvement by attempting prefetch, and
in fact it seems to have a negative impact in some cases. Done correctly
perhaps the routines could be further accelerated by proper use of prefetch,
fences, etc.
Testing
-------
In the test subdirectory is an executable (prim_test) that tests both
functionality and speed of primitives library operations. Any new
modules should be added to that test, following the conventions already
established in that directory. The program can be executed on various
target hardware to compare generic C, optimized, and IPP performance
with various array sizes.

View File

@@ -0,0 +1,20 @@
# FreeRDP: A Remote Desktop Protocol Implementation
# FreeRDP cmake build script
#
# Copyright 2025 Armin Novak <anovak@thincast.com>
# Copyright 2025 Thincast Technologies GmbH
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_executable(primitives-benchmark benchmark.c)
target_link_libraries(primitives-benchmark PRIVATE winpr freerdp)

View File

@@ -0,0 +1,254 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* primitives benchmarking tool
*
* Copyright 2025 Armin Novak <anovak@thincast.com>
* Copyright 2025 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <winpr/crypto.h>
#include <winpr/sysinfo.h>
#include <freerdp/primitives.h>
typedef struct
{
BYTE* channels[3];
UINT32 steps[3];
prim_size_t roi;
BYTE* outputBuffer;
BYTE* outputChannels[3];
BYTE* rgbBuffer;
UINT32 outputStride;
UINT32 testedFormat;
} primitives_YUV_benchmark;
static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
{
if (!bench)
return;
free(bench->outputBuffer);
free(bench->rgbBuffer);
for (size_t i = 0; i < 3; i++)
{
free(bench->outputChannels[i]);
free(bench->channels[i]);
}
const primitives_YUV_benchmark empty = WINPR_C_ARRAY_INIT;
*bench = empty;
}
static primitives_YUV_benchmark primitives_YUV_benchmark_init(void)
{
primitives_YUV_benchmark ret = WINPR_C_ARRAY_INIT;
ret.roi.width = 3840 * 4;
ret.roi.height = 2160 * 4;
ret.outputStride = ret.roi.width * 4;
ret.testedFormat = PIXEL_FORMAT_BGRA32;
ret.outputBuffer = calloc(ret.outputStride, ret.roi.height);
if (!ret.outputBuffer)
goto fail;
ret.rgbBuffer = calloc(ret.outputStride, ret.roi.height);
if (!ret.rgbBuffer)
goto fail;
if (winpr_RAND(ret.rgbBuffer, 1ULL * ret.outputStride * ret.roi.height) < 0)
goto fail;
for (size_t i = 0; i < 3; i++)
{
ret.channels[i] = calloc(ret.roi.width, ret.roi.height);
ret.outputChannels[i] = calloc(ret.roi.width, ret.roi.height);
if (!ret.channels[i] || !ret.outputChannels[i])
goto fail;
if (winpr_RAND(ret.channels[i], 1ull * ret.roi.width * ret.roi.height) < 0)
goto fail;
ret.steps[i] = ret.roi.width;
}
return ret;
fail:
primitives_YUV_benchmark_free(&ret);
return ret;
}
static const char* print_time(UINT64 t, char* buffer, size_t size)
{
(void)_snprintf(buffer, size, "%u.%03u.%03u.%03u", (unsigned)(t / 1000000000ull),
(unsigned)((t / 1000000ull) % 1000), (unsigned)((t / 1000ull) % 1000),
(unsigned)((t) % 1000));
return buffer;
}
static BOOL primitives_YUV420_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
{
const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
for (size_t i = 0; i < 3; i++)
channels[i] = bench->channels[i];
for (size_t x = 0; x < 10; x++)
{
const UINT64 start = winpr_GetTickCount64NS();
pstatus_t status =
prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
const UINT64 end = winpr_GetTickCount64NS();
if (status != PRIMITIVES_SUCCESS)
{
(void)fprintf(stderr, "Running YUV420ToRGB_8u_P3AC4R failed\n");
return FALSE;
}
const UINT64 diff = end - start;
char buffer[32] = WINPR_C_ARRAY_INIT;
printf("[%" PRIuz "] YUV420ToRGB_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
}
return TRUE;
}
static BOOL primitives_YUV444_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
{
const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
for (size_t i = 0; i < 3; i++)
channels[i] = bench->channels[i];
for (size_t x = 0; x < 10; x++)
{
const UINT64 start = winpr_GetTickCount64NS();
pstatus_t status =
prims->YUV444ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
const UINT64 end = winpr_GetTickCount64NS();
if (status != PRIMITIVES_SUCCESS)
{
(void)fprintf(stderr, "Running YUV444ToRGB_8u_P3AC4R failed\n");
return FALSE;
}
const UINT64 diff = end - start;
char buffer[32] = WINPR_C_ARRAY_INIT;
printf("[%" PRIuz "] YUV444ToRGB_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
}
return TRUE;
}
static BOOL primitives_RGB2420_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
{
for (size_t x = 0; x < 10; x++)
{
const UINT64 start = winpr_GetTickCount64NS();
pstatus_t status =
prims->RGBToYUV420_8u_P3AC4R(bench->rgbBuffer, bench->testedFormat, bench->outputStride,
bench->outputChannels, bench->steps, &bench->roi);
const UINT64 end = winpr_GetTickCount64NS();
if (status != PRIMITIVES_SUCCESS)
{
(void)fprintf(stderr, "Running RGBToYUV420_8u_P3AC4R failed\n");
return FALSE;
}
const UINT64 diff = end - start;
char buffer[32] = WINPR_C_ARRAY_INIT;
printf("[%" PRIuz "] RGBToYUV420_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
}
return TRUE;
}
static BOOL primitives_RGB2444_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
{
for (size_t x = 0; x < 10; x++)
{
const UINT64 start = winpr_GetTickCount64NS();
pstatus_t status =
prims->RGBToYUV444_8u_P3AC4R(bench->rgbBuffer, bench->testedFormat, bench->outputStride,
bench->outputChannels, bench->steps, &bench->roi);
const UINT64 end = winpr_GetTickCount64NS();
if (status != PRIMITIVES_SUCCESS)
{
(void)fprintf(stderr, "Running RGBToYUV444_8u_P3AC4R failed\n");
return FALSE;
}
const UINT64 diff = end - start;
char buffer[32] = WINPR_C_ARRAY_INIT;
printf("[%" PRIuz "] RGBToYUV444_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
}
return TRUE;
}
int main(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
primitives_YUV_benchmark bench = primitives_YUV_benchmark_init();
for (primitive_hints hint = PRIMITIVES_PURE_SOFT; hint < PRIMITIVES_AUTODETECT; hint++)
{
const char* hintstr = primtives_hint_str(hint);
primitives_t* prim = primitives_get_by_type(hint);
if (!prim)
{
(void)fprintf(stderr, "failed to get primitives: %s\n", hintstr);
goto fail;
}
printf("Running YUV420 -> RGB benchmark on %s implementation:\n", hintstr);
if (!primitives_YUV420_benchmark_run(&bench, prim))
{
(void)fprintf(stderr, "YUV420 -> RGB benchmark failed\n");
goto fail;
}
printf("\n");
printf("Running RGB -> YUV420 benchmark on %s implementation:\n", hintstr);
if (!primitives_RGB2420_benchmark_run(&bench, prim))
{
(void)fprintf(stderr, "RGB -> YUV420 benchmark failed\n");
goto fail;
}
printf("\n");
printf("Running YUV444 -> RGB benchmark on %s implementation:\n", hintstr);
if (!primitives_YUV444_benchmark_run(&bench, prim))
{
(void)fprintf(stderr, "YUV444 -> RGB benchmark failed\n");
goto fail;
}
printf("\n");
printf("Running RGB -> YUV444 benchmark on %s implementation:\n", hintstr);
if (!primitives_RGB2444_benchmark_run(&bench, prim))
{
(void)fprintf(stderr, "RGB -> YUV444 benchmark failed\n");
goto fail;
}
printf("\n");
}
fail:
primitives_YUV_benchmark_free(&bench);
return 0;
}

View File

@@ -0,0 +1,168 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized YCoCg<->RGB conversion operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_internal.h"
#include "prim_YCoCg.h"
#if defined(NEON_INTRINSICS_ENABLED)
#include <arm_neon.h>
static primitives_t* generic = nullptr;
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
{
BYTE* dptr = pDst;
const BYTE* sptr = pSrc;
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
const int8_t cll = shift - 1; /* -1 builds in the /2's */
const UINT32 srcPad = srcStep - (width * 4);
const UINT32 dstPad = dstStep - (width * formatSize);
const UINT32 pad = width % 8;
const uint8x8_t aVal = vdup_n_u8(0xFF);
const int8x8_t cllv = vdup_n_s8(cll);
for (UINT32 y = 0; y < height; y++)
{
for (UINT32 x = 0; x < width - pad; x += 8)
{
/* Note: shifts must be done before sign-conversion. */
const uint8x8x4_t raw = vld4_u8(sptr);
const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
const int16x8_t Cg = vmovl_s8(CgRaw);
const int16x8_t Co = vmovl_s8(CoRaw);
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
const int16x8_t T = vsubq_s16(Y, Cg);
const int16x8_t R = vaddq_s16(T, Co);
const int16x8_t G = vaddq_s16(Y, Cg);
const int16x8_t B = vsubq_s16(T, Co);
uint8x8x4_t bgrx;
bgrx.val[bPos] = vqmovun_s16(B);
bgrx.val[gPos] = vqmovun_s16(G);
bgrx.val[rPos] = vqmovun_s16(R);
if (alpha)
bgrx.val[aPos] = raw.val[3];
else
bgrx.val[aPos] = aVal;
vst4_u8(dptr, bgrx);
sptr += sizeof(raw);
dptr += sizeof(bgrx);
}
for (UINT32 x = 0; x < pad; x++)
{
/* Note: shifts must be done before sign-conversion. */
const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
const INT16 T = Y - Cg;
const INT16 R = T + Co;
const INT16 G = Y + Cg;
const INT16 B = T - Co;
BYTE bgra[4];
bgra[bPos] = CLIP(B);
bgra[gPos] = CLIP(G);
bgra[rPos] = CLIP(R);
bgra[aPos] = *sptr++;
if (!alpha)
bgra[aPos] = 0xFF;
*dptr++ = bgra[0];
*dptr++ = bgra[1];
*dptr++ = bgra[2];
*dptr++ = bgra[3];
}
sptr += srcPad;
dptr += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 2, 1, 0, 3, withAlpha);
case PIXEL_FORMAT_BGRX32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 2, 1, 0, 3, withAlpha);
case PIXEL_FORMAT_RGBA32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 0, 1, 2, 3, withAlpha);
case PIXEL_FORMAT_RGBX32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 0, 1, 2, 3, withAlpha);
case PIXEL_FORMAT_ARGB32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 1, 2, 3, 0, withAlpha);
case PIXEL_FORMAT_XRGB32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 1, 2, 3, 0, withAlpha);
case PIXEL_FORMAT_ABGR32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 3, 2, 1, 0, withAlpha);
case PIXEL_FORMAT_XBGR32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 3, 2, 1, 0, withAlpha);
default:
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
height, shift, withAlpha);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(NEON_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "NEON optimizations");
prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,837 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Optimized YUV/RGB conversion operations
*
* Copyright 2014 Thomas Erbesdobler
* Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
* Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
* Copyright 2016-2017 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include <winpr/crt.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_YUV.h"
#if defined(NEON_INTRINSICS_ENABLED)
#include <arm_neon.h>
static primitives_t* generic = nullptr;
static inline uint8x8_t neon_YUV2R_single(uint16x8_t C, int16x8_t D, int16x8_t E)
{
/* R = (256 * Y + 403 * (V - 128)) >> 8 */
const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
const int32x4_t e403h = vmull_n_s16(vget_high_s16(E), 403);
const int32x4_t cehm = vaddq_s32(Ch, e403h);
const int32x4_t ceh = vshrq_n_s32(cehm, 8);
const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
const int32x4_t e403l = vmull_n_s16(vget_low_s16(E), 403);
const int32x4_t celm = vaddq_s32(Cl, e403l);
const int32x4_t cel = vshrq_n_s32(celm, 8);
const int16x8_t ce = vcombine_s16(vqmovn_s32(cel), vqmovn_s32(ceh));
return vqmovun_s16(ce);
}
static inline uint8x8x2_t neon_YUV2R(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
{
uint8x8x2_t res = { { neon_YUV2R_single(C.val[0], D.val[0], E.val[0]),
neon_YUV2R_single(C.val[1], D.val[1], E.val[1]) } };
return res;
}
static inline uint8x8_t neon_YUV2G_single(uint16x8_t C, int16x8_t D, int16x8_t E)
{
/* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
const int16x8_t d48 = vmulq_n_s16(D, 48);
const int16x8_t e120 = vmulq_n_s16(E, 120);
const int32x4_t deh = vaddl_s16(vget_high_s16(d48), vget_high_s16(e120));
const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
const int32x4_t cdeh32m = vsubq_s32(Ch, deh);
const int32x4_t cdeh32 = vshrq_n_s32(cdeh32m, 8);
const int16x4_t cdeh = vqmovn_s32(cdeh32);
const int32x4_t del = vaddl_s16(vget_low_s16(d48), vget_low_s16(e120));
const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
const int32x4_t cdel32m = vsubq_s32(Cl, del);
const int32x4_t cdel32 = vshrq_n_s32(cdel32m, 8);
const int16x4_t cdel = vqmovn_s32(cdel32);
const int16x8_t cde = vcombine_s16(cdel, cdeh);
return vqmovun_s16(cde);
}
static inline uint8x8x2_t neon_YUV2G(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
{
uint8x8x2_t res = { { neon_YUV2G_single(C.val[0], D.val[0], E.val[0]),
neon_YUV2G_single(C.val[1], D.val[1], E.val[1]) } };
return res;
}
static inline uint8x8_t neon_YUV2B_single(uint16x8_t C, int16x8_t D, int16x8_t E)
{
/* B = (256L * Y + 475 * (U - 128)) >> 8*/
const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
const int32x4_t d475h = vmull_n_s16(vget_high_s16(D), 475);
const int32x4_t cdhm = vaddq_s32(Ch, d475h);
const int32x4_t cdh = vshrq_n_s32(cdhm, 8);
const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
const int32x4_t d475l = vmull_n_s16(vget_low_s16(D), 475);
const int32x4_t cdlm = vaddq_s32(Cl, d475l);
const int32x4_t cdl = vshrq_n_s32(cdlm, 8);
const int16x8_t cd = vcombine_s16(vqmovn_s32(cdl), vqmovn_s32(cdh));
return vqmovun_s16(cd);
}
static inline uint8x8x2_t neon_YUV2B(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
{
uint8x8x2_t res = { { neon_YUV2B_single(C.val[0], D.val[0], E.val[0]),
neon_YUV2B_single(C.val[1], D.val[1], E.val[1]) } };
return res;
}
static inline void neon_store_bgrx(BYTE* WINPR_RESTRICT pRGB, uint8x8_t r, uint8x8_t g, uint8x8_t b,
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
uint8x8x4_t bgrx = vld4_u8(pRGB);
bgrx.val[rPos] = r;
bgrx.val[gPos] = g;
bgrx.val[bPos] = b;
vst4_u8(pRGB, bgrx);
}
static inline void neon_YuvToRgbPixel(BYTE* pRGB, uint8x8x2_t Y, int16x8x2_t D, int16x8x2_t E,
const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
const uint8_t aPos)
{
/* Y * 256 == Y << 8 */
const uint16x8x2_t C = { { vshlq_n_u16(vmovl_u8(Y.val[0]), 8),
vshlq_n_u16(vmovl_u8(Y.val[1]), 8) } };
const uint8x8x2_t r = neon_YUV2R(C, D, E);
const uint8x8x2_t g = neon_YUV2G(C, D, E);
const uint8x8x2_t b = neon_YUV2B(C, D, E);
neon_store_bgrx(pRGB, r.val[0], g.val[0], b.val[0], rPos, gPos, bPos, aPos);
neon_store_bgrx(pRGB + sizeof(uint8x8x4_t), r.val[1], g.val[1], b.val[1], rPos, gPos, bPos,
aPos);
}
static inline int16x8x2_t loadUV(const BYTE* WINPR_RESTRICT pV, size_t x)
{
const uint8x8_t Vraw = vld1_u8(&pV[x / 2]);
const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vraw));
const int16x8_t c128 = vdupq_n_s16(128);
const int16x8_t E = vsubq_s16(V, c128);
return vzipq_s16(E, E);
}
static inline void neon_write_pixel(BYTE* pRGB, BYTE Y, BYTE U, BYTE V, const uint8_t rPos,
const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
{
const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V);
pRGB[rPos] = r;
pRGB[gPos] = g;
pRGB[bPos] = b;
}
static inline void neon_YUV420ToX_DOUBLE_ROW(const BYTE* WINPR_RESTRICT pY[2],
const BYTE* WINPR_RESTRICT pU,
const BYTE* WINPR_RESTRICT pV,
BYTE* WINPR_RESTRICT pRGB[2], size_t width,
const uint8_t rPos, const uint8_t gPos,
const uint8_t bPos, const uint8_t aPos)
{
UINT32 x = 0;
for (; x < width - width % 16; x += 16)
{
const uint8x16_t Y0raw = vld1q_u8(&pY[0][x]);
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
const int16x8x2_t D = loadUV(pU, x);
const int16x8x2_t E = loadUV(pV, x);
neon_YuvToRgbPixel(&pRGB[0][4ULL * x], Y0, D, E, rPos, gPos, bPos, aPos);
const uint8x16_t Y1raw = vld1q_u8(&pY[1][x]);
const uint8x8x2_t Y1 = { { vget_low_u8(Y1raw), vget_high_u8(Y1raw) } };
neon_YuvToRgbPixel(&pRGB[1][4ULL * x], Y1, D, E, rPos, gPos, bPos, aPos);
}
for (; x < width - width % 2; x += 2)
{
const BYTE U = pU[x / 2];
const BYTE V = pV[x / 2];
neon_write_pixel(&pRGB[0][4 * x], pY[0][x], U, V, rPos, gPos, bPos, aPos);
neon_write_pixel(&pRGB[0][4 * (1ULL + x)], pY[0][1ULL + x], U, V, rPos, gPos, bPos, aPos);
neon_write_pixel(&pRGB[1][4 * x], pY[1][x], U, V, rPos, gPos, bPos, aPos);
neon_write_pixel(&pRGB[1][4 * (1ULL + x)], pY[1][1ULL + x], U, V, rPos, gPos, bPos, aPos);
}
for (; x < width; x++)
{
const BYTE U = pU[x / 2];
const BYTE V = pV[x / 2];
neon_write_pixel(&pRGB[0][4 * x], pY[0][x], U, V, rPos, gPos, bPos, aPos);
neon_write_pixel(&pRGB[1][4 * x], pY[1][x], U, V, rPos, gPos, bPos, aPos);
}
}
static inline void neon_YUV420ToX_SINGLE_ROW(const BYTE* WINPR_RESTRICT pY,
const BYTE* WINPR_RESTRICT pU,
const BYTE* WINPR_RESTRICT pV,
BYTE* WINPR_RESTRICT pRGB, size_t width,
const uint8_t rPos, const uint8_t gPos,
const uint8_t bPos, const uint8_t aPos)
{
UINT32 x = 0;
for (; x < width - width % 16; x += 16)
{
const uint8x16_t Y0raw = vld1q_u8(&pY[x]);
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
const int16x8x2_t D = loadUV(pU, x);
const int16x8x2_t E = loadUV(pV, x);
neon_YuvToRgbPixel(&pRGB[4ULL * x], Y0, D, E, rPos, gPos, bPos, aPos);
}
for (; x < width - width % 2; x += 2)
{
const BYTE U = pU[x / 2];
const BYTE V = pV[x / 2];
neon_write_pixel(&pRGB[4 * x], pY[x], U, V, rPos, gPos, bPos, aPos);
neon_write_pixel(&pRGB[4 * (1ULL + x)], pY[1ULL + x], U, V, rPos, gPos, bPos, aPos);
}
for (; x < width; x++)
{
const BYTE U = pU[x / 2];
const BYTE V = pV[x / 2];
neon_write_pixel(&pRGB[4 * x], pY[x], U, V, rPos, gPos, bPos, aPos);
}
}
static inline pstatus_t neon_YUV420ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
{
const UINT32 nWidth = roi->width;
const UINT32 nHeight = roi->height;
WINPR_ASSERT(nHeight > 0);
UINT32 y = 0;
for (; y < (nHeight - 1); y += 2)
{
const uint8_t* pY[2] = { pSrc[0] + y * srcStep[0], pSrc[0] + (1ULL + y) * srcStep[0] };
const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
uint8_t* pRGB[2] = { pDst + y * dstStep, pDst + (1ULL + y) * dstStep };
neon_YUV420ToX_DOUBLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
}
for (; y < nHeight; y++)
{
const uint8_t* pY = pSrc[0] + y * srcStep[0];
const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
uint8_t* pRGB = pDst + y * dstStep;
neon_YUV420ToX_SINGLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
static inline int16x8_t loadUVreg(uint8x8_t Vraw)
{
const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vraw));
const int16x8_t c128 = vdupq_n_s16(128);
const int16x8_t E = vsubq_s16(V, c128);
return E;
}
static inline int16x8x2_t loadUV444(uint8x16_t Vld)
{
const uint8x8x2_t V = { { vget_low_u8(Vld), vget_high_u8(Vld) } };
const int16x8x2_t res = { {
loadUVreg(V.val[0]),
loadUVreg(V.val[1]),
} };
return res;
}
static inline void avgUV(BYTE U[2][2])
{
const BYTE u00 = U[0][0];
const INT16 umul = (INT16)u00 << 2;
const INT16 sum = (INT16)U[0][1] + U[1][0] + U[1][1];
const INT16 wavg = umul - sum;
const BYTE val = CONDITIONAL_CLIP(wavg, u00);
U[0][0] = val;
}
static inline void neon_avgUV(uint8x16_t pU[2])
{
/* put even and odd values into different registers.
* U 0/0 is in lower half */
const uint8x16x2_t usplit = vuzpq_u8(pU[0], pU[1]);
const uint8x16_t ueven = usplit.val[0];
const uint8x16_t uodd = usplit.val[1];
const uint8x8_t u00 = vget_low_u8(ueven);
const uint8x8_t u01 = vget_low_u8(uodd);
const uint8x8_t u10 = vget_high_u8(ueven);
const uint8x8_t u11 = vget_high_u8(uodd);
/* Create sum of U01 + U10 + U11 */
const uint16x8_t uoddsum = vaddl_u8(u01, u10);
const uint16x8_t usum = vaddq_u16(uoddsum, vmovl_u8(u11));
/* U00 * 4 */
const uint16x8_t umul = vshll_n_u8(u00, 2);
/* U00 - (U01 + U10 + U11) */
const int16x8_t wavg = vsubq_s16(vreinterpretq_s16_u16(umul), vreinterpretq_s16_u16(usum));
const uint8x8_t avg = vqmovun_s16(wavg);
/* abs(u00 - avg) */
const uint8x8_t absdiff = vabd_u8(avg, u00);
/* (diff < 30) ? u00 : avg */
const uint8x8_t mask = vclt_u8(absdiff, vdup_n_u8(30));
/* out1 = u00 & mask */
const uint8x8_t out1 = vand_u8(u00, mask);
/* invmask = ~mask */
const uint8x8_t notmask = vmvn_u8(mask);
/* out2 = avg & invmask */
const uint8x8_t out2 = vand_u8(avg, notmask);
/* out = out1 | out2 */
const uint8x8_t out = vorr_u8(out1, out2);
const uint8x8x2_t ua = vzip_u8(out, u01);
const uint8x16_t u = vcombine_u8(ua.val[0], ua.val[1]);
pU[0] = u;
}
static inline pstatus_t neon_YUV444ToX_SINGLE_ROW(const BYTE* WINPR_RESTRICT pY,
const BYTE* WINPR_RESTRICT pU,
const BYTE* WINPR_RESTRICT pV,
BYTE* WINPR_RESTRICT pRGB, size_t width,
const uint8_t rPos, const uint8_t gPos,
const uint8_t bPos, const uint8_t aPos)
{
WINPR_ASSERT(width % 2 == 0);
size_t x = 0;
for (; x < width - width % 16; x += 16)
{
uint8x16_t U = vld1q_u8(&pU[x]);
uint8x16_t V = vld1q_u8(&pV[x]);
const uint8x16_t Y0raw = vld1q_u8(&pY[x]);
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
const int16x8x2_t D0 = loadUV444(U);
const int16x8x2_t E0 = loadUV444(V);
neon_YuvToRgbPixel(&pRGB[4ULL * x], Y0, D0, E0, rPos, gPos, bPos, aPos);
}
for (; x < width; x += 2)
{
BYTE* rgb = &pRGB[x * 4];
for (size_t j = 0; j < 2; j++)
{
const BYTE y = pY[x + j];
const BYTE u = pU[x + j];
const BYTE v = pV[x + j];
neon_write_pixel(&rgb[4 * (j)], y, u, v, rPos, gPos, bPos, aPos);
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t neon_YUV444ToX_DOUBLE_ROW(const BYTE* WINPR_RESTRICT pY[2],
const BYTE* WINPR_RESTRICT pU[2],
const BYTE* WINPR_RESTRICT pV[2],
BYTE* WINPR_RESTRICT pRGB[2], size_t width,
const uint8_t rPos, const uint8_t gPos,
const uint8_t bPos, const uint8_t aPos)
{
WINPR_ASSERT(width % 2 == 0);
size_t x = 0;
for (; x < width - width % 16; x += 16)
{
uint8x16_t U[2] = { vld1q_u8(&pU[0][x]), vld1q_u8(&pU[1][x]) };
neon_avgUV(U);
uint8x16_t V[2] = { vld1q_u8(&pV[0][x]), vld1q_u8(&pV[1][x]) };
neon_avgUV(V);
const uint8x16_t Y0raw = vld1q_u8(&pY[0][x]);
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
const int16x8x2_t D0 = loadUV444(U[0]);
const int16x8x2_t E0 = loadUV444(V[0]);
neon_YuvToRgbPixel(&pRGB[0][4ULL * x], Y0, D0, E0, rPos, gPos, bPos, aPos);
const uint8x16_t Y1raw = vld1q_u8(&pY[1][x]);
const uint8x8x2_t Y1 = { { vget_low_u8(Y1raw), vget_high_u8(Y1raw) } };
const int16x8x2_t D1 = loadUV444(U[1]);
const int16x8x2_t E1 = loadUV444(V[1]);
neon_YuvToRgbPixel(&pRGB[1][4ULL * x], Y1, D1, E1, rPos, gPos, bPos, aPos);
}
for (; x < width; x += 2)
{
BYTE* rgb[2] = { &pRGB[0][x * 4], &pRGB[1][x * 4] };
BYTE U[2][2] = { { pU[0][x], pU[0][x + 1] }, { pU[1][x], pU[1][x + 1] } };
avgUV(U);
BYTE V[2][2] = { { pV[0][x], pV[0][x + 1] }, { pV[1][x], pV[1][x + 1] } };
avgUV(V);
for (size_t i = 0; i < 2; i++)
{
for (size_t j = 0; j < 2; j++)
{
const BYTE y = pY[i][x + j];
const BYTE u = U[i][j];
const BYTE v = V[i][j];
neon_write_pixel(&rgb[i][4 * (j)], y, u, v, rPos, gPos, bPos, aPos);
}
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t neon_YUV444ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
{
WINPR_ASSERT(roi);
const UINT32 nWidth = roi->width;
const UINT32 nHeight = roi->height;
size_t y = 0;
for (; y < nHeight - nHeight % 2; y += 2)
{
const uint8_t* WINPR_RESTRICT pY[2] = { pSrc[0] + y * srcStep[0],
pSrc[0] + (y + 1) * srcStep[0] };
const uint8_t* WINPR_RESTRICT pU[2] = { pSrc[1] + y * srcStep[1],
pSrc[1] + (y + 1) * srcStep[1] };
const uint8_t* WINPR_RESTRICT pV[2] = { pSrc[2] + y * srcStep[2],
pSrc[2] + (y + 1) * srcStep[2] };
uint8_t* WINPR_RESTRICT pRGB[2] = { &pDst[y * dstStep], &pDst[(y + 1) * dstStep] };
const pstatus_t rc =
neon_YUV444ToX_DOUBLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
if (rc != PRIMITIVES_SUCCESS)
return rc;
}
for (; y < nHeight; y++)
{
const uint8_t* WINPR_RESTRICT pY = pSrc[0] + y * srcStep[0];
const uint8_t* WINPR_RESTRICT pU = pSrc[1] + y * srcStep[1];
const uint8_t* WINPR_RESTRICT pV = pSrc[2] + y * srcStep[2];
uint8_t* WINPR_RESTRICT pRGB = &pDst[y * dstStep];
const pstatus_t rc =
neon_YUV444ToX_SINGLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
if (rc != PRIMITIVES_SUCCESS)
return rc;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
static pstatus_t neon_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3], const UINT32 srcStep[3],
BYTE* WINPR_RESTRICT pDstRaw[3], const UINT32 dstStep[3],
const RECTANGLE_16* WINPR_RESTRICT roi)
{
const UINT32 nWidth = roi->right - roi->left;
const UINT32 nHeight = roi->bottom - roi->top;
const UINT32 halfWidth = (nWidth + 1) / 2;
const UINT32 halfHeight = (nHeight + 1) / 2;
const UINT32 evenY = 0;
const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
pDstRaw[1] + roi->top * dstStep[1] + roi->left,
pDstRaw[2] + roi->top * dstStep[2] + roi->left };
/* Y data is already here... */
/* B1 */
for (UINT32 y = 0; y < nHeight; y++)
{
const BYTE* Ym = pSrc[0] + srcStep[0] * y;
BYTE* pY = pDst[0] + dstStep[0] * y;
memcpy(pY, Ym, nWidth);
}
/* The first half of U, V are already here part of this frame. */
/* B2 and B3 */
for (UINT32 y = 0; y < halfHeight; y++)
{
const UINT32 val2y = (2 * y + evenY);
const BYTE* Um = pSrc[1] + srcStep[1] * y;
const BYTE* Vm = pSrc[2] + srcStep[2] * y;
BYTE* pU = pDst[1] + dstStep[1] * val2y;
BYTE* pV = pDst[2] + dstStep[2] * val2y;
BYTE* pU1 = pU + dstStep[1];
BYTE* pV1 = pV + dstStep[2];
UINT32 x = 0;
for (; x + 16 < halfWidth; x += 16)
{
{
const uint8x16_t u = vld1q_u8(Um);
uint8x16x2_t u2x;
u2x.val[0] = u;
u2x.val[1] = u;
vst2q_u8(pU, u2x);
vst2q_u8(pU1, u2x);
Um += 16;
pU += 32;
pU1 += 32;
}
{
const uint8x16_t v = vld1q_u8(Vm);
uint8x16x2_t v2x;
v2x.val[0] = v;
v2x.val[1] = v;
vst2q_u8(pV, v2x);
vst2q_u8(pV1, v2x);
Vm += 16;
pV += 32;
pV1 += 32;
}
}
for (; x < halfWidth; x++)
{
const BYTE u = *Um++;
const BYTE v = *Vm++;
*pU++ = u;
*pU++ = u;
*pU1++ = u;
*pU1++ = u;
*pV++ = v;
*pV++ = v;
*pV1++ = v;
*pV1++ = v;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
const UINT32 dstStep[3],
const RECTANGLE_16* WINPR_RESTRICT roi)
{
const UINT32 mod = 16;
UINT32 uY = 0;
UINT32 vY = 0;
const UINT32 nWidth = roi->right - roi->left;
const UINT32 nHeight = roi->bottom - roi->top;
const UINT32 halfWidth = (nWidth) / 2;
const UINT32 halfHeight = (nHeight) / 2;
const UINT32 oddY = 1;
const UINT32 evenY = 0;
const UINT32 oddX = 1;
/* The auxiliary frame is aligned to multiples of 16x16.
* We need the padded height for B4 and B5 conversion. */
const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
const UINT32 halfPad = halfWidth % 16;
const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
pDstRaw[1] + roi->top * dstStep[1] + roi->left,
pDstRaw[2] + roi->top * dstStep[2] + roi->left };
/* The second half of U and V is a bit more tricky... */
/* B4 and B5 */
for (UINT32 y = 0; y < padHeigth; y++)
{
const BYTE* Ya = pSrc[0] + srcStep[0] * y;
BYTE* pX;
if ((y) % mod < (mod + 1) / 2)
{
const UINT32 pos = (2 * uY++ + oddY);
if (pos >= nHeight)
continue;
pX = pDst[1] + dstStep[1] * pos;
}
else
{
const UINT32 pos = (2 * vY++ + oddY);
if (pos >= nHeight)
continue;
pX = pDst[2] + dstStep[2] * pos;
}
memcpy(pX, Ya, nWidth);
}
/* B6 and B7 */
for (UINT32 y = 0; y < halfHeight; y++)
{
const UINT32 val2y = (y * 2 + evenY);
const BYTE* Ua = pSrc[1] + srcStep[1] * y;
const BYTE* Va = pSrc[2] + srcStep[2] * y;
BYTE* pU = pDst[1] + dstStep[1] * val2y;
BYTE* pV = pDst[2] + dstStep[2] * val2y;
UINT32 x = 0;
for (; x < halfWidth - halfPad; x += 16)
{
{
uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
u.val[1] = vld1q_u8(&Ua[x]);
vst2q_u8(&pU[2 * x], u);
}
{
uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
v.val[1] = vld1q_u8(&Va[x]);
vst2q_u8(&pV[2 * x], v);
}
}
for (; x < halfWidth; x++)
{
const UINT32 val2x1 = (x * 2 + oddX);
pU[val2x1] = Ua[x];
pV[val2x1] = Va[x];
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
UINT32 nTotalWidth, UINT32 nTotalHeight,
BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
const RECTANGLE_16* WINPR_RESTRICT roi)
{
const UINT32 nWidth = roi->right - roi->left;
const UINT32 nHeight = roi->bottom - roi->top;
const UINT32 halfWidth = (nWidth + 1) / 2;
const UINT32 halfPad = halfWidth % 16;
const UINT32 halfHeight = (nHeight + 1) / 2;
const UINT32 quaterWidth = (nWidth + 3) / 4;
const UINT32 quaterPad = quaterWidth % 16;
/* B4 and B5: odd UV values for width/2, height */
for (UINT32 y = 0; y < nHeight; y++)
{
const UINT32 yTop = y + roi->top;
const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
const BYTE* pYaV = pYaU + nTotalWidth / 2;
BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
UINT32 x = 0;
for (; x < halfWidth - halfPad; x += 16)
{
{
uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
u.val[1] = vld1q_u8(&pYaU[x]);
vst2q_u8(&pU[2 * x], u);
}
{
uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
v.val[1] = vld1q_u8(&pYaV[x]);
vst2q_u8(&pV[2 * x], v);
}
}
for (; x < halfWidth; x++)
{
const UINT32 odd = 2 * x + 1;
pU[odd] = pYaU[x];
pV[odd] = pYaV[x];
}
}
/* B6 - B9 */
for (UINT32 y = 0; y < halfHeight; y++)
{
const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
const BYTE* pUaV = pUaU + nTotalWidth / 4;
const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
const BYTE* pVaV = pVaU + nTotalWidth / 4;
BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
UINT32 x = 0;
for (; x < quaterWidth - quaterPad; x += 16)
{
{
uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
u.val[0] = vld1q_u8(&pUaU[x]);
u.val[2] = vld1q_u8(&pVaU[x]);
vst4q_u8(&pU[4 * x], u);
}
{
uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
v.val[0] = vld1q_u8(&pUaV[x]);
v.val[2] = vld1q_u8(&pVaV[x]);
vst4q_u8(&pV[4 * x], v);
}
}
for (; x < quaterWidth; x++)
{
pU[4 * x + 0] = pUaU[x];
pV[4 * x + 0] = pUaV[x];
pU[4 * x + 2] = pVaU[x];
pV[4 * x + 2] = pVaV[x];
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
const RECTANGLE_16* WINPR_RESTRICT roi)
{
if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
return -1;
if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
return -1;
if (!roi)
return -1;
switch (type)
{
case AVC444_LUMA:
return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
case AVC444_CHROMAv1:
return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
case AVC444_CHROMAv2:
return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
default:
return -1;
}
}
#endif
void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(NEON_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "NEON optimizations");
prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,274 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized Color conversion operations.
* vi:ts=4 sw=4:
*
* Copyright 2011 Stephen Erisman
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_internal.h"
#include "prim_colors.h"
/*---------------------------------------------------------------------------*/
#if defined(NEON_INTRINSICS_ENABLED)
#include <arm_neon.h>
static primitives_t* generic = nullptr;
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
BYTE* pRGB = pDst;
const INT16* pY = pSrc[0];
const INT16* pCb = pSrc[1];
const INT16* pCr = pSrc[2];
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
const size_t pad = roi->width % 8;
const int16x4_t c4096 = vdup_n_s16(4096);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width - pad; x += 8)
{
const int16x8_t Y = vld1q_s16(pY);
const int16x4_t Yh = vget_high_s16(Y);
const int16x4_t Yl = vget_low_s16(Y);
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
const int16x8_t Cr = vld1q_s16(pCr);
const int16x4_t Crh = vget_high_s16(Cr);
const int16x4_t Crl = vget_low_s16(Cr);
const int16x8_t Cb = vld1q_s16(pCb);
const int16x4_t Cbh = vget_high_s16(Cb);
const int16x4_t Cbl = vget_low_s16(Cb);
uint8x8x4_t bgrx;
{
/* R */
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
bgrx.val[rPos] = vqmovun_s16(Rs);
}
{
/* G */
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
const uint8x8_t G = vqmovun_s16(Gs);
bgrx.val[gPos] = G;
}
{
/* B */
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
const uint8x8_t B = vqmovun_s16(Bs);
bgrx.val[bPos] = B;
}
/* A */
{
bgrx.val[aPos] = vdup_n_u8(0xFF);
}
vst4_u8(pRGB, bgrx);
pY += 8;
pCb += 8;
pCr += 8;
pRGB += 32;
}
for (UINT32 x = 0; x < pad; x++)
{
const INT32 divisor = 16;
const INT32 Y = ((*pY++) + 4096) << divisor;
const INT32 Cb = (*pCb++);
const INT32 Cr = (*pCr++);
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
BYTE bgrx[4];
bgrx[bPos] = CLIP(B);
bgrx[gPos] = CLIP(G);
bgrx[rPos] = CLIP(R);
bgrx[aPos] = 0xFF;
*pRGB++ = bgrx[0];
*pRGB++ = bgrx[1];
*pRGB++ = bgrx[2];
*pRGB++ = bgrx[3];
}
pY += srcPad;
pCb += srcPad;
pCr += srcPad;
pRGB += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
UINT32 pad = roi->width % 8;
for (UINT32 y = 0; y < roi->height; y++)
{
const INT16* pr = (const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
const INT16* pg = (const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
const INT16* pb = (const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
BYTE* dst = pDst + y * dstStep;
for (UINT32 x = 0; x < roi->width - pad; x += 8)
{
int16x8_t r = vld1q_s16(pr);
int16x8_t g = vld1q_s16(pg);
int16x8_t b = vld1q_s16(pb);
uint8x8x4_t bgrx;
bgrx.val[aPos] = vdup_n_u8(0xFF);
bgrx.val[rPos] = vqmovun_s16(r);
bgrx.val[gPos] = vqmovun_s16(g);
bgrx.val[bPos] = vqmovun_s16(b);
vst4_u8(dst, bgrx);
pr += 8;
pg += 8;
pb += 8;
dst += 32;
}
for (UINT32 x = 0; x < pad; x++)
{
BYTE bgrx[4];
bgrx[bPos] = *pb++;
bgrx[gPos] = *pg++;
bgrx[rPos] = *pr++;
bgrx[aPos] = 0xFF;
*dst++ = bgrx[0];
*dst++ = bgrx[1];
*dst++ = bgrx[2];
*dst++ = bgrx[3];
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
#endif /* NEON_INTRINSICS_ENABLED */
/* ------------------------------------------------------------------------- */
void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(NEON_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "NEON optimizations");
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,501 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Optimized YUV/RGB conversion operations using openCL
*
* Copyright 2019 David Fort <contact@hardening-consulting.com>
* Copyright 2019 Rangee Gmbh
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#if defined(WITH_OPENCL)
#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include <CL/cl.h>
#endif
#include "primitives-opencl-program.h"
#include <freerdp/log.h>
#define TAG FREERDP_TAG("primitives")
typedef struct
{
BOOL support;
cl_platform_id platformId;
cl_device_id deviceId;
cl_context context;
cl_command_queue commandQueue;
cl_program program;
} primitives_opencl_context;
typedef struct
{
primitives_opencl_context* cl;
cl_kernel kernel;
cl_mem srcObjs[3];
cl_mem dstObj;
prim_size_t roi;
size_t dstStep;
} primitives_cl_kernel;
static primitives_opencl_context* primitives_get_opencl_context(void);
static void cl_kernel_free(primitives_cl_kernel* kernel)
{
if (!kernel)
return;
if (kernel->dstObj)
clReleaseMemObject(kernel->dstObj);
for (size_t i = 0; i < ARRAYSIZE(kernel->srcObjs); i++)
{
cl_mem obj = kernel->srcObjs[i];
kernel->srcObjs[i] = nullptr;
if (obj)
clReleaseMemObject(obj);
}
if (kernel->kernel)
clReleaseKernel(kernel->kernel);
free(kernel);
}
static primitives_cl_kernel* cl_kernel_new(const char* kernelName, const prim_size_t* roi)
{
WINPR_ASSERT(kernelName);
WINPR_ASSERT(roi);
primitives_cl_kernel* kernel = calloc(1, sizeof(primitives_cl_kernel));
if (!kernel)
goto fail;
kernel->roi = *roi;
kernel->cl = primitives_get_opencl_context();
if (!kernel->cl)
goto fail;
cl_int ret = CL_INVALID_VALUE;
kernel->kernel = clCreateKernel(kernel->cl->program, kernelName, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName);
goto fail;
}
return kernel;
fail:
cl_kernel_free(kernel);
return nullptr;
}
static BOOL cl_kernel_set_sources(primitives_cl_kernel* ctx, const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3])
{
const char* sourceNames[] = { "Y", "U", "V" };
WINPR_ASSERT(ctx);
WINPR_ASSERT(pSrc);
WINPR_ASSERT(srcStep);
for (cl_uint i = 0; i < ARRAYSIZE(ctx->srcObjs); i++)
{
cl_int ret = CL_INVALID_VALUE;
const BYTE* csrc = pSrc[i];
void* WINPR_RESTRICT src = WINPR_CAST_CONST_PTR_AWAY(csrc, void* WINPR_RESTRICT);
ctx->srcObjs[i] = clCreateBuffer(ctx->cl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1ull * srcStep[i] * ctx->roi.height, src, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
return FALSE;
}
ret = clSetKernelArg(ctx->kernel, i * 2, sizeof(cl_mem), (const void*)&ctx->srcObjs[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
return FALSE;
}
ret = clSetKernelArg(ctx->kernel, i * 2 + 1, sizeof(cl_uint), &srcStep[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
return FALSE;
}
}
return TRUE;
}
static BOOL cl_kernel_set_destination(primitives_cl_kernel* ctx, UINT32 dstStep)
{
WINPR_ASSERT(ctx);
ctx->dstStep = dstStep;
cl_int ret = CL_INVALID_VALUE;
ctx->dstObj = clCreateBuffer(ctx->cl->context, CL_MEM_WRITE_ONLY,
1ull * dstStep * ctx->roi.height, nullptr, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to create dest obj");
return FALSE;
}
ret = clSetKernelArg(ctx->kernel, 6, sizeof(cl_mem), (const void*)&ctx->dstObj);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg destObj");
return FALSE;
}
ret = clSetKernelArg(ctx->kernel, 7, sizeof(cl_uint), &dstStep);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg dstStep");
return FALSE;
}
return TRUE;
}
static BOOL cl_kernel_process(primitives_cl_kernel* ctx, BYTE* pDst)
{
WINPR_ASSERT(ctx);
WINPR_ASSERT(pDst);
size_t indexes[2] = WINPR_C_ARRAY_INIT;
indexes[0] = ctx->roi.width;
indexes[1] = ctx->roi.height;
cl_int ret = clEnqueueNDRangeKernel(ctx->cl->commandQueue, ctx->kernel, 2, nullptr, indexes,
nullptr, 0, nullptr, nullptr);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to enqueue call kernel");
return FALSE;
}
/* Transfer result to host */
ret = clEnqueueReadBuffer(ctx->cl->commandQueue, ctx->dstObj, CL_TRUE, 0,
ctx->roi.height * ctx->dstStep, pDst, 0, nullptr, nullptr);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to read back buffer");
return FALSE;
}
return TRUE;
}
static pstatus_t opencl_YUVToRGB(const char* kernelName, const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi)
{
pstatus_t res = -1;
primitives_cl_kernel* ctx = cl_kernel_new(kernelName, roi);
if (!ctx)
goto fail;
if (!cl_kernel_set_sources(ctx, pSrc, srcStep))
goto fail;
if (!cl_kernel_set_destination(ctx, dstStep))
goto fail;
if (!cl_kernel_process(ctx, pDst))
goto fail;
res = PRIMITIVES_SUCCESS;
fail:
cl_kernel_free(ctx);
return res;
}
static primitives_opencl_context openclContext = WINPR_C_ARRAY_INIT;
static primitives_opencl_context* primitives_get_opencl_context(void)
{
return &openclContext;
}
static void cl_context_free(primitives_opencl_context* ctx)
{
if (!ctx)
return;
clReleaseProgram(ctx->program);
clReleaseCommandQueue(ctx->commandQueue);
clReleaseContext(ctx->context);
clReleaseDevice(ctx->deviceId);
ctx->support = FALSE;
}
static pstatus_t primitives_uninit_opencl(void)
{
if (!openclContext.support)
return PRIMITIVES_SUCCESS;
cl_context_free(&openclContext);
return PRIMITIVES_SUCCESS;
}
static BOOL primitives_init_opencl_context(primitives_opencl_context* WINPR_RESTRICT prims)
{
cl_uint ndevices = 0;
cl_uint nplatforms = 0;
cl_kernel kernel = nullptr;
BOOL gotGPU = FALSE;
size_t programLen = 0;
cl_int ret = clGetPlatformIDs(0, nullptr, &nplatforms);
if (ret != CL_SUCCESS || nplatforms < 1)
return FALSE;
cl_platform_id* platform_ids = (cl_platform_id*)calloc(nplatforms, sizeof(cl_platform_id));
if (!platform_ids)
return FALSE;
ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms);
if (ret != CL_SUCCESS)
{
free((void*)platform_ids);
return FALSE;
}
for (cl_uint i = 0; (i < nplatforms) && !gotGPU; i++)
{
cl_device_id device_id = nullptr;
cl_context context = nullptr;
char platformName[1000] = WINPR_C_ARRAY_INIT;
char deviceName[1000] = WINPR_C_ARRAY_INIT;
ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName),
platformName, nullptr);
if (ret != CL_SUCCESS)
continue;
ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices);
if (ret != CL_SUCCESS)
continue;
ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName);
clReleaseDevice(device_id);
continue;
}
context = clCreateContext(nullptr, 1, &device_id, nullptr, nullptr, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s",
platformName, deviceName);
clReleaseDevice(device_id);
continue;
}
#if defined(CL_VERSION_2_0)
prims->commandQueue = clCreateCommandQueueWithProperties(context, device_id, nullptr, &ret);
#else
prims->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
#endif
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable to create command queue");
clReleaseContext(context);
clReleaseDevice(device_id);
continue;
}
WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName);
prims->platformId = platform_ids[i];
prims->deviceId = device_id;
prims->context = context;
gotGPU = TRUE;
}
free((void*)platform_ids);
if (!gotGPU)
{
WLog_ERR(TAG, "openCL: no GPU found");
return FALSE;
}
programLen = strnlen(openclProgram, sizeof(openclProgram));
const char* ptr = openclProgram;
prims->program = clCreateProgramWithSource(prims->context, 1, &ptr, &programLen, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable to create program");
goto fail;
}
ret = clBuildProgram(prims->program, 1, &prims->deviceId, nullptr, nullptr, nullptr);
if (ret != CL_SUCCESS)
{
size_t length = 0;
char buffer[2048];
ret = clGetProgramBuildInfo(prims->program, prims->deviceId, CL_PROGRAM_BUILD_LOG,
sizeof(buffer), buffer, &length);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG,
"openCL: building program failed but unable to retrieve buildLog, error=%d",
ret);
}
else
{
WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer);
}
goto fail;
}
kernel = clCreateKernel(prims->program, "yuv420_to_bgra_1b", &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel");
goto fail;
}
clReleaseKernel(kernel);
prims->support = TRUE;
return TRUE;
fail:
cl_context_free(prims);
return FALSE;
}
static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
const char* kernel_name = nullptr;
switch (DstFormat)
{
case PIXEL_FORMAT_ABGR32:
kernel_name = "yuv420_to_abgr_1b";
break;
case PIXEL_FORMAT_XBGR32:
kernel_name = "yuv420_to_xbgr_1b";
break;
case PIXEL_FORMAT_RGBX32:
kernel_name = "yuv420_to_rgba_1b";
break;
case PIXEL_FORMAT_RGBA32:
kernel_name = "yuv420_to_rgbx_1b";
break;
case PIXEL_FORMAT_BGRA32:
kernel_name = "yuv420_to_bgra_1b";
break;
case PIXEL_FORMAT_BGRX32:
kernel_name = "yuv420_to_bgrx_1b";
break;
case PIXEL_FORMAT_XRGB32:
kernel_name = "yuv420_to_xrgb_1b";
break;
case PIXEL_FORMAT_ARGB32:
kernel_name = "yuv420_to_argb_1b";
break;
default:
{
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
if (!p)
return -1;
return p->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
}
static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
const char* kernel_name = nullptr;
switch (DstFormat)
{
case PIXEL_FORMAT_ABGR32:
kernel_name = "yuv444_to_abgr_1b";
break;
case PIXEL_FORMAT_XBGR32:
kernel_name = "yuv444_to_xbgr_1b";
break;
case PIXEL_FORMAT_RGBX32:
kernel_name = "yuv444_to_rgba_1b";
break;
case PIXEL_FORMAT_RGBA32:
kernel_name = "yuv444_to_rgbx_1b";
break;
case PIXEL_FORMAT_BGRA32:
kernel_name = "yuv444_to_bgra_1b";
break;
case PIXEL_FORMAT_BGRX32:
kernel_name = "yuv444_to_bgrx_1b";
break;
case PIXEL_FORMAT_XRGB32:
kernel_name = "yuv444_to_xrgb_1b";
break;
case PIXEL_FORMAT_ARGB32:
kernel_name = "yuv444_to_argb_1b";
break;
default:
{
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
if (!p)
return -1;
return p->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
}
BOOL primitives_init_opencl(primitives_t* prims)
{
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
if (!prims || !p)
return FALSE;
*prims = *p;
if (!primitives_init_opencl_context(&openclContext))
return TRUE;
prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
prims->uninit = primitives_uninit_opencl;
return TRUE;
}
#endif

View File

@@ -0,0 +1,474 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Optimized operations using openCL
* vi:ts=4 sw=4
*
* Copyright 2019 David Fort <contact@hardening-consulting.com>
* Copyright 2019 Rangee Gmbh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
uchar clamp_uc(int v, short l, short h)
{
if (v > h)
v = h;
if (v < l)
v = l;
return (uchar)v;
}
short avgUV(__global const uchar* buf, unsigned stride, unsigned x, unsigned y)
{
const short U00 = buf[y * stride];
if ((x != 0) || (y != 0))
return U00;
const short U01 = buf[y * stride + 1];
const short U10 = buf[(y + 1) * stride];
const short U11 = buf[(y + 1) * stride + 1];
const short avg = U00 * 4 - U01 - U10 - U11;
const short avgU = clamp_uc(avg, 0, 255);
const short diff = abs(U00 - avgU);
if (diff < 30)
return U00;
return avgU;
}
__kernel void yuv420_to_rgba_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
/* A */
}
__kernel void yuv420_to_abgr_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
/* A */
destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
destPtr[2] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
}
__kernel void yuv444_to_abgr_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
/* A */
destPtr[1] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[3] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
}
__kernel void yuv444_to_rgba_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
/* A */
}
__kernel void yuv420_to_rgbx_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
destPtr[3] = 0xff; /* A */
}
__kernel void yuv420_to_xbgr_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = 0xff; /* A */
destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
destPtr[2] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
}
__kernel void yuv444_to_xbgr_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = 0xff; /* A */
destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[3] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
}
__kernel void yuv444_to_rgbx_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
destPtr[3] = 0xff; /* A */
}
__kernel void yuv420_to_argb_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
/* A */
destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
}
__kernel void yuv420_to_bgra_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
destPtr[1] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
/* A */
}
__kernel void yuv444_to_bgra_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
/* A */
}
__kernel void yuv444_to_argb_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[3] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[1] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
/* A */
}
__kernel void yuv420_to_xrgb_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = 0xff; /* A */
destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
}
__kernel void yuv420_to_bgrx_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
destPtr[1] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
destPtr[3] = 0xff; /* A */
}
__kernel void yuv444_to_bgrx_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[0] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[2] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
destPtr[3] = 0xff; /* A */
}
__kernel void yuv444_to_xrgb_1b(__global const uchar* bufY, unsigned strideY,
__global const uchar* bufU, unsigned strideU,
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
unsigned strideDest)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
short Y = bufY[y * strideY + x];
short U = avgUV(bufU, strideU, x, y);
short V = avgUV(bufV, strideV, x, y);
short D = U - 128;
short E = V - 128;
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
int y256 = 256 * Y;
destPtr[3] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
destPtr[1] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
destPtr[0] = 0xff; /* A */
}

View File

@@ -0,0 +1,11 @@
/* AUTOGENERATED file, do not edit
*
* part of @PROJECT_NAME@
* generated from libfreerdp/primitives/opencl/primitives.h.in
*
* with file contents of @FILENAME@
*/
#pragma once
static const char openclProgram[] = { @FILEDATA@ };

View File

@@ -0,0 +1,82 @@
/* FreeRDP: A Remote Desktop Protocol Client
* YCoCg<->RGB Color conversion operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_YCoCg.h"
/* helper function to convert raw 8 bit values to signed 16bit values.
*/
static INT16 convert(UINT8 raw, int shift)
{
const int cll = shift - 1; /* -1 builds in the /2's */
return (INT16)((INT8)(raw << cll));
}
/* ------------------------------------------------------------------------- */
static pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
BOOL withAlpha)
{
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, TRUE);
for (size_t y = 0; y < height; y++)
{
const BYTE* sptr = &pSrc[y * WINPR_ASSERTING_INT_CAST(uint32_t, srcStep)];
BYTE* dptr = &pDst[y * WINPR_ASSERTING_INT_CAST(uint32_t, dstStep)];
for (size_t x = 0; x < width; x++)
{
/* Note: shifts must be done before sign-conversion. */
const INT16 Cg = convert(*sptr++, shift);
const INT16 Co = convert(*sptr++, shift);
const INT16 Y = *sptr++; /* UINT8->INT16 */
const INT16 T = (INT16)(Y - Cg);
const INT16 B = (INT16)(T + Co);
const INT16 G = (INT16)(Y + Cg);
const INT16 R = (INT16)(T - Co);
BYTE A = *sptr++;
if (!withAlpha)
A = 0xFFU;
dptr = writePixel(dptr, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), A);
}
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims)
{
prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
}
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YCoCg(prims);
primitives_init_YCoCg_ssse3(prims);
primitives_init_YCoCg_neon(prims);
}

View File

@@ -0,0 +1,53 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_YCoCg_H
#define FREERDP_LIB_PRIM_YCoCg_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_YCoCg_ssse3_int(prims);
}
FREERDP_LOCAL void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_YCoCg_neon_int(prims);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_YUV_H
#define FREERDP_LIB_PRIM_YUV_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YUV_sse41(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresentEx(PF_EX_SSE41) ||
!IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_YUV_sse41_int(prims);
}
FREERDP_LOCAL void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YUV_neon(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_YUV_neon_int(prims);
}
#endif

View File

@@ -0,0 +1,83 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Add operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <freerdp/config.h>
#include <stdint.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_add.h"
/* ----------------------------------------------------------------------------
* 16-bit signed add with saturation (under and over).
*/
static inline INT16 add(INT16 a, INT16 b)
{
INT32 k = (INT32)a + (INT32)b;
if (k > INT16_MAX)
return INT16_MAX;
if (k < INT16_MIN)
return INT16_MIN;
return (INT16)k;
}
static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
UINT32 len)
{
const UINT32 rem = len % 16;
const UINT32 align = len - rem;
for (UINT32 x = 0; x < align; x++)
*pDst++ = add(*pSrc1++, *pSrc2++);
for (UINT32 x = 0; x < rem; x++)
*pDst++ = add(*pSrc1++, *pSrc2++);
return PRIMITIVES_SUCCESS;
}
static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len)
{
for (UINT32 x = 0; x < len; x++)
{
INT16 v = add(pSrcDst1[x], pSrcDst2[x]);
pSrcDst1[x] = v;
pSrcDst2[x] = v;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_add(primitives_t* WINPR_RESTRICT prims)
{
prims->add_16s = general_add_16s;
prims->add_16s_inplace = general_add_16s_inplace;
}
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_add(prims);
primitives_init_add_sse3(prims);
}

View File

@@ -0,0 +1,42 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_ADD_H
#define FREERDP_LIB_PRIM_ADD_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
return;
primitives_init_add_sse3_int(prims);
}
#endif

View File

@@ -0,0 +1,98 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Alpha blending routines.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Note: this code assumes the second operand is fully opaque,
* e.g.
* newval = alpha1*val1 + (1-alpha1)*val2
* rather than
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_alphaComp.h"
#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
/* ------------------------------------------------------------------------- */
static pstatus_t general_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
UINT32 height)
{
for (size_t y = 0; y < height; y++)
{
const UINT32* sptr1 = (const UINT32*)(pSrc1 + y * src1Step);
const UINT32* sptr2 = (const UINT32*)(pSrc2 + y * src2Step);
UINT32* dptr = (UINT32*)(pDst + y * dstStep);
for (size_t x = 0; x < width; x++)
{
const UINT32 src1 = *sptr1++;
const UINT32 src2 = *sptr2++;
UINT32 alpha = ALPHA(src1) + 1;
if (alpha == 256)
{
/* If alpha is 255+1, just copy src1. */
*dptr++ = src1;
}
else if (alpha <= 1)
{
/* If alpha is 0+1, just copy src2. */
*dptr++ = src2;
}
else
{
/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
* rather than adding one to alpha and dividing by 256, but this
* is much faster and only differs by one 16% of the time.
* I'm not sure who first designed the double-ops trick
* (Red Blue and Alpha Green).
*/
UINT32 rb = 0;
UINT32 ag = 0;
UINT32 s2rb = src2 & 0x00FF00FFU;
UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
UINT32 s1rb = src1 & 0x00FF00FFU;
UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
UINT32 drb = s1rb - s2rb;
UINT32 dag = s1ag - s2ag;
drb *= alpha;
dag *= alpha;
rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
*dptr++ = rb | ag;
}
}
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims)
{
prims->alphaComp_argb = general_alphaComp_argb;
}
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_alphaComp(prims);
primitives_init_alphaComp_sse3(prims);
}

View File

@@ -0,0 +1,42 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H
#define FREERDP_LIB_PRIM_ALPHA_COMP_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
return;
primitives_init_alphaComp_sse3_int(prims);
}
#endif

View File

@@ -0,0 +1,66 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Logical operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_andor.h"
/* ----------------------------------------------------------------------------
* 32-bit AND with a constant.
*/
static pstatus_t general_andC_32u(const UINT32* WINPR_RESTRICT pSrc, UINT32 val,
UINT32* WINPR_RESTRICT pDst, INT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
while (len--)
*pDst++ = *pSrc++ & val;
return PRIMITIVES_SUCCESS;
}
/* ----------------------------------------------------------------------------
* 32-bit OR with a constant.
*/
static pstatus_t general_orC_32u(const UINT32* WINPR_RESTRICT pSrc, UINT32 val,
UINT32* WINPR_RESTRICT pDst, INT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
while (len--)
*pDst++ = *pSrc++ | val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_andor(primitives_t* WINPR_RESTRICT prims)
{
/* Start with the default. */
prims->andC_32u = general_andC_32u;
prims->orC_32u = general_orC_32u;
}
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_andor(prims);
primitives_init_andor_sse3(prims);
}

View File

@@ -0,0 +1,42 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_ANDOR_H
#define FREERDP_LIB_PRIM_ANDOR_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_andor_sse3_int(prims);
}
#endif

View File

@@ -0,0 +1,576 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Color conversion operations.
* vi:ts=4 sw=4:
*
* Copyright 2011 Stephen Erisman
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <math.h>
#include <freerdp/config.h>
#include <winpr/assert.h>
#include <winpr/cast.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/codec/color.h>
#include "prim_internal.h"
#include "prim_colors.h"
#ifndef MINMAX
#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
#endif /* !MINMAX */
/* ------------------------------------------------------------------------- */
/* pregenerated table for ycbcr constants: [0,27]
*
* rounded integer values derived from the following formula:
*
* { (1.402525f * 2^divisor), (0.714401f * 2^divisor), (0.343730f * 2^divisor), (1.769905f *
* 2^divisor) }
*/
static const INT32 ycbcr_constants[][4] = { { 1, 1, 0, 2 },
{ 3, 1, 1, 4 },
{ 6, 3, 1, 7 },
{ 11, 6, 3, 14 },
{ 22, 11, 5, 28 },
{ 45, 23, 11, 57 },
{ 90, 46, 22, 113 },
{ 180, 91, 44, 227 },
{ 359, 183, 88, 453 },
{ 718, 366, 176, 906 },
{ 1436, 732, 352, 1812 },
{ 2872, 1463, 704, 3625 },
{ 5745, 2926, 1408, 7250 },
{ 11489, 5852, 2816, 14499 },
{ 22979, 11705, 5632, 28998 },
{ 45958, 23409, 11263, 57996 },
{ 91916, 46819, 22527, 115992 },
{ 183832, 93638, 45053, 231985 },
{ 367664, 187276, 90107, 463970 },
{ 735327, 374552, 180214, 927940 },
{ 1470654, 749104, 360427, 1855880 },
{ 2941308, 1498207, 720854, 3711760 },
{ 5882616, 2996415, 1441708, 7423520 },
{ 11765232, 5992830, 2883416, 14847039 },
{ 23530465, 11985660, 5766832, 29694078 },
{ 47060930, 23971320, 11533665, 59388157 },
{ 94121859, 47942640, 23067330, 118776314 },
{ 188243719, 95885279, 46134660, 237552628 },
{ 376487438, 191770558, 92269319, 475105256 },
{ 752974876, 383541116, 184538639, 950210512 },
{ 1505949752, 767082233, 369077277, 1900421023 } };
static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
BYTE* pRGB = pDst;
const INT16* pY = pSrc[0];
const INT16* pCb = pSrc[1];
const INT16* pCr = pSrc[2];
const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
const size_t dstPad = (dstStep - (roi->width * 4));
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width; x++)
{
const INT32 divisor = 16;
const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
const INT32 Cb = (*pCb++);
const INT32 Cr = (*pCr++);
const INT32 CrR = WINPR_ASSERTING_INT_CAST(
int32_t, Cr* ycbcr_constants[divisor][0]); //(1.402525f * 2^divisor);
const INT32 CrG = WINPR_ASSERTING_INT_CAST(
int32_t, Cr* ycbcr_constants[divisor][1]); //(0.714401f * 2^divisor);
const INT32 CbG = WINPR_ASSERTING_INT_CAST(
int32_t, Cb* ycbcr_constants[divisor][2]); //(0.343730f * 2^divisor);
const INT32 CbB = WINPR_ASSERTING_INT_CAST(
int32_t, Cb* ycbcr_constants[divisor][3]); //(1.769905f * 2^divisor);
const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, ((CrR + Y) >> divisor) >> 5);
const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, ((Y - CbG - CrG) >> divisor) >> 5);
const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, ((CbB + Y) >> divisor) >> 5);
pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
}
pY += srcPad;
pCb += srcPad;
pCr += srcPad;
pRGB += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_general(const INT16* WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
BYTE* pRGB = pDst;
const INT16* pY = pSrc[0];
const INT16* pCb = pSrc[1];
const INT16* pCr = pSrc[2];
const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
const size_t dstPad = (dstStep - (roi->width * 4));
const fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width; x++)
{
const INT32 divisor = 16;
const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
const INT32 Cb = (*pCb++);
const INT32 Cr = (*pCr++);
const INT32 CrR = Cr * ycbcr_constants[divisor][0];
const INT32 CrG = Cr * ycbcr_constants[divisor][1];
const INT32 CbG = Cb * ycbcr_constants[divisor][2];
const INT32 CbB = Cb * ycbcr_constants[divisor][3];
const INT32 R = (CrR + Y) >> (divisor + 5);
const INT32 G = (Y - CbG - CrG) >> (divisor + 5);
const INT32 B = (CbB + Y) >> (divisor + 5);
pRGB = writePixel(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
}
pY += srcPad;
pCb += srcPad;
pCr += srcPad;
pRGB += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return general_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat,
roi);
default:
return general_yCbCrToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
roi);
}
}
/* ------------------------------------------------------------------------- */
static pstatus_t
general_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
/**
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point
* numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value range
* is [-128.0, 127.0]. In other words, the decoded coefficients are scaled
* by << 5 when interpreted as INT16.
* It was scaled in the quantization phase, so we must scale it back here.
*/
const INT16* yptr = pSrc[0];
const INT16* cbptr = pSrc[1];
const INT16* crptr = pSrc[2];
INT16* rptr = pDst[0];
INT16* gptr = pDst[1];
INT16* bptr = pDst[2];
UINT32 srcbump = (WINPR_ASSERTING_INT_CAST(uint32_t, srcStep) - (roi->width * sizeof(UINT16))) /
sizeof(UINT16);
UINT32 dstbump = (WINPR_ASSERTING_INT_CAST(uint32_t, dstStep) - (roi->width * sizeof(UINT16))) /
sizeof(UINT16);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width; ++x)
{
/* INT32 is used intentionally because we calculate
* with shifted factors!
*/
INT32 cy = (INT32)(*yptr++);
INT32 cb = (INT32)(*cbptr++);
INT32 cr = (INT32)(*crptr++);
INT64 r = 0;
INT64 g = 0;
INT64 b = 0;
/*
* This is the slow floating point version kept here for reference.
* y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
* r = y + cr*1.403f;
* g = y - cb*0.344f - cr*0.714f;
* b = y + cb*1.770f;
* y_r_buf[i] = CLIP(r>>5);
* cb_g_buf[i] = CLIP(g>>5);
* cr_b_buf[i] = CLIP(b>>5);
*/
/*
* We scale the factors by << 16 into 32-bit integers in order to
* avoid slower floating point multiplications. Since the final
* result needs to be scaled by >> 5 we will extract only the
* upper 11 bits (>> 21) from the final sum.
* Hence we also have to scale the other terms of the sum by << 16.
* R: 1.403 << 16 = 91947
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
* B: 1.770 << 16 = 115998
*/
cy = (INT32)((UINT32)(cy + 4096) << 16);
r = 1LL * cy + 1LL * cr * ycbcr_constants[16][0];
g = 1LL * cy - 1LL * cb * ycbcr_constants[16][1] - 1LL * cr * ycbcr_constants[16][2];
b = 1LL * cy + 1LL * cb * ycbcr_constants[16][3];
*rptr++ = CLIP(r >> 21);
*gptr++ = CLIP(g >> 21);
*bptr++ = CLIP(b >> 21);
}
yptr += srcbump;
cbptr += srcbump;
crptr += srcbump;
rptr += dstbump;
gptr += dstbump;
bptr += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t
general_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
* numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value
* range is [-128.0, 127.0]. In other words, the encoded coefficients
* is scaled by << 5 when interpreted as INT16.
* It will be scaled down to original during the quantization phase.
*/
const INT16* rptr = pSrc[0];
const INT16* gptr = pSrc[1];
const INT16* bptr = pSrc[2];
INT16* yptr = pDst[0];
INT16* cbptr = pDst[1];
INT16* crptr = pDst[2];
UINT32 srcbump = (WINPR_ASSERTING_INT_CAST(uint32_t, srcStep) - (roi->width * sizeof(UINT16))) /
sizeof(UINT16);
UINT32 dstbump = (WINPR_ASSERTING_INT_CAST(uint32_t, dstStep) - (roi->width * sizeof(UINT16))) /
sizeof(UINT16);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width; ++x)
{
/* INT32 is used intentionally because we calculate with
* shifted factors!
*/
INT32 r = (INT32)(*rptr++);
INT32 g = (INT32)(*gptr++);
INT32 b = (INT32)(*bptr++);
/* We scale the factors by << 15 into 32-bit integers in order
* to avoid slower floating point multiplications. Since the
* terms need to be scaled by << 5 we simply scale the final
* sum by >> 10
*
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235,
* 0.114000 << 15 = 3735
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868,
* 0.500590 << 15 = 16403
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
* 0.081282 << 15 = 2663
*/
INT32 cy = (r * 9798 + g * 19235 + b * 3735) >> 10;
INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
*yptr++ = (INT16)MINMAX(cy - 4096, -4096, 4095);
*cbptr++ = (INT16)MINMAX(cb, -4096, 4095);
*crptr++ = (INT16)MINMAX(cr, -4096, 4095);
}
yptr += srcbump;
cbptr += srcbump;
crptr += srcbump;
rptr += dstbump;
gptr += dstbump;
bptr += dstbump;
}
return PRIMITIVES_SUCCESS;
}
static inline void writeScanlineGeneric(BYTE* dst, DWORD formatSize, UINT32 DstFormat,
const INT16* r, const INT16* g, const INT16* b, DWORD width)
{
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
for (UINT32 x = 0; x < width; x++)
{
const INT16 pr = *r++;
const INT16 pg = *g++;
const INT16 pb = *b++;
dst =
writePixel(dst, formatSize, DstFormat, WINPR_ASSERTING_INT_CAST(UINT8, pr),
WINPR_ASSERTING_INT_CAST(UINT8, pg), WINPR_ASSERTING_INT_CAST(UINT8, pb), 0);
}
}
static inline void writeScanlineRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
const INT16* g, const INT16* b, DWORD width)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(DstFormat);
for (UINT32 x = 0; x < width; x++)
{
const BYTE R = CLIP(*r++);
const BYTE G = CLIP(*g++);
const BYTE B = CLIP(*b++);
*dst++ = R;
*dst++ = G;
*dst++ = B;
}
}
static inline void writeScanlineBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
const INT16* g, const INT16* b, DWORD width)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(DstFormat);
for (UINT32 x = 0; x < width; x++)
{
const BYTE R = CLIP(*r++);
const BYTE G = CLIP(*g++);
const BYTE B = CLIP(*b++);
*dst++ = B;
*dst++ = G;
*dst++ = R;
}
}
static inline void writeScanlineBGRX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
const INT16* g, const INT16* b, DWORD width)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(DstFormat);
for (UINT32 x = 0; x < width; x++)
{
const BYTE R = CLIP(*r++);
const BYTE G = CLIP(*g++);
const BYTE B = CLIP(*b++);
*dst++ = B;
*dst++ = G;
*dst++ = R;
*dst++ = 0xFF;
}
}
static inline void writeScanlineRGBX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
const INT16* g, const INT16* b, DWORD width)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(DstFormat);
for (UINT32 x = 0; x < width; x++)
{
const BYTE R = CLIP(*r++);
const BYTE G = CLIP(*g++);
const BYTE B = CLIP(*b++);
*dst++ = R;
*dst++ = G;
*dst++ = B;
*dst++ = 0xFF;
}
}
static inline void writeScanlineXBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
const INT16* g, const INT16* b, DWORD width)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(DstFormat);
for (UINT32 x = 0; x < width; x++)
{
const BYTE R = CLIP(*r++);
const BYTE G = CLIP(*g++);
const BYTE B = CLIP(*b++);
*dst++ = 0xFF;
*dst++ = B;
*dst++ = G;
*dst++ = R;
}
}
static inline void writeScanlineXRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
const INT16* g, const INT16* b, DWORD width)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(DstFormat);
for (UINT32 x = 0; x < width; x++)
{
const BYTE R = CLIP(*r++);
const BYTE G = CLIP(*g++);
const BYTE B = CLIP(*b++);
*dst++ = 0xFF;
*dst++ = R;
*dst++ = G;
*dst++ = B;
}
}
typedef void (*fkt_writeScanline)(BYTE*, DWORD, UINT32, const INT16*, const INT16*, const INT16*,
DWORD);
static inline fkt_writeScanline getScanlineWriteFunction(DWORD format)
{
switch (format)
{
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return writeScanlineXRGB;
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return writeScanlineXBGR;
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return writeScanlineRGBX;
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return writeScanlineBGRX;
case PIXEL_FORMAT_BGR24:
return writeScanlineBGR;
case PIXEL_FORMAT_RGB24:
return writeScanlineRGB;
default:
return writeScanlineGeneric;
}
}
/* ------------------------------------------------------------------------- */
static pstatus_t general_RGBToRGB_16s8u_P3AC4R_general(
const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
const INT16* r = pSrc[0];
const INT16* g = pSrc[1];
const INT16* b = pSrc[2];
const DWORD srcAdd = srcStep / sizeof(INT16);
fkt_writeScanline writeScanline = getScanlineWriteFunction(DstFormat);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
for (UINT32 y = 0; y < roi->height; ++y)
{
(*writeScanline)(pDst, formatSize, DstFormat, r, g, b, roi->width);
pDst += dstStep;
r += srcAdd;
g += srcAdd;
b += srcAdd;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t general_RGBToRGB_16s8u_P3AC4R_BGRX(
const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
const INT16* r = pSrc[0];
const INT16* g = pSrc[1];
const INT16* b = pSrc[2];
const DWORD srcAdd = srcStep / sizeof(INT16);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
for (UINT32 y = 0; y < roi->height; ++y)
{
writeScanlineBGRX(pDst, formatSize, DstFormat, r, g, b, roi->width);
pDst += dstStep;
r += srcAdd;
g += srcAdd;
b += srcAdd;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t
general_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return general_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
default:
return general_RGBToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
roi);
}
}
/* ------------------------------------------------------------------------- */
void primitives_init_colors(primitives_t* WINPR_RESTRICT prims)
{
prims->yCbCrToRGB_16s8u_P3AC4R = general_yCbCrToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
}
/* ------------------------------------------------------------------------- */
void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_colors(prims);
primitives_init_colors_sse2(prims);
primitives_init_colors_neon(prims);
}

View File

@@ -0,0 +1,51 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives colors
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_COLORS_H
#define FREERDP_LIB_PRIM_COLORS_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_colors_sse2(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_colors_sse2_int(prims);
}
FREERDP_LOCAL void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_colors_neon(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_colors_neon_int(prims);
}
#endif

View File

@@ -0,0 +1,439 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
/*static inline BOOL memory_regions_overlap_1d(*/
static BOOL memory_regions_overlap_1d(const BYTE* p1, const BYTE* p2, size_t bytes)
{
const ULONG_PTR p1m = (const ULONG_PTR)p1;
const ULONG_PTR p2m = (const ULONG_PTR)p2;
if (p1m <= p2m)
{
if (p1m + bytes > p2m)
return TRUE;
}
else
{
if (p2m + bytes > p1m)
return TRUE;
}
/* else */
return FALSE;
}
/* ------------------------------------------------------------------------- */
/*static inline BOOL memory_regions_overlap_2d( */
static BOOL memory_regions_overlap_2d(const BYTE* p1, int p1Step, int p1Size, const BYTE* p2,
int p2Step, int p2Size, int width, int height)
{
ULONG_PTR p1m = (ULONG_PTR)p1;
ULONG_PTR p2m = (ULONG_PTR)p2;
if (p1m <= p2m)
{
ULONG_PTR p1mEnd = p1m +
1ull * (WINPR_ASSERTING_INT_CAST(uint32_t, height - 1)) *
WINPR_ASSERTING_INT_CAST(uint32_t, p1Step) +
1ull * WINPR_ASSERTING_INT_CAST(uint32_t, width* p1Size);
if (p1mEnd > p2m)
return TRUE;
}
else
{
ULONG_PTR p2mEnd = p2m +
1ull * (WINPR_ASSERTING_INT_CAST(uintptr_t, height - 1)) *
WINPR_ASSERTING_INT_CAST(uintptr_t, p2Step) +
1ull * WINPR_ASSERTING_INT_CAST(uintptr_t, width* p2Size);
if (p2mEnd > p1m)
return TRUE;
}
/* else */
return FALSE;
}
/* ------------------------------------------------------------------------- */
static pstatus_t general_copy_8u(const BYTE* WINPR_RESTRICT pSrc, BYTE* WINPR_RESTRICT pDst,
INT32 len)
{
if (memory_regions_overlap_1d(pSrc, pDst, (size_t)len))
{
memmove((void*)pDst, (const void*)pSrc, (size_t)len);
}
else
{
memcpy((void*)pDst, (const void*)pSrc, (size_t)len);
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
/* Copy a block of pixels from one buffer to another.
* The addresses are assumed to have been already offset to the upper-left
* corners of the source and destination region of interest.
*/
static pstatus_t general_copy_8u_AC4r(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, INT32 dstStep, INT32 width,
INT32 height)
{
const BYTE* src = pSrc;
BYTE* dst = pDst;
const size_t rowbytes = WINPR_ASSERTING_INT_CAST(size_t, width) * sizeof(UINT32);
if ((width == 0) || (height == 0))
return PRIMITIVES_SUCCESS;
if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32), pDst, dstStep, sizeof(UINT32),
width, height))
{
do
{
const pstatus_t rc =
generic->copy(src, dst, WINPR_ASSERTING_INT_CAST(int32_t, rowbytes));
if (rc != PRIMITIVES_SUCCESS)
return rc;
src += srcStep;
dst += dstStep;
} while (--height);
}
else
{
/* TODO: do it in one operation when the rowdata is adjacent. */
do
{
/* If we find a replacement for memcpy that is consistently
* faster, this could be replaced with that.
*/
memcpy(dst, src, rowbytes);
src += srcStep;
dst += dstStep;
} while (--height);
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t generic_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 3;
const int64_t dstByte = 4;
const UINT32 width = nWidth - nWidth % 8;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
WINPR_PRAGMA_UNROLL_LOOP
for (; x < width; x++)
{
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
}
for (; x < nWidth; x++)
{
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t
generic_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep, UINT32 nXDst,
UINT32 nYDst, UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, UINT32 nSrcStep, UINT32 nXSrc,
UINT32 nYSrc, int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 4;
const int64_t dstByte = 4;
const UINT32 width = nWidth - nWidth % 8;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
WINPR_PRAGMA_UNROLL_LOOP
for (; x < width; x++)
{
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
}
for (; x < nWidth; x++)
{
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
}
}
return PRIMITIVES_SUCCESS;
}
pstatus_t generic_image_copy_no_overlap_convert(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const int64_t dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const UINT32 width = nWidth - nWidth % 8;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
// WINPR_PRAGMA_UNROLL_LOOP
for (; x < width; x++)
{
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
if (!FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor))
return -1;
}
for (; x < nWidth; x++)
{
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
if (!FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor))
return -1;
}
}
return PRIMITIVES_SUCCESS;
}
pstatus_t generic_image_copy_no_overlap_memcpy(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
WINPR_ATTR_UNUSED const gdiPalette* WINPR_RESTRICT palette, int64_t srcVMultiplier,
int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset, WINPR_ATTR_UNUSED UINT32 flags)
{
const int64_t dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const int64_t srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const int64_t copyDstWidth = nWidth * dstByte;
const int64_t xSrcOffset = nXSrc * srcByte;
const int64_t xDstOffset = nXDst * dstByte;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
memcpy(&dstLine[xDstOffset], &srcLine[xSrcOffset],
WINPR_ASSERTING_INT_CAST(size_t, copyDstWidth));
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t generic_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return generic_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return generic_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
switch (DstFormat)
{
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return generic_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
case PIXEL_FORMAT_RGB24:
return generic_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
return generic_image_copy_no_overlap_convert(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
}
static inline pstatus_t generic_image_copy_no_overlap_no_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset,
UINT32 flags)
{
if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
return generic_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
}
static pstatus_t generic_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
int64_t srcVOffset = 0;
int64_t srcVMultiplier = 1;
int64_t dstVOffset = 0;
int64_t dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return generic_image_copy_no_overlap_dst_alpha(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier,
dstVOffset);
else
return generic_image_copy_no_overlap_no_alpha(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset,
flags);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_copy(primitives_t* WINPR_RESTRICT prims)
{
/* Start with the default. */
prims->copy_8u = general_copy_8u;
prims->copy_8u_AC4r = general_copy_8u_AC4r;
prims->copy = WINPR_FUNC_PTR_CAST(prims->copy_8u, fn_copy_t);
prims->copy_no_overlap = generic_image_copy_no_overlap;
}
void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_copy(prims);
primitives_init_copy_sse41(prims);
#if defined(WITH_AVX2)
primitives_init_copy_avx2(prims);
#endif
}

View File

@@ -0,0 +1,63 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_COPY_H
#define FREERDP_LIB_PRIM_COPY_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
WINPR_ATTR_NODISCARD FREERDP_LOCAL pstatus_t generic_image_copy_no_overlap_convert(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset);
WINPR_ATTR_NODISCARD FREERDP_LOCAL pstatus_t generic_image_copy_no_overlap_memcpy(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset,
UINT32 flags);
FREERDP_LOCAL void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_copy_sse41(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_copy_sse41_int(prims);
}
#if defined(WITH_AVX2)
FREERDP_LOCAL void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_copy_avx2(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_copy_avx2_int(prims);
}
#endif
#endif

View File

@@ -0,0 +1,352 @@
/* prim_internal.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifndef FREERDP_LIB_PRIM_INTERNAL_H
#define FREERDP_LIB_PRIM_INTERNAL_H
#include <winpr/platform.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include <freerdp/api.h>
#include "../core/simd.h"
#define PRIM_ALIGN_128 DECLSPEC_ALIGN(16)
#if defined(SSE_AVX_INTRINSICS_ENABLED) || defined(NEON_INTRINSICS_ENABLED) || defined(WITH_OPENCL)
#define HAVE_OPTIMIZED_PRIMITIVES 1
#endif
#if defined(SSE_AVX_INTRINSICS_ENABLED) || defined(NEON_INTRINSICS_ENABLED)
#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
#endif
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelBGRA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
*dst++ = B;
*dst++ = G;
*dst++ = R;
*dst++ = A;
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelBGRX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
WINPR_UNUSED(A);
*dst++ = B;
*dst++ = G;
*dst++ = R;
dst++; /* Do not touch alpha */
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelRGBA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
*dst++ = R;
*dst++ = G;
*dst++ = B;
*dst++ = A;
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelRGBX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
WINPR_UNUSED(A);
*dst++ = R;
*dst++ = G;
*dst++ = B;
dst++; /* Do not touch alpha */
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelABGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
*dst++ = A;
*dst++ = B;
*dst++ = G;
*dst++ = R;
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelXBGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
WINPR_UNUSED(A);
dst++; /* Do not touch alpha */
*dst++ = B;
*dst++ = G;
*dst++ = R;
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelARGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
*dst++ = A;
*dst++ = R;
*dst++ = G;
*dst++ = B;
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelXRGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
WINPR_UNUSED(formatSize);
WINPR_UNUSED(format);
WINPR_UNUSED(A);
dst++; /* Do not touch alpha */
*dst++ = R;
*dst++ = G;
*dst++ = B;
return dst;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelGenericAlpha(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R,
BYTE G, BYTE B, BYTE A)
{
UINT32 color = FreeRDPGetColor(format, R, G, B, A);
FreeRDPWriteColor(dst, format, color);
return dst + formatSize;
}
WINPR_ATTR_NODISCARD
static inline BYTE* writePixelGeneric(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
BYTE B, BYTE A)
{
UINT32 color = FreeRDPGetColor(format, R, G, B, A);
FreeRDPWriteColorIgnoreAlpha(dst, format, color);
return dst + formatSize;
}
typedef BYTE* (*fkt_writePixel)(BYTE*, DWORD, UINT32, BYTE, BYTE, BYTE, BYTE);
WINPR_ATTR_NODISCARD
static inline fkt_writePixel getPixelWriteFunction(DWORD format, BOOL useAlpha)
{
switch (format)
{
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return useAlpha ? writePixelARGB : writePixelXRGB;
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return useAlpha ? writePixelABGR : writePixelXBGR;
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return useAlpha ? writePixelRGBA : writePixelRGBX;
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return useAlpha ? writePixelBGRA : writePixelBGRX;
default:
return useAlpha ? writePixelGenericAlpha : writePixelGeneric;
}
}
WINPR_ATTR_NODISCARD
static inline BYTE CLIP(INT64 X)
{
if (X > 255L)
return 255L;
if (X < 0L)
return 0L;
return (BYTE)X;
}
WINPR_ATTR_NODISCARD
static inline BYTE CONDITIONAL_CLIP(INT32 in, BYTE original)
{
BYTE out = CLIP(in);
BYTE diff = 0;
if (out > original)
diff = out - original;
else
diff = original - out;
if (diff < 30)
return original;
return out;
}
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*/
static inline INT32 C(INT32 Y)
{
return (Y)-0;
}
static inline INT32 D(INT32 U)
{
return (U)-128;
}
static inline INT32 E(INT32 V)
{
return (V)-128;
}
WINPR_ATTR_NODISCARD
static inline BYTE YUV2R(INT32 Y, INT32 U, INT32 V)
{
const INT32 r = (256 * C(Y) + 0 * D(U) + 403 * E(V));
const INT32 r8 = r >> 8;
return CLIP(r8);
}
WINPR_ATTR_NODISCARD
static inline BYTE YUV2G(INT32 Y, INT32 U, INT32 V)
{
const INT32 g = (256 * C(Y) - 48 * D(U) - 120 * E(V));
const INT32 g8 = g >> 8;
return CLIP(g8);
}
WINPR_ATTR_NODISCARD
static inline BYTE YUV2B(INT32 Y, INT32 U, INT32 V)
{
const INT32 b = (256 * C(Y) + 475 * D(U) + 0 * E(V));
const INT32 b8 = b >> 8;
return CLIP(b8);
}
/**
* | Y | ( | 54 183 18 | | R | ) | 0 |
* | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 |
* | V | ( | 128 -116 -12 | | B | ) | 128 |
*/
WINPR_ATTR_NODISCARD
static inline BYTE RGB2Y(INT32 R, INT32 G, INT32 B)
{
const INT32 val = ((54 * R + 183 * G + 18 * B) >> 8);
return WINPR_ASSERTING_INT_CAST(BYTE, val);
}
WINPR_ATTR_NODISCARD
static inline BYTE RGB2U(INT32 R, INT32 G, INT32 B)
{
const INT32 val = (((-29 * R - 99 * G + 128 * B) >> 8) + 128);
return WINPR_ASSERTING_INT_CAST(BYTE, val);
}
WINPR_ATTR_NODISCARD
static inline BYTE RGB2V(INT32 R, INT32 G, INT32 B)
{
const INT32 val = (((128 * R - 116 * G - 12 * B) >> 8) + 128);
return WINPR_ASSERTING_INT_CAST(BYTE, val);
}
static inline BYTE* writeYUVPixel(BYTE* dst, UINT32 DstFormat, INT32 y, INT32 u, INT32 v,
fkt_writePixel fkt)
{
WINPR_ASSERT(fkt);
const BYTE r = YUV2R(y, u, v);
const BYTE g = YUV2G(y, u, v);
const BYTE b = YUV2B(y, u, v);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
return fkt(dst, formatSize, DstFormat, r, g, b, 0);
}
FREERDP_LOCAL void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
size_t offset, const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width);
FREERDP_LOCAL void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width);
/* Function prototypes for all the init/deinit routines. */
FREERDP_LOCAL void primitives_init_copy(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_set(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_add(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_andor(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_shift(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_sign(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_colors(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims);
FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims);
#if defined(WITH_OPENCL)
WINPR_ATTR_NODISCARD
FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* WINPR_RESTRICT prims);
#endif
#endif /* FREERDP_LIB_PRIM_INTERNAL_H */

View File

@@ -0,0 +1,137 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Routines to set a chunk of memory to a constant.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_set.h"
/* ========================================================================= */
static pstatus_t general_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
{
memset((void*)pDst, (int)val, (size_t)len);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t general_zero(void* WINPR_RESTRICT pDst, size_t len)
{
memset(pDst, 0, len);
return PRIMITIVES_SUCCESS;
}
/* ========================================================================= */
static pstatus_t general_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
{
INT32* dptr = pDst;
size_t span = 0;
size_t remaining = 0;
if (len < 256)
{
while (len--)
*dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* else quadratic growth memcpy algorithm */
span = 1;
*dptr = val;
remaining = len - 1;
primitives_t* prims = primitives_get();
while (remaining)
{
size_t thiswidth = span;
if (thiswidth > remaining)
thiswidth = remaining;
const size_t s = thiswidth << 2;
WINPR_ASSERT(thiswidth <= INT32_MAX);
const pstatus_t rc = prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), (INT32)s);
if (rc != PRIMITIVES_SUCCESS)
return rc;
remaining -= thiswidth;
span <<= 1;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t general_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
{
UINT32* dptr = pDst;
size_t span = 0;
size_t remaining = 0;
primitives_t* prims = nullptr;
if (len < 256)
{
while (len--)
*dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* else quadratic growth memcpy algorithm */
span = 1;
*dptr = val;
remaining = len - 1;
prims = primitives_get();
while (remaining)
{
size_t thiswidth = span;
if (thiswidth > remaining)
thiswidth = remaining;
const size_t s = thiswidth << 2;
WINPR_ASSERT(thiswidth <= INT32_MAX);
const pstatus_t rc = prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), (INT32)s);
if (rc != PRIMITIVES_SUCCESS)
return rc;
remaining -= thiswidth;
span <<= 1;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_set(primitives_t* WINPR_RESTRICT prims)
{
/* Start with the default. */
prims->set_8u = general_set_8u;
prims->set_32s = general_set_32s;
prims->set_32u = general_set_32u;
prims->zero = general_zero;
}
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_set(prims);
primitives_init_set_sse2(prims);
}

View File

@@ -0,0 +1,42 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_SET_H
#define FREERDP_LIB_PRIM_SET_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_set_sse2_int(prims);
}
#endif

View File

@@ -0,0 +1,150 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/assert.h>
#include <winpr/cast.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_shift.h"
/* ------------------------------------------------------------------------- */
static inline INT16 shift(INT16 val, UINT32 sh)
{
const INT16 rc = (int16_t)(((UINT32)val << sh) & 0xFFFF);
return WINPR_ASSERTING_INT_CAST(INT16, rc);
}
static inline pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val,
UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
for (UINT32 x = 0; x < len; x++)
pSrcDst[x] = shift(pSrcDst[x], val);
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t general_lShiftC_16s(const INT16* WINPR_RESTRICT pSrc, UINT32 val,
INT16* WINPR_RESTRICT pDst, UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
for (UINT32 x = 0; x < len; x++)
pDst[x] = shift(pSrc[x], val);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static inline pstatus_t general_rShiftC_16s(const INT16* WINPR_RESTRICT pSrc, UINT32 val,
INT16* WINPR_RESTRICT pDst, UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
for (UINT32 x = 0; x < len; x++)
pDst[x] = WINPR_ASSERTING_INT_CAST(int16_t, pSrc[x] >> val);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static inline pstatus_t general_lShiftC_16u(const UINT16* WINPR_RESTRICT pSrc, UINT32 val,
UINT16* WINPR_RESTRICT pDst, UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
for (UINT32 x = 0; x < len; x++)
pDst[x] = WINPR_ASSERTING_INT_CAST(UINT16, ((pSrc[x] << val) & 0xFFFF));
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static inline pstatus_t general_rShiftC_16u(const UINT16* WINPR_RESTRICT pSrc, UINT32 val,
UINT16* WINPR_RESTRICT pDst, UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
for (UINT32 x = 0; x < len; x++)
pDst[x] = pSrc[x] >> val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static inline pstatus_t general_shiftC_16s(const INT16* WINPR_RESTRICT pSrc, INT32 val,
INT16* WINPR_RESTRICT pDst, UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val < 0)
return general_rShiftC_16s(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, -val), pDst, len);
else
return general_lShiftC_16s(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, val), pDst, len);
}
/* ------------------------------------------------------------------------- */
static inline pstatus_t general_shiftC_16u(const UINT16* WINPR_RESTRICT pSrc, INT32 val,
UINT16* WINPR_RESTRICT pDst, UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val < 0)
return general_rShiftC_16u(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, -val), pDst, len);
else
return general_lShiftC_16u(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, val), pDst, len);
}
/* ------------------------------------------------------------------------- */
void primitives_init_shift(primitives_t* WINPR_RESTRICT prims)
{
/* Start with the default. */
prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace;
prims->lShiftC_16s = general_lShiftC_16s;
prims->rShiftC_16s = general_rShiftC_16s;
prims->lShiftC_16u = general_lShiftC_16u;
prims->rShiftC_16u = general_rShiftC_16u;
/* Wrappers */
prims->shiftC_16s = general_shiftC_16s;
prims->shiftC_16u = general_shiftC_16u;
}
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_shift(prims);
primitives_init_shift_sse3(prims);
}

View File

@@ -0,0 +1,41 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_SHIFT_H
#define FREERDP_LIB_PRIM_SHIFT_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_shift_sse3_int(prims);
}
#endif

View File

@@ -0,0 +1,50 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Sign operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_sign.h"
/* ----------------------------------------------------------------------------
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
*/
static pstatus_t general_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
UINT32 len)
{
while (len--)
{
INT16 src = *pSrc++;
*pDst++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_sign(primitives_t* WINPR_RESTRICT prims)
{
/* Start with the default. */
prims->sign_16s = general_sign_16s;
}
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_sign(prims);
primitives_init_sign_ssse3(prims);
}

View File

@@ -0,0 +1,42 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_SIGN_H
#define FREERDP_LIB_PRIM_SIGN_H
#include <winpr/wtypes.h>
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
FREERDP_LOCAL void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
{
if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_sign_ssse3_int(prims);
}
#endif

View File

@@ -0,0 +1,455 @@
/* primitives.c
* This code queries processor features and calls the init/deinit routines.
* vi:ts=4 sw=4
*
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Copyright 2019 David Fort <contact@hardening-consulting.com>
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <string.h>
#include <stdlib.h>
#include <winpr/synch.h>
#include <winpr/sysinfo.h>
#include <winpr/crypto.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include <freerdp/log.h>
#define TAG FREERDP_TAG("primitives")
/* hints to know which kind of primitives to use */
static primitive_hints primitivesHints = PRIMITIVES_AUTODETECT;
static BOOL primitives_init_optimized(primitives_t* prims);
void primitives_set_hints(primitive_hints hints)
{
primitivesHints = hints;
}
primitive_hints primitives_get_hints(void)
{
return primitivesHints;
}
/* Singleton pointer used throughout the program when requested. */
static primitives_t pPrimitivesGeneric = WINPR_C_ARRAY_INIT;
static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
static primitives_t pPrimitivesCpu = WINPR_C_ARRAY_INIT;
static INIT_ONCE cpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
#endif
#if defined(WITH_OPENCL)
static primitives_t pPrimitivesGpu = WINPR_C_ARRAY_INIT;
static INIT_ONCE gpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
#endif
static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
static primitives_t pPrimitives = WINPR_C_ARRAY_INIT;
/* ------------------------------------------------------------------------- */
static BOOL primitives_init_generic(primitives_t* prims)
{
primitives_init_add(prims);
primitives_init_andor(prims);
primitives_init_alphaComp(prims);
primitives_init_copy(prims);
primitives_init_set(prims);
primitives_init_shift(prims);
primitives_init_sign(prims);
primitives_init_colors(prims);
primitives_init_YCoCg(prims);
primitives_init_YUV(prims);
prims->uninit = nullptr;
return TRUE;
}
static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PVOID* context)
{
WINPR_UNUSED(once);
WINPR_UNUSED(param);
WINPR_UNUSED(context);
return primitives_init_generic(&pPrimitivesGeneric);
}
static BOOL primitives_init_optimized(primitives_t* prims)
{
primitives_init_generic(prims);
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
primitives_init_add_opt(prims);
primitives_init_andor_opt(prims);
primitives_init_alphaComp_opt(prims);
primitives_init_copy_opt(prims);
primitives_init_set_opt(prims);
primitives_init_shift_opt(prims);
primitives_init_sign_opt(prims);
primitives_init_colors_opt(prims);
primitives_init_YCoCg_opt(prims);
primitives_init_YUV_opt(prims);
prims->flags |= PRIM_FLAGS_HAVE_EXTCPU;
#endif
return TRUE;
}
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) && defined(WITH_OPENCL)
typedef struct
{
BYTE* channels[3];
UINT32 steps[3];
prim_size_t roi;
BYTE* outputBuffer;
UINT32 outputStride;
UINT32 testedFormat;
} primitives_YUV_benchmark;
static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
{
if (!bench)
return;
free(bench->outputBuffer);
for (int i = 0; i < 3; i++)
free(bench->channels[i]);
memset(bench, 0, sizeof(primitives_YUV_benchmark));
}
static primitives_YUV_benchmark* primitives_YUV_benchmark_init(primitives_YUV_benchmark* ret)
{
prim_size_t* roi = nullptr;
if (!ret)
return nullptr;
memset(ret, 0, sizeof(primitives_YUV_benchmark));
roi = &ret->roi;
roi->width = 1024;
roi->height = 768;
ret->outputStride = roi->width * 4;
ret->testedFormat = PIXEL_FORMAT_BGRA32;
ret->outputBuffer = calloc(ret->outputStride, roi->height);
if (!ret->outputBuffer)
goto fail;
for (int i = 0; i < 3; i++)
{
BYTE* buf = ret->channels[i] = calloc(roi->width, roi->height);
if (!buf)
goto fail;
if (winpr_RAND(buf, 1ull * roi->width * roi->height) < 0)
goto fail;
ret->steps[i] = roi->width;
}
return ret;
fail:
primitives_YUV_benchmark_free(ret);
return ret;
}
static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
UINT64 runTime, UINT32* computations)
{
ULONGLONG dueDate = 0;
const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
pstatus_t status = 0;
*computations = 0;
for (size_t i = 0; i < 3; i++)
channels[i] = bench->channels[i];
/* do a first dry run to initialize cache and such */
status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* let's run the benchmark */
dueDate = GetTickCount64() + runTime;
while (GetTickCount64() < dueDate)
{
pstatus_t cstatus =
prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
bench->outputStride, bench->testedFormat, &bench->roi);
if (cstatus != PRIMITIVES_SUCCESS)
return FALSE;
*computations = *computations + 1;
}
return TRUE;
}
#endif
static BOOL primitives_autodetect_best(primitives_t* prims)
{
BOOL ret = FALSE;
struct prim_benchmark
{
const char* name;
primitives_t* prims;
primitive_hints flags;
UINT32 count;
};
struct prim_benchmark testcases[] = {
{ "generic", nullptr, PRIMITIVES_PURE_SOFT, 0 },
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
{ "optimized", nullptr, PRIMITIVES_ONLY_CPU, 0 },
#endif
#if defined(WITH_OPENCL)
{ "opencl", nullptr, PRIMITIVES_ONLY_GPU, 0 },
#endif
};
const struct prim_benchmark* best = nullptr;
#if !defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) || !defined(WITH_OPENCL)
{
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) || defined(WITH_OPENCL)
struct prim_benchmark* cur = &testcases[1];
#else
struct prim_benchmark* cur = &testcases[0];
#endif
cur->prims = primitives_get_by_type(cur->flags);
if (!cur->prims)
{
WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
return FALSE;
}
WLog_DBG(TAG, "primitives benchmark: only one backend, skipping...");
best = cur;
}
#else
{
UINT64 benchDuration = 150; /* 150 ms */
primitives_YUV_benchmark bench = WINPR_C_ARRAY_INIT;
primitives_YUV_benchmark* yuvBench = primitives_YUV_benchmark_init(&bench);
if (!yuvBench)
return FALSE;
WLog_DBG(TAG, "primitives benchmark result:");
for (size_t x = 0; x < ARRAYSIZE(testcases); x++)
{
struct prim_benchmark* cur = &testcases[x];
cur->prims = primitives_get_by_type(cur->flags);
if (!cur->prims)
{
WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
continue;
}
if (!primitives_YUV_benchmark_run(yuvBench, cur->prims, benchDuration, &cur->count))
{
WLog_WARN(TAG, "error running %s YUV bench", cur->name);
continue;
}
WLog_DBG(TAG, " * %s= %" PRIu32, cur->name, cur->count);
if (!best || (best->count < cur->count))
best = cur;
}
primitives_YUV_benchmark_free(yuvBench);
}
#endif
if (!best)
{
WLog_ERR(TAG, "No primitives to test, aborting.");
goto out;
}
/* finally compute the results */
*prims = *best->prims;
WLog_DBG(TAG, "primitives autodetect, using %s", best->name);
ret = TRUE;
out:
if (!ret)
*prims = pPrimitivesGeneric;
return ret;
}
#if defined(WITH_OPENCL)
static BOOL CALLBACK primitives_init_gpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
{
WINPR_UNUSED(once);
WINPR_UNUSED(param);
WINPR_UNUSED(context);
return primitives_init_opencl(&pPrimitivesGpu);
}
#endif
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
static BOOL CALLBACK primitives_init_cpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
{
WINPR_UNUSED(once);
WINPR_UNUSED(param);
WINPR_UNUSED(context);
return (primitives_init_optimized(&pPrimitivesCpu));
}
#endif
static BOOL CALLBACK primitives_auto_init_cb(PINIT_ONCE once, PVOID param, PVOID* context)
{
WINPR_UNUSED(once);
WINPR_UNUSED(param);
WINPR_UNUSED(context);
return primitives_init(&pPrimitives, primitivesHints);
}
BOOL primitives_init(primitives_t* p, primitive_hints hints)
{
switch (hints)
{
case PRIMITIVES_AUTODETECT:
return primitives_autodetect_best(p);
case PRIMITIVES_PURE_SOFT:
*p = pPrimitivesGeneric;
return TRUE;
case PRIMITIVES_ONLY_CPU:
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
*p = pPrimitivesCpu;
return TRUE;
#endif
case PRIMITIVES_ONLY_GPU:
#if defined(WITH_OPENCL)
*p = pPrimitivesGpu;
return TRUE;
#endif
default:
WLog_ERR(TAG, "unknown hint %u", hints);
return FALSE;
}
}
void primitives_uninit(void)
{
#if defined(WITH_OPENCL)
if (pPrimitivesGpu.uninit)
pPrimitivesGpu.uninit();
#endif
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
if (pPrimitivesCpu.uninit)
pPrimitivesCpu.uninit();
#endif
if (pPrimitivesGeneric.uninit)
pPrimitivesGeneric.uninit();
}
/* ------------------------------------------------------------------------- */
static void setup(void)
{
if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
nullptr))
return;
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, nullptr, nullptr))
return;
#endif
#if defined(WITH_OPENCL)
if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, nullptr, nullptr))
return;
#endif
if (!InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, nullptr, nullptr))
return;
}
primitives_t* primitives_get(void)
{
setup();
return &pPrimitives;
}
primitives_t* primitives_get_generic(void)
{
if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
nullptr))
return nullptr;
return &pPrimitivesGeneric;
}
primitives_t* primitives_get_by_type(primitive_hints type)
{
if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
nullptr))
return nullptr;
switch (type)
{
case PRIMITIVES_ONLY_GPU:
#if defined(WITH_OPENCL)
if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, nullptr,
nullptr))
return nullptr;
return &pPrimitivesGpu;
#endif
case PRIMITIVES_ONLY_CPU:
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, nullptr,
nullptr))
return nullptr;
return &pPrimitivesCpu;
#endif
case PRIMITIVES_PURE_SOFT:
default:
return &pPrimitivesGeneric;
}
}
DWORD primitives_flags(primitives_t* p)
{
return p->flags;
}
const char* primitives_avc444_frame_type_str(avc444_frame_type type)
{
switch (type)
{
case AVC444_LUMA:
return "AVC444_LUMA";
case AVC444_CHROMAv1:
return "AVC444_CHROMAv1";
case AVC444_CHROMAv2:
return "AVC444_CHROMAv2";
default:
return "INVALID_FRAME_TYPE";
}
}
const char* primtives_hint_str(primitive_hints hint)
{
switch (hint)
{
case PRIMITIVES_PURE_SOFT:
return "PRIMITIVES_PURE_SOFT";
case PRIMITIVES_ONLY_CPU:
return "PRIMITIVES_ONLY_CPU";
case PRIMITIVES_ONLY_GPU:
return "PRIMITIVES_ONLY_GPU";
case PRIMITIVES_AUTODETECT:
return "PRIMITIVES_AUTODETECT";
default:
return "PRIMITIVES_UNKNOWN";
}
}

View File

@@ -0,0 +1,383 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized YCoCg<->RGB conversion operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_YCoCg.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <tmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
UINT32 dstStep, UINT32 width, UINT32 height,
UINT8 shift, BOOL withAlpha)
{
const BYTE* sptr = pSrc;
BYTE* dptr = pDst;
WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
const size_t sRowBump = srcStep - width * sizeof(UINT32);
const size_t dRowBump = dstStep - width * sizeof(UINT32);
/* Shift left by "shift" and divide by two is the same as shift
* left by "shift-1".
*/
int dataShift = shift - 1;
BYTE mask = (BYTE)(0xFFU << dataShift);
/* Let's say the data is of the form:
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
* Apply:
* |R| | 1 1/2 -1/2 | |y|
* |G| = | 1 0 1/2 | * |o|
* |B| | 1 -1/2 -1/2 | |g|
* where Y is 8-bit unsigned and o & g are 8-bit signed.
*/
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
{
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
width, height, shift, withAlpha);
}
for (UINT32 h = 0; h < height; h++)
{
UINT32 w = width;
while (w >= 8)
{
__m128i R0;
__m128i R1;
__m128i R2;
__m128i R3;
__m128i R4;
__m128i R5;
__m128i R6;
__m128i R7;
R0 = LOAD_SI128(sptr);
sptr += (128 / 8);
R1 = LOAD_SI128(sptr);
sptr += (128 / 8);
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
/* Shuffle to pack all the like types together. */
R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
R3 = _mm_shuffle_epi8(R0, R2);
R4 = _mm_shuffle_epi8(R1, R2);
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
R5 = _mm_unpackhi_epi32(R3, R4);
R6 = _mm_unpacklo_epi32(R3, R4);
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Save alphas aside */
if (withAlpha)
R7 = _mm_unpackhi_epi64(R5, R5);
else
R7 = mm_set1_epu32(0xFFFFFFFFU);
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
R1 = mm_set1_epu32(0);
R0 = _mm_unpacklo_epi8(R5, R1);
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
* Note: this must be done before sign-conversion.
* Note also there is no slli_epi8, so we have to use a 16-bit
* version and then mask.
*/
R6 = _mm_slli_epi16(R6, dataShift);
R1 = mm_set1_epu8(mask);
R6 = _mm_and_si128(R6, R1);
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Expand Co's from 8-bit signed to 16-bit signed */
R1 = _mm_unpackhi_epi8(R6, R6);
R1 = _mm_srai_epi16(R1, 8);
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
/* Expand Cg's form 8-bit signed to 16-bit signed */
R2 = _mm_unpacklo_epi8(R6, R6);
R2 = _mm_srai_epi16(R2, 8);
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
/* Get Y - halfCg and save */
R6 = _mm_subs_epi16(R0, R2);
/* R = (Y-halfCg) + halfCo */
R3 = _mm_adds_epi16(R6, R1);
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
/* G = Y + Cg(/2) */
R4 = _mm_adds_epi16(R0, R2);
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
/* B = (Y-halfCg) - Co(/2) */
R5 = _mm_subs_epi16(R6, R1);
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
/* Repack R's & B's. */
R0 = _mm_packus_epi16(R3, R5);
/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
/* Repack G's. */
R1 = _mm_packus_epi16(R4, R4);
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
/* And add the A's. */
R1 = _mm_unpackhi_epi64(R1, R7);
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
/* Now do interleaving again. */
R2 = _mm_unpacklo_epi8(R0, R1);
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
R3 = _mm_unpackhi_epi8(R0, R1);
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
R4 = _mm_unpacklo_epi16(R2, R3);
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
R5 = _mm_unpackhi_epi16(R2, R3);
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
STORE_SI128(dptr, R4);
dptr += (128 / 8);
STORE_SI128(dptr, R5);
dptr += (128 / 8);
w -= 8;
}
/* Handle any remainder pixels. */
if (w > 0)
{
pstatus_t status = 0;
status = generic->YCoCgToRGB_8u_AC4R(
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += w * sizeof(UINT32);
dptr += w * sizeof(UINT32);
}
sptr += sRowBump;
dptr += dRowBump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 DstFormat, UINT32 dstStep, UINT32 width,
UINT32 height, UINT8 shift, BOOL withAlpha)
{
const BYTE* sptr = pSrc;
BYTE* dptr = pDst;
size_t sRowBump = srcStep - width * sizeof(UINT32);
size_t dRowBump = dstStep - width * sizeof(UINT32);
/* Shift left by "shift" and divide by two is the same as shift
* left by "shift-1".
*/
int dataShift = shift - 1;
BYTE mask = (BYTE)(0xFFU << dataShift);
/* Let's say the data is of the form:
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
* Apply:
* |R| | 1 1/2 -1/2 | |y|
* |G| = | 1 0 1/2 | * |o|
* |B| | 1 -1/2 -1/2 | |g|
* where Y is 8-bit unsigned and o & g are 8-bit signed.
*/
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
{
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
width, height, shift, withAlpha);
}
for (UINT32 h = 0; h < height; h++)
{
UINT32 w = width;
while (w >= 8)
{
__m128i R7;
/* The faster path, 16-byte aligned load. */
__m128i R0 = LOAD_SI128(sptr);
sptr += (128 / 8);
__m128i R1 = LOAD_SI128(sptr);
sptr += (128 / 8);
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
/* Shuffle to pack all the like types together. */
__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
__m128i R3 = _mm_shuffle_epi8(R0, R2);
__m128i R4 = _mm_shuffle_epi8(R1, R2);
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
__m128i R5 = _mm_unpackhi_epi32(R3, R4);
__m128i R6 = _mm_unpacklo_epi32(R3, R4);
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Save alphas aside */
if (withAlpha)
R7 = _mm_unpackhi_epi64(R5, R5);
else
R7 = mm_set1_epu32(0xFFFFFFFFU);
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
R1 = mm_set1_epu32(0);
R0 = _mm_unpacklo_epi8(R5, R1);
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
* Note: this must be done before sign-conversion.
* Note also there is no slli_epi8, so we have to use a 16-bit
* version and then mask.
*/
R6 = _mm_slli_epi16(R6, dataShift);
R1 = mm_set1_epu8(mask);
R6 = _mm_and_si128(R6, R1);
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
/* Expand Co's from 8-bit signed to 16-bit signed */
R1 = _mm_unpackhi_epi8(R6, R6);
R1 = _mm_srai_epi16(R1, 8);
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
/* Expand Cg's form 8-bit signed to 16-bit signed */
R2 = _mm_unpacklo_epi8(R6, R6);
R2 = _mm_srai_epi16(R2, 8);
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
/* Get Y - halfCg and save */
R6 = _mm_subs_epi16(R0, R2);
/* R = (Y-halfCg) + halfCo */
R3 = _mm_adds_epi16(R6, R1);
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
/* G = Y + Cg(/2) */
R4 = _mm_adds_epi16(R0, R2);
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
/* B = (Y-halfCg) - Co(/2) */
R5 = _mm_subs_epi16(R6, R1);
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
/* Repack R's & B's. */
/* This line is the only diff between inverted and non-inverted.
* Unfortunately, it would be expensive to check "inverted"
* every time through this loop.
*/
R0 = _mm_packus_epi16(R5, R3);
/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
/* Repack G's. */
R1 = _mm_packus_epi16(R4, R4);
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
/* And add the A's. */
R1 = _mm_unpackhi_epi64(R1, R7);
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
/* Now do interleaving again. */
R2 = _mm_unpacklo_epi8(R0, R1);
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
R3 = _mm_unpackhi_epi8(R0, R1);
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
R4 = _mm_unpacklo_epi16(R2, R3);
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
R5 = _mm_unpackhi_epi16(R2, R3);
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
STORE_SI128(dptr, R4);
dptr += (128 / 8);
STORE_SI128(dptr, R5);
dptr += (128 / 8);
w -= 8;
}
/* Handle any remainder pixels. */
if (w > 0)
{
pstatus_t status = 0;
status = generic->YCoCgToRGB_8u_AC4R(
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
}
sptr += sRowBump;
dptr += dRowBump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
BOOL withAlpha)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return ssse3_YCoCgRToRGB_8u_AC4R_invert(
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
default:
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
height, shift, withAlpha);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,187 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized add operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_add.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
{
const int shifts = 2;
INT16* dptr1 = pSrcDst1;
INT16* dptr2 = pSrcDst2;
if (ulen < 16) /* pointless if too small */
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst1 & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
}
/* Get to the 16-byte boundary now. */
const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
if (rem != 0)
{
const UINT32 add = 16 - (UINT32)rem;
pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
if (status != PRIMITIVES_SUCCESS)
return status;
dptr1 += add;
dptr2 += add;
}
/* Use 4 128-bit SSE registers. */
size_t len = ulen;
size_t count = len >> (7 - shifts);
len -= count << (7 - shifts);
if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
{
/* Unaligned loads */
while (count--)
{
const __m128i* vsptr1 = (const __m128i*)dptr1;
const __m128i* vsptr2 = (const __m128i*)dptr2;
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = LOAD_SI128(vsptr1++);
__m128i xmm1 = LOAD_SI128(vsptr1++);
__m128i xmm2 = LOAD_SI128(vsptr1++);
__m128i xmm3 = LOAD_SI128(vsptr1++);
__m128i xmm4 = LOAD_SI128(vsptr2++);
__m128i xmm5 = LOAD_SI128(vsptr2++);
__m128i xmm6 = LOAD_SI128(vsptr2++);
__m128i xmm7 = LOAD_SI128(vsptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr1++, xmm1);
STORE_SI128(vdptr1++, xmm2);
STORE_SI128(vdptr1++, xmm3);
STORE_SI128(vdptr2++, xmm0);
STORE_SI128(vdptr2++, xmm1);
STORE_SI128(vdptr2++, xmm2);
STORE_SI128(vdptr2++, xmm3);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
}
}
else
{
/* Aligned loads */
while (count--)
{
const __m128i* vsptr1 = (const __m128i*)dptr1;
const __m128i* vsptr2 = (const __m128i*)dptr2;
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = LOAD_SI128(vsptr1++);
__m128i xmm1 = LOAD_SI128(vsptr1++);
__m128i xmm2 = LOAD_SI128(vsptr1++);
__m128i xmm3 = LOAD_SI128(vsptr1++);
__m128i xmm4 = LOAD_SI128(vsptr2++);
__m128i xmm5 = LOAD_SI128(vsptr2++);
__m128i xmm6 = LOAD_SI128(vsptr2++);
__m128i xmm7 = LOAD_SI128(vsptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr1++, xmm1);
STORE_SI128(vdptr1++, xmm2);
STORE_SI128(vdptr1++, xmm3);
STORE_SI128(vdptr2++, xmm0);
STORE_SI128(vdptr2++, xmm1);
STORE_SI128(vdptr2++, xmm2);
STORE_SI128(vdptr2++, xmm3);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
}
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* vsptr1 = (const __m128i*)dptr1;
const __m128i* vsptr2 = (const __m128i*)dptr2;
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = LOAD_SI128(vsptr1);
__m128i xmm1 = LOAD_SI128(vsptr2);
xmm0 = _mm_adds_epi16(xmm0, xmm1);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr2++, xmm0);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
}
/* Finish off the remainder. */
if (len > 0)
return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
return PRIMITIVES_SUCCESS;
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->add_16s = sse3_add_16s;
prims->add_16s_inplace = sse3_add_16s_inplace;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,215 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized alpha blending routines.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Note: this code assumes the second operand is fully opaque,
* e.g.
* newval = alpha1*val1 + (1-alpha1)*val2
* rather than
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
* The IPP gives other options.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_alphaComp.h"
#include "prim_internal.h"
#include "prim_avxsse.h"
/* ------------------------------------------------------------------------- */
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
UINT32 height)
{
const UINT32* sptr1 = (const UINT32*)pSrc1;
const UINT32* sptr2 = (const UINT32*)pSrc2;
if ((width <= 0) || (height <= 0))
return PRIMITIVES_SUCCESS;
if (width < 4) /* pointless if too small */
{
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
height);
}
UINT32* dptr = (UINT32*)pDst;
const size_t linebytes = width * sizeof(UINT32);
const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
__m128i xmm0 = mm_set1_epu32(0);
__m128i xmm1 = _mm_set1_epi16(1);
for (UINT32 y = 0; y < height; ++y)
{
uint32_t pixels = width;
uint32_t count = 0;
/* Get to the 16-byte boundary now. */
uint32_t leadIn = 0;
switch ((ULONG_PTR)dptr & 0x0f)
{
case 0:
leadIn = 0;
break;
case 4:
leadIn = 3;
break;
case 8:
leadIn = 2;
break;
case 12:
leadIn = 1;
break;
default:
/* We'll never hit a 16-byte boundary, so do the whole
* thing the slow way.
*/
leadIn = width;
break;
}
if (leadIn)
{
pstatus_t status = 0;
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += leadIn;
sptr2 += leadIn;
dptr += leadIn;
pixels -= leadIn;
}
/* Use SSE registers to do 4 pixels at a time. */
count = pixels >> 2;
pixels -= count << 2;
while (count--)
{
__m128i xmm2;
__m128i xmm3;
__m128i xmm4;
__m128i xmm5;
__m128i xmm6;
__m128i xmm7;
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
xmm2 = LOAD_SI128(sptr1);
sptr1 += 4;
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
xmm3 = LOAD_SI128(sptr2);
sptr2 += 4;
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
/* subtract */
xmm6 = _mm_subs_epi16(xmm4, xmm5);
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
/* Add one to alphas */
xmm4 = _mm_adds_epi16(xmm4, xmm1);
/* Multiply and take low word */
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
/* Shift 8 right */
xmm4 = _mm_srai_epi16(xmm4, 8);
/* Add xmm5 */
xmm4 = _mm_adds_epi16(xmm4, xmm5);
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
/* subtract */
xmm7 = _mm_subs_epi16(xmm5, xmm6);
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
/* Add one to alphas */
xmm5 = _mm_adds_epi16(xmm5, xmm1);
/* Multiply and take low word */
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
/* Shift 8 right */
xmm5 = _mm_srai_epi16(xmm5, 8);
/* Add xmm6 */
xmm5 = _mm_adds_epi16(xmm5, xmm6);
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
/* Must mask off remainders or pack gets confused */
xmm3 = _mm_set1_epi16(0x00ffU);
xmm4 = _mm_and_si128(xmm4, xmm3);
xmm5 = _mm_and_si128(xmm5, xmm3);
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
xmm5 = _mm_packus_epi16(xmm5, xmm4);
STORE_SI128(dptr, xmm5);
dptr += 4;
}
/* Finish off the remainder. */
if (pixels)
{
pstatus_t status = 0;
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
src2Step, (BYTE*)dptr, dstStep, pixels, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += pixels;
sptr2 += pixels;
dptr += pixels;
}
/* Jump to next row. */
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->alphaComp_argb = sse2_alphaComp_argb;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,54 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized Logical operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_andor.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
*dptr++ = *sptr++ & val)
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->andC_32u = sse3_andC_32u;
prims->orC_32u = sse3_orC_32u;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,79 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* FreeRDP primitives SSE implementation
*
* Copyright 2025 Armin Novak <armin.novak@thincast.com>
* Copyright 2025 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <winpr/cast.h>
#include "../../core/simd.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
WINPR_ATTR_NODISCARD
static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
{
return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
WINPR_CXX_COMPAT_CAST(int32_t, val3),
WINPR_CXX_COMPAT_CAST(int32_t, val4));
}
WINPR_ATTR_NODISCARD
static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
{
return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
}
WINPR_ATTR_NODISCARD
static inline __m128i mm_set1_epu32(uint32_t val)
{
return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
}
WINPR_ATTR_NODISCARD
static inline __m128i mm_set1_epu8(uint8_t val)
{
return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
}
WINPR_ATTR_NODISCARD
static inline __m128i LOAD_SI128(const void* ptr)
{
const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
return _mm_lddqu_si128(mptr);
}
static inline void STORE_SI128(void* ptr, __m128i val)
{
__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
_mm_storeu_si128(mptr, val);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,278 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <immintrin.h>
static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
{
return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
(int32_t)i5, (int32_t)i6, (int32_t)i7);
}
static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 3;
const int64_t dstByte = 4;
const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
0xFF000000, 0xFF000000, 0xFF000000);
const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
const UINT32 rem = nWidth % 8;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
/* Ensure alignment requirements can be met */
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
__m256i s1 = _mm256_shuffle_epi8(s0, smask);
/* _mm256_shuffle_epi8 can not cross 128bit lanes.
* manually copy these bytes with extract/insert */
const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
const __m256i s2 = _mm256_loadu_si256(dst);
__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
_mm256_storeu_si256(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 4;
const int64_t dstByte = 4;
const __m256i mask = _mm256_setr_epi8(
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
const UINT32 rem = nWidth % 8;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
const __m256i s1 = _mm256_loadu_si256(dst);
__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
_mm256_storeu_si256(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
int64_t dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return avx2_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return avx2_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
switch (DstFormat)
{
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return avx2_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
int64_t srcVOffset = 0;
int64_t srcVMultiplier = 1;
int64_t dstVOffset = 0;
int64_t dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, flags, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
{
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
WLog_VRB(PRIM_TAG, "AVX2 optimizations");
prims->copy_no_overlap = avx2_image_copy_no_overlap;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,257 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <immintrin.h>
static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 3;
const int64_t dstByte = 4;
const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
const UINT32 rem = nWidth % 4;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
/* Ensure alignment requirements can be met */
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = LOAD_SI128(src);
const __m128i s1 = _mm_shuffle_epi8(s0, smask);
const __m128i s2 = LOAD_SI128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
STORE_SI128(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
int64_t srcVMultiplier, int64_t srcVOffset,
int64_t dstVMultiplier, int64_t dstVOffset)
{
const int64_t srcByte = 4;
const int64_t dstByte = 4;
const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
(char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
(char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
const UINT32 rem = nWidth % 4;
const int64_t width = nWidth - rem;
for (int64_t y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
int64_t x = 0;
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = LOAD_SI128(src);
const __m128i s1 = LOAD_SI128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
STORE_SI128(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t sse_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
int64_t dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return sse_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return sse_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
switch (DstFormat)
{
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return sse_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
int64_t srcVOffset = 0;
int64_t srcVMultiplier = 1;
int64_t dstVOffset = 0;
int64_t dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, flags, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
{
primitives_t* gen = primitives_get_generic();
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
prims->copy_no_overlap = sse_image_copy_no_overlap;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,235 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized routines to set a chunk of memory to a constant.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_set.h"
/* ========================================================================= */
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
static primitives_t* generic = nullptr;
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
{
size_t len = ulen;
BYTE byte = 0;
BYTE* dptr = nullptr;
__m128i xmm0;
size_t count = 0;
if (len < 16)
return generic->set_8u(val, pDst, ulen);
byte = val;
dptr = pDst;
/* Seek 16-byte alignment. */
while ((ULONG_PTR)dptr & 0x0f)
{
*dptr++ = byte;
if (--len == 0)
return PRIMITIVES_SUCCESS;
}
xmm0 = mm_set1_epu8(byte);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 8;
len -= count << 8;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
STORE_SI128(dptr, xmm0);
dptr += 16;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 4;
len -= count << 4;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 16;
}
/* Do leftover bytes. */
while (len--)
*dptr++ = byte;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
{
size_t len = ulen;
const primitives_t* prim = primitives_get_generic();
UINT32* dptr = pDst;
__m128i xmm0;
size_t count = 0;
/* If really short, just do it here. */
if (len < 32)
{
while (len--)
*dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* Assure we can reach 16-byte alignment. */
if (((ULONG_PTR)dptr & 0x03) != 0)
{
return prim->set_32u(val, pDst, ulen);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR)dptr & 0x0f)
{
*dptr++ = val;
if (--len == 0)
return PRIMITIVES_SUCCESS;
}
xmm0 = mm_set1_epu32(val);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 6;
len -= count << 6;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
STORE_SI128(dptr, xmm0);
dptr += 4;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 2;
len -= count << 2;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
STORE_SI128(dptr, xmm0);
dptr += 4;
}
/* Do leftover bytes. */
while (len--)
*dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
{
UINT32 uval = *((UINT32*)&val);
return sse2_set_32u(uval, (UINT32*)pDst, len);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
/* Pick tuned versions if possible. */
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->set_8u = sse2_set_8u;
prims->set_32s = sse2_set_32s;
prims->set_32u = sse2_set_32u;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,160 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_shift.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
*dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
*dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
*dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
*dptr++ = *sptr++ >> val)
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
{
size_t len = ulen;
const INT32 shifts = 2;
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
if (len < 16) /* pointless if too small */
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
}
/* Get to the 16-byte boundary now. */
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
if (rem > 0)
{
const UINT32 add = 16 - rem;
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
if (status != PRIMITIVES_SUCCESS)
return status;
pSrcDst += add;
len -= add;
}
/* Use 8 128-bit SSE registers. */
size_t count = len >> (8 - shifts);
len -= count << (8 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = LOAD_SI128(src++);
__m128i xmm1 = LOAD_SI128(src++);
__m128i xmm2 = LOAD_SI128(src++);
__m128i xmm3 = LOAD_SI128(src++);
__m128i xmm4 = LOAD_SI128(src++);
__m128i xmm5 = LOAD_SI128(src++);
__m128i xmm6 = LOAD_SI128(src++);
__m128i xmm7 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
__m128i* dst = (__m128i*)pSrcDst;
STORE_SI128(dst++, xmm0);
STORE_SI128(dst++, xmm1);
STORE_SI128(dst++, xmm2);
STORE_SI128(dst++, xmm3);
STORE_SI128(dst++, xmm4);
STORE_SI128(dst++, xmm5);
STORE_SI128(dst++, xmm6);
STORE_SI128(dst++, xmm7);
pSrcDst = (INT16*)dst;
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
__m128i* dst = (__m128i*)pSrcDst;
STORE_SI128(dst++, xmm0);
pSrcDst = (INT16*)dst;
}
/* Finish off the remainder. */
if (len > 0)
return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
return PRIMITIVES_SUCCESS;
}
#endif
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
* depending on the sign of val. To avoid using the deprecated inplace
* routines, a wrapper can use the src for the dest.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u;
prims->rShiftC_16u = sse2_rShiftC_16u;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,188 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized sign operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_sign.h"
#include "prim_internal.h"
#include "prim_avxsse.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <tmmintrin.h>
static primitives_t* generic = nullptr;
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
UINT32 ulen)
{
size_t len = ulen;
const INT16* sptr = pSrc;
INT16* dptr = pDst;
size_t count = 0;
if (len < 16)
{
return generic->sign_16s(pSrc, pDst, ulen);
}
/* Check for 16-byte alignment (eventually). */
if ((ULONG_PTR)pDst & 0x01)
{
return generic->sign_16s(pSrc, pDst, ulen);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR)dptr & 0x0f)
{
INT16 src = *sptr++;
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
if (--len == 0)
return PRIMITIVES_SUCCESS;
}
/* Do 32-short chunks using 8 XMM registers. */
count = len >> 5; /* / 32 */
len -= count << 5; /* * 32 */
if ((ULONG_PTR)sptr & 0x0f)
{
/* Unaligned */
while (count--)
{
__m128i xmm0;
__m128i xmm1;
__m128i xmm2;
__m128i xmm3;
__m128i xmm4;
__m128i xmm5;
__m128i xmm6;
__m128i xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = LOAD_SI128(sptr);
sptr += 8;
xmm5 = LOAD_SI128(sptr);
sptr += 8;
xmm6 = LOAD_SI128(sptr);
sptr += 8;
xmm7 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
STORE_SI128(dptr, xmm0);
dptr += 8;
STORE_SI128(dptr, xmm1);
dptr += 8;
STORE_SI128(dptr, xmm2);
dptr += 8;
STORE_SI128(dptr, xmm3);
dptr += 8;
}
}
else
{
/* Aligned */
while (count--)
{
__m128i xmm0;
__m128i xmm1;
__m128i xmm2;
__m128i xmm3;
__m128i xmm4;
__m128i xmm5;
__m128i xmm6;
__m128i xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = LOAD_SI128(sptr);
sptr += 8;
xmm5 = LOAD_SI128(sptr);
sptr += 8;
xmm6 = LOAD_SI128(sptr);
sptr += 8;
xmm7 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
STORE_SI128(dptr, xmm0);
dptr += 8;
STORE_SI128(dptr, xmm1);
dptr += 8;
STORE_SI128(dptr, xmm2);
dptr += 8;
STORE_SI128(dptr, xmm3);
dptr += 8;
}
}
/* Do 8-short chunks using two XMM registers. */
count = len >> 3;
len -= count << 3;
while (count--)
{
__m128i xmm0 = _mm_set1_epi16(0x0001U);
__m128i xmm1 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm1);
STORE_SI128(dptr, xmm0);
dptr += 8;
}
/* Do leftovers. */
while (len--)
{
INT16 src = *sptr++;
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
}
return PRIMITIVES_SUCCESS;
}
#endif /* SSE_AVX_INTRINSICS_ENABLED */
/* ------------------------------------------------------------------------- */
void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
generic = primitives_get_generic();
/* Pick tuned versions if possible. */
/* I didn't spot an IPP version of this. */
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
prims->sign_16s = ssse3_sign_16s;
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
WINPR_UNUSED(prims);
#endif
}

View File

@@ -0,0 +1,278 @@
/* prim_templates.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*/
#pragma once
#include "prim_avxsse.h"
/* These are prototypes for SSE (potentially NEON) routines that do a
* simple SSE operation over an array of data. Since so much of this
* code is shared except for the operation itself, these prototypes are
* used rather than duplicating code. The naming convention depends on
* the parameters: S=Source param; C=Constant; D=Destination.
* All the macros have parameters for a fallback procedure if the data
* is too small and an operation "the slow way" for use at 16-byte edges.
*/
/* SSE3 note: If someone needs to support an SSE2 version of these without
* SSE3 support, an alternative version could be added that merely checks
* that 16-byte alignment on both destination and source(s) can be
* achieved, rather than use LDDQU for unaligned reads.
*/
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
* It easily can't do that if the value is stored in a variable.
* So don't save it as an intermediate value.
*/
/* ----------------------------------------------------------------------------
* SCD = Source, Constant, Destination
*/
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
WINPR_ATTR_NODISCARD \
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
_type_* WINPR_RESTRICT pDst, UINT32 ulen) \
{ \
size_t len = ulen; \
INT32 shifts = 0; \
const _type_* sptr = pSrc; \
_type_* dptr = pDst; \
if (val == 0) \
return PRIMITIVES_SUCCESS; \
if (val >= 16) \
return -1; \
if (sizeof(_type_) == 1) \
shifts = 1; \
else if (sizeof(_type_) == 2) \
shifts = 2; \
else if (sizeof(_type_) == 4) \
shifts = 3; \
else if (sizeof(_type_) == 8) \
shifts = 4; \
/* Use 8 128-bit SSE registers. */ \
size_t count = len >> (8 - shifts); \
len -= count << (8 - shifts); \
\
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm1 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm2 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm3 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm4 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm5 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm6 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm7 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, (_op_type_)val); \
xmm1 = _op_(xmm1, (_op_type_)val); \
xmm2 = _op_(xmm2, (_op_type_)val); \
xmm3 = _op_(xmm3, (_op_type_)val); \
xmm4 = _op_(xmm4, (_op_type_)val); \
xmm5 = _op_(xmm5, (_op_type_)val); \
xmm6 = _op_(xmm6, (_op_type_)val); \
xmm7 = _op_(xmm7, (_op_type_)val); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm2); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm3); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm4); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm5); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm6); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm7); \
dptr += (16 / sizeof(_type_)); \
} \
\
/* Use a single 128-bit SSE register. */ \
count = len >> (5 - shifts); \
len -= count << (5 - shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, (_op_type_)val); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) \
{ \
_slowWay_; \
} \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SCD = Source, Constant, Destination
* PRE = preload xmm0 with the constant.
*/
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
WINPR_ATTR_NODISCARD \
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
_type_* WINPR_RESTRICT pDst, INT32 ilen) \
{ \
size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
int shifts = 0; \
const _type_* sptr = pSrc; \
_type_* dptr = pDst; \
__m128i xmm0; \
if (sizeof(_type_) == 1) \
shifts = 1; \
else if (sizeof(_type_) == 2) \
shifts = 2; \
else if (sizeof(_type_) == 4) \
shifts = 3; \
else if (sizeof(_type_) == 8) \
shifts = 4; \
/* Use 4 128-bit SSE registers. */ \
size_t count = len >> (7 - shifts); \
len -= count << (7 - shifts); \
xmm0 = mm_set1_epu32(val); \
for (size_t x = 0; x < count; x++) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm2 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm3 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
__m128i xmm4 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm2); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm3); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm4); \
dptr += (16 / sizeof(_type_)); \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5 - shifts); \
len -= count << (5 - shifts); \
for (size_t x = 0; x < count; x++) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); \
sptr += (16 / sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
for (size_t x = 0; x < len; x++) \
{ \
_slowWay_; \
} \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SSD = Source1, Source2, Destination
*/
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
WINPR_ATTR_NODISCARD \
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
UINT32 ulen) \
{ \
size_t len = ulen; \
int shifts = 0; \
const _type_* sptr1 = pSrc1; \
const _type_* sptr2 = pSrc2; \
_type_* dptr = pDst; \
size_t count; \
if (sizeof(_type_) == 1) \
shifts = 1; \
else if (sizeof(_type_) == 2) \
shifts = 2; \
else if (sizeof(_type_) == 4) \
shifts = 3; \
else if (sizeof(_type_) == 8) \
shifts = 4; \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7 - shifts); \
len -= count << (7 - shifts); \
/* Aligned loads */ \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm1 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm2 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm3 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm4 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
__m128i xmm5 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
__m128i xmm6 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
__m128i xmm7 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm1); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm2); \
dptr += (16 / sizeof(_type_)); \
STORE_SI128(dptr, xmm3); \
dptr += (16 / sizeof(_type_)); \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5 - shifts); \
len -= count << (5 - shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr1); \
sptr1 += (16 / sizeof(_type_)); \
__m128i xmm1 = LOAD_SI128(sptr2); \
sptr2 += (16 / sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm1); \
STORE_SI128(dptr, xmm0); \
dptr += (16 / sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) \
{ \
const pstatus_t rc = _slowWay_; \
if (rc != PRIMITIVES_SUCCESS) \
return rc; \
} \
return PRIMITIVES_SUCCESS; \
}

View File

@@ -0,0 +1,39 @@
set(MODULE_NAME "TestPrimitives")
set(MODULE_PREFIX "TEST_FREERDP_PRIMITIVES")
disable_warnings_for_directory(${CMAKE_CURRENT_BINARY_DIR})
set(${MODULE_PREFIX}_DRIVER ${MODULE_NAME}.c)
set(${MODULE_PREFIX}_TESTS
TestPrimitivesAdd.c
TestPrimitivesAlphaComp.c
TestPrimitivesAndOr.c
TestPrimitivesColors.c
TestPrimitivesCopy.c
TestPrimitivesSet.c
TestPrimitivesShift.c
TestPrimitivesSign.c
TestPrimitivesYUV.c
TestPrimitivesYCbCr.c
TestPrimitivesYCoCg.c
)
create_test_sourcelist(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_DRIVER} ${${MODULE_PREFIX}_TESTS})
set(${MODULE_PREFIX}_EXTRA_SRCS prim_test.c prim_test.h measure.h)
add_executable(${MODULE_NAME} ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_EXTRA_SRCS})
set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} winpr freerdp)
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
set_target_properties(${MODULE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${TESTING_OUTPUT_DIRECTORY}")
foreach(test ${${MODULE_PREFIX}_TESTS})
get_filename_component(TestName ${test} NAME_WE)
add_test(${TestName} ${TESTING_OUTPUT_DIRECTORY}/${MODULE_NAME} ${TestName})
endforeach()
set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/Test")

View File

@@ -0,0 +1,80 @@
/* test_add.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
/* ========================================================================= */
static BOOL test_add16s_func(void)
{
pstatus_t status = 0;
INT16 src1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 src2[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 d2[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src1, sizeof(src1)) < 0)
return FALSE;
if (winpr_RAND(src2, sizeof(src2)) < 0)
return FALSE;
status = generic->add_16s(src1 + 1, src2 + 1, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = optimized->add_16s(src1 + 1, src2 + 1, d2 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
/* ------------------------------------------------------------------------- */
static BOOL test_add16s_speed(void)
{
BYTE src1[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
BYTE src2[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
BYTE dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (!g_TestPrimitivesPerformance)
return TRUE;
if (winpr_RAND(src1, sizeof(src1)) < 0)
return FALSE;
if (winpr_RAND(src2, sizeof(src2)) < 0)
return FALSE;
return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->add_16s,
(speed_test_fkt)optimized->add_16s, src1, src2, dst, FUNC_TEST_SIZE));
}
int TestPrimitivesAdd(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
if (!test_add16s_func())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_add16s_speed())
return -1;
}
return 0;
}

View File

@@ -0,0 +1,203 @@
/* test_alphaComp.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#define MAX_BLOCK_SIZE 256
#define SIZE_SQUARED (MAX_BLOCK_SIZE * MAX_BLOCK_SIZE)
/* ========================================================================= */
#define ALF(_c_) (((_c_)&0xFF000000U) >> 24)
#define RED(_c_) (((_c_)&0x00FF0000U) >> 16)
#define GRN(_c_) (((_c_)&0x0000FF00U) >> 8)
#define BLU(_c_) ((_c_)&0x000000FFU)
#define TOLERANCE 1
static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
{
const BYTE* addr = _addr_ + 1ULL * _x_ * sizeof(UINT32) + 1ULL * _y_ * _bytes_;
return (const UINT32*)addr;
}
#define SRC1_WIDTH 6
#define SRC1_HEIGHT 6
#define SRC2_WIDTH 7
#define SRC2_HEIGHT 7
#define DST_WIDTH 9
#define DST_HEIGHT 9
#define TEST_WIDTH 4
#define TEST_HEIGHT 5
/* ------------------------------------------------------------------------- */
static UINT32 alpha_add(UINT32 c1, UINT32 c2)
{
UINT32 a1 = ALF(c1);
UINT32 r1 = RED(c1);
UINT32 g1 = GRN(c1);
UINT32 b1 = BLU(c1);
UINT32 a2 = ALF(c2);
UINT32 r2 = RED(c2);
UINT32 g2 = GRN(c2);
UINT32 b2 = BLU(c2);
UINT32 a3 = ((a1 * a1 + (255 - a1) * a2) / 255) & 0xff;
UINT32 r3 = ((a1 * r1 + (255 - a1) * r2) / 255) & 0xff;
UINT32 g3 = ((a1 * g1 + (255 - a1) * g2) / 255) & 0xff;
UINT32 b3 = ((a1 * b1 + (255 - a1) * b2) / 255) & 0xff;
return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
}
/* ------------------------------------------------------------------------- */
static UINT32 colordist(UINT32 c1, UINT32 c2)
{
int d = 0;
int maxd = 0;
d = ABS((INT32)(ALF(c1) - ALF(c2)));
if (d > maxd)
maxd = d;
d = ABS((INT32)(RED(c1) - RED(c2)));
if (d > maxd)
maxd = d;
d = ABS((INT32)(GRN(c1) - GRN(c2)));
if (d > maxd)
maxd = d;
d = ABS((INT32)(BLU(c1) - BLU(c2)));
if (d > maxd)
maxd = d;
return maxd;
}
/* ------------------------------------------------------------------------- */
static BOOL check(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height)
{
for (UINT32 y = 0; y < height; ++y)
{
for (UINT32 x = 0; x < width; ++x)
{
UINT32 s1 = *PIXEL(pSrc1, src1Step, x, y);
UINT32 s2 = *PIXEL(pSrc2, src2Step, x, y);
UINT32 c0 = alpha_add(s1, s2);
UINT32 c1 = *PIXEL(pDst, dstStep, x, y);
if (colordist(c0, c1) > TOLERANCE)
{
printf("alphaComp-general: [%" PRIu32 ",%" PRIu32 "] 0x%08" PRIx32 "+0x%08" PRIx32
"=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
x, y, s1, s2, c0, c1);
return FALSE;
}
}
}
return TRUE;
}
static BOOL test_alphaComp_func(void)
{
pstatus_t status = 0;
BYTE src1[SRC1_WIDTH * SRC1_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
BYTE src2[SRC2_WIDTH * SRC2_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
BYTE dst1[DST_WIDTH * DST_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
UINT32* ptr = nullptr;
if (winpr_RAND(src1, sizeof(src1)) < 0)
return FALSE;
/* Special-case the first two values */
src1[0] &= 0x00FFFFFFU;
src1[1] |= 0xFF000000U;
if (winpr_RAND(src2, sizeof(src2)) < 0)
return FALSE;
/* Set the second operand to fully-opaque. */
ptr = (UINT32*)src2;
for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
*ptr++ |= 0xFF000000U;
status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1,
4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
TEST_HEIGHT))
return FALSE;
status = optimized->alphaComp_argb((const BYTE*)src1, 4 * SRC1_WIDTH, (const BYTE*)src2,
4 * SRC2_WIDTH, (BYTE*)dst1, 4 * DST_WIDTH, TEST_WIDTH,
TEST_HEIGHT);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
TEST_HEIGHT))
return FALSE;
return TRUE;
}
static int test_alphaComp_speed(void)
{
BYTE src1[SRC1_WIDTH * SRC1_HEIGHT] = WINPR_C_ARRAY_INIT;
BYTE src2[SRC2_WIDTH * SRC2_HEIGHT] = WINPR_C_ARRAY_INIT;
BYTE dst1[DST_WIDTH * DST_HEIGHT] = WINPR_C_ARRAY_INIT;
UINT32* ptr = nullptr;
if (winpr_RAND(src1, sizeof(src1)) < 0)
return -1;
/* Special-case the first two values */
src1[0] &= 0x00FFFFFFU;
src1[1] |= 0xFF000000U;
if (winpr_RAND(src2, sizeof(src2)) < 0)
return -1;
/* Set the second operand to fully-opaque. */
ptr = (UINT32*)src2;
for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
*ptr++ |= 0xFF000000U;
return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->alphaComp_argb,
(speed_test_fkt)optimized->alphaComp_argb, src1, 4 * SRC1_WIDTH, src2,
4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT));
}
int TestPrimitivesAlphaComp(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
if (!test_alphaComp_func())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_alphaComp_speed())
return -1;
}
return 0;
}

View File

@@ -0,0 +1,171 @@
/* test_andor.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
#define VALUE (0xA5A5A5A5U)
/* ========================================================================= */
static BOOL test_and_32u_impl(const char* name, fn_andC_32u_t fkt, const UINT32* src,
const UINT32 val, UINT32* dst, size_t size)
{
pstatus_t status = fkt(src, val, dst, WINPR_ASSERTING_INT_CAST(int32_t, size));
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (size_t i = 0; i < size; ++i)
{
if (dst[i] != (src[i] & val))
{
printf("AND %s FAIL[%" PRIuz "] 0x%08" PRIx32 "&0x%08" PRIx32 "=0x%08" PRIx32
", got 0x%08" PRIx32 "\n",
name, i, src[i], val, (src[i] & val), dst[i]);
return FALSE;
}
}
return TRUE;
}
static BOOL test_and_32u_func(void)
{
UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u, src + 1, VALUE, dst + 1,
FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u, src + 1, VALUE,
dst + 2, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u, src + 1, VALUE,
dst + 1, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u, src + 1, VALUE,
dst + 2, FUNC_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_and_32u_speed(void)
{
UINT32 src[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
if (!speed_test("andC_32u", "aligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
(speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("andC_32u", "unaligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
(speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ========================================================================= */
static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
{
for (UINT32 i = 0; i < size; ++i)
{
if (dst[i] != (src[i] | value))
{
printf("OR-general general FAIL[%" PRIu32 "] 0x%08" PRIx32 "&0x%08" PRIx32
"=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
i, src[i], value, src[i] | value, dst[i]);
return FALSE;
}
}
return TRUE;
}
static BOOL test_or_32u_func(void)
{
pstatus_t status = 0;
UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
return FALSE;
status = optimized->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_or_32u_speed(void)
{
UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->orC_32u,
(speed_test_fkt)optimized->orC_32u, src + 1, VALUE, dst + 1,
FUNC_TEST_SIZE));
}
int TestPrimitivesAndOr(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
if (!test_and_32u_func())
return -1;
if (!test_or_32u_func())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_and_32u_speed())
return -1;
if (!test_or_32u_speed())
return -1;
}
return 0;
}

View File

@@ -0,0 +1,291 @@
/* test_colors.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include <freerdp/utils/profiler.h>
#include "prim_test.h"
/* ------------------------------------------------------------------------- */
static BOOL test_RGBToRGB_16s8u_P3AC4R_func(prim_size_t roi, DWORD DstFormat)
{
INT16* r = nullptr;
INT16* g = nullptr;
INT16* b = nullptr;
BYTE* out1 = nullptr;
BYTE* out2 = nullptr;
BOOL failed = FALSE;
const INT16* ptrs[3];
const UINT32 rgbStride = roi.width * 2;
const UINT32 dstStride = roi.width * 4;
PROFILER_DEFINE(genericProf)
PROFILER_DEFINE(optProf)
PROFILER_CREATE(genericProf, "RGBToRGB_16s8u_P3AC4R-GENERIC")
PROFILER_CREATE(optProf, "RGBToRGB_16s8u_P3AC4R-OPTIMIZED")
r = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
g = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
b = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
out1 = winpr_aligned_calloc(1, 1ULL * dstStride * roi.height, 16);
out2 = winpr_aligned_calloc(1, 1ULL * dstStride * roi.height, 16);
if (!r || !g || !b || !out1 || !out2)
goto fail;
if (winpr_RAND(r, 1ULL * rgbStride * roi.height) < 0)
goto fail;
if (winpr_RAND(g, 1ULL * rgbStride * roi.height) < 0)
goto fail;
if (winpr_RAND(b, 1ULL * rgbStride * roi.height) < 0)
goto fail;
ptrs[0] = r;
ptrs[1] = g;
ptrs[2] = b;
PROFILER_ENTER(genericProf)
if (generic->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out1, dstStride, DstFormat, &roi) !=
PRIMITIVES_SUCCESS)
goto fail;
PROFILER_EXIT(genericProf)
PROFILER_ENTER(optProf)
if (optimized->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out2, dstStride, DstFormat, &roi) !=
PRIMITIVES_SUCCESS)
goto fail;
PROFILER_EXIT(optProf)
if (memcmp(out1, out2, 1ULL * dstStride * roi.height) != 0)
{
for (UINT64 i = 0; i < 1ull * roi.width * roi.height; ++i)
{
const UINT32 o1 = FreeRDPReadColor(out1 + 4 * i, DstFormat);
const UINT32 o2 = FreeRDPReadColor(out2 + 4 * i, DstFormat);
if (o1 != o2)
{
printf("RGBToRGB_16s8u_P3AC4R FAIL: out1[%" PRIu64 "]=0x%08" PRIx8 " out2[%" PRIu64
"]=0x%08" PRIx8 "\n",
i, out1[i], i, out2[i]);
failed = TRUE;
}
}
}
printf("Results for %" PRIu32 "x%" PRIu32 " [%s]\n", roi.width, roi.height,
FreeRDPGetColorFormatName(DstFormat));
PROFILER_PRINT_HEADER
PROFILER_PRINT(genericProf)
PROFILER_PRINT(optProf)
PROFILER_PRINT_FOOTER
fail:
PROFILER_FREE(genericProf)
PROFILER_FREE(optProf)
winpr_aligned_free(r);
winpr_aligned_free(g);
winpr_aligned_free(b);
winpr_aligned_free(out1);
winpr_aligned_free(out2);
return !failed;
}
/* ------------------------------------------------------------------------- */
static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
{
union
{
const INT16** cpv;
INT16** pv;
} cnv;
const prim_size_t roi64x64 = { 64, 64 };
INT16 r[4096 + 1] = WINPR_C_ARRAY_INIT;
INT16 g[4096 + 1] = WINPR_C_ARRAY_INIT;
INT16 b[4096 + 1] = WINPR_C_ARRAY_INIT;
UINT32 dst[4096 + 1] = WINPR_C_ARRAY_INIT;
INT16* ptrs[3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(r, sizeof(r)) < 0)
return FALSE;
if (winpr_RAND(g, sizeof(g)) < 0)
return FALSE;
if (winpr_RAND(b, sizeof(b)) < 0)
return FALSE;
/* clear upper bytes */
for (int i = 0; i < 4096; ++i)
{
r[i] &= 0x00FFU;
g[i] &= 0x00FFU;
b[i] &= 0x00FFU;
}
ptrs[0] = r + 1;
ptrs[1] = g + 1;
ptrs[2] = b + 1;
cnv.pv = ptrs;
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv, 64 * 2, (BYTE*)dst,
64 * 4, &roi64x64))
return FALSE;
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv, 64 * 2,
((BYTE*)dst) + 1, 64 * 4, &roi64x64))
return FALSE;
return TRUE;
}
/* ========================================================================= */
static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
{
pstatus_t status = 0;
INT16 y[4096] = WINPR_C_ARRAY_INIT;
INT16 cb[4096] = WINPR_C_ARRAY_INIT;
INT16 cr[4096] = WINPR_C_ARRAY_INIT;
INT16 r1[4096] = WINPR_C_ARRAY_INIT;
INT16 g1[4096] = WINPR_C_ARRAY_INIT;
INT16 b1[4096] = WINPR_C_ARRAY_INIT;
INT16 r2[4096] = WINPR_C_ARRAY_INIT;
INT16 g2[4096] = WINPR_C_ARRAY_INIT;
INT16 b2[4096] = WINPR_C_ARRAY_INIT;
const INT16* in[3];
INT16* out1[3];
INT16* out2[3];
prim_size_t roi = { 64, 64 };
if (winpr_RAND(y, sizeof(y)) < 0)
return FALSE;
if (winpr_RAND(cb, sizeof(cb)) < 0)
return FALSE;
if (winpr_RAND(cr, sizeof(cr)) < 0)
return FALSE;
/* Normalize to 11.5 fixed radix */
for (int i = 0; i < 4096; ++i)
{
y[i] &= 0x1FE0U;
cb[i] &= 0x1FE0U;
cr[i] &= 0x1FE0U;
}
in[0] = y;
in[1] = cb;
in[2] = cr;
out1[0] = r1;
out1[1] = g1;
out1[2] = b1;
out2[0] = r2;
out2[1] = g2;
out2[2] = b2;
status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (int i = 0; i < 4096; ++i)
{
if ((ABS(r1[i] - r2[i]) > 1) || (ABS(g1[i] - g2[i]) > 1) || (ABS(b1[i] - b2[i]) > 1))
{
printf("YCbCrToRGB-SSE FAIL[%d]: %" PRId16 ",%" PRId16 ",%" PRId16 " vs %" PRId16
",%" PRId16 ",%" PRId16 "\n",
i, r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
return FALSE;
}
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
{
prim_size_t roi = { 64, 64 };
INT16 y[4096] = WINPR_C_ARRAY_INIT;
INT16 cb[4096] = WINPR_C_ARRAY_INIT;
INT16 cr[4096] = WINPR_C_ARRAY_INIT;
INT16 r[4096] = WINPR_C_ARRAY_INIT;
INT16 g[4096] = WINPR_C_ARRAY_INIT;
INT16 b[4096] = WINPR_C_ARRAY_INIT;
const INT16* input[3] = WINPR_C_ARRAY_INIT;
INT16* output[3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(y, sizeof(y)) < 0)
return FALSE;
if (winpr_RAND(cb, sizeof(cb)) < 0)
return FALSE;
if (winpr_RAND(cr, sizeof(cr)) < 0)
return FALSE;
/* Normalize to 11.5 fixed radix */
for (int i = 0; i < 4096; ++i)
{
y[i] &= 0x1FE0U;
cb[i] &= 0x1FE0U;
cr[i] &= 0x1FE0U;
}
input[0] = y;
input[1] = cb;
input[2] = cr;
output[0] = r;
output[1] = g;
output[2] = b;
return (speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
(speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
(speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3, input, 64 * 2, output,
64 * 2, &roi));
}
int TestPrimitivesColors(int argc, char* argv[])
{
const DWORD formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_ABGR32,
PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
prim_size_t roi = { 1920 / 4, 1080 / 4 };
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
{
if (!test_RGBToRGB_16s8u_P3AC4R_func(roi, formats[x]))
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_RGBToRGB_16s8u_P3AC4R_speed())
return 1;
}
if (!test_yCbCrToRGB_16s16s_P3P3_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_yCbCrToRGB_16s16s_P3P3_speed())
return 1;
}
}
return 0;
}

View File

@@ -0,0 +1,296 @@
/* test_copy.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <stdio.h>
#include <freerdp/config.h>
#include <winpr/crypto.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#define COPY_TESTSIZE (256 * 2 + 16 * 2 + 15 + 15)
/* ------------------------------------------------------------------------- */
static BOOL test_copy8u_func(void)
{
primitives_t* prims = primitives_get();
BYTE data[COPY_TESTSIZE + 15] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(data, sizeof(data)) < 0)
return FALSE;
for (int soff = 0; soff < 16; ++soff)
{
for (int doff = 0; doff < 16; ++doff)
{
for (int length = 1; length <= COPY_TESTSIZE - doff; ++length)
{
BYTE dest[COPY_TESTSIZE + 15] = WINPR_C_ARRAY_INIT;
if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
return FALSE;
for (int i = 0; i < length; ++i)
{
if (dest[i + doff] != data[i + soff])
{
printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02" PRIx8 ""
"data[%d]=0x%02" PRIx8 "\n",
doff, length, i + doff, dest[i + doff], i + soff, data[i + soff]);
return FALSE;
}
}
}
}
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_copy8u_speed(void)
{
BYTE src[MAX_TEST_SIZE + 4] = WINPR_C_ARRAY_INIT;
BYTE dst[MAX_TEST_SIZE + 4] = WINPR_C_ARRAY_INIT;
if (!speed_test("copy_8u", "aligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
(speed_test_fkt)optimized->copy_8u, src, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("copy_8u", "unaligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
(speed_test_fkt)optimized->copy_8u, src + 1, dst + 1, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
static BYTE* rand_alloc(size_t w, size_t h, size_t bpp, size_t pad, BYTE** copy)
{
const size_t s = w * bpp + pad;
BYTE* ptr = calloc(s, h);
if (!ptr)
return nullptr;
if (winpr_RAND(ptr, s * h) < 0)
{
free(ptr);
return nullptr;
}
if (copy)
{
BYTE* ptr2 = calloc(s, h);
if (!ptr2)
{
free(ptr);
return nullptr;
}
memcpy(ptr2, ptr, s * h);
*copy = ptr2;
}
return ptr;
}
static size_t runcount = 0;
static BOOL test_copy_no_overlap_off(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
UINT32 pad, UINT32 w, UINT32 h, UINT32 dxoff, UINT32 dyoff,
UINT32 sxoff, UINT32 syoff)
{
BOOL rc = FALSE;
primitives_t* gen = primitives_get_generic();
primitives_t* prims = primitives_get();
if (!gen || !prims)
return FALSE;
runcount++;
WINPR_ASSERT(dxoff < w);
WINPR_ASSERT(sxoff < w);
WINPR_ASSERT(dyoff < h);
WINPR_ASSERT(syoff < h);
const UINT32 sbpp = FreeRDPGetBytesPerPixel(srcFormat);
const UINT32 dbpp = FreeRDPGetBytesPerPixel(dstFormat);
if (verbose)
{
(void)fprintf(stderr,
"run src: %s, dst: %s [flags 0x%08" PRIx32 "] %" PRIu32 "x%" PRIu32
", soff=%" PRIu32 "x%" PRIu32 ", doff=%" PRIu32 "x%" PRIu32 ", pad=%" PRIu32
"\n",
FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
flags, w, h, sxoff, syoff, dxoff, dyoff, pad);
}
const UINT32 sstride = (w + sxoff) * sbpp + pad;
const UINT32 dstride = (w + dxoff) * dbpp + pad;
BYTE* dst2 = nullptr;
BYTE* src2 = nullptr;
BYTE* dst1 = rand_alloc(w + dxoff, h + dyoff, dbpp, pad, &dst2);
BYTE* src1 = rand_alloc(w + sxoff, h + syoff, sbpp, pad, &src2);
if (!dst1 || !dst2 || !src1 || !src2)
goto fail;
if (gen->copy_no_overlap(dst1, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat, sstride,
sxoff, syoff, nullptr, flags) != PRIMITIVES_SUCCESS)
goto fail;
if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
goto fail;
if (prims->copy_no_overlap(dst2, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat,
sstride, sxoff, syoff, nullptr, flags) != PRIMITIVES_SUCCESS)
goto fail;
if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
goto fail;
if (memcmp(dst1, dst2, 1ULL * dstride * h) != 0)
goto fail;
if (flags == FREERDP_KEEP_DST_ALPHA)
{
for (size_t y = 0; y < h; y++)
{
const BYTE* d1 = &dst1[(y + dyoff) * dstride];
const BYTE* d2 = &dst2[(y + dyoff) * dstride];
for (size_t x = 0; x < w; x++)
{
const UINT32 c1 = FreeRDPReadColor(&d1[(x + dxoff) * dbpp], dstFormat);
const UINT32 c2 = FreeRDPReadColor(&d2[(x + dxoff) * dbpp], dstFormat);
BYTE a1 = 0;
BYTE a2 = 0;
FreeRDPSplitColor(c1, dstFormat, nullptr, nullptr, nullptr, &a1, nullptr);
FreeRDPSplitColor(c2, dstFormat, nullptr, nullptr, nullptr, &a2, nullptr);
if (a1 != a2)
goto fail;
}
}
}
rc = TRUE;
fail:
if (!rc)
{
(void)fprintf(stderr, "failed to compare copy_no_overlap(%s -> %s [0x%08" PRIx32 "])\n",
FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
flags);
}
free(dst1);
free(dst2);
free(src1);
free(src2);
return rc;
}
static BOOL test_copy_no_overlap(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
UINT32 width, UINT32 height)
{
BOOL rc = TRUE;
const UINT32 mw = 4;
const UINT32 mh = 4;
for (UINT32 dxoff = 0; dxoff < mw; dxoff++)
{
for (UINT32 dyoff = 0; dyoff <= mh; dyoff++)
{
for (UINT32 sxoff = 0; sxoff <= mw; sxoff++)
{
for (UINT32 syoff = 0; syoff <= mh; syoff++)
{
/* We need minimum alignment of 8 bytes.
* AVX2 can read 8 pixels (at most 8x4=32 bytes) per step
* if we have 24bpp input that is 24 bytes with 8 bytes read
* out of bound */
for (UINT32 pad = 8; pad <= 12; pad++)
{
if (!test_copy_no_overlap_off(verbose, srcFormat, dstFormat, flags, pad,
width, height, dxoff, dyoff, sxoff, syoff))
rc = FALSE;
}
}
}
}
}
return rc;
}
int TestPrimitivesCopy(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
const BOOL verbose = argc > 1;
prim_test_setup(FALSE);
if (!test_copy8u_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_copy8u_speed())
return 1;
}
const UINT32 flags[] = {
FREERDP_FLIP_NONE,
FREERDP_KEEP_DST_ALPHA,
FREERDP_FLIP_HORIZONTAL,
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_HORIZONTAL,
#if defined(TEST_ALL_FLAGS)
FREERDP_FLIP_VERTICAL,
FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL,
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL,
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL
#endif
};
const UINT32 formats[] = { PIXEL_FORMAT_BGRA32,
PIXEL_FORMAT_BGRX32,
PIXEL_FORMAT_BGR24
#if defined(TEST_ALL_FLAGS) /* Only the previous 3 have SIMD optimizations, so skip the rest */
,
PIXEL_FORMAT_RGB24,
PIXEL_FORMAT_ABGR32,
PIXEL_FORMAT_ARGB32,
PIXEL_FORMAT_XBGR32,
PIXEL_FORMAT_XRGB32,
PIXEL_FORMAT_RGBA32,
PIXEL_FORMAT_RGBX32
#endif
};
int rc = 0;
for (size_t z = 0; z < ARRAYSIZE(flags); z++)
{
const UINT32 flag = flags[z];
for (size_t x = 0; x < ARRAYSIZE(formats); x++)
{
const UINT32 sformat = formats[x];
for (size_t y = 0; y < ARRAYSIZE(formats); y++)
{
const UINT32 dformat = formats[y];
if (!test_copy_no_overlap(verbose, sformat, dformat, flag, 21, 17))
rc = -1;
}
}
}
if (verbose)
(void)fprintf(stderr, "runcount=%" PRIuz "\n", runcount);
return rc;
}

View File

@@ -0,0 +1,277 @@
/* test_set.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
/* ------------------------------------------------------------------------- */
static BOOL check8(const BYTE* src, UINT32 length, UINT32 offset, BYTE value)
{
for (UINT32 i = 0; i < length; ++i)
{
if (src[offset + i] != value)
{
printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%02" PRIx8
"\n",
offset, length, i + offset, src[i + offset]);
return FALSE;
}
}
return TRUE;
}
static BOOL test_set8u_func(void)
{
pstatus_t status = 0;
for (UINT32 off = 0; off < 16; ++off)
{
BYTE dest[1024];
memset(dest, 3, sizeof(dest));
for (UINT32 len = 1; len < 48 - off; ++len)
{
status = generic->set_8u(0xa5, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check8(dest, len, off, 0xa5))
return FALSE;
}
}
for (UINT32 off = 0; off < 16; ++off)
{
BYTE dest[1024];
memset(dest, 3, sizeof(dest));
for (UINT32 len = 1; len < 48 - off; ++len)
{
status = optimized->set_8u(0xa5, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check8(dest, len, off, 0xa5))
return FALSE;
}
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_set8u_speed(void)
{
BYTE dest[1024];
BYTE value = 0;
for (UINT32 x = 0; x < 16; x++)
{
if (winpr_RAND(&value, sizeof(value)) < 0)
return FALSE;
if (!speed_test("set_8u", "", g_Iterations, (speed_test_fkt)generic->set_8u,
(speed_test_fkt)optimized->set_8u, value, dest + x, x))
return FALSE;
}
return TRUE;
}
static BOOL check32s(const INT32* src, UINT32 length, UINT32 offset, INT32 value)
{
for (UINT32 i = 0; i < length; ++i)
{
if (src[offset + i] != value)
{
printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
"\n",
offset, length, i + offset, src[i + offset]);
return FALSE;
}
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_set32s_func(void)
{
pstatus_t status = 0;
const INT32 value = -0x12345678;
for (UINT32 off = 0; off < 16; ++off)
{
INT32 dest[1024] = WINPR_C_ARRAY_INIT;
for (UINT32 len = 1; len < 48 - off; ++len)
{
status = generic->set_32s(value, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check32s(dest, len, off, value))
return FALSE;
}
}
for (UINT32 off = 0; off < 16; ++off)
{
INT32 dest[1024] = WINPR_C_ARRAY_INIT;
for (UINT32 len = 1; len < 48 - off; ++len)
{
status = optimized->set_32s(value, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check32s(dest, len, off, value))
return FALSE;
}
}
return TRUE;
}
static BOOL check32u(const UINT32* src, UINT32 length, UINT32 offset, UINT32 value)
{
for (UINT32 i = 0; i < length; ++i)
{
if (src[offset + i] != value)
{
printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
"\n",
offset, length, i + offset, src[i + offset]);
return FALSE;
}
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_set32u_func(void)
{
pstatus_t status = 0;
const UINT32 value = 0xABCDEF12;
for (UINT32 off = 0; off < 16; ++off)
{
UINT32 dest[1024] = WINPR_C_ARRAY_INIT;
for (UINT32 len = 1; len < 48 - off; ++len)
{
status = generic->set_32u(value, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check32u(dest, len, off, value))
return FALSE;
}
}
for (UINT32 off = 0; off < 16; ++off)
{
UINT32 dest[1024] = WINPR_C_ARRAY_INIT;
for (UINT32 len = 1; len < 48 - off; ++len)
{
status = optimized->set_32u(value, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check32u(dest, len, off, value))
return FALSE;
}
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_set32u_speed(void)
{
UINT32 dest[1024];
BYTE value = 0;
for (UINT32 x = 0; x < 16; x++)
{
if (winpr_RAND(&value, sizeof(value)) < 0)
return FALSE;
if (!speed_test("set_32u", "", g_Iterations, (speed_test_fkt)generic->set_32u,
(speed_test_fkt)optimized->set_32u, value, dest + x, x))
return FALSE;
}
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_set32s_speed(void)
{
INT32 dest[1024];
BYTE value = 0;
for (UINT32 x = 0; x < 16; x++)
{
if (winpr_RAND(&value, sizeof(value)) < 0)
return FALSE;
if (!speed_test("set_32s", "", g_Iterations, (speed_test_fkt)generic->set_32s,
(speed_test_fkt)optimized->set_32s, value, dest + x, x))
return FALSE;
}
return TRUE;
}
int TestPrimitivesSet(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
if (!test_set8u_func())
return -1;
if (!test_set32s_func())
return -1;
if (!test_set32u_func())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_set8u_speed())
return -1;
if (!test_set32s_speed())
return -1;
if (!test_set32u_speed())
return -1;
}
return 0;
}

View File

@@ -0,0 +1,470 @@
/* test_shift.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
static BOOL test_lShift_16s_func(void)
{
pstatus_t status = 0;
INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 val = 0;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
val = val % 16;
/* Negative tests */
status = generic->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
/* Aligned */
status = generic->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
static BOOL test_lShift_16u_func(void)
{
pstatus_t status = 0;
UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 val = 0;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
val = val % 16;
/* Negative tests */
status = generic->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
/* Aligned */
status = generic->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
static BOOL test_rShift_16s_func(void)
{
pstatus_t status = 0;
INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 val = 0;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
val = val % 16;
/* Negative Tests */
status = generic->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
/* Aligned */
status = generic->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
static BOOL test_rShift_16u_func(void)
{
pstatus_t status = 0;
UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 val = 0;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
val = val % 16;
/* Negative tests */
status = generic->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
/* Aligned */
status = generic->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
static BOOL test_ShiftWrapper_16s_func(void)
{
pstatus_t status = 0;
INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 tmp = 0;
if (winpr_RAND(&tmp, sizeof(tmp)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
INT32 val = WINPR_ASSERTING_INT_CAST(int32_t, tmp % 16);
/* Negative tests */
status = generic->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
/* Aligned */
status = generic->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
static BOOL test_ShiftWrapper_16u_func(void)
{
pstatus_t status = 0;
UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
UINT32 tmp = 0;
if (winpr_RAND(&tmp, sizeof(tmp)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
INT32 val = WINPR_ASSERTING_INT_CAST(int32_t, tmp % 16);
/* Negative */
status = generic->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
if (status == PRIMITIVES_SUCCESS)
return FALSE;
/* Aligned */
status = generic->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
return (status == PRIMITIVES_SUCCESS);
}
/* ------------------------------------------------------------------------- */
static BOOL test_lShift_16s_speed(void)
{
UINT32 val = 0;
INT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
INT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
if (winpr_RAND(&val, sizeof(val)))
return FALSE;
val = val % 16;
if (!speed_test("lShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
(speed_test_fkt)optimized->lShiftC_16s, src, val, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("lShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
(speed_test_fkt)optimized->lShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_lShift_16u_speed(void)
{
UINT32 val = 0;
UINT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
UINT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
val = val % 16;
if (!speed_test("lShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
(speed_test_fkt)optimized->lShiftC_16u, src, val, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("lShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
(speed_test_fkt)optimized->lShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_rShift_16s_speed(void)
{
UINT32 val = 0;
INT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
INT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
val = val % 16;
if (!speed_test("rShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
(speed_test_fkt)optimized->rShiftC_16s, src, val, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("rShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
(speed_test_fkt)optimized->rShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_rShift_16u_speed(void)
{
UINT32 val = 0;
UINT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
UINT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(&val, sizeof(val)) < 0)
return FALSE;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
val = val % 16;
if (!speed_test("rShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
(speed_test_fkt)optimized->rShiftC_16u, src, val, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("rShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
(speed_test_fkt)optimized->rShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
int TestPrimitivesShift(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
if (!test_lShift_16s_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_lShift_16s_speed())
return 1;
}
if (!test_lShift_16u_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_lShift_16u_speed())
return 1;
}
if (!test_rShift_16s_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_rShift_16s_speed())
return 1;
}
if (!test_rShift_16u_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_rShift_16u_speed())
return 1;
}
if (!test_ShiftWrapper_16s_func())
return 1;
if (!test_ShiftWrapper_16u_func())
return 1;
return 0;
}

View File

@@ -0,0 +1,95 @@
/* test_sign.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#define TEST_BUFFER_SIZE 65535
/* ------------------------------------------------------------------------- */
static BOOL test_sign16s_func(void)
{
pstatus_t status = 0;
INT16 src[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
INT16 d1[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
INT16 d2[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (memcmp(d1, d2, sizeof(d1)) != 0)
return FALSE;
status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (memcmp(d1, d2, sizeof(d1)) != 0)
return FALSE;
return TRUE;
}
static int test_sign16s_speed(void)
{
INT16 src[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
INT16 dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
if (winpr_RAND(src, sizeof(src)) < 0)
return FALSE;
if (!speed_test("sign16s", "aligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 1, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("sign16s", "unaligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 2, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
int TestPrimitivesSign(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
if (!test_sign16s_func())
return 1;
if (g_TestPrimitivesPerformance)
{
if (!test_sign16s_speed())
return 1;
}
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,150 @@
/* test_YCoCg.c
* vi:ts=4 sw=4
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <winpr/sysinfo.h>
#include "prim_test.h"
#include <freerdp/utils/profiler.h>
/* ------------------------------------------------------------------------- */
static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
{
pstatus_t status = -1;
BYTE* out_sse = nullptr;
BYTE* in = nullptr;
BYTE* out_c = nullptr;
const UINT32 srcStride = width * 4;
const UINT32 size = srcStride * height;
const UINT32 formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32,
PIXEL_FORMAT_RGBX32, PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
PROFILER_DEFINE(genericProf)
PROFILER_DEFINE(optProf)
in = winpr_aligned_calloc(1, size, 16);
out_c = winpr_aligned_calloc(1, size, 16);
out_sse = winpr_aligned_calloc(1, size, 16);
if (!in || !out_c || !out_sse)
goto fail;
if (winpr_RAND(in, size) < 0)
goto fail;
for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
{
const UINT32 format = formats[x];
const UINT32 dstStride = width * FreeRDPGetBytesPerPixel(format);
const char* formatName = FreeRDPGetColorFormatName(format);
PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC")
PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT")
PROFILER_ENTER(genericProf)
status = generic->YCoCgToRGB_8u_AC4R(in, WINPR_ASSERTING_INT_CAST(int, srcStride), out_c,
format, WINPR_ASSERTING_INT_CAST(int, dstStride),
width, height, 2, TRUE);
PROFILER_EXIT(genericProf)
if (status != PRIMITIVES_SUCCESS)
goto loop_fail;
PROFILER_ENTER(optProf)
status = optimized->YCoCgToRGB_8u_AC4R(
in, WINPR_ASSERTING_INT_CAST(int, srcStride), out_sse, format,
WINPR_ASSERTING_INT_CAST(int, dstStride), width, height, 2, TRUE);
PROFILER_EXIT(optProf)
if (status != PRIMITIVES_SUCCESS)
goto loop_fail;
if (memcmp(out_c, out_sse, 1ULL * dstStride * height) != 0)
{
for (size_t i = 0; i < 1ull * width * height; ++i)
{
const UINT32 c = FreeRDPReadColor(out_c + 4 * i, format);
const UINT32 sse = FreeRDPReadColor(out_sse + 4 * i, format);
if (c != sse)
{
printf("optimized->YCoCgRToRGB FAIL[%s] [%" PRIuz "]: 0x%08" PRIx32
" -> C 0x%08" PRIx32 " vs optimized 0x%08" PRIx32 "\n",
formatName, i, in[i + 1], c, sse);
status = -1;
}
}
}
printf("--------------------------- [%s] [%" PRIu32 "x%" PRIu32
"] ---------------------------\n",
formatName, width, height);
PROFILER_PRINT_HEADER
PROFILER_PRINT(genericProf)
PROFILER_PRINT(optProf)
PROFILER_PRINT_FOOTER
loop_fail:
PROFILER_FREE(genericProf)
PROFILER_FREE(optProf)
if (status != PRIMITIVES_SUCCESS)
goto fail;
}
fail:
winpr_aligned_free(in);
winpr_aligned_free(out_c);
winpr_aligned_free(out_sse);
return status == PRIMITIVES_SUCCESS;
}
int TestPrimitivesYCoCg(int argc, char* argv[])
{
WINPR_UNUSED(argc);
WINPR_UNUSED(argv);
prim_test_setup(FALSE);
/* Random resolution tests */
if (argc < 2)
{
for (UINT32 x = 0; x < 10; x++)
{
UINT32 w = 0;
UINT32 h = 0;
do
{
if (winpr_RAND(&w, sizeof(w)) < 0)
return -1;
w %= 2048 / 4;
} while (w < 16);
do
{
if (winpr_RAND(&h, sizeof(h)) < 0)
return -1;
h %= 2048 / 4;
} while (h < 16);
if (!test_YCoCgRToRGB_8u_AC4R_func(w, h))
return 1;
}
}
/* Test once with full HD/4 */
if (!test_YCoCgRToRGB_8u_AC4R_func(1920 / 4, 1080 / 4))
return 1;
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,138 @@
/* measure.h
* Macros to help with performance measurement.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
* MEASURE_LOOP_START("measurement", 2000)
* code to be measured
* MEASURE_LOOP_STOP
* buffer flush and such
* MEASURE_SHOW_RESULTS
*
* Define GOOGLE_PROFILER if you want gperftools included.
*/
#ifndef TEST_MEASURE_H_INCLUDED
#define TEST_MEASURE_H_INCLUDED
#include <freerdp/config.h>
#include <time.h>
#include <winpr/string.h>
#include <winpr/sysinfo.h>
#ifndef _WIN32
#include <sys/param.h>
#endif
#include <winpr/crt.h>
#ifdef _WIN32
#define PROFILER_START(_prefix_)
#define PROFILER_STOP
#define MEASURE_LOOP_START(_prefix_, _count_)
#define MEASURE_LOOP_STOP
#define MEASURE_GET_RESULTS(_result_)
#define MEASURE_SHOW_RESULTS(_result_)
#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)
#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_)
#else
#ifdef GOOGLE_PROFILER
#include <gperftools/profiler.h>
#define PROFILER_START(_prefix_) \
do \
{ \
char _path[PATH_MAX]; \
sprintf_s(_path, sizeof(_path), "./%s.prof", (_prefix_)); \
ProfilerStart(_path); \
} while (0);
#define PROFILER_STOP \
do \
{ \
ProfilerStop(); \
} while (0);
#else
#define PROFILER_START(_prefix_)
#define PROFILER_STOP
#endif // GOOGLE_PROFILER
extern float measure_delta_time(UINT64 t0, UINT64 t1);
extern void measure_floatprint(float t, char* output, size_t len);
#define MEASURE_LOOP_START(_prefix_, _count_) \
{ \
int _count = (_count_); \
int _loop; \
char str1[32] = WINPR_C_ARRAY_INIT; \
char str2[32] = WINPR_C_ARRAY_INIT; \
char* _prefix = _strdup(_prefix_); \
const UINT64 start = winpr_GetTickCount64NS(); \
PROFILER_START(_prefix); \
_loop = (_count); \
do \
{
#define MEASURE_LOOP_STOP \
} \
while (--_loop) \
;
#define MEASURE_GET_RESULTS(_result_) \
PROFILER_STOP; \
const UINT64 stop = winpr_GetTickCount64NS(); \
const float delta = measure_delta_time(start, stop); \
(_result_) = (float)_count / delta; \
free(_prefix); \
}
#define MEASURE_SHOW_RESULTS(_result_) \
PROFILER_STOP; \
const UINT64 stop = winpr_GetTickCount64NS(); \
const float delta = measure_delta_time(start, stop); \
(_result_) = (float)_count / delta; \
measure_floatprint((float)_count / delta, str1); \
printf("%s: %9d iterations in %5.1f seconds = %s/s \n", _prefix, _count, delta, str1); \
free(_prefix); \
}
#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \
PROFILER_STOP; \
const UINT64 stop = winpr_GetTickCount64NS(); \
const float delta = measure_delta_time(start, stop); \
measure_floatprint((float)_count / delta, str1); \
measure_floatprint((float)_count / delta * (_scale_), str2); \
printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", _prefix, _count, delta, str1, \
str2, _label_); \
free(_prefix); \
}
#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
{ \
float _r; \
MEASURE_LOOP_START(_label_, _init_iter_); \
_call_; \
MEASURE_LOOP_STOP; \
MEASURE_GET_RESULTS(_r); \
MEASURE_LOOP_START(_label_, _r* _test_time_); \
_call_; \
MEASURE_LOOP_STOP; \
MEASURE_SHOW_RESULTS(_result_); \
}
#endif
#endif // __MEASURE_H_INCLUDED__

View File

@@ -0,0 +1,94 @@
/* prim_test.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include "prim_test.h"
#ifndef _WIN32
#include <fcntl.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#endif
#include <winpr/sysinfo.h>
#include <winpr/platform.h>
#include <winpr/crypto.h>
primitives_t* generic = nullptr;
primitives_t* optimized = nullptr;
BOOL g_TestPrimitivesPerformance = FALSE;
UINT32 g_Iterations = 1000;
int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
/* ------------------------------------------------------------------------- */
float measure_delta_time(UINT64 t0, UINT64 t1)
{
INT64 diff = (INT64)(t1 - t0);
double retval = ((double)diff / 1000000000.0);
return (retval < 0.0) ? 0.0f : (float)retval;
}
/* ------------------------------------------------------------------------- */
void measure_floatprint(float t, char* output, size_t len)
{
/* I don't want to link against -lm, so avoid log,exp,... */
float f = 10.0f;
int i = 0;
while (t > f)
f *= 10.0f;
f /= 1000.0f;
i = ((int)(t / f + 0.5f)) * (int)f;
if (t < 0.0f)
(void)_snprintf(output, len, "%f", t);
else if (i == 0)
(void)_snprintf(output, len, "%d", (int)(t + 0.5f));
else if (t < 1e+3f)
(void)_snprintf(output, len, "%3d", i);
else if (t < 1e+6f)
(void)_snprintf(output, len, "%3d,%03d", i / 1000, i % 1000);
else if (t < 1e+9f)
(void)_snprintf(output, len, "%3d,%03d,000", i / 1000000, (i % 1000000) / 1000);
else if (t < 1e+12f)
(void)_snprintf(output, len, "%3d,%03d,000,000", i / 1000000000,
(i % 1000000000) / 1000000);
else
(void)_snprintf(output, len, "%f", t);
}
void prim_test_setup(BOOL performance)
{
generic = primitives_get_generic();
optimized = primitives_get();
g_TestPrimitivesPerformance = performance;
}
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
speed_test_fkt optimized, ...)
{
if (!name || !generic || !optimized || (iterations == 0))
return FALSE;
for (UINT32 i = 0; i < iterations; i++)
{
}
return TRUE;
}

View File

@@ -0,0 +1,48 @@
/* primtest.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*/
#ifndef FREERDP_LIB_PRIMTEST_H
#define FREERDP_LIB_PRIMTEST_H
#include <winpr/crt.h>
#include <winpr/spec.h>
#include <winpr/wtypes.h>
#include <winpr/platform.h>
#include <winpr/crypto.h>
#include <freerdp/primitives.h>
#include "measure.h"
#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
#define MAX_TEST_SIZE 4096
extern int test_sizes[];
#define NUM_TEST_SIZES 10
extern BOOL g_TestPrimitivesPerformance;
extern UINT32 g_Iterations;
extern primitives_t* generic;
extern primitives_t* optimized;
void prim_test_setup(BOOL performance);
typedef pstatus_t (*speed_test_fkt)();
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
speed_test_fkt optimized, ...);
#endif /* FREERDP_LIB_PRIMTEST_H */