Milestone 5: deliver embedded RDP sessions and lifecycle hardening
This commit is contained in:
104
third_party/FreeRDP/libfreerdp/primitives/CMakeLists.txt
vendored
Normal file
104
third_party/FreeRDP/libfreerdp/primitives/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
# primitives
|
||||
|
||||
set(PRIMITIVES_SRCS
|
||||
prim_add.c
|
||||
prim_add.h
|
||||
prim_andor.c
|
||||
prim_andor.h
|
||||
prim_alphaComp.c
|
||||
prim_alphaComp.h
|
||||
prim_colors.c
|
||||
prim_colors.h
|
||||
prim_copy.c
|
||||
prim_copy.h
|
||||
prim_set.c
|
||||
prim_set.h
|
||||
prim_shift.c
|
||||
prim_shift.h
|
||||
prim_sign.c
|
||||
prim_sign.h
|
||||
prim_YUV.c
|
||||
prim_YUV.h
|
||||
prim_YCoCg.c
|
||||
prim_YCoCg.h
|
||||
primitives.c
|
||||
prim_internal.h
|
||||
)
|
||||
|
||||
set(PRIMITIVES_SSE3_SRCS
|
||||
sse/prim_avxsse.h
|
||||
sse/prim_templates.h
|
||||
sse/prim_colors_sse2.c
|
||||
sse/prim_set_sse2.c
|
||||
sse/prim_add_sse3.c
|
||||
sse/prim_alphaComp_sse3.c
|
||||
sse/prim_andor_sse3.c
|
||||
sse/prim_shift_sse3.c
|
||||
)
|
||||
|
||||
set(PRIMITIVES_SSSE3_SRCS sse/prim_sign_ssse3.c sse/prim_YCoCg_ssse3.c)
|
||||
|
||||
set(PRIMITIVES_SSE4_1_SRCS sse/prim_copy_sse4_1.c sse/prim_YUV_sse4.1.c)
|
||||
|
||||
set(PRIMITIVES_SSE4_2_SRCS)
|
||||
|
||||
set(PRIMITIVES_AVX2_SRCS sse/prim_copy_avx2.c)
|
||||
|
||||
set(PRIMITIVES_NEON_SRCS neon/prim_colors_neon.c neon/prim_YCoCg_neon.c neon/prim_YUV_neon.c)
|
||||
|
||||
set(PRIMITIVES_OPENCL_SRCS opencl/prim_YUV_opencl.c)
|
||||
|
||||
if(WITH_OPENCL)
|
||||
include(WarnUnmaintained)
|
||||
warn_unmaintained("OpenCL support for primitives" "-DWITH_OPENCL=OFF")
|
||||
|
||||
set(FILENAME "opencl/primitives.cl")
|
||||
set_source_files_properties(${FILENAME} PROPERTIES HEADER_FILE_ONLY ON)
|
||||
list(APPEND PRIMITIVES_OPENCL_SRCS ${FILENAME})
|
||||
|
||||
include(ConvertFileToHexArray)
|
||||
file_to_hex_array(${FILENAME} FILEDATA)
|
||||
|
||||
set(HDR_FILE "${CMAKE_CURRENT_BINARY_DIR}/opencl/primitives-opencl-program.h")
|
||||
cleaning_configure_file("${CMAKE_CURRENT_SOURCE_DIR}/opencl/primitives.h.in" ${HDR_FILE} @ONLY)
|
||||
list(APPEND PRIMITIVES_OPENCL_SRCS ${HDR_FILE})
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR}/opencl)
|
||||
freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
|
||||
freerdp_library_add(OpenCL::OpenCL)
|
||||
freerdp_pc_add_requires_private("OpenCL")
|
||||
endif()
|
||||
|
||||
set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS}
|
||||
${PRIMITIVES_SSE4_1_SRCS} ${PRIMITIVES_SSE4_2_SRCS} ${PRIMITIVES_OPENCL_SRCS}
|
||||
)
|
||||
|
||||
if(WITH_AVX2)
|
||||
list(APPEND PRIMITIVES_OPT_SRCS ${PRIMITIVES_AVX2_SRCS})
|
||||
endif()
|
||||
|
||||
set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS})
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS})
|
||||
|
||||
include(CompilerDetect)
|
||||
include(DetectIntrinsicSupport)
|
||||
if(WITH_SIMD)
|
||||
set_simd_source_file_properties("sse3" ${PRIMITIVES_SSE3_SRCS})
|
||||
set_simd_source_file_properties("ssse3" ${PRIMITIVES_SSSE3_SRCS})
|
||||
set_simd_source_file_properties("sse4.1" ${PRIMITIVES_SSE4_1_SRCS})
|
||||
set_simd_source_file_properties("sse4.2" ${PRIMITIVES_SSE4_2_SRCS})
|
||||
set_simd_source_file_properties("avx2" ${PRIMITIVES_AVX2_SRCS})
|
||||
set_simd_source_file_properties("neon" ${PRIMITIVES_OPT_SRCS})
|
||||
endif()
|
||||
|
||||
freerdp_object_library_add(freerdp-primitives)
|
||||
|
||||
if(BUILD_BENCHMARK)
|
||||
add_subdirectory(benchmark)
|
||||
endif()
|
||||
|
||||
if(BUILD_TESTING_INTERNAL)
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
101
third_party/FreeRDP/libfreerdp/primitives/README.txt
vendored
Normal file
101
third_party/FreeRDP/libfreerdp/primitives/README.txt
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
The Primitives Library
|
||||
|
||||
Introduction
|
||||
------------
|
||||
The purpose of the primitives library is to give the freerdp code easy
|
||||
access to *run-time* optimization via SIMD operations. When the library
|
||||
is initialized, dynamic checks of processor features are run (such as
|
||||
the support of SSE3 or Neon), and entrypoints are linked to through
|
||||
function pointers to provide the fastest possible operations. All
|
||||
routines offer generic C alternatives as fallbacks.
|
||||
|
||||
Run-time optimization has the advantage of allowing a single executable
|
||||
to run fast on multiple platforms with different SIMD capabilities.
|
||||
|
||||
|
||||
Use In Code
|
||||
-----------
|
||||
A singleton pointing to a structure containing the function pointers
|
||||
is accessed through primitives_get(). The function pointers can then
|
||||
be used from that structure, e.g.
|
||||
|
||||
primitives_t *prims = primitives_get();
|
||||
prims->shiftC_16s(buffer, shifts, buffer, 256);
|
||||
|
||||
Of course, there is some overhead in calling through the function pointer
|
||||
and setting up the SIMD operations, so it would be counterproductive to
|
||||
call the primitives library for very small operation, e.g. initializing an
|
||||
array of eight values to a constant. The primitives library is intended
|
||||
for larger-scale operations, e.g. arrays of size 64 and larger.
|
||||
|
||||
|
||||
Initialization and Cleanup
|
||||
--------------------------
|
||||
Library initialization is done the first time primitives_init() is called
|
||||
or the first time primitives_get() is used. Cleanup (if any) is done by
|
||||
primitives_deinit().
|
||||
|
||||
|
||||
Intel Integrated Performance Primitives (IPP)
|
||||
---------------------------------------------
|
||||
If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
|
||||
calls will be used (where available) to fill the function pointers.
|
||||
Where possible, function names and parameter lists match IPP format so
|
||||
that the IPP functions can be plugged into the function pointers without
|
||||
a wrapper layer. Use of IPP is completely optional, and in many cases
|
||||
the SSE operations in the primitives library itself are faster or similar
|
||||
in performance.
|
||||
|
||||
|
||||
Coverage
|
||||
--------
|
||||
The primitives library is not meant to be comprehensive, offering
|
||||
entrypoints for every operation and operand type. Instead, the coverage
|
||||
is focused on operations known to be performance bottlenecks in the code.
|
||||
For instance, 16-bit signed operations are used widely in the RemoteFX
|
||||
software, so you'll find 16s versions of several operations, but there
|
||||
is no attempt to provide (unused) copies of the same code for 8u, 16u,
|
||||
32s, etc.
|
||||
|
||||
|
||||
New Optimizations
|
||||
-----------------
|
||||
As the need arises, new optimizations can be added to the library,
|
||||
including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
|
||||
The CPU feature detection is done in winpr/sysinfo.
|
||||
|
||||
|
||||
Adding Entrypoints
|
||||
------------------
|
||||
As the need for new operations or operands arises, new entrypoints can
|
||||
be added.
|
||||
1) Function prototypes and pointers are added to
|
||||
include/freerdp/primitives.h
|
||||
2) New module initialization and cleanup function prototypes are added
|
||||
to prim_internal.h and called in primitives.c (primitives_init()
|
||||
and primitives_deinit()).
|
||||
3) Operation names and parameter lists should be compatible with the IPP.
|
||||
IPP manuals are available online at software.intel.com.
|
||||
4) A generic C entrypoint must be available as a fallback.
|
||||
5) prim_templates.h contains macro-based templates for simple operations,
|
||||
such as applying a single SSE operation to arrays of data.
|
||||
The template functions can frequently be used to extend the
|
||||
operations without writing a lot of new code.
|
||||
|
||||
Cache Management
|
||||
----------------
|
||||
I haven't found a lot of speed improvement by attempting prefetch, and
|
||||
in fact it seems to have a negative impact in some cases. Done correctly
|
||||
perhaps the routines could be further accelerated by proper use of prefetch,
|
||||
fences, etc.
|
||||
|
||||
|
||||
Testing
|
||||
-------
|
||||
In the test subdirectory is an executable (prim_test) that tests both
|
||||
functionality and speed of primitives library operations. Any new
|
||||
modules should be added to that test, following the conventions already
|
||||
established in that directory. The program can be executed on various
|
||||
target hardware to compare generic C, optimized, and IPP performance
|
||||
with various array sizes.
|
||||
|
||||
20
third_party/FreeRDP/libfreerdp/primitives/benchmark/CMakeLists.txt
vendored
Normal file
20
third_party/FreeRDP/libfreerdp/primitives/benchmark/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# FreeRDP: A Remote Desktop Protocol Implementation
|
||||
# FreeRDP cmake build script
|
||||
#
|
||||
# Copyright 2025 Armin Novak <anovak@thincast.com>
|
||||
# Copyright 2025 Thincast Technologies GmbH
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
add_executable(primitives-benchmark benchmark.c)
|
||||
target_link_libraries(primitives-benchmark PRIVATE winpr freerdp)
|
||||
254
third_party/FreeRDP/libfreerdp/primitives/benchmark/benchmark.c
vendored
Normal file
254
third_party/FreeRDP/libfreerdp/primitives/benchmark/benchmark.c
vendored
Normal file
@@ -0,0 +1,254 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* primitives benchmarking tool
|
||||
*
|
||||
* Copyright 2025 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2025 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <winpr/crypto.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
typedef struct
|
||||
{
|
||||
BYTE* channels[3];
|
||||
UINT32 steps[3];
|
||||
prim_size_t roi;
|
||||
BYTE* outputBuffer;
|
||||
BYTE* outputChannels[3];
|
||||
BYTE* rgbBuffer;
|
||||
UINT32 outputStride;
|
||||
UINT32 testedFormat;
|
||||
} primitives_YUV_benchmark;
|
||||
|
||||
static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
|
||||
{
|
||||
if (!bench)
|
||||
return;
|
||||
|
||||
free(bench->outputBuffer);
|
||||
free(bench->rgbBuffer);
|
||||
|
||||
for (size_t i = 0; i < 3; i++)
|
||||
{
|
||||
free(bench->outputChannels[i]);
|
||||
free(bench->channels[i]);
|
||||
}
|
||||
|
||||
const primitives_YUV_benchmark empty = WINPR_C_ARRAY_INIT;
|
||||
*bench = empty;
|
||||
}
|
||||
|
||||
static primitives_YUV_benchmark primitives_YUV_benchmark_init(void)
|
||||
{
|
||||
primitives_YUV_benchmark ret = WINPR_C_ARRAY_INIT;
|
||||
ret.roi.width = 3840 * 4;
|
||||
ret.roi.height = 2160 * 4;
|
||||
ret.outputStride = ret.roi.width * 4;
|
||||
ret.testedFormat = PIXEL_FORMAT_BGRA32;
|
||||
|
||||
ret.outputBuffer = calloc(ret.outputStride, ret.roi.height);
|
||||
if (!ret.outputBuffer)
|
||||
goto fail;
|
||||
ret.rgbBuffer = calloc(ret.outputStride, ret.roi.height);
|
||||
if (!ret.rgbBuffer)
|
||||
goto fail;
|
||||
if (winpr_RAND(ret.rgbBuffer, 1ULL * ret.outputStride * ret.roi.height) < 0)
|
||||
goto fail;
|
||||
|
||||
for (size_t i = 0; i < 3; i++)
|
||||
{
|
||||
ret.channels[i] = calloc(ret.roi.width, ret.roi.height);
|
||||
ret.outputChannels[i] = calloc(ret.roi.width, ret.roi.height);
|
||||
if (!ret.channels[i] || !ret.outputChannels[i])
|
||||
goto fail;
|
||||
|
||||
if (winpr_RAND(ret.channels[i], 1ull * ret.roi.width * ret.roi.height) < 0)
|
||||
goto fail;
|
||||
ret.steps[i] = ret.roi.width;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
fail:
|
||||
primitives_YUV_benchmark_free(&ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char* print_time(UINT64 t, char* buffer, size_t size)
|
||||
{
|
||||
(void)_snprintf(buffer, size, "%u.%03u.%03u.%03u", (unsigned)(t / 1000000000ull),
|
||||
(unsigned)((t / 1000000ull) % 1000), (unsigned)((t / 1000ull) % 1000),
|
||||
(unsigned)((t) % 1000));
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static BOOL primitives_YUV420_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
|
||||
{
|
||||
const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
for (size_t i = 0; i < 3; i++)
|
||||
channels[i] = bench->channels[i];
|
||||
|
||||
for (size_t x = 0; x < 10; x++)
|
||||
{
|
||||
const UINT64 start = winpr_GetTickCount64NS();
|
||||
pstatus_t status =
|
||||
prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
|
||||
bench->outputStride, bench->testedFormat, &bench->roi);
|
||||
const UINT64 end = winpr_GetTickCount64NS();
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
{
|
||||
(void)fprintf(stderr, "Running YUV420ToRGB_8u_P3AC4R failed\n");
|
||||
return FALSE;
|
||||
}
|
||||
const UINT64 diff = end - start;
|
||||
char buffer[32] = WINPR_C_ARRAY_INIT;
|
||||
printf("[%" PRIuz "] YUV420ToRGB_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
|
||||
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL primitives_YUV444_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
|
||||
{
|
||||
const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
for (size_t i = 0; i < 3; i++)
|
||||
channels[i] = bench->channels[i];
|
||||
|
||||
for (size_t x = 0; x < 10; x++)
|
||||
{
|
||||
const UINT64 start = winpr_GetTickCount64NS();
|
||||
pstatus_t status =
|
||||
prims->YUV444ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
|
||||
bench->outputStride, bench->testedFormat, &bench->roi);
|
||||
const UINT64 end = winpr_GetTickCount64NS();
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
{
|
||||
(void)fprintf(stderr, "Running YUV444ToRGB_8u_P3AC4R failed\n");
|
||||
return FALSE;
|
||||
}
|
||||
const UINT64 diff = end - start;
|
||||
char buffer[32] = WINPR_C_ARRAY_INIT;
|
||||
printf("[%" PRIuz "] YUV444ToRGB_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
|
||||
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL primitives_RGB2420_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
|
||||
{
|
||||
for (size_t x = 0; x < 10; x++)
|
||||
{
|
||||
const UINT64 start = winpr_GetTickCount64NS();
|
||||
pstatus_t status =
|
||||
prims->RGBToYUV420_8u_P3AC4R(bench->rgbBuffer, bench->testedFormat, bench->outputStride,
|
||||
bench->outputChannels, bench->steps, &bench->roi);
|
||||
const UINT64 end = winpr_GetTickCount64NS();
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
{
|
||||
(void)fprintf(stderr, "Running RGBToYUV420_8u_P3AC4R failed\n");
|
||||
return FALSE;
|
||||
}
|
||||
const UINT64 diff = end - start;
|
||||
char buffer[32] = WINPR_C_ARRAY_INIT;
|
||||
printf("[%" PRIuz "] RGBToYUV420_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
|
||||
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL primitives_RGB2444_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims)
|
||||
{
|
||||
for (size_t x = 0; x < 10; x++)
|
||||
{
|
||||
const UINT64 start = winpr_GetTickCount64NS();
|
||||
pstatus_t status =
|
||||
prims->RGBToYUV444_8u_P3AC4R(bench->rgbBuffer, bench->testedFormat, bench->outputStride,
|
||||
bench->outputChannels, bench->steps, &bench->roi);
|
||||
const UINT64 end = winpr_GetTickCount64NS();
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
{
|
||||
(void)fprintf(stderr, "Running RGBToYUV444_8u_P3AC4R failed\n");
|
||||
return FALSE;
|
||||
}
|
||||
const UINT64 diff = end - start;
|
||||
char buffer[32] = WINPR_C_ARRAY_INIT;
|
||||
printf("[%" PRIuz "] RGBToYUV444_8u_P3AC4R %" PRIu32 "x%" PRIu32 " took %sns\n", x,
|
||||
bench->roi.width, bench->roi.height, print_time(diff, buffer, sizeof(buffer)));
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
primitives_YUV_benchmark bench = primitives_YUV_benchmark_init();
|
||||
|
||||
for (primitive_hints hint = PRIMITIVES_PURE_SOFT; hint < PRIMITIVES_AUTODETECT; hint++)
|
||||
{
|
||||
const char* hintstr = primtives_hint_str(hint);
|
||||
primitives_t* prim = primitives_get_by_type(hint);
|
||||
if (!prim)
|
||||
{
|
||||
(void)fprintf(stderr, "failed to get primitives: %s\n", hintstr);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
printf("Running YUV420 -> RGB benchmark on %s implementation:\n", hintstr);
|
||||
if (!primitives_YUV420_benchmark_run(&bench, prim))
|
||||
{
|
||||
(void)fprintf(stderr, "YUV420 -> RGB benchmark failed\n");
|
||||
goto fail;
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Running RGB -> YUV420 benchmark on %s implementation:\n", hintstr);
|
||||
if (!primitives_RGB2420_benchmark_run(&bench, prim))
|
||||
{
|
||||
(void)fprintf(stderr, "RGB -> YUV420 benchmark failed\n");
|
||||
goto fail;
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Running YUV444 -> RGB benchmark on %s implementation:\n", hintstr);
|
||||
if (!primitives_YUV444_benchmark_run(&bench, prim))
|
||||
{
|
||||
(void)fprintf(stderr, "YUV444 -> RGB benchmark failed\n");
|
||||
goto fail;
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Running RGB -> YUV444 benchmark on %s implementation:\n", hintstr);
|
||||
if (!primitives_RGB2444_benchmark_run(&bench, prim))
|
||||
{
|
||||
(void)fprintf(stderr, "RGB -> YUV444 benchmark failed\n");
|
||||
goto fail;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
fail:
|
||||
primitives_YUV_benchmark_free(&bench);
|
||||
return 0;
|
||||
}
|
||||
168
third_party/FreeRDP/libfreerdp/primitives/neon/prim_YCoCg_neon.c
vendored
Normal file
168
third_party/FreeRDP/libfreerdp/primitives/neon/prim_YCoCg_neon.c
vendored
Normal file
@@ -0,0 +1,168 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized YCoCg<->RGB conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
#if defined(NEON_INTRINSICS_ENABLED)
|
||||
#include <arm_neon.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
|
||||
UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
|
||||
BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
|
||||
{
|
||||
BYTE* dptr = pDst;
|
||||
const BYTE* sptr = pSrc;
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
const int8_t cll = shift - 1; /* -1 builds in the /2's */
|
||||
const UINT32 srcPad = srcStep - (width * 4);
|
||||
const UINT32 dstPad = dstStep - (width * formatSize);
|
||||
const UINT32 pad = width % 8;
|
||||
const uint8x8_t aVal = vdup_n_u8(0xFF);
|
||||
const int8x8_t cllv = vdup_n_s8(cll);
|
||||
|
||||
for (UINT32 y = 0; y < height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < width - pad; x += 8)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const uint8x8x4_t raw = vld4_u8(sptr);
|
||||
const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
|
||||
const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
|
||||
const int16x8_t Cg = vmovl_s8(CgRaw);
|
||||
const int16x8_t Co = vmovl_s8(CoRaw);
|
||||
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
|
||||
const int16x8_t T = vsubq_s16(Y, Cg);
|
||||
const int16x8_t R = vaddq_s16(T, Co);
|
||||
const int16x8_t G = vaddq_s16(Y, Cg);
|
||||
const int16x8_t B = vsubq_s16(T, Co);
|
||||
uint8x8x4_t bgrx;
|
||||
bgrx.val[bPos] = vqmovun_s16(B);
|
||||
bgrx.val[gPos] = vqmovun_s16(G);
|
||||
bgrx.val[rPos] = vqmovun_s16(R);
|
||||
|
||||
if (alpha)
|
||||
bgrx.val[aPos] = raw.val[3];
|
||||
else
|
||||
bgrx.val[aPos] = aVal;
|
||||
|
||||
vst4_u8(dptr, bgrx);
|
||||
sptr += sizeof(raw);
|
||||
dptr += sizeof(bgrx);
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
|
||||
const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
|
||||
const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
|
||||
const INT16 T = Y - Cg;
|
||||
const INT16 R = T + Co;
|
||||
const INT16 G = Y + Cg;
|
||||
const INT16 B = T - Co;
|
||||
BYTE bgra[4];
|
||||
bgra[bPos] = CLIP(B);
|
||||
bgra[gPos] = CLIP(G);
|
||||
bgra[rPos] = CLIP(R);
|
||||
bgra[aPos] = *sptr++;
|
||||
|
||||
if (!alpha)
|
||||
bgra[aPos] = 0xFF;
|
||||
|
||||
*dptr++ = bgra[0];
|
||||
*dptr++ = bgra[1];
|
||||
*dptr++ = bgra[2];
|
||||
*dptr++ = bgra[3];
|
||||
}
|
||||
|
||||
sptr += srcPad;
|
||||
dptr += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
|
||||
UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 2, 1, 0, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 2, 1, 0, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 0, 1, 2, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 0, 1, 2, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 1, 2, 3, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 1, 2, 3, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 3, 2, 1, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 3, 2, 1, 0, withAlpha);
|
||||
|
||||
default:
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
|
||||
height, shift, withAlpha);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(NEON_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "NEON optimizations");
|
||||
prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
837
third_party/FreeRDP/libfreerdp/primitives/neon/prim_YUV_neon.c
vendored
Normal file
837
third_party/FreeRDP/libfreerdp/primitives/neon/prim_YUV_neon.c
vendored
Normal file
@@ -0,0 +1,837 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Optimized YUV/RGB conversion operations
|
||||
*
|
||||
* Copyright 2014 Thomas Erbesdobler
|
||||
* Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
|
||||
* Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
|
||||
* Copyright 2016-2017 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include <winpr/crt.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YUV.h"
|
||||
|
||||
#if defined(NEON_INTRINSICS_ENABLED)
|
||||
#include <arm_neon.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static inline uint8x8_t neon_YUV2R_single(uint16x8_t C, int16x8_t D, int16x8_t E)
|
||||
{
|
||||
/* R = (256 * Y + 403 * (V - 128)) >> 8 */
|
||||
const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
|
||||
const int32x4_t e403h = vmull_n_s16(vget_high_s16(E), 403);
|
||||
const int32x4_t cehm = vaddq_s32(Ch, e403h);
|
||||
const int32x4_t ceh = vshrq_n_s32(cehm, 8);
|
||||
|
||||
const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
|
||||
const int32x4_t e403l = vmull_n_s16(vget_low_s16(E), 403);
|
||||
const int32x4_t celm = vaddq_s32(Cl, e403l);
|
||||
const int32x4_t cel = vshrq_n_s32(celm, 8);
|
||||
const int16x8_t ce = vcombine_s16(vqmovn_s32(cel), vqmovn_s32(ceh));
|
||||
return vqmovun_s16(ce);
|
||||
}
|
||||
|
||||
static inline uint8x8x2_t neon_YUV2R(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
|
||||
{
|
||||
uint8x8x2_t res = { { neon_YUV2R_single(C.val[0], D.val[0], E.val[0]),
|
||||
neon_YUV2R_single(C.val[1], D.val[1], E.val[1]) } };
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline uint8x8_t neon_YUV2G_single(uint16x8_t C, int16x8_t D, int16x8_t E)
|
||||
{
|
||||
/* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
|
||||
const int16x8_t d48 = vmulq_n_s16(D, 48);
|
||||
const int16x8_t e120 = vmulq_n_s16(E, 120);
|
||||
const int32x4_t deh = vaddl_s16(vget_high_s16(d48), vget_high_s16(e120));
|
||||
const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
|
||||
const int32x4_t cdeh32m = vsubq_s32(Ch, deh);
|
||||
const int32x4_t cdeh32 = vshrq_n_s32(cdeh32m, 8);
|
||||
const int16x4_t cdeh = vqmovn_s32(cdeh32);
|
||||
|
||||
const int32x4_t del = vaddl_s16(vget_low_s16(d48), vget_low_s16(e120));
|
||||
const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
|
||||
const int32x4_t cdel32m = vsubq_s32(Cl, del);
|
||||
const int32x4_t cdel32 = vshrq_n_s32(cdel32m, 8);
|
||||
const int16x4_t cdel = vqmovn_s32(cdel32);
|
||||
const int16x8_t cde = vcombine_s16(cdel, cdeh);
|
||||
return vqmovun_s16(cde);
|
||||
}
|
||||
|
||||
static inline uint8x8x2_t neon_YUV2G(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
|
||||
{
|
||||
uint8x8x2_t res = { { neon_YUV2G_single(C.val[0], D.val[0], E.val[0]),
|
||||
neon_YUV2G_single(C.val[1], D.val[1], E.val[1]) } };
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline uint8x8_t neon_YUV2B_single(uint16x8_t C, int16x8_t D, int16x8_t E)
|
||||
{
|
||||
/* B = (256L * Y + 475 * (U - 128)) >> 8*/
|
||||
const int32x4_t Ch = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(C)));
|
||||
const int32x4_t d475h = vmull_n_s16(vget_high_s16(D), 475);
|
||||
const int32x4_t cdhm = vaddq_s32(Ch, d475h);
|
||||
const int32x4_t cdh = vshrq_n_s32(cdhm, 8);
|
||||
|
||||
const int32x4_t Cl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(C)));
|
||||
const int32x4_t d475l = vmull_n_s16(vget_low_s16(D), 475);
|
||||
const int32x4_t cdlm = vaddq_s32(Cl, d475l);
|
||||
const int32x4_t cdl = vshrq_n_s32(cdlm, 8);
|
||||
const int16x8_t cd = vcombine_s16(vqmovn_s32(cdl), vqmovn_s32(cdh));
|
||||
return vqmovun_s16(cd);
|
||||
}
|
||||
|
||||
static inline uint8x8x2_t neon_YUV2B(uint16x8x2_t C, int16x8x2_t D, int16x8x2_t E)
|
||||
{
|
||||
uint8x8x2_t res = { { neon_YUV2B_single(C.val[0], D.val[0], E.val[0]),
|
||||
neon_YUV2B_single(C.val[1], D.val[1], E.val[1]) } };
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline void neon_store_bgrx(BYTE* WINPR_RESTRICT pRGB, uint8x8_t r, uint8x8_t g, uint8x8_t b,
|
||||
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
uint8x8x4_t bgrx = vld4_u8(pRGB);
|
||||
bgrx.val[rPos] = r;
|
||||
bgrx.val[gPos] = g;
|
||||
bgrx.val[bPos] = b;
|
||||
vst4_u8(pRGB, bgrx);
|
||||
}
|
||||
|
||||
static inline void neon_YuvToRgbPixel(BYTE* pRGB, uint8x8x2_t Y, int16x8x2_t D, int16x8x2_t E,
|
||||
const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
|
||||
const uint8_t aPos)
|
||||
{
|
||||
/* Y * 256 == Y << 8 */
|
||||
const uint16x8x2_t C = { { vshlq_n_u16(vmovl_u8(Y.val[0]), 8),
|
||||
vshlq_n_u16(vmovl_u8(Y.val[1]), 8) } };
|
||||
|
||||
const uint8x8x2_t r = neon_YUV2R(C, D, E);
|
||||
const uint8x8x2_t g = neon_YUV2G(C, D, E);
|
||||
const uint8x8x2_t b = neon_YUV2B(C, D, E);
|
||||
|
||||
neon_store_bgrx(pRGB, r.val[0], g.val[0], b.val[0], rPos, gPos, bPos, aPos);
|
||||
neon_store_bgrx(pRGB + sizeof(uint8x8x4_t), r.val[1], g.val[1], b.val[1], rPos, gPos, bPos,
|
||||
aPos);
|
||||
}
|
||||
|
||||
static inline int16x8x2_t loadUV(const BYTE* WINPR_RESTRICT pV, size_t x)
|
||||
{
|
||||
const uint8x8_t Vraw = vld1_u8(&pV[x / 2]);
|
||||
const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vraw));
|
||||
const int16x8_t c128 = vdupq_n_s16(128);
|
||||
const int16x8_t E = vsubq_s16(V, c128);
|
||||
return vzipq_s16(E, E);
|
||||
}
|
||||
|
||||
static inline void neon_write_pixel(BYTE* pRGB, BYTE Y, BYTE U, BYTE V, const uint8_t rPos,
|
||||
const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
const BYTE r = YUV2R(Y, U, V);
|
||||
const BYTE g = YUV2G(Y, U, V);
|
||||
const BYTE b = YUV2B(Y, U, V);
|
||||
|
||||
pRGB[rPos] = r;
|
||||
pRGB[gPos] = g;
|
||||
pRGB[bPos] = b;
|
||||
}
|
||||
|
||||
static inline void neon_YUV420ToX_DOUBLE_ROW(const BYTE* WINPR_RESTRICT pY[2],
|
||||
const BYTE* WINPR_RESTRICT pU,
|
||||
const BYTE* WINPR_RESTRICT pV,
|
||||
BYTE* WINPR_RESTRICT pRGB[2], size_t width,
|
||||
const uint8_t rPos, const uint8_t gPos,
|
||||
const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
UINT32 x = 0;
|
||||
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
const uint8x16_t Y0raw = vld1q_u8(&pY[0][x]);
|
||||
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
|
||||
const int16x8x2_t D = loadUV(pU, x);
|
||||
const int16x8x2_t E = loadUV(pV, x);
|
||||
neon_YuvToRgbPixel(&pRGB[0][4ULL * x], Y0, D, E, rPos, gPos, bPos, aPos);
|
||||
|
||||
const uint8x16_t Y1raw = vld1q_u8(&pY[1][x]);
|
||||
const uint8x8x2_t Y1 = { { vget_low_u8(Y1raw), vget_high_u8(Y1raw) } };
|
||||
neon_YuvToRgbPixel(&pRGB[1][4ULL * x], Y1, D, E, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
|
||||
for (; x < width - width % 2; x += 2)
|
||||
{
|
||||
const BYTE U = pU[x / 2];
|
||||
const BYTE V = pV[x / 2];
|
||||
|
||||
neon_write_pixel(&pRGB[0][4 * x], pY[0][x], U, V, rPos, gPos, bPos, aPos);
|
||||
neon_write_pixel(&pRGB[0][4 * (1ULL + x)], pY[0][1ULL + x], U, V, rPos, gPos, bPos, aPos);
|
||||
neon_write_pixel(&pRGB[1][4 * x], pY[1][x], U, V, rPos, gPos, bPos, aPos);
|
||||
neon_write_pixel(&pRGB[1][4 * (1ULL + x)], pY[1][1ULL + x], U, V, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
|
||||
for (; x < width; x++)
|
||||
{
|
||||
const BYTE U = pU[x / 2];
|
||||
const BYTE V = pV[x / 2];
|
||||
|
||||
neon_write_pixel(&pRGB[0][4 * x], pY[0][x], U, V, rPos, gPos, bPos, aPos);
|
||||
neon_write_pixel(&pRGB[1][4 * x], pY[1][x], U, V, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void neon_YUV420ToX_SINGLE_ROW(const BYTE* WINPR_RESTRICT pY,
|
||||
const BYTE* WINPR_RESTRICT pU,
|
||||
const BYTE* WINPR_RESTRICT pV,
|
||||
BYTE* WINPR_RESTRICT pRGB, size_t width,
|
||||
const uint8_t rPos, const uint8_t gPos,
|
||||
const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
UINT32 x = 0;
|
||||
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
const uint8x16_t Y0raw = vld1q_u8(&pY[x]);
|
||||
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
|
||||
const int16x8x2_t D = loadUV(pU, x);
|
||||
const int16x8x2_t E = loadUV(pV, x);
|
||||
neon_YuvToRgbPixel(&pRGB[4ULL * x], Y0, D, E, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
|
||||
for (; x < width - width % 2; x += 2)
|
||||
{
|
||||
const BYTE U = pU[x / 2];
|
||||
const BYTE V = pV[x / 2];
|
||||
|
||||
neon_write_pixel(&pRGB[4 * x], pY[x], U, V, rPos, gPos, bPos, aPos);
|
||||
neon_write_pixel(&pRGB[4 * (1ULL + x)], pY[1ULL + x], U, V, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
for (; x < width; x++)
|
||||
{
|
||||
const BYTE U = pU[x / 2];
|
||||
const BYTE V = pV[x / 2];
|
||||
|
||||
neon_write_pixel(&pRGB[4 * x], pY[x], U, V, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
}
|
||||
|
||||
static inline pstatus_t neon_YUV420ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
|
||||
const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
const UINT32 nWidth = roi->width;
|
||||
const UINT32 nHeight = roi->height;
|
||||
|
||||
WINPR_ASSERT(nHeight > 0);
|
||||
UINT32 y = 0;
|
||||
for (; y < (nHeight - 1); y += 2)
|
||||
{
|
||||
const uint8_t* pY[2] = { pSrc[0] + y * srcStep[0], pSrc[0] + (1ULL + y) * srcStep[0] };
|
||||
const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
|
||||
const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
|
||||
uint8_t* pRGB[2] = { pDst + y * dstStep, pDst + (1ULL + y) * dstStep };
|
||||
|
||||
neon_YUV420ToX_DOUBLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
for (; y < nHeight; y++)
|
||||
{
|
||||
const uint8_t* pY = pSrc[0] + y * srcStep[0];
|
||||
const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
|
||||
const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
|
||||
uint8_t* pRGB = pDst + y * dstStep;
|
||||
|
||||
neon_YUV420ToX_SINGLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int16x8_t loadUVreg(uint8x8_t Vraw)
|
||||
{
|
||||
const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vraw));
|
||||
const int16x8_t c128 = vdupq_n_s16(128);
|
||||
const int16x8_t E = vsubq_s16(V, c128);
|
||||
return E;
|
||||
}
|
||||
|
||||
static inline int16x8x2_t loadUV444(uint8x16_t Vld)
|
||||
{
|
||||
const uint8x8x2_t V = { { vget_low_u8(Vld), vget_high_u8(Vld) } };
|
||||
const int16x8x2_t res = { {
|
||||
loadUVreg(V.val[0]),
|
||||
loadUVreg(V.val[1]),
|
||||
} };
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline void avgUV(BYTE U[2][2])
|
||||
{
|
||||
const BYTE u00 = U[0][0];
|
||||
const INT16 umul = (INT16)u00 << 2;
|
||||
const INT16 sum = (INT16)U[0][1] + U[1][0] + U[1][1];
|
||||
const INT16 wavg = umul - sum;
|
||||
const BYTE val = CONDITIONAL_CLIP(wavg, u00);
|
||||
U[0][0] = val;
|
||||
}
|
||||
|
||||
static inline void neon_avgUV(uint8x16_t pU[2])
|
||||
{
|
||||
/* put even and odd values into different registers.
|
||||
* U 0/0 is in lower half */
|
||||
const uint8x16x2_t usplit = vuzpq_u8(pU[0], pU[1]);
|
||||
const uint8x16_t ueven = usplit.val[0];
|
||||
const uint8x16_t uodd = usplit.val[1];
|
||||
|
||||
const uint8x8_t u00 = vget_low_u8(ueven);
|
||||
const uint8x8_t u01 = vget_low_u8(uodd);
|
||||
const uint8x8_t u10 = vget_high_u8(ueven);
|
||||
const uint8x8_t u11 = vget_high_u8(uodd);
|
||||
|
||||
/* Create sum of U01 + U10 + U11 */
|
||||
const uint16x8_t uoddsum = vaddl_u8(u01, u10);
|
||||
const uint16x8_t usum = vaddq_u16(uoddsum, vmovl_u8(u11));
|
||||
|
||||
/* U00 * 4 */
|
||||
const uint16x8_t umul = vshll_n_u8(u00, 2);
|
||||
|
||||
/* U00 - (U01 + U10 + U11) */
|
||||
const int16x8_t wavg = vsubq_s16(vreinterpretq_s16_u16(umul), vreinterpretq_s16_u16(usum));
|
||||
const uint8x8_t avg = vqmovun_s16(wavg);
|
||||
|
||||
/* abs(u00 - avg) */
|
||||
const uint8x8_t absdiff = vabd_u8(avg, u00);
|
||||
|
||||
/* (diff < 30) ? u00 : avg */
|
||||
const uint8x8_t mask = vclt_u8(absdiff, vdup_n_u8(30));
|
||||
|
||||
/* out1 = u00 & mask */
|
||||
const uint8x8_t out1 = vand_u8(u00, mask);
|
||||
|
||||
/* invmask = ~mask */
|
||||
const uint8x8_t notmask = vmvn_u8(mask);
|
||||
|
||||
/* out2 = avg & invmask */
|
||||
const uint8x8_t out2 = vand_u8(avg, notmask);
|
||||
|
||||
/* out = out1 | out2 */
|
||||
const uint8x8_t out = vorr_u8(out1, out2);
|
||||
|
||||
const uint8x8x2_t ua = vzip_u8(out, u01);
|
||||
const uint8x16_t u = vcombine_u8(ua.val[0], ua.val[1]);
|
||||
pU[0] = u;
|
||||
}
|
||||
|
||||
static inline pstatus_t neon_YUV444ToX_SINGLE_ROW(const BYTE* WINPR_RESTRICT pY,
|
||||
const BYTE* WINPR_RESTRICT pU,
|
||||
const BYTE* WINPR_RESTRICT pV,
|
||||
BYTE* WINPR_RESTRICT pRGB, size_t width,
|
||||
const uint8_t rPos, const uint8_t gPos,
|
||||
const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
WINPR_ASSERT(width % 2 == 0);
|
||||
|
||||
size_t x = 0;
|
||||
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
uint8x16_t U = vld1q_u8(&pU[x]);
|
||||
uint8x16_t V = vld1q_u8(&pV[x]);
|
||||
const uint8x16_t Y0raw = vld1q_u8(&pY[x]);
|
||||
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
|
||||
const int16x8x2_t D0 = loadUV444(U);
|
||||
const int16x8x2_t E0 = loadUV444(V);
|
||||
neon_YuvToRgbPixel(&pRGB[4ULL * x], Y0, D0, E0, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
|
||||
for (; x < width; x += 2)
|
||||
{
|
||||
BYTE* rgb = &pRGB[x * 4];
|
||||
|
||||
for (size_t j = 0; j < 2; j++)
|
||||
{
|
||||
const BYTE y = pY[x + j];
|
||||
const BYTE u = pU[x + j];
|
||||
const BYTE v = pV[x + j];
|
||||
|
||||
neon_write_pixel(&rgb[4 * (j)], y, u, v, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t neon_YUV444ToX_DOUBLE_ROW(const BYTE* WINPR_RESTRICT pY[2],
|
||||
const BYTE* WINPR_RESTRICT pU[2],
|
||||
const BYTE* WINPR_RESTRICT pV[2],
|
||||
BYTE* WINPR_RESTRICT pRGB[2], size_t width,
|
||||
const uint8_t rPos, const uint8_t gPos,
|
||||
const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
WINPR_ASSERT(width % 2 == 0);
|
||||
|
||||
size_t x = 0;
|
||||
|
||||
for (; x < width - width % 16; x += 16)
|
||||
{
|
||||
uint8x16_t U[2] = { vld1q_u8(&pU[0][x]), vld1q_u8(&pU[1][x]) };
|
||||
neon_avgUV(U);
|
||||
|
||||
uint8x16_t V[2] = { vld1q_u8(&pV[0][x]), vld1q_u8(&pV[1][x]) };
|
||||
neon_avgUV(V);
|
||||
|
||||
const uint8x16_t Y0raw = vld1q_u8(&pY[0][x]);
|
||||
const uint8x8x2_t Y0 = { { vget_low_u8(Y0raw), vget_high_u8(Y0raw) } };
|
||||
const int16x8x2_t D0 = loadUV444(U[0]);
|
||||
const int16x8x2_t E0 = loadUV444(V[0]);
|
||||
neon_YuvToRgbPixel(&pRGB[0][4ULL * x], Y0, D0, E0, rPos, gPos, bPos, aPos);
|
||||
|
||||
const uint8x16_t Y1raw = vld1q_u8(&pY[1][x]);
|
||||
const uint8x8x2_t Y1 = { { vget_low_u8(Y1raw), vget_high_u8(Y1raw) } };
|
||||
const int16x8x2_t D1 = loadUV444(U[1]);
|
||||
const int16x8x2_t E1 = loadUV444(V[1]);
|
||||
neon_YuvToRgbPixel(&pRGB[1][4ULL * x], Y1, D1, E1, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
|
||||
for (; x < width; x += 2)
|
||||
{
|
||||
BYTE* rgb[2] = { &pRGB[0][x * 4], &pRGB[1][x * 4] };
|
||||
BYTE U[2][2] = { { pU[0][x], pU[0][x + 1] }, { pU[1][x], pU[1][x + 1] } };
|
||||
avgUV(U);
|
||||
|
||||
BYTE V[2][2] = { { pV[0][x], pV[0][x + 1] }, { pV[1][x], pV[1][x + 1] } };
|
||||
avgUV(V);
|
||||
|
||||
for (size_t i = 0; i < 2; i++)
|
||||
{
|
||||
for (size_t j = 0; j < 2; j++)
|
||||
{
|
||||
const BYTE y = pY[i][x + j];
|
||||
const BYTE u = U[i][j];
|
||||
const BYTE v = V[i][j];
|
||||
|
||||
neon_write_pixel(&rgb[i][4 * (j)], y, u, v, rPos, gPos, bPos, aPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t neon_YUV444ToX(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi, const uint8_t rPos,
|
||||
const uint8_t gPos, const uint8_t bPos, const uint8_t aPos)
|
||||
{
|
||||
WINPR_ASSERT(roi);
|
||||
const UINT32 nWidth = roi->width;
|
||||
const UINT32 nHeight = roi->height;
|
||||
|
||||
size_t y = 0;
|
||||
for (; y < nHeight - nHeight % 2; y += 2)
|
||||
{
|
||||
const uint8_t* WINPR_RESTRICT pY[2] = { pSrc[0] + y * srcStep[0],
|
||||
pSrc[0] + (y + 1) * srcStep[0] };
|
||||
const uint8_t* WINPR_RESTRICT pU[2] = { pSrc[1] + y * srcStep[1],
|
||||
pSrc[1] + (y + 1) * srcStep[1] };
|
||||
const uint8_t* WINPR_RESTRICT pV[2] = { pSrc[2] + y * srcStep[2],
|
||||
pSrc[2] + (y + 1) * srcStep[2] };
|
||||
|
||||
uint8_t* WINPR_RESTRICT pRGB[2] = { &pDst[y * dstStep], &pDst[(y + 1) * dstStep] };
|
||||
|
||||
const pstatus_t rc =
|
||||
neon_YUV444ToX_DOUBLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
|
||||
if (rc != PRIMITIVES_SUCCESS)
|
||||
return rc;
|
||||
}
|
||||
for (; y < nHeight; y++)
|
||||
{
|
||||
const uint8_t* WINPR_RESTRICT pY = pSrc[0] + y * srcStep[0];
|
||||
const uint8_t* WINPR_RESTRICT pU = pSrc[1] + y * srcStep[1];
|
||||
const uint8_t* WINPR_RESTRICT pV = pSrc[2] + y * srcStep[2];
|
||||
uint8_t* WINPR_RESTRICT pRGB = &pDst[y * dstStep];
|
||||
|
||||
const pstatus_t rc =
|
||||
neon_YUV444ToX_SINGLE_ROW(pY, pU, pV, pRGB, nWidth, rPos, gPos, bPos, aPos);
|
||||
if (rc != PRIMITIVES_SUCCESS)
|
||||
return rc;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
static pstatus_t neon_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3], const UINT32 srcStep[3],
|
||||
BYTE* WINPR_RESTRICT pDstRaw[3], const UINT32 dstStep[3],
|
||||
const RECTANGLE_16* WINPR_RESTRICT roi)
|
||||
{
|
||||
const UINT32 nWidth = roi->right - roi->left;
|
||||
const UINT32 nHeight = roi->bottom - roi->top;
|
||||
const UINT32 halfWidth = (nWidth + 1) / 2;
|
||||
const UINT32 halfHeight = (nHeight + 1) / 2;
|
||||
const UINT32 evenY = 0;
|
||||
const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
|
||||
pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
|
||||
pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
|
||||
BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
|
||||
pDstRaw[1] + roi->top * dstStep[1] + roi->left,
|
||||
pDstRaw[2] + roi->top * dstStep[2] + roi->left };
|
||||
|
||||
/* Y data is already here... */
|
||||
/* B1 */
|
||||
for (UINT32 y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* Ym = pSrc[0] + srcStep[0] * y;
|
||||
BYTE* pY = pDst[0] + dstStep[0] * y;
|
||||
memcpy(pY, Ym, nWidth);
|
||||
}
|
||||
|
||||
/* The first half of U, V are already here part of this frame. */
|
||||
/* B2 and B3 */
|
||||
for (UINT32 y = 0; y < halfHeight; y++)
|
||||
{
|
||||
const UINT32 val2y = (2 * y + evenY);
|
||||
const BYTE* Um = pSrc[1] + srcStep[1] * y;
|
||||
const BYTE* Vm = pSrc[2] + srcStep[2] * y;
|
||||
BYTE* pU = pDst[1] + dstStep[1] * val2y;
|
||||
BYTE* pV = pDst[2] + dstStep[2] * val2y;
|
||||
BYTE* pU1 = pU + dstStep[1];
|
||||
BYTE* pV1 = pV + dstStep[2];
|
||||
|
||||
UINT32 x = 0;
|
||||
for (; x + 16 < halfWidth; x += 16)
|
||||
{
|
||||
{
|
||||
const uint8x16_t u = vld1q_u8(Um);
|
||||
uint8x16x2_t u2x;
|
||||
u2x.val[0] = u;
|
||||
u2x.val[1] = u;
|
||||
vst2q_u8(pU, u2x);
|
||||
vst2q_u8(pU1, u2x);
|
||||
Um += 16;
|
||||
pU += 32;
|
||||
pU1 += 32;
|
||||
}
|
||||
{
|
||||
const uint8x16_t v = vld1q_u8(Vm);
|
||||
uint8x16x2_t v2x;
|
||||
v2x.val[0] = v;
|
||||
v2x.val[1] = v;
|
||||
vst2q_u8(pV, v2x);
|
||||
vst2q_u8(pV1, v2x);
|
||||
Vm += 16;
|
||||
pV += 32;
|
||||
pV1 += 32;
|
||||
}
|
||||
}
|
||||
|
||||
for (; x < halfWidth; x++)
|
||||
{
|
||||
const BYTE u = *Um++;
|
||||
const BYTE v = *Vm++;
|
||||
*pU++ = u;
|
||||
*pU++ = u;
|
||||
*pU1++ = u;
|
||||
*pU1++ = u;
|
||||
*pV++ = v;
|
||||
*pV++ = v;
|
||||
*pV1++ = v;
|
||||
*pV1++ = v;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
|
||||
const UINT32 dstStep[3],
|
||||
const RECTANGLE_16* WINPR_RESTRICT roi)
|
||||
{
|
||||
const UINT32 mod = 16;
|
||||
UINT32 uY = 0;
|
||||
UINT32 vY = 0;
|
||||
const UINT32 nWidth = roi->right - roi->left;
|
||||
const UINT32 nHeight = roi->bottom - roi->top;
|
||||
const UINT32 halfWidth = (nWidth) / 2;
|
||||
const UINT32 halfHeight = (nHeight) / 2;
|
||||
const UINT32 oddY = 1;
|
||||
const UINT32 evenY = 0;
|
||||
const UINT32 oddX = 1;
|
||||
/* The auxiliary frame is aligned to multiples of 16x16.
|
||||
* We need the padded height for B4 and B5 conversion. */
|
||||
const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
|
||||
const UINT32 halfPad = halfWidth % 16;
|
||||
const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
|
||||
pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
|
||||
pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
|
||||
BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
|
||||
pDstRaw[1] + roi->top * dstStep[1] + roi->left,
|
||||
pDstRaw[2] + roi->top * dstStep[2] + roi->left };
|
||||
|
||||
/* The second half of U and V is a bit more tricky... */
|
||||
/* B4 and B5 */
|
||||
for (UINT32 y = 0; y < padHeigth; y++)
|
||||
{
|
||||
const BYTE* Ya = pSrc[0] + srcStep[0] * y;
|
||||
BYTE* pX;
|
||||
|
||||
if ((y) % mod < (mod + 1) / 2)
|
||||
{
|
||||
const UINT32 pos = (2 * uY++ + oddY);
|
||||
|
||||
if (pos >= nHeight)
|
||||
continue;
|
||||
|
||||
pX = pDst[1] + dstStep[1] * pos;
|
||||
}
|
||||
else
|
||||
{
|
||||
const UINT32 pos = (2 * vY++ + oddY);
|
||||
|
||||
if (pos >= nHeight)
|
||||
continue;
|
||||
|
||||
pX = pDst[2] + dstStep[2] * pos;
|
||||
}
|
||||
|
||||
memcpy(pX, Ya, nWidth);
|
||||
}
|
||||
|
||||
/* B6 and B7 */
|
||||
for (UINT32 y = 0; y < halfHeight; y++)
|
||||
{
|
||||
const UINT32 val2y = (y * 2 + evenY);
|
||||
const BYTE* Ua = pSrc[1] + srcStep[1] * y;
|
||||
const BYTE* Va = pSrc[2] + srcStep[2] * y;
|
||||
BYTE* pU = pDst[1] + dstStep[1] * val2y;
|
||||
BYTE* pV = pDst[2] + dstStep[2] * val2y;
|
||||
|
||||
UINT32 x = 0;
|
||||
for (; x < halfWidth - halfPad; x += 16)
|
||||
{
|
||||
{
|
||||
uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
|
||||
u.val[1] = vld1q_u8(&Ua[x]);
|
||||
vst2q_u8(&pU[2 * x], u);
|
||||
}
|
||||
{
|
||||
uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
|
||||
v.val[1] = vld1q_u8(&Va[x]);
|
||||
vst2q_u8(&pV[2 * x], v);
|
||||
}
|
||||
}
|
||||
|
||||
for (; x < halfWidth; x++)
|
||||
{
|
||||
const UINT32 val2x1 = (x * 2 + oddX);
|
||||
pU[val2x1] = Ua[x];
|
||||
pV[val2x1] = Va[x];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
|
||||
UINT32 nTotalWidth, UINT32 nTotalHeight,
|
||||
BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
|
||||
const RECTANGLE_16* WINPR_RESTRICT roi)
|
||||
{
|
||||
const UINT32 nWidth = roi->right - roi->left;
|
||||
const UINT32 nHeight = roi->bottom - roi->top;
|
||||
const UINT32 halfWidth = (nWidth + 1) / 2;
|
||||
const UINT32 halfPad = halfWidth % 16;
|
||||
const UINT32 halfHeight = (nHeight + 1) / 2;
|
||||
const UINT32 quaterWidth = (nWidth + 3) / 4;
|
||||
const UINT32 quaterPad = quaterWidth % 16;
|
||||
|
||||
/* B4 and B5: odd UV values for width/2, height */
|
||||
for (UINT32 y = 0; y < nHeight; y++)
|
||||
{
|
||||
const UINT32 yTop = y + roi->top;
|
||||
const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
|
||||
const BYTE* pYaV = pYaU + nTotalWidth / 2;
|
||||
BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
|
||||
BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
|
||||
|
||||
UINT32 x = 0;
|
||||
for (; x < halfWidth - halfPad; x += 16)
|
||||
{
|
||||
{
|
||||
uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
|
||||
u.val[1] = vld1q_u8(&pYaU[x]);
|
||||
vst2q_u8(&pU[2 * x], u);
|
||||
}
|
||||
{
|
||||
uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
|
||||
v.val[1] = vld1q_u8(&pYaV[x]);
|
||||
vst2q_u8(&pV[2 * x], v);
|
||||
}
|
||||
}
|
||||
|
||||
for (; x < halfWidth; x++)
|
||||
{
|
||||
const UINT32 odd = 2 * x + 1;
|
||||
pU[odd] = pYaU[x];
|
||||
pV[odd] = pYaV[x];
|
||||
}
|
||||
}
|
||||
|
||||
/* B6 - B9 */
|
||||
for (UINT32 y = 0; y < halfHeight; y++)
|
||||
{
|
||||
const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
|
||||
const BYTE* pUaV = pUaU + nTotalWidth / 4;
|
||||
const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
|
||||
const BYTE* pVaV = pVaU + nTotalWidth / 4;
|
||||
BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
|
||||
BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
|
||||
|
||||
UINT32 x = 0;
|
||||
for (; x < quaterWidth - quaterPad; x += 16)
|
||||
{
|
||||
{
|
||||
uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
|
||||
u.val[0] = vld1q_u8(&pUaU[x]);
|
||||
u.val[2] = vld1q_u8(&pVaU[x]);
|
||||
vst4q_u8(&pU[4 * x], u);
|
||||
}
|
||||
{
|
||||
uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
|
||||
v.val[0] = vld1q_u8(&pUaV[x]);
|
||||
v.val[2] = vld1q_u8(&pVaV[x]);
|
||||
vst4q_u8(&pV[4 * x], v);
|
||||
}
|
||||
}
|
||||
|
||||
for (; x < quaterWidth; x++)
|
||||
{
|
||||
pU[4 * x + 0] = pUaU[x];
|
||||
pV[4 * x + 0] = pUaV[x];
|
||||
pU[4 * x + 2] = pVaU[x];
|
||||
pV[4 * x + 2] = pVaV[x];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
|
||||
const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
|
||||
BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
|
||||
const RECTANGLE_16* WINPR_RESTRICT roi)
|
||||
{
|
||||
if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
|
||||
return -1;
|
||||
|
||||
if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
|
||||
return -1;
|
||||
|
||||
if (!roi)
|
||||
return -1;
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case AVC444_LUMA:
|
||||
return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
|
||||
|
||||
case AVC444_CHROMAv1:
|
||||
return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
|
||||
|
||||
case AVC444_CHROMAv2:
|
||||
return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
|
||||
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(NEON_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
WLog_VRB(PRIM_TAG, "NEON optimizations");
|
||||
prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
|
||||
prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
|
||||
prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
274
third_party/FreeRDP/libfreerdp/primitives/neon/prim_colors_neon.c
vendored
Normal file
274
third_party/FreeRDP/libfreerdp/primitives/neon/prim_colors_neon.c
vendored
Normal file
@@ -0,0 +1,274 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized Color conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* Copyright 2011 Stephen Erisman
|
||||
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
|
||||
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_colors.h"
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#if defined(NEON_INTRINSICS_ENABLED)
|
||||
#include <arm_neon.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
|
||||
uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
BYTE* pRGB = pDst;
|
||||
const INT16* pY = pSrc[0];
|
||||
const INT16* pCb = pSrc[1];
|
||||
const INT16* pCr = pSrc[2];
|
||||
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
|
||||
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
|
||||
const size_t pad = roi->width % 8;
|
||||
const int16x4_t c4096 = vdup_n_s16(4096);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width - pad; x += 8)
|
||||
{
|
||||
const int16x8_t Y = vld1q_s16(pY);
|
||||
const int16x4_t Yh = vget_high_s16(Y);
|
||||
const int16x4_t Yl = vget_low_s16(Y);
|
||||
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
|
||||
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
|
||||
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
|
||||
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
|
||||
const int16x8_t Cr = vld1q_s16(pCr);
|
||||
const int16x4_t Crh = vget_high_s16(Cr);
|
||||
const int16x4_t Crl = vget_low_s16(Cr);
|
||||
const int16x8_t Cb = vld1q_s16(pCb);
|
||||
const int16x4_t Cbh = vget_high_s16(Cb);
|
||||
const int16x4_t Cbl = vget_low_s16(Cb);
|
||||
uint8x8x4_t bgrx;
|
||||
{
|
||||
/* R */
|
||||
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
|
||||
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
|
||||
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
|
||||
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
|
||||
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
|
||||
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
|
||||
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
|
||||
bgrx.val[rPos] = vqmovun_s16(Rs);
|
||||
}
|
||||
{
|
||||
/* G */
|
||||
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
|
||||
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
|
||||
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
|
||||
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
|
||||
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
|
||||
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
|
||||
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
|
||||
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
|
||||
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
|
||||
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
|
||||
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
|
||||
const uint8x8_t G = vqmovun_s16(Gs);
|
||||
bgrx.val[gPos] = G;
|
||||
}
|
||||
{
|
||||
/* B */
|
||||
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
|
||||
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
|
||||
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
|
||||
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
|
||||
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
|
||||
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
|
||||
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
|
||||
const uint8x8_t B = vqmovun_s16(Bs);
|
||||
bgrx.val[bPos] = B;
|
||||
}
|
||||
/* A */
|
||||
{
|
||||
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
||||
}
|
||||
vst4_u8(pRGB, bgrx);
|
||||
pY += 8;
|
||||
pCb += 8;
|
||||
pCr += 8;
|
||||
pRGB += 32;
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
const INT32 divisor = 16;
|
||||
const INT32 Y = ((*pY++) + 4096) << divisor;
|
||||
const INT32 Cb = (*pCb++);
|
||||
const INT32 Cr = (*pCr++);
|
||||
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
|
||||
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
|
||||
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
|
||||
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
|
||||
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
|
||||
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
|
||||
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
|
||||
BYTE bgrx[4];
|
||||
bgrx[bPos] = CLIP(B);
|
||||
bgrx[gPos] = CLIP(G);
|
||||
bgrx[rPos] = CLIP(R);
|
||||
bgrx[aPos] = 0xFF;
|
||||
*pRGB++ = bgrx[0];
|
||||
*pRGB++ = bgrx[1];
|
||||
*pRGB++ = bgrx[2];
|
||||
*pRGB++ = bgrx[3];
|
||||
}
|
||||
|
||||
pY += srcPad;
|
||||
pCb += srcPad;
|
||||
pCr += srcPad;
|
||||
pRGB += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
static pstatus_t
|
||||
neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
|
||||
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
UINT32 pad = roi->width % 8;
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
const INT16* pr = (const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
|
||||
const INT16* pg = (const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
|
||||
const INT16* pb = (const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
|
||||
BYTE* dst = pDst + y * dstStep;
|
||||
|
||||
for (UINT32 x = 0; x < roi->width - pad; x += 8)
|
||||
{
|
||||
int16x8_t r = vld1q_s16(pr);
|
||||
int16x8_t g = vld1q_s16(pg);
|
||||
int16x8_t b = vld1q_s16(pb);
|
||||
uint8x8x4_t bgrx;
|
||||
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
||||
bgrx.val[rPos] = vqmovun_s16(r);
|
||||
bgrx.val[gPos] = vqmovun_s16(g);
|
||||
bgrx.val[bPos] = vqmovun_s16(b);
|
||||
vst4_u8(dst, bgrx);
|
||||
pr += 8;
|
||||
pg += 8;
|
||||
pb += 8;
|
||||
dst += 32;
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
BYTE bgrx[4];
|
||||
bgrx[bPos] = *pb++;
|
||||
bgrx[gPos] = *pg++;
|
||||
bgrx[rPos] = *pr++;
|
||||
bgrx[aPos] = 0xFF;
|
||||
*dst++ = bgrx[0];
|
||||
*dst++ = bgrx[1];
|
||||
*dst++ = bgrx[2];
|
||||
*dst++ = bgrx[3];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t
|
||||
neon_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
#endif /* NEON_INTRINSICS_ENABLED */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(NEON_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "NEON optimizations");
|
||||
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
501
third_party/FreeRDP/libfreerdp/primitives/opencl/prim_YUV_opencl.c
vendored
Normal file
501
third_party/FreeRDP/libfreerdp/primitives/opencl/prim_YUV_opencl.c
vendored
Normal file
@@ -0,0 +1,501 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Optimized YUV/RGB conversion operations using openCL
|
||||
*
|
||||
* Copyright 2019 David Fort <contact@hardening-consulting.com>
|
||||
* Copyright 2019 Rangee Gmbh
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include "prim_internal.h"
|
||||
|
||||
#if defined(WITH_OPENCL)
|
||||
#ifdef __APPLE__
|
||||
#include "OpenCL/opencl.h"
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
#include "primitives-opencl-program.h"
|
||||
|
||||
#include <freerdp/log.h>
|
||||
#define TAG FREERDP_TAG("primitives")
|
||||
|
||||
typedef struct
|
||||
{
|
||||
BOOL support;
|
||||
cl_platform_id platformId;
|
||||
cl_device_id deviceId;
|
||||
cl_context context;
|
||||
cl_command_queue commandQueue;
|
||||
cl_program program;
|
||||
} primitives_opencl_context;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
primitives_opencl_context* cl;
|
||||
cl_kernel kernel;
|
||||
cl_mem srcObjs[3];
|
||||
cl_mem dstObj;
|
||||
prim_size_t roi;
|
||||
size_t dstStep;
|
||||
} primitives_cl_kernel;
|
||||
|
||||
static primitives_opencl_context* primitives_get_opencl_context(void);
|
||||
|
||||
static void cl_kernel_free(primitives_cl_kernel* kernel)
|
||||
{
|
||||
if (!kernel)
|
||||
return;
|
||||
|
||||
if (kernel->dstObj)
|
||||
clReleaseMemObject(kernel->dstObj);
|
||||
|
||||
for (size_t i = 0; i < ARRAYSIZE(kernel->srcObjs); i++)
|
||||
{
|
||||
cl_mem obj = kernel->srcObjs[i];
|
||||
kernel->srcObjs[i] = nullptr;
|
||||
if (obj)
|
||||
clReleaseMemObject(obj);
|
||||
}
|
||||
|
||||
if (kernel->kernel)
|
||||
clReleaseKernel(kernel->kernel);
|
||||
|
||||
free(kernel);
|
||||
}
|
||||
|
||||
static primitives_cl_kernel* cl_kernel_new(const char* kernelName, const prim_size_t* roi)
|
||||
{
|
||||
WINPR_ASSERT(kernelName);
|
||||
WINPR_ASSERT(roi);
|
||||
|
||||
primitives_cl_kernel* kernel = calloc(1, sizeof(primitives_cl_kernel));
|
||||
if (!kernel)
|
||||
goto fail;
|
||||
|
||||
kernel->roi = *roi;
|
||||
kernel->cl = primitives_get_opencl_context();
|
||||
if (!kernel->cl)
|
||||
goto fail;
|
||||
|
||||
cl_int ret = CL_INVALID_VALUE;
|
||||
kernel->kernel = clCreateKernel(kernel->cl->program, kernelName, &ret);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
return kernel;
|
||||
fail:
|
||||
cl_kernel_free(kernel);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static BOOL cl_kernel_set_sources(primitives_cl_kernel* ctx, const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3])
|
||||
{
|
||||
const char* sourceNames[] = { "Y", "U", "V" };
|
||||
|
||||
WINPR_ASSERT(ctx);
|
||||
WINPR_ASSERT(pSrc);
|
||||
WINPR_ASSERT(srcStep);
|
||||
|
||||
for (cl_uint i = 0; i < ARRAYSIZE(ctx->srcObjs); i++)
|
||||
{
|
||||
cl_int ret = CL_INVALID_VALUE;
|
||||
const BYTE* csrc = pSrc[i];
|
||||
void* WINPR_RESTRICT src = WINPR_CAST_CONST_PTR_AWAY(csrc, void* WINPR_RESTRICT);
|
||||
ctx->srcObjs[i] = clCreateBuffer(ctx->cl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
|
||||
1ull * srcStep[i] * ctx->roi.height, src, &ret);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ret = clSetKernelArg(ctx->kernel, i * 2, sizeof(cl_mem), (const void*)&ctx->srcObjs[i]);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ret = clSetKernelArg(ctx->kernel, i * 2 + 1, sizeof(cl_uint), &srcStep[i]);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL cl_kernel_set_destination(primitives_cl_kernel* ctx, UINT32 dstStep)
|
||||
{
|
||||
|
||||
WINPR_ASSERT(ctx);
|
||||
|
||||
ctx->dstStep = dstStep;
|
||||
cl_int ret = CL_INVALID_VALUE;
|
||||
ctx->dstObj = clCreateBuffer(ctx->cl->context, CL_MEM_WRITE_ONLY,
|
||||
1ull * dstStep * ctx->roi.height, nullptr, &ret);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to create dest obj");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ret = clSetKernelArg(ctx->kernel, 6, sizeof(cl_mem), (const void*)&ctx->dstObj);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to set arg destObj");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ret = clSetKernelArg(ctx->kernel, 7, sizeof(cl_uint), &dstStep);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to set arg dstStep");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL cl_kernel_process(primitives_cl_kernel* ctx, BYTE* pDst)
|
||||
{
|
||||
WINPR_ASSERT(ctx);
|
||||
WINPR_ASSERT(pDst);
|
||||
|
||||
size_t indexes[2] = WINPR_C_ARRAY_INIT;
|
||||
indexes[0] = ctx->roi.width;
|
||||
indexes[1] = ctx->roi.height;
|
||||
|
||||
cl_int ret = clEnqueueNDRangeKernel(ctx->cl->commandQueue, ctx->kernel, 2, nullptr, indexes,
|
||||
nullptr, 0, nullptr, nullptr);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to enqueue call kernel");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* Transfer result to host */
|
||||
ret = clEnqueueReadBuffer(ctx->cl->commandQueue, ctx->dstObj, CL_TRUE, 0,
|
||||
ctx->roi.height * ctx->dstStep, pDst, 0, nullptr, nullptr);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "unable to read back buffer");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static pstatus_t opencl_YUVToRGB(const char* kernelName, const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
pstatus_t res = -1;
|
||||
|
||||
primitives_cl_kernel* ctx = cl_kernel_new(kernelName, roi);
|
||||
if (!ctx)
|
||||
goto fail;
|
||||
|
||||
if (!cl_kernel_set_sources(ctx, pSrc, srcStep))
|
||||
goto fail;
|
||||
|
||||
if (!cl_kernel_set_destination(ctx, dstStep))
|
||||
goto fail;
|
||||
|
||||
if (!cl_kernel_process(ctx, pDst))
|
||||
goto fail;
|
||||
|
||||
res = PRIMITIVES_SUCCESS;
|
||||
|
||||
fail:
|
||||
cl_kernel_free(ctx);
|
||||
return res;
|
||||
}
|
||||
|
||||
static primitives_opencl_context openclContext = WINPR_C_ARRAY_INIT;
|
||||
|
||||
static primitives_opencl_context* primitives_get_opencl_context(void)
|
||||
{
|
||||
return &openclContext;
|
||||
}
|
||||
|
||||
static void cl_context_free(primitives_opencl_context* ctx)
|
||||
{
|
||||
if (!ctx)
|
||||
return;
|
||||
clReleaseProgram(ctx->program);
|
||||
clReleaseCommandQueue(ctx->commandQueue);
|
||||
clReleaseContext(ctx->context);
|
||||
clReleaseDevice(ctx->deviceId);
|
||||
ctx->support = FALSE;
|
||||
}
|
||||
|
||||
static pstatus_t primitives_uninit_opencl(void)
|
||||
{
|
||||
if (!openclContext.support)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
cl_context_free(&openclContext);
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static BOOL primitives_init_opencl_context(primitives_opencl_context* WINPR_RESTRICT prims)
|
||||
{
|
||||
cl_uint ndevices = 0;
|
||||
cl_uint nplatforms = 0;
|
||||
cl_kernel kernel = nullptr;
|
||||
|
||||
BOOL gotGPU = FALSE;
|
||||
size_t programLen = 0;
|
||||
|
||||
cl_int ret = clGetPlatformIDs(0, nullptr, &nplatforms);
|
||||
if (ret != CL_SUCCESS || nplatforms < 1)
|
||||
return FALSE;
|
||||
|
||||
cl_platform_id* platform_ids = (cl_platform_id*)calloc(nplatforms, sizeof(cl_platform_id));
|
||||
if (!platform_ids)
|
||||
return FALSE;
|
||||
|
||||
ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
free((void*)platform_ids);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
for (cl_uint i = 0; (i < nplatforms) && !gotGPU; i++)
|
||||
{
|
||||
cl_device_id device_id = nullptr;
|
||||
cl_context context = nullptr;
|
||||
char platformName[1000] = WINPR_C_ARRAY_INIT;
|
||||
char deviceName[1000] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName),
|
||||
platformName, nullptr);
|
||||
if (ret != CL_SUCCESS)
|
||||
continue;
|
||||
|
||||
ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices);
|
||||
if (ret != CL_SUCCESS)
|
||||
continue;
|
||||
|
||||
ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName);
|
||||
clReleaseDevice(device_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
context = clCreateContext(nullptr, 1, &device_id, nullptr, nullptr, &ret);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s",
|
||||
platformName, deviceName);
|
||||
clReleaseDevice(device_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
#if defined(CL_VERSION_2_0)
|
||||
prims->commandQueue = clCreateCommandQueueWithProperties(context, device_id, nullptr, &ret);
|
||||
#else
|
||||
prims->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
|
||||
#endif
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable to create command queue");
|
||||
clReleaseContext(context);
|
||||
clReleaseDevice(device_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName);
|
||||
|
||||
prims->platformId = platform_ids[i];
|
||||
prims->deviceId = device_id;
|
||||
prims->context = context;
|
||||
gotGPU = TRUE;
|
||||
}
|
||||
|
||||
free((void*)platform_ids);
|
||||
|
||||
if (!gotGPU)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: no GPU found");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
programLen = strnlen(openclProgram, sizeof(openclProgram));
|
||||
const char* ptr = openclProgram;
|
||||
prims->program = clCreateProgramWithSource(prims->context, 1, &ptr, &programLen, &ret);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable to create program");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ret = clBuildProgram(prims->program, 1, &prims->deviceId, nullptr, nullptr, nullptr);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
size_t length = 0;
|
||||
char buffer[2048];
|
||||
ret = clGetProgramBuildInfo(prims->program, prims->deviceId, CL_PROGRAM_BUILD_LOG,
|
||||
sizeof(buffer), buffer, &length);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG,
|
||||
"openCL: building program failed but unable to retrieve buildLog, error=%d",
|
||||
ret);
|
||||
}
|
||||
else
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
|
||||
kernel = clCreateKernel(prims->program, "yuv420_to_bgra_1b", &ret);
|
||||
if (ret != CL_SUCCESS)
|
||||
{
|
||||
WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel");
|
||||
goto fail;
|
||||
}
|
||||
clReleaseKernel(kernel);
|
||||
|
||||
prims->support = TRUE;
|
||||
return TRUE;
|
||||
|
||||
fail:
|
||||
cl_context_free(prims);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
const char* kernel_name = nullptr;
|
||||
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
kernel_name = "yuv420_to_abgr_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
kernel_name = "yuv420_to_xbgr_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
kernel_name = "yuv420_to_rgba_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
kernel_name = "yuv420_to_rgbx_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
kernel_name = "yuv420_to_bgra_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
kernel_name = "yuv420_to_bgrx_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
kernel_name = "yuv420_to_xrgb_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
kernel_name = "yuv420_to_argb_1b";
|
||||
break;
|
||||
default:
|
||||
{
|
||||
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
|
||||
if (!p)
|
||||
return -1;
|
||||
return p->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
|
||||
static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
const char* kernel_name = nullptr;
|
||||
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
kernel_name = "yuv444_to_abgr_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
kernel_name = "yuv444_to_xbgr_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
kernel_name = "yuv444_to_rgba_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
kernel_name = "yuv444_to_rgbx_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
kernel_name = "yuv444_to_bgra_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
kernel_name = "yuv444_to_bgrx_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
kernel_name = "yuv444_to_xrgb_1b";
|
||||
break;
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
kernel_name = "yuv444_to_argb_1b";
|
||||
break;
|
||||
default:
|
||||
{
|
||||
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
|
||||
if (!p)
|
||||
return -1;
|
||||
return p->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
|
||||
BOOL primitives_init_opencl(primitives_t* prims)
|
||||
{
|
||||
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
|
||||
if (!prims || !p)
|
||||
return FALSE;
|
||||
*prims = *p;
|
||||
|
||||
if (!primitives_init_opencl_context(&openclContext))
|
||||
return TRUE;
|
||||
|
||||
prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
|
||||
prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
|
||||
prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
|
||||
prims->uninit = primitives_uninit_opencl;
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
474
third_party/FreeRDP/libfreerdp/primitives/opencl/primitives.cl
vendored
Normal file
474
third_party/FreeRDP/libfreerdp/primitives/opencl/primitives.cl
vendored
Normal file
@@ -0,0 +1,474 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Optimized operations using openCL
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* Copyright 2019 David Fort <contact@hardening-consulting.com>
|
||||
* Copyright 2019 Rangee Gmbh
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
uchar clamp_uc(int v, short l, short h)
|
||||
{
|
||||
if (v > h)
|
||||
v = h;
|
||||
if (v < l)
|
||||
v = l;
|
||||
return (uchar)v;
|
||||
}
|
||||
|
||||
short avgUV(__global const uchar* buf, unsigned stride, unsigned x, unsigned y)
|
||||
{
|
||||
const short U00 = buf[y * stride];
|
||||
if ((x != 0) || (y != 0))
|
||||
return U00;
|
||||
const short U01 = buf[y * stride + 1];
|
||||
const short U10 = buf[(y + 1) * stride];
|
||||
const short U11 = buf[(y + 1) * stride + 1];
|
||||
const short avg = U00 * 4 - U01 - U10 - U11;
|
||||
const short avgU = clamp_uc(avg, 0, 255);
|
||||
const short diff = abs(U00 - avgU);
|
||||
if (diff < 30)
|
||||
return U00;
|
||||
return avgU;
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_rgba_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
|
||||
/* A */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_abgr_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
/* A */
|
||||
destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
|
||||
destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_abgr_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
/* A */
|
||||
destPtr[1] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[3] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_rgba_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
/* A */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_rgbx_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
|
||||
destPtr[3] = 0xff; /* A */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_xbgr_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = 0xff; /* A */
|
||||
destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
|
||||
destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_xbgr_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = 0xff; /* A */
|
||||
destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[3] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_rgbx_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
destPtr[3] = 0xff; /* A */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_argb_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
/* A */
|
||||
destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
|
||||
destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_bgra_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
|
||||
/* A */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_bgra_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
/* A */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_argb_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[3] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[1] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
/* A */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_xrgb_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = 0xff; /* A */
|
||||
destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8, 0, 255); /* G */
|
||||
destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
|
||||
}
|
||||
|
||||
__kernel void yuv420_to_bgrx_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
|
||||
short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * U) - (120 * V)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
|
||||
destPtr[3] = 0xff; /* A */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_bgrx_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[0] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
destPtr[1] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[2] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
destPtr[3] = 0xff; /* A */
|
||||
}
|
||||
|
||||
__kernel void yuv444_to_xrgb_1b(__global const uchar* bufY, unsigned strideY,
|
||||
__global const uchar* bufU, unsigned strideU,
|
||||
__global const uchar* bufV, unsigned strideV, __global uchar* dest,
|
||||
unsigned strideDest)
|
||||
{
|
||||
unsigned int x = get_global_id(0);
|
||||
unsigned int y = get_global_id(1);
|
||||
|
||||
short Y = bufY[y * strideY + x];
|
||||
short U = avgUV(bufU, strideU, x, y);
|
||||
short V = avgUV(bufV, strideV, x, y);
|
||||
short D = U - 128;
|
||||
short E = V - 128;
|
||||
|
||||
__global uchar* destPtr = dest + (strideDest * y) + (x * 4);
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
int y256 = 256 * Y;
|
||||
destPtr[3] = clamp_uc((y256 + (475 * D)) >> 8, 0, 255); /* B */
|
||||
destPtr[2] = clamp_uc((y256 - (48 * D) - (120 * E)) >> 8, 0, 255); /* G */
|
||||
destPtr[1] = clamp_uc((y256 + (403 * E)) >> 8, 0, 255); /* R */
|
||||
destPtr[0] = 0xff; /* A */
|
||||
}
|
||||
11
third_party/FreeRDP/libfreerdp/primitives/opencl/primitives.h.in
vendored
Normal file
11
third_party/FreeRDP/libfreerdp/primitives/opencl/primitives.h.in
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
/* AUTOGENERATED file, do not edit
|
||||
*
|
||||
* part of @PROJECT_NAME@
|
||||
* generated from libfreerdp/primitives/opencl/primitives.h.in
|
||||
*
|
||||
* with file contents of @FILENAME@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
static const char openclProgram[] = { @FILEDATA@ };
|
||||
|
||||
82
third_party/FreeRDP/libfreerdp/primitives/prim_YCoCg.c
vendored
Normal file
82
third_party/FreeRDP/libfreerdp/primitives/prim_YCoCg.c
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* YCoCg<->RGB Color conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
/* helper function to convert raw 8 bit values to signed 16bit values.
|
||||
*/
|
||||
static INT16 convert(UINT8 raw, int shift)
|
||||
{
|
||||
const int cll = shift - 1; /* -1 builds in the /2's */
|
||||
return (INT16)((INT8)(raw << cll));
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
|
||||
BOOL withAlpha)
|
||||
{
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, TRUE);
|
||||
|
||||
for (size_t y = 0; y < height; y++)
|
||||
{
|
||||
const BYTE* sptr = &pSrc[y * WINPR_ASSERTING_INT_CAST(uint32_t, srcStep)];
|
||||
BYTE* dptr = &pDst[y * WINPR_ASSERTING_INT_CAST(uint32_t, dstStep)];
|
||||
for (size_t x = 0; x < width; x++)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const INT16 Cg = convert(*sptr++, shift);
|
||||
const INT16 Co = convert(*sptr++, shift);
|
||||
const INT16 Y = *sptr++; /* UINT8->INT16 */
|
||||
const INT16 T = (INT16)(Y - Cg);
|
||||
const INT16 B = (INT16)(T + Co);
|
||||
const INT16 G = (INT16)(Y + Cg);
|
||||
const INT16 R = (INT16)(T - Co);
|
||||
BYTE A = *sptr++;
|
||||
|
||||
if (!withAlpha)
|
||||
A = 0xFFU;
|
||||
|
||||
dptr = writePixel(dptr, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), A);
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
|
||||
}
|
||||
|
||||
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_YCoCg(prims);
|
||||
primitives_init_YCoCg_ssse3(prims);
|
||||
primitives_init_YCoCg_neon(prims);
|
||||
}
|
||||
53
third_party/FreeRDP/libfreerdp/primitives/prim_YCoCg.h
vendored
Normal file
53
third_party/FreeRDP/libfreerdp/primitives/prim_YCoCg.h
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_YCoCg_H
|
||||
#define FREERDP_LIB_PRIM_YCoCg_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims);
|
||||
|
||||
static inline void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_YCoCg_ssse3_int(prims);
|
||||
}
|
||||
|
||||
FREERDP_LOCAL void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims);
|
||||
|
||||
static inline void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
primitives_init_YCoCg_neon_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
2323
third_party/FreeRDP/libfreerdp/primitives/prim_YUV.c
vendored
Normal file
2323
third_party/FreeRDP/libfreerdp/primitives/prim_YUV.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
51
third_party/FreeRDP/libfreerdp/primitives/prim_YUV.h
vendored
Normal file
51
third_party/FreeRDP/libfreerdp/primitives/prim_YUV.h
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_YUV_H
|
||||
#define FREERDP_LIB_PRIM_YUV_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_YUV_sse41(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresentEx(PF_EX_SSE41) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_YUV_sse41_int(prims);
|
||||
}
|
||||
|
||||
FREERDP_LOCAL void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_YUV_neon(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_YUV_neon_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
83
third_party/FreeRDP/libfreerdp/primitives/prim_add.c
vendored
Normal file
83
third_party/FreeRDP/libfreerdp/primitives/prim_add.c
vendored
Normal file
@@ -0,0 +1,83 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Add operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_add.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 16-bit signed add with saturation (under and over).
|
||||
*/
|
||||
static inline INT16 add(INT16 a, INT16 b)
|
||||
{
|
||||
INT32 k = (INT32)a + (INT32)b;
|
||||
|
||||
if (k > INT16_MAX)
|
||||
return INT16_MAX;
|
||||
|
||||
if (k < INT16_MIN)
|
||||
return INT16_MIN;
|
||||
|
||||
return (INT16)k;
|
||||
}
|
||||
|
||||
static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
|
||||
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 len)
|
||||
{
|
||||
const UINT32 rem = len % 16;
|
||||
const UINT32 align = len - rem;
|
||||
|
||||
for (UINT32 x = 0; x < align; x++)
|
||||
*pDst++ = add(*pSrc1++, *pSrc2++);
|
||||
|
||||
for (UINT32 x = 0; x < rem; x++)
|
||||
*pDst++ = add(*pSrc1++, *pSrc2++);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
|
||||
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len)
|
||||
{
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
{
|
||||
INT16 v = add(pSrcDst1[x], pSrcDst2[x]);
|
||||
pSrcDst1[x] = v;
|
||||
pSrcDst2[x] = v;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_add(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
prims->add_16s = general_add_16s;
|
||||
prims->add_16s_inplace = general_add_16s_inplace;
|
||||
}
|
||||
|
||||
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_add(prims);
|
||||
primitives_init_add_sse3(prims);
|
||||
}
|
||||
42
third_party/FreeRDP/libfreerdp/primitives/prim_add.h
vendored
Normal file
42
third_party/FreeRDP/libfreerdp/primitives/prim_add.h
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_ADD_H
|
||||
#define FREERDP_LIB_PRIM_ADD_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
||||
return;
|
||||
|
||||
primitives_init_add_sse3_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
98
third_party/FreeRDP/libfreerdp/primitives/prim_alphaComp.c
vendored
Normal file
98
third_party/FreeRDP/libfreerdp/primitives/prim_alphaComp.c
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Alpha blending routines.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
* Note: this code assumes the second operand is fully opaque,
|
||||
* e.g.
|
||||
* newval = alpha1*val1 + (1-alpha1)*val2
|
||||
* rather than
|
||||
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_alphaComp.h"
|
||||
|
||||
#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t general_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
|
||||
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
|
||||
UINT32 height)
|
||||
{
|
||||
for (size_t y = 0; y < height; y++)
|
||||
{
|
||||
const UINT32* sptr1 = (const UINT32*)(pSrc1 + y * src1Step);
|
||||
const UINT32* sptr2 = (const UINT32*)(pSrc2 + y * src2Step);
|
||||
UINT32* dptr = (UINT32*)(pDst + y * dstStep);
|
||||
|
||||
for (size_t x = 0; x < width; x++)
|
||||
{
|
||||
const UINT32 src1 = *sptr1++;
|
||||
const UINT32 src2 = *sptr2++;
|
||||
UINT32 alpha = ALPHA(src1) + 1;
|
||||
|
||||
if (alpha == 256)
|
||||
{
|
||||
/* If alpha is 255+1, just copy src1. */
|
||||
*dptr++ = src1;
|
||||
}
|
||||
else if (alpha <= 1)
|
||||
{
|
||||
/* If alpha is 0+1, just copy src2. */
|
||||
*dptr++ = src2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
|
||||
* rather than adding one to alpha and dividing by 256, but this
|
||||
* is much faster and only differs by one 16% of the time.
|
||||
* I'm not sure who first designed the double-ops trick
|
||||
* (Red Blue and Alpha Green).
|
||||
*/
|
||||
UINT32 rb = 0;
|
||||
UINT32 ag = 0;
|
||||
UINT32 s2rb = src2 & 0x00FF00FFU;
|
||||
UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
|
||||
UINT32 s1rb = src1 & 0x00FF00FFU;
|
||||
UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
|
||||
UINT32 drb = s1rb - s2rb;
|
||||
UINT32 dag = s1ag - s2ag;
|
||||
drb *= alpha;
|
||||
dag *= alpha;
|
||||
rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
|
||||
ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
|
||||
*dptr++ = rb | ag;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
prims->alphaComp_argb = general_alphaComp_argb;
|
||||
}
|
||||
|
||||
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_alphaComp(prims);
|
||||
primitives_init_alphaComp_sse3(prims);
|
||||
}
|
||||
42
third_party/FreeRDP/libfreerdp/primitives/prim_alphaComp.h
vendored
Normal file
42
third_party/FreeRDP/libfreerdp/primitives/prim_alphaComp.h
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H
|
||||
#define FREERDP_LIB_PRIM_ALPHA_COMP_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
||||
return;
|
||||
|
||||
primitives_init_alphaComp_sse3_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
66
third_party/FreeRDP/libfreerdp/primitives/prim_andor.c
vendored
Normal file
66
third_party/FreeRDP/libfreerdp/primitives/prim_andor.c
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Logical operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_andor.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 32-bit AND with a constant.
|
||||
*/
|
||||
static pstatus_t general_andC_32u(const UINT32* WINPR_RESTRICT pSrc, UINT32 val,
|
||||
UINT32* WINPR_RESTRICT pDst, INT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
while (len--)
|
||||
*pDst++ = *pSrc++ & val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 32-bit OR with a constant.
|
||||
*/
|
||||
static pstatus_t general_orC_32u(const UINT32* WINPR_RESTRICT pSrc, UINT32 val,
|
||||
UINT32* WINPR_RESTRICT pDst, INT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
while (len--)
|
||||
*pDst++ = *pSrc++ | val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_andor(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->andC_32u = general_andC_32u;
|
||||
prims->orC_32u = general_orC_32u;
|
||||
}
|
||||
|
||||
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_andor(prims);
|
||||
primitives_init_andor_sse3(prims);
|
||||
}
|
||||
42
third_party/FreeRDP/libfreerdp/primitives/prim_andor.h
vendored
Normal file
42
third_party/FreeRDP/libfreerdp/primitives/prim_andor.h
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_ANDOR_H
|
||||
#define FREERDP_LIB_PRIM_ANDOR_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_andor_sse3_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
576
third_party/FreeRDP/libfreerdp/primitives/prim_colors.c
vendored
Normal file
576
third_party/FreeRDP/libfreerdp/primitives/prim_colors.c
vendored
Normal file
@@ -0,0 +1,576 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Color conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* Copyright 2011 Stephen Erisman
|
||||
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
|
||||
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <winpr/assert.h>
|
||||
#include <winpr/cast.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_colors.h"
|
||||
|
||||
#ifndef MINMAX
|
||||
#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
|
||||
#endif /* !MINMAX */
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* pregenerated table for ycbcr constants: [0,27]
|
||||
*
|
||||
* rounded integer values derived from the following formula:
|
||||
*
|
||||
* { (1.402525f * 2^divisor), (0.714401f * 2^divisor), (0.343730f * 2^divisor), (1.769905f *
|
||||
* 2^divisor) }
|
||||
*/
|
||||
|
||||
static const INT32 ycbcr_constants[][4] = { { 1, 1, 0, 2 },
|
||||
{ 3, 1, 1, 4 },
|
||||
{ 6, 3, 1, 7 },
|
||||
{ 11, 6, 3, 14 },
|
||||
{ 22, 11, 5, 28 },
|
||||
{ 45, 23, 11, 57 },
|
||||
{ 90, 46, 22, 113 },
|
||||
{ 180, 91, 44, 227 },
|
||||
{ 359, 183, 88, 453 },
|
||||
{ 718, 366, 176, 906 },
|
||||
{ 1436, 732, 352, 1812 },
|
||||
{ 2872, 1463, 704, 3625 },
|
||||
{ 5745, 2926, 1408, 7250 },
|
||||
{ 11489, 5852, 2816, 14499 },
|
||||
{ 22979, 11705, 5632, 28998 },
|
||||
{ 45958, 23409, 11263, 57996 },
|
||||
{ 91916, 46819, 22527, 115992 },
|
||||
{ 183832, 93638, 45053, 231985 },
|
||||
{ 367664, 187276, 90107, 463970 },
|
||||
{ 735327, 374552, 180214, 927940 },
|
||||
{ 1470654, 749104, 360427, 1855880 },
|
||||
{ 2941308, 1498207, 720854, 3711760 },
|
||||
{ 5882616, 2996415, 1441708, 7423520 },
|
||||
{ 11765232, 5992830, 2883416, 14847039 },
|
||||
{ 23530465, 11985660, 5766832, 29694078 },
|
||||
{ 47060930, 23971320, 11533665, 59388157 },
|
||||
{ 94121859, 47942640, 23067330, 118776314 },
|
||||
{ 188243719, 95885279, 46134660, 237552628 },
|
||||
{ 376487438, 191770558, 92269319, 475105256 },
|
||||
{ 752974876, 383541116, 184538639, 950210512 },
|
||||
{ 1505949752, 767082233, 369077277, 1900421023 } };
|
||||
|
||||
static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
BYTE* pRGB = pDst;
|
||||
const INT16* pY = pSrc[0];
|
||||
const INT16* pCb = pSrc[1];
|
||||
const INT16* pCr = pSrc[2];
|
||||
const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
|
||||
const size_t dstPad = (dstStep - (roi->width * 4));
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width; x++)
|
||||
{
|
||||
const INT32 divisor = 16;
|
||||
const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
|
||||
const INT32 Cb = (*pCb++);
|
||||
const INT32 Cr = (*pCr++);
|
||||
|
||||
const INT32 CrR = WINPR_ASSERTING_INT_CAST(
|
||||
int32_t, Cr* ycbcr_constants[divisor][0]); //(1.402525f * 2^divisor);
|
||||
const INT32 CrG = WINPR_ASSERTING_INT_CAST(
|
||||
int32_t, Cr* ycbcr_constants[divisor][1]); //(0.714401f * 2^divisor);
|
||||
const INT32 CbG = WINPR_ASSERTING_INT_CAST(
|
||||
int32_t, Cb* ycbcr_constants[divisor][2]); //(0.343730f * 2^divisor);
|
||||
const INT32 CbB = WINPR_ASSERTING_INT_CAST(
|
||||
int32_t, Cb* ycbcr_constants[divisor][3]); //(1.769905f * 2^divisor);
|
||||
const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, ((CrR + Y) >> divisor) >> 5);
|
||||
const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, ((Y - CbG - CrG) >> divisor) >> 5);
|
||||
const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, ((CbB + Y) >> divisor) >> 5);
|
||||
pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
|
||||
}
|
||||
|
||||
pY += srcPad;
|
||||
pCb += srcPad;
|
||||
pCr += srcPad;
|
||||
pRGB += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_general(const INT16* WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
BYTE* pRGB = pDst;
|
||||
const INT16* pY = pSrc[0];
|
||||
const INT16* pCb = pSrc[1];
|
||||
const INT16* pCr = pSrc[2];
|
||||
const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
|
||||
const size_t dstPad = (dstStep - (roi->width * 4));
|
||||
const fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width; x++)
|
||||
{
|
||||
const INT32 divisor = 16;
|
||||
const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
|
||||
const INT32 Cb = (*pCb++);
|
||||
const INT32 Cr = (*pCr++);
|
||||
const INT32 CrR = Cr * ycbcr_constants[divisor][0];
|
||||
const INT32 CrG = Cr * ycbcr_constants[divisor][1];
|
||||
const INT32 CbG = Cb * ycbcr_constants[divisor][2];
|
||||
const INT32 CbB = Cb * ycbcr_constants[divisor][3];
|
||||
const INT32 R = (CrR + Y) >> (divisor + 5);
|
||||
const INT32 G = (Y - CbG - CrG) >> (divisor + 5);
|
||||
const INT32 B = (CbB + Y) >> (divisor + 5);
|
||||
pRGB = writePixel(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
|
||||
}
|
||||
|
||||
pY += srcPad;
|
||||
pCb += srcPad;
|
||||
pCr += srcPad;
|
||||
pRGB += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return general_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat,
|
||||
roi);
|
||||
|
||||
default:
|
||||
return general_yCbCrToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
|
||||
roi);
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
static pstatus_t
|
||||
general_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
|
||||
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
/**
|
||||
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point
|
||||
* numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value range
|
||||
* is [-128.0, 127.0]. In other words, the decoded coefficients are scaled
|
||||
* by << 5 when interpreted as INT16.
|
||||
* It was scaled in the quantization phase, so we must scale it back here.
|
||||
*/
|
||||
const INT16* yptr = pSrc[0];
|
||||
const INT16* cbptr = pSrc[1];
|
||||
const INT16* crptr = pSrc[2];
|
||||
INT16* rptr = pDst[0];
|
||||
INT16* gptr = pDst[1];
|
||||
INT16* bptr = pDst[2];
|
||||
UINT32 srcbump = (WINPR_ASSERTING_INT_CAST(uint32_t, srcStep) - (roi->width * sizeof(UINT16))) /
|
||||
sizeof(UINT16);
|
||||
UINT32 dstbump = (WINPR_ASSERTING_INT_CAST(uint32_t, dstStep) - (roi->width * sizeof(UINT16))) /
|
||||
sizeof(UINT16);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width; ++x)
|
||||
{
|
||||
/* INT32 is used intentionally because we calculate
|
||||
* with shifted factors!
|
||||
*/
|
||||
INT32 cy = (INT32)(*yptr++);
|
||||
INT32 cb = (INT32)(*cbptr++);
|
||||
INT32 cr = (INT32)(*crptr++);
|
||||
INT64 r = 0;
|
||||
INT64 g = 0;
|
||||
INT64 b = 0;
|
||||
/*
|
||||
* This is the slow floating point version kept here for reference.
|
||||
* y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
|
||||
* r = y + cr*1.403f;
|
||||
* g = y - cb*0.344f - cr*0.714f;
|
||||
* b = y + cb*1.770f;
|
||||
* y_r_buf[i] = CLIP(r>>5);
|
||||
* cb_g_buf[i] = CLIP(g>>5);
|
||||
* cr_b_buf[i] = CLIP(b>>5);
|
||||
*/
|
||||
/*
|
||||
* We scale the factors by << 16 into 32-bit integers in order to
|
||||
* avoid slower floating point multiplications. Since the final
|
||||
* result needs to be scaled by >> 5 we will extract only the
|
||||
* upper 11 bits (>> 21) from the final sum.
|
||||
* Hence we also have to scale the other terms of the sum by << 16.
|
||||
* R: 1.403 << 16 = 91947
|
||||
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
|
||||
* B: 1.770 << 16 = 115998
|
||||
*/
|
||||
cy = (INT32)((UINT32)(cy + 4096) << 16);
|
||||
|
||||
r = 1LL * cy + 1LL * cr * ycbcr_constants[16][0];
|
||||
g = 1LL * cy - 1LL * cb * ycbcr_constants[16][1] - 1LL * cr * ycbcr_constants[16][2];
|
||||
b = 1LL * cy + 1LL * cb * ycbcr_constants[16][3];
|
||||
*rptr++ = CLIP(r >> 21);
|
||||
*gptr++ = CLIP(g >> 21);
|
||||
*bptr++ = CLIP(b >> 21);
|
||||
}
|
||||
|
||||
yptr += srcbump;
|
||||
cbptr += srcbump;
|
||||
crptr += srcbump;
|
||||
rptr += dstbump;
|
||||
gptr += dstbump;
|
||||
bptr += dstbump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t
|
||||
general_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], INT32 srcStep,
|
||||
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
|
||||
* numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value
|
||||
* range is [-128.0, 127.0]. In other words, the encoded coefficients
|
||||
* is scaled by << 5 when interpreted as INT16.
|
||||
* It will be scaled down to original during the quantization phase.
|
||||
*/
|
||||
const INT16* rptr = pSrc[0];
|
||||
const INT16* gptr = pSrc[1];
|
||||
const INT16* bptr = pSrc[2];
|
||||
INT16* yptr = pDst[0];
|
||||
INT16* cbptr = pDst[1];
|
||||
INT16* crptr = pDst[2];
|
||||
UINT32 srcbump = (WINPR_ASSERTING_INT_CAST(uint32_t, srcStep) - (roi->width * sizeof(UINT16))) /
|
||||
sizeof(UINT16);
|
||||
UINT32 dstbump = (WINPR_ASSERTING_INT_CAST(uint32_t, dstStep) - (roi->width * sizeof(UINT16))) /
|
||||
sizeof(UINT16);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width; ++x)
|
||||
{
|
||||
/* INT32 is used intentionally because we calculate with
|
||||
* shifted factors!
|
||||
*/
|
||||
INT32 r = (INT32)(*rptr++);
|
||||
INT32 g = (INT32)(*gptr++);
|
||||
INT32 b = (INT32)(*bptr++);
|
||||
/* We scale the factors by << 15 into 32-bit integers in order
|
||||
* to avoid slower floating point multiplications. Since the
|
||||
* terms need to be scaled by << 5 we simply scale the final
|
||||
* sum by >> 10
|
||||
*
|
||||
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235,
|
||||
* 0.114000 << 15 = 3735
|
||||
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868,
|
||||
* 0.500590 << 15 = 16403
|
||||
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
|
||||
* 0.081282 << 15 = 2663
|
||||
*/
|
||||
INT32 cy = (r * 9798 + g * 19235 + b * 3735) >> 10;
|
||||
INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
|
||||
INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
|
||||
*yptr++ = (INT16)MINMAX(cy - 4096, -4096, 4095);
|
||||
*cbptr++ = (INT16)MINMAX(cb, -4096, 4095);
|
||||
*crptr++ = (INT16)MINMAX(cr, -4096, 4095);
|
||||
}
|
||||
|
||||
yptr += srcbump;
|
||||
cbptr += srcbump;
|
||||
crptr += srcbump;
|
||||
rptr += dstbump;
|
||||
gptr += dstbump;
|
||||
bptr += dstbump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void writeScanlineGeneric(BYTE* dst, DWORD formatSize, UINT32 DstFormat,
|
||||
const INT16* r, const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const INT16 pr = *r++;
|
||||
const INT16 pg = *g++;
|
||||
const INT16 pb = *b++;
|
||||
|
||||
dst =
|
||||
writePixel(dst, formatSize, DstFormat, WINPR_ASSERTING_INT_CAST(UINT8, pr),
|
||||
WINPR_ASSERTING_INT_CAST(UINT8, pg), WINPR_ASSERTING_INT_CAST(UINT8, pb), 0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void writeScanlineRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
|
||||
const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(DstFormat);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const BYTE R = CLIP(*r++);
|
||||
const BYTE G = CLIP(*g++);
|
||||
const BYTE B = CLIP(*b++);
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void writeScanlineBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
|
||||
const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(DstFormat);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const BYTE R = CLIP(*r++);
|
||||
const BYTE G = CLIP(*g++);
|
||||
const BYTE B = CLIP(*b++);
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void writeScanlineBGRX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
|
||||
const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(DstFormat);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const BYTE R = CLIP(*r++);
|
||||
const BYTE G = CLIP(*g++);
|
||||
const BYTE B = CLIP(*b++);
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
*dst++ = 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void writeScanlineRGBX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
|
||||
const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(DstFormat);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const BYTE R = CLIP(*r++);
|
||||
const BYTE G = CLIP(*g++);
|
||||
const BYTE B = CLIP(*b++);
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
*dst++ = 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void writeScanlineXBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
|
||||
const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(DstFormat);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const BYTE R = CLIP(*r++);
|
||||
const BYTE G = CLIP(*g++);
|
||||
const BYTE B = CLIP(*b++);
|
||||
*dst++ = 0xFF;
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void writeScanlineXRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
|
||||
const INT16* g, const INT16* b, DWORD width)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(DstFormat);
|
||||
|
||||
for (UINT32 x = 0; x < width; x++)
|
||||
{
|
||||
const BYTE R = CLIP(*r++);
|
||||
const BYTE G = CLIP(*g++);
|
||||
const BYTE B = CLIP(*b++);
|
||||
*dst++ = 0xFF;
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (*fkt_writeScanline)(BYTE*, DWORD, UINT32, const INT16*, const INT16*, const INT16*,
|
||||
DWORD);
|
||||
|
||||
static inline fkt_writeScanline getScanlineWriteFunction(DWORD format)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return writeScanlineXRGB;
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return writeScanlineXBGR;
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return writeScanlineRGBX;
|
||||
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return writeScanlineBGRX;
|
||||
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
return writeScanlineBGR;
|
||||
|
||||
case PIXEL_FORMAT_RGB24:
|
||||
return writeScanlineRGB;
|
||||
|
||||
default:
|
||||
return writeScanlineGeneric;
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t general_RGBToRGB_16s8u_P3AC4R_general(
|
||||
const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
const INT16* r = pSrc[0];
|
||||
const INT16* g = pSrc[1];
|
||||
const INT16* b = pSrc[2];
|
||||
const DWORD srcAdd = srcStep / sizeof(INT16);
|
||||
fkt_writeScanline writeScanline = getScanlineWriteFunction(DstFormat);
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; ++y)
|
||||
{
|
||||
(*writeScanline)(pDst, formatSize, DstFormat, r, g, b, roi->width);
|
||||
pDst += dstStep;
|
||||
r += srcAdd;
|
||||
g += srcAdd;
|
||||
b += srcAdd;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t general_RGBToRGB_16s8u_P3AC4R_BGRX(
|
||||
const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
const INT16* r = pSrc[0];
|
||||
const INT16* g = pSrc[1];
|
||||
const INT16* b = pSrc[2];
|
||||
const DWORD srcAdd = srcStep / sizeof(INT16);
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; ++y)
|
||||
{
|
||||
writeScanlineBGRX(pDst, formatSize, DstFormat, r, g, b, roi->width);
|
||||
pDst += dstStep;
|
||||
r += srcAdd;
|
||||
g += srcAdd;
|
||||
b += srcAdd;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t
|
||||
general_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return general_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
|
||||
default:
|
||||
return general_RGBToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
|
||||
roi);
|
||||
}
|
||||
}
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
prims->yCbCrToRGB_16s8u_P3AC4R = general_yCbCrToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
|
||||
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
|
||||
prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_colors(prims);
|
||||
primitives_init_colors_sse2(prims);
|
||||
primitives_init_colors_neon(prims);
|
||||
}
|
||||
51
third_party/FreeRDP/libfreerdp/primitives/prim_colors.h
vendored
Normal file
51
third_party/FreeRDP/libfreerdp/primitives/prim_colors.h
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives colors
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_COLORS_H
|
||||
#define FREERDP_LIB_PRIM_COLORS_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_colors_sse2(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_colors_sse2_int(prims);
|
||||
}
|
||||
|
||||
FREERDP_LOCAL void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_colors_neon(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_colors_neon_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
439
third_party/FreeRDP/libfreerdp/primitives/prim_copy.c
vendored
Normal file
439
third_party/FreeRDP/libfreerdp/primitives/prim_copy.c
vendored
Normal file
@@ -0,0 +1,439 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Copy operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/log.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_copy.h"
|
||||
#include "../codec/color.h"
|
||||
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/*static inline BOOL memory_regions_overlap_1d(*/
|
||||
static BOOL memory_regions_overlap_1d(const BYTE* p1, const BYTE* p2, size_t bytes)
|
||||
{
|
||||
const ULONG_PTR p1m = (const ULONG_PTR)p1;
|
||||
const ULONG_PTR p2m = (const ULONG_PTR)p2;
|
||||
|
||||
if (p1m <= p2m)
|
||||
{
|
||||
if (p1m + bytes > p2m)
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (p2m + bytes > p1m)
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* else */
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/*static inline BOOL memory_regions_overlap_2d( */
|
||||
static BOOL memory_regions_overlap_2d(const BYTE* p1, int p1Step, int p1Size, const BYTE* p2,
|
||||
int p2Step, int p2Size, int width, int height)
|
||||
{
|
||||
ULONG_PTR p1m = (ULONG_PTR)p1;
|
||||
ULONG_PTR p2m = (ULONG_PTR)p2;
|
||||
|
||||
if (p1m <= p2m)
|
||||
{
|
||||
ULONG_PTR p1mEnd = p1m +
|
||||
1ull * (WINPR_ASSERTING_INT_CAST(uint32_t, height - 1)) *
|
||||
WINPR_ASSERTING_INT_CAST(uint32_t, p1Step) +
|
||||
1ull * WINPR_ASSERTING_INT_CAST(uint32_t, width* p1Size);
|
||||
|
||||
if (p1mEnd > p2m)
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
ULONG_PTR p2mEnd = p2m +
|
||||
1ull * (WINPR_ASSERTING_INT_CAST(uintptr_t, height - 1)) *
|
||||
WINPR_ASSERTING_INT_CAST(uintptr_t, p2Step) +
|
||||
1ull * WINPR_ASSERTING_INT_CAST(uintptr_t, width* p2Size);
|
||||
|
||||
if (p2mEnd > p1m)
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* else */
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t general_copy_8u(const BYTE* WINPR_RESTRICT pSrc, BYTE* WINPR_RESTRICT pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (memory_regions_overlap_1d(pSrc, pDst, (size_t)len))
|
||||
{
|
||||
memmove((void*)pDst, (const void*)pSrc, (size_t)len);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy((void*)pDst, (const void*)pSrc, (size_t)len);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* Copy a block of pixels from one buffer to another.
|
||||
* The addresses are assumed to have been already offset to the upper-left
|
||||
* corners of the source and destination region of interest.
|
||||
*/
|
||||
static pstatus_t general_copy_8u_AC4r(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, INT32 dstStep, INT32 width,
|
||||
INT32 height)
|
||||
{
|
||||
const BYTE* src = pSrc;
|
||||
BYTE* dst = pDst;
|
||||
const size_t rowbytes = WINPR_ASSERTING_INT_CAST(size_t, width) * sizeof(UINT32);
|
||||
|
||||
if ((width == 0) || (height == 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32), pDst, dstStep, sizeof(UINT32),
|
||||
width, height))
|
||||
{
|
||||
do
|
||||
{
|
||||
const pstatus_t rc =
|
||||
generic->copy(src, dst, WINPR_ASSERTING_INT_CAST(int32_t, rowbytes));
|
||||
if (rc != PRIMITIVES_SUCCESS)
|
||||
return rc;
|
||||
|
||||
src += srcStep;
|
||||
dst += dstStep;
|
||||
} while (--height);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* TODO: do it in one operation when the rowdata is adjacent. */
|
||||
do
|
||||
{
|
||||
/* If we find a replacement for memcpy that is consistently
|
||||
* faster, this could be replaced with that.
|
||||
*/
|
||||
memcpy(dst, src, rowbytes);
|
||||
src += srcStep;
|
||||
dst += dstStep;
|
||||
} while (--height);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t generic_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 3;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t
|
||||
generic_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep, UINT32 nXDst,
|
||||
UINT32 nYDst, UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData, UINT32 nSrcStep, UINT32 nXSrc,
|
||||
UINT32 nYSrc, int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 4;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
pstatus_t generic_image_copy_no_overlap_convert(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
const int64_t srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
const int64_t dstByte = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
// WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
|
||||
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
|
||||
if (!FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor))
|
||||
return -1;
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
|
||||
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
|
||||
if (!FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor))
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
pstatus_t generic_image_copy_no_overlap_memcpy(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
WINPR_ATTR_UNUSED const gdiPalette* WINPR_RESTRICT palette, int64_t srcVMultiplier,
|
||||
int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset, WINPR_ATTR_UNUSED UINT32 flags)
|
||||
{
|
||||
const int64_t dstByte = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
const int64_t srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
const int64_t copyDstWidth = nWidth * dstByte;
|
||||
const int64_t xSrcOffset = nXSrc * srcByte;
|
||||
const int64_t xDstOffset = nXDst * dstByte;
|
||||
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
memcpy(&dstLine[xDstOffset], &srcLine[xSrcOffset],
|
||||
WINPR_ASSERTING_INT_CAST(size_t, copyDstWidth));
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t generic_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
|
||||
switch (SrcFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return generic_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return generic_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return generic_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
case PIXEL_FORMAT_RGB24:
|
||||
return generic_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return generic_image_copy_no_overlap_convert(
|
||||
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
}
|
||||
|
||||
static inline pstatus_t generic_image_copy_no_overlap_no_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset,
|
||||
UINT32 flags)
|
||||
{
|
||||
if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
return generic_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
}
|
||||
|
||||
static pstatus_t generic_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
const gdiPalette* WINPR_RESTRICT palette,
|
||||
UINT32 flags)
|
||||
{
|
||||
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
|
||||
int64_t srcVOffset = 0;
|
||||
int64_t srcVMultiplier = 1;
|
||||
int64_t dstVOffset = 0;
|
||||
int64_t dstVMultiplier = 1;
|
||||
|
||||
if ((nWidth == 0) || (nHeight == 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
|
||||
return -1;
|
||||
|
||||
if (!pDstData || !pSrcData)
|
||||
return -1;
|
||||
|
||||
if (nDstStep == 0)
|
||||
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
if (nSrcStep == 0)
|
||||
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
|
||||
if (vSrcVFlip)
|
||||
{
|
||||
srcVOffset = (nHeight - 1ll) * nSrcStep;
|
||||
srcVMultiplier = -1;
|
||||
}
|
||||
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return generic_image_copy_no_overlap_dst_alpha(
|
||||
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
|
||||
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier,
|
||||
dstVOffset);
|
||||
else
|
||||
return generic_image_copy_no_overlap_no_alpha(
|
||||
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
|
||||
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset,
|
||||
flags);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->copy_8u = general_copy_8u;
|
||||
prims->copy_8u_AC4r = general_copy_8u_AC4r;
|
||||
prims->copy = WINPR_FUNC_PTR_CAST(prims->copy_8u, fn_copy_t);
|
||||
prims->copy_no_overlap = generic_image_copy_no_overlap;
|
||||
}
|
||||
|
||||
void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_copy(prims);
|
||||
primitives_init_copy_sse41(prims);
|
||||
#if defined(WITH_AVX2)
|
||||
primitives_init_copy_avx2(prims);
|
||||
#endif
|
||||
}
|
||||
63
third_party/FreeRDP/libfreerdp/primitives/prim_copy.h
vendored
Normal file
63
third_party/FreeRDP/libfreerdp/primitives/prim_copy.h
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_COPY_H
|
||||
#define FREERDP_LIB_PRIM_COPY_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
WINPR_ATTR_NODISCARD FREERDP_LOCAL pstatus_t generic_image_copy_no_overlap_convert(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset);
|
||||
|
||||
WINPR_ATTR_NODISCARD FREERDP_LOCAL pstatus_t generic_image_copy_no_overlap_memcpy(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, int64_t dstVOffset,
|
||||
UINT32 flags);
|
||||
|
||||
FREERDP_LOCAL void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_copy_sse41(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_copy_sse41_int(prims);
|
||||
}
|
||||
|
||||
#if defined(WITH_AVX2)
|
||||
FREERDP_LOCAL void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_copy_avx2(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_copy_avx2_int(prims);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
352
third_party/FreeRDP/libfreerdp/primitives/prim_internal.h
vendored
Normal file
352
third_party/FreeRDP/libfreerdp/primitives/prim_internal.h
vendored
Normal file
@@ -0,0 +1,352 @@
|
||||
/* prim_internal.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_INTERNAL_H
|
||||
#define FREERDP_LIB_PRIM_INTERNAL_H
|
||||
|
||||
#include <winpr/platform.h>
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/api.h>
|
||||
|
||||
#include "../core/simd.h"
|
||||
|
||||
#define PRIM_ALIGN_128 DECLSPEC_ALIGN(16)
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED) || defined(NEON_INTRINSICS_ENABLED) || defined(WITH_OPENCL)
|
||||
#define HAVE_OPTIMIZED_PRIMITIVES 1
|
||||
#endif
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED) || defined(NEON_INTRINSICS_ENABLED)
|
||||
#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
|
||||
#endif
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelBGRA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
*dst++ = A;
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelBGRX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
WINPR_UNUSED(A);
|
||||
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
dst++; /* Do not touch alpha */
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelRGBA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
*dst++ = A;
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelRGBX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
WINPR_UNUSED(A);
|
||||
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
dst++; /* Do not touch alpha */
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelABGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
|
||||
*dst++ = A;
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelXBGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
WINPR_UNUSED(A);
|
||||
|
||||
dst++; /* Do not touch alpha */
|
||||
*dst++ = B;
|
||||
*dst++ = G;
|
||||
*dst++ = R;
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelARGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
|
||||
*dst++ = A;
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelXRGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
WINPR_UNUSED(formatSize);
|
||||
WINPR_UNUSED(format);
|
||||
WINPR_UNUSED(A);
|
||||
|
||||
dst++; /* Do not touch alpha */
|
||||
*dst++ = R;
|
||||
*dst++ = G;
|
||||
*dst++ = B;
|
||||
return dst;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelGenericAlpha(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R,
|
||||
BYTE G, BYTE B, BYTE A)
|
||||
{
|
||||
UINT32 color = FreeRDPGetColor(format, R, G, B, A);
|
||||
FreeRDPWriteColor(dst, format, color);
|
||||
return dst + formatSize;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE* writePixelGeneric(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
|
||||
BYTE B, BYTE A)
|
||||
{
|
||||
UINT32 color = FreeRDPGetColor(format, R, G, B, A);
|
||||
FreeRDPWriteColorIgnoreAlpha(dst, format, color);
|
||||
return dst + formatSize;
|
||||
}
|
||||
|
||||
typedef BYTE* (*fkt_writePixel)(BYTE*, DWORD, UINT32, BYTE, BYTE, BYTE, BYTE);
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline fkt_writePixel getPixelWriteFunction(DWORD format, BOOL useAlpha)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return useAlpha ? writePixelARGB : writePixelXRGB;
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return useAlpha ? writePixelABGR : writePixelXBGR;
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return useAlpha ? writePixelRGBA : writePixelRGBX;
|
||||
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return useAlpha ? writePixelBGRA : writePixelBGRX;
|
||||
|
||||
default:
|
||||
return useAlpha ? writePixelGenericAlpha : writePixelGeneric;
|
||||
}
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE CLIP(INT64 X)
|
||||
{
|
||||
if (X > 255L)
|
||||
return 255L;
|
||||
|
||||
if (X < 0L)
|
||||
return 0L;
|
||||
|
||||
return (BYTE)X;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE CONDITIONAL_CLIP(INT32 in, BYTE original)
|
||||
{
|
||||
BYTE out = CLIP(in);
|
||||
BYTE diff = 0;
|
||||
if (out > original)
|
||||
diff = out - original;
|
||||
else
|
||||
diff = original - out;
|
||||
if (diff < 30)
|
||||
return original;
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* | R | ( | 256 0 403 | | Y | )
|
||||
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||
* | B | ( | 256 475 0 | | V - 128 | )
|
||||
*/
|
||||
static inline INT32 C(INT32 Y)
|
||||
{
|
||||
return (Y)-0;
|
||||
}
|
||||
|
||||
static inline INT32 D(INT32 U)
|
||||
{
|
||||
return (U)-128;
|
||||
}
|
||||
|
||||
static inline INT32 E(INT32 V)
|
||||
{
|
||||
return (V)-128;
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE YUV2R(INT32 Y, INT32 U, INT32 V)
|
||||
{
|
||||
const INT32 r = (256 * C(Y) + 0 * D(U) + 403 * E(V));
|
||||
const INT32 r8 = r >> 8;
|
||||
return CLIP(r8);
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE YUV2G(INT32 Y, INT32 U, INT32 V)
|
||||
{
|
||||
const INT32 g = (256 * C(Y) - 48 * D(U) - 120 * E(V));
|
||||
const INT32 g8 = g >> 8;
|
||||
return CLIP(g8);
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE YUV2B(INT32 Y, INT32 U, INT32 V)
|
||||
{
|
||||
const INT32 b = (256 * C(Y) + 475 * D(U) + 0 * E(V));
|
||||
const INT32 b8 = b >> 8;
|
||||
return CLIP(b8);
|
||||
}
|
||||
|
||||
/**
|
||||
* | Y | ( | 54 183 18 | | R | ) | 0 |
|
||||
* | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 |
|
||||
* | V | ( | 128 -116 -12 | | B | ) | 128 |
|
||||
*/
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE RGB2Y(INT32 R, INT32 G, INT32 B)
|
||||
{
|
||||
const INT32 val = ((54 * R + 183 * G + 18 * B) >> 8);
|
||||
return WINPR_ASSERTING_INT_CAST(BYTE, val);
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE RGB2U(INT32 R, INT32 G, INT32 B)
|
||||
{
|
||||
const INT32 val = (((-29 * R - 99 * G + 128 * B) >> 8) + 128);
|
||||
return WINPR_ASSERTING_INT_CAST(BYTE, val);
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline BYTE RGB2V(INT32 R, INT32 G, INT32 B)
|
||||
{
|
||||
const INT32 val = (((128 * R - 116 * G - 12 * B) >> 8) + 128);
|
||||
return WINPR_ASSERTING_INT_CAST(BYTE, val);
|
||||
}
|
||||
|
||||
static inline BYTE* writeYUVPixel(BYTE* dst, UINT32 DstFormat, INT32 y, INT32 u, INT32 v,
|
||||
fkt_writePixel fkt)
|
||||
{
|
||||
WINPR_ASSERT(fkt);
|
||||
const BYTE r = YUV2R(y, u, v);
|
||||
const BYTE g = YUV2G(y, u, v);
|
||||
const BYTE b = YUV2B(y, u, v);
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
return fkt(dst, formatSize, DstFormat, r, g, b, 0);
|
||||
}
|
||||
|
||||
FREERDP_LOCAL void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
size_t offset, const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
|
||||
BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
|
||||
BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
|
||||
BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width);
|
||||
|
||||
FREERDP_LOCAL void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
size_t offset, const BYTE* WINPR_RESTRICT pSrcEven, const BYTE* WINPR_RESTRICT pSrcOdd,
|
||||
BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
|
||||
BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
|
||||
BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
|
||||
BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
|
||||
BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
|
||||
BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width);
|
||||
|
||||
/* Function prototypes for all the init/deinit routines. */
|
||||
FREERDP_LOCAL void primitives_init_copy(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_set(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_add(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_andor(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_shift(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_sign(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_colors(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims);
|
||||
|
||||
FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims);
|
||||
|
||||
#if defined(WITH_OPENCL)
|
||||
WINPR_ATTR_NODISCARD
|
||||
FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* WINPR_RESTRICT prims);
|
||||
#endif
|
||||
|
||||
#endif /* FREERDP_LIB_PRIM_INTERNAL_H */
|
||||
137
third_party/FreeRDP/libfreerdp/primitives/prim_set.c
vendored
Normal file
137
third_party/FreeRDP/libfreerdp/primitives/prim_set.c
vendored
Normal file
@@ -0,0 +1,137 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Routines to set a chunk of memory to a constant.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_set.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
static pstatus_t general_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
memset((void*)pDst, (int)val, (size_t)len);
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t general_zero(void* WINPR_RESTRICT pDst, size_t len)
|
||||
{
|
||||
memset(pDst, 0, len);
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
static pstatus_t general_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
INT32* dptr = pDst;
|
||||
size_t span = 0;
|
||||
size_t remaining = 0;
|
||||
|
||||
if (len < 256)
|
||||
{
|
||||
while (len--)
|
||||
*dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* else quadratic growth memcpy algorithm */
|
||||
span = 1;
|
||||
*dptr = val;
|
||||
remaining = len - 1;
|
||||
primitives_t* prims = primitives_get();
|
||||
|
||||
while (remaining)
|
||||
{
|
||||
size_t thiswidth = span;
|
||||
|
||||
if (thiswidth > remaining)
|
||||
thiswidth = remaining;
|
||||
|
||||
const size_t s = thiswidth << 2;
|
||||
WINPR_ASSERT(thiswidth <= INT32_MAX);
|
||||
const pstatus_t rc = prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), (INT32)s);
|
||||
if (rc != PRIMITIVES_SUCCESS)
|
||||
return rc;
|
||||
remaining -= thiswidth;
|
||||
span <<= 1;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t general_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
UINT32* dptr = pDst;
|
||||
size_t span = 0;
|
||||
size_t remaining = 0;
|
||||
primitives_t* prims = nullptr;
|
||||
|
||||
if (len < 256)
|
||||
{
|
||||
while (len--)
|
||||
*dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* else quadratic growth memcpy algorithm */
|
||||
span = 1;
|
||||
*dptr = val;
|
||||
remaining = len - 1;
|
||||
prims = primitives_get();
|
||||
|
||||
while (remaining)
|
||||
{
|
||||
size_t thiswidth = span;
|
||||
|
||||
if (thiswidth > remaining)
|
||||
thiswidth = remaining;
|
||||
|
||||
const size_t s = thiswidth << 2;
|
||||
WINPR_ASSERT(thiswidth <= INT32_MAX);
|
||||
const pstatus_t rc = prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), (INT32)s);
|
||||
if (rc != PRIMITIVES_SUCCESS)
|
||||
return rc;
|
||||
|
||||
remaining -= thiswidth;
|
||||
span <<= 1;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_set(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->set_8u = general_set_8u;
|
||||
prims->set_32s = general_set_32s;
|
||||
prims->set_32u = general_set_32u;
|
||||
prims->zero = general_zero;
|
||||
}
|
||||
|
||||
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_set(prims);
|
||||
primitives_init_set_sse2(prims);
|
||||
}
|
||||
42
third_party/FreeRDP/libfreerdp/primitives/prim_set.h
vendored
Normal file
42
third_party/FreeRDP/libfreerdp/primitives/prim_set.h
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_SET_H
|
||||
#define FREERDP_LIB_PRIM_SET_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_set_sse2_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
150
third_party/FreeRDP/libfreerdp/primitives/prim_shift.c
vendored
Normal file
150
third_party/FreeRDP/libfreerdp/primitives/prim_shift.c
vendored
Normal file
@@ -0,0 +1,150 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Shift operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <winpr/assert.h>
|
||||
#include <winpr/cast.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_shift.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline INT16 shift(INT16 val, UINT32 sh)
|
||||
{
|
||||
const INT16 rc = (int16_t)(((UINT32)val << sh) & 0xFFFF);
|
||||
return WINPR_ASSERTING_INT_CAST(INT16, rc);
|
||||
}
|
||||
|
||||
static inline pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val,
|
||||
UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pSrcDst[x] = shift(pSrcDst[x], val);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t general_lShiftC_16s(const INT16* WINPR_RESTRICT pSrc, UINT32 val,
|
||||
INT16* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pDst[x] = shift(pSrc[x], val);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline pstatus_t general_rShiftC_16s(const INT16* WINPR_RESTRICT pSrc, UINT32 val,
|
||||
INT16* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pDst[x] = WINPR_ASSERTING_INT_CAST(int16_t, pSrc[x] >> val);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline pstatus_t general_lShiftC_16u(const UINT16* WINPR_RESTRICT pSrc, UINT32 val,
|
||||
UINT16* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pDst[x] = WINPR_ASSERTING_INT_CAST(UINT16, ((pSrc[x] << val) & 0xFFFF));
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline pstatus_t general_rShiftC_16u(const UINT16* WINPR_RESTRICT pSrc, UINT32 val,
|
||||
UINT16* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pDst[x] = pSrc[x] >> val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline pstatus_t general_shiftC_16s(const INT16* WINPR_RESTRICT pSrc, INT32 val,
|
||||
INT16* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (val < 0)
|
||||
return general_rShiftC_16s(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, -val), pDst, len);
|
||||
else
|
||||
return general_lShiftC_16s(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, val), pDst, len);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline pstatus_t general_shiftC_16u(const UINT16* WINPR_RESTRICT pSrc, INT32 val,
|
||||
UINT16* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (val < 0)
|
||||
return general_rShiftC_16u(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, -val), pDst, len);
|
||||
else
|
||||
return general_lShiftC_16u(pSrc, WINPR_ASSERTING_INT_CAST(UINT32, val), pDst, len);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_shift(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace;
|
||||
prims->lShiftC_16s = general_lShiftC_16s;
|
||||
prims->rShiftC_16s = general_rShiftC_16s;
|
||||
prims->lShiftC_16u = general_lShiftC_16u;
|
||||
prims->rShiftC_16u = general_rShiftC_16u;
|
||||
/* Wrappers */
|
||||
prims->shiftC_16s = general_shiftC_16s;
|
||||
prims->shiftC_16u = general_shiftC_16u;
|
||||
}
|
||||
|
||||
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_shift(prims);
|
||||
primitives_init_shift_sse3(prims);
|
||||
}
|
||||
41
third_party/FreeRDP/libfreerdp/primitives/prim_shift.h
vendored
Normal file
41
third_party/FreeRDP/libfreerdp/primitives/prim_shift.h
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_SHIFT_H
|
||||
#define FREERDP_LIB_PRIM_SHIFT_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_shift_sse3_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
50
third_party/FreeRDP/libfreerdp/primitives/prim_sign.c
vendored
Normal file
50
third_party/FreeRDP/libfreerdp/primitives/prim_sign.c
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Sign operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_sign.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
|
||||
*/
|
||||
static pstatus_t general_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 len)
|
||||
{
|
||||
while (len--)
|
||||
{
|
||||
INT16 src = *pSrc++;
|
||||
*pDst++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_sign(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->sign_16s = general_sign_16s;
|
||||
}
|
||||
|
||||
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_sign(prims);
|
||||
primitives_init_sign_ssse3(prims);
|
||||
}
|
||||
42
third_party/FreeRDP/libfreerdp/primitives/prim_sign.h
vendored
Normal file
42
third_party/FreeRDP/libfreerdp/primitives/prim_sign.h
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_SIGN_H
|
||||
#define FREERDP_LIB_PRIM_SIGN_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
FREERDP_LOCAL void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims);
|
||||
static inline void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
|
||||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return;
|
||||
|
||||
primitives_init_sign_ssse3_int(prims);
|
||||
}
|
||||
|
||||
#endif
|
||||
455
third_party/FreeRDP/libfreerdp/primitives/primitives.c
vendored
Normal file
455
third_party/FreeRDP/libfreerdp/primitives/primitives.c
vendored
Normal file
@@ -0,0 +1,455 @@
|
||||
/* primitives.c
|
||||
* This code queries processor features and calls the init/deinit routines.
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Copyright 2019 David Fort <contact@hardening-consulting.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <winpr/synch.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
#include <winpr/crypto.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
#include <freerdp/log.h>
|
||||
#define TAG FREERDP_TAG("primitives")
|
||||
|
||||
/* hints to know which kind of primitives to use */
|
||||
static primitive_hints primitivesHints = PRIMITIVES_AUTODETECT;
|
||||
static BOOL primitives_init_optimized(primitives_t* prims);
|
||||
|
||||
void primitives_set_hints(primitive_hints hints)
|
||||
{
|
||||
primitivesHints = hints;
|
||||
}
|
||||
|
||||
primitive_hints primitives_get_hints(void)
|
||||
{
|
||||
return primitivesHints;
|
||||
}
|
||||
|
||||
/* Singleton pointer used throughout the program when requested. */
|
||||
static primitives_t pPrimitivesGeneric = WINPR_C_ARRAY_INIT;
|
||||
static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
|
||||
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
static primitives_t pPrimitivesCpu = WINPR_C_ARRAY_INIT;
|
||||
static INIT_ONCE cpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
|
||||
|
||||
#endif
|
||||
#if defined(WITH_OPENCL)
|
||||
static primitives_t pPrimitivesGpu = WINPR_C_ARRAY_INIT;
|
||||
static INIT_ONCE gpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
|
||||
|
||||
#endif
|
||||
|
||||
static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
|
||||
|
||||
static primitives_t pPrimitives = WINPR_C_ARRAY_INIT;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL primitives_init_generic(primitives_t* prims)
|
||||
{
|
||||
primitives_init_add(prims);
|
||||
primitives_init_andor(prims);
|
||||
primitives_init_alphaComp(prims);
|
||||
primitives_init_copy(prims);
|
||||
primitives_init_set(prims);
|
||||
primitives_init_shift(prims);
|
||||
primitives_init_sign(prims);
|
||||
primitives_init_colors(prims);
|
||||
primitives_init_YCoCg(prims);
|
||||
primitives_init_YUV(prims);
|
||||
prims->uninit = nullptr;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PVOID* context)
|
||||
{
|
||||
WINPR_UNUSED(once);
|
||||
WINPR_UNUSED(param);
|
||||
WINPR_UNUSED(context);
|
||||
return primitives_init_generic(&pPrimitivesGeneric);
|
||||
}
|
||||
|
||||
static BOOL primitives_init_optimized(primitives_t* prims)
|
||||
{
|
||||
primitives_init_generic(prims);
|
||||
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
primitives_init_add_opt(prims);
|
||||
primitives_init_andor_opt(prims);
|
||||
primitives_init_alphaComp_opt(prims);
|
||||
primitives_init_copy_opt(prims);
|
||||
primitives_init_set_opt(prims);
|
||||
primitives_init_shift_opt(prims);
|
||||
primitives_init_sign_opt(prims);
|
||||
primitives_init_colors_opt(prims);
|
||||
primitives_init_YCoCg_opt(prims);
|
||||
primitives_init_YUV_opt(prims);
|
||||
prims->flags |= PRIM_FLAGS_HAVE_EXTCPU;
|
||||
#endif
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) && defined(WITH_OPENCL)
|
||||
typedef struct
|
||||
{
|
||||
BYTE* channels[3];
|
||||
UINT32 steps[3];
|
||||
prim_size_t roi;
|
||||
BYTE* outputBuffer;
|
||||
UINT32 outputStride;
|
||||
UINT32 testedFormat;
|
||||
} primitives_YUV_benchmark;
|
||||
|
||||
static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
|
||||
{
|
||||
if (!bench)
|
||||
return;
|
||||
|
||||
free(bench->outputBuffer);
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
free(bench->channels[i]);
|
||||
memset(bench, 0, sizeof(primitives_YUV_benchmark));
|
||||
}
|
||||
|
||||
static primitives_YUV_benchmark* primitives_YUV_benchmark_init(primitives_YUV_benchmark* ret)
|
||||
{
|
||||
prim_size_t* roi = nullptr;
|
||||
if (!ret)
|
||||
return nullptr;
|
||||
|
||||
memset(ret, 0, sizeof(primitives_YUV_benchmark));
|
||||
roi = &ret->roi;
|
||||
roi->width = 1024;
|
||||
roi->height = 768;
|
||||
ret->outputStride = roi->width * 4;
|
||||
ret->testedFormat = PIXEL_FORMAT_BGRA32;
|
||||
|
||||
ret->outputBuffer = calloc(ret->outputStride, roi->height);
|
||||
if (!ret->outputBuffer)
|
||||
goto fail;
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
BYTE* buf = ret->channels[i] = calloc(roi->width, roi->height);
|
||||
if (!buf)
|
||||
goto fail;
|
||||
|
||||
if (winpr_RAND(buf, 1ull * roi->width * roi->height) < 0)
|
||||
goto fail;
|
||||
ret->steps[i] = roi->width;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
fail:
|
||||
primitives_YUV_benchmark_free(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
|
||||
UINT64 runTime, UINT32* computations)
|
||||
{
|
||||
ULONGLONG dueDate = 0;
|
||||
const BYTE* channels[3] = WINPR_C_ARRAY_INIT;
|
||||
pstatus_t status = 0;
|
||||
|
||||
*computations = 0;
|
||||
|
||||
for (size_t i = 0; i < 3; i++)
|
||||
channels[i] = bench->channels[i];
|
||||
|
||||
/* do a first dry run to initialize cache and such */
|
||||
status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
|
||||
bench->outputStride, bench->testedFormat, &bench->roi);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* let's run the benchmark */
|
||||
dueDate = GetTickCount64() + runTime;
|
||||
while (GetTickCount64() < dueDate)
|
||||
{
|
||||
pstatus_t cstatus =
|
||||
prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
|
||||
bench->outputStride, bench->testedFormat, &bench->roi);
|
||||
if (cstatus != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
*computations = *computations + 1;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
static BOOL primitives_autodetect_best(primitives_t* prims)
|
||||
{
|
||||
BOOL ret = FALSE;
|
||||
struct prim_benchmark
|
||||
{
|
||||
const char* name;
|
||||
primitives_t* prims;
|
||||
primitive_hints flags;
|
||||
UINT32 count;
|
||||
};
|
||||
|
||||
struct prim_benchmark testcases[] = {
|
||||
{ "generic", nullptr, PRIMITIVES_PURE_SOFT, 0 },
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
{ "optimized", nullptr, PRIMITIVES_ONLY_CPU, 0 },
|
||||
#endif
|
||||
#if defined(WITH_OPENCL)
|
||||
{ "opencl", nullptr, PRIMITIVES_ONLY_GPU, 0 },
|
||||
#endif
|
||||
};
|
||||
const struct prim_benchmark* best = nullptr;
|
||||
|
||||
#if !defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) || !defined(WITH_OPENCL)
|
||||
{
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) || defined(WITH_OPENCL)
|
||||
struct prim_benchmark* cur = &testcases[1];
|
||||
#else
|
||||
struct prim_benchmark* cur = &testcases[0];
|
||||
#endif
|
||||
cur->prims = primitives_get_by_type(cur->flags);
|
||||
if (!cur->prims)
|
||||
{
|
||||
WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
|
||||
return FALSE;
|
||||
}
|
||||
WLog_DBG(TAG, "primitives benchmark: only one backend, skipping...");
|
||||
best = cur;
|
||||
}
|
||||
#else
|
||||
{
|
||||
UINT64 benchDuration = 150; /* 150 ms */
|
||||
primitives_YUV_benchmark bench = WINPR_C_ARRAY_INIT;
|
||||
primitives_YUV_benchmark* yuvBench = primitives_YUV_benchmark_init(&bench);
|
||||
if (!yuvBench)
|
||||
return FALSE;
|
||||
|
||||
WLog_DBG(TAG, "primitives benchmark result:");
|
||||
for (size_t x = 0; x < ARRAYSIZE(testcases); x++)
|
||||
{
|
||||
struct prim_benchmark* cur = &testcases[x];
|
||||
cur->prims = primitives_get_by_type(cur->flags);
|
||||
if (!cur->prims)
|
||||
{
|
||||
WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
|
||||
continue;
|
||||
}
|
||||
if (!primitives_YUV_benchmark_run(yuvBench, cur->prims, benchDuration, &cur->count))
|
||||
{
|
||||
WLog_WARN(TAG, "error running %s YUV bench", cur->name);
|
||||
continue;
|
||||
}
|
||||
|
||||
WLog_DBG(TAG, " * %s= %" PRIu32, cur->name, cur->count);
|
||||
if (!best || (best->count < cur->count))
|
||||
best = cur;
|
||||
}
|
||||
primitives_YUV_benchmark_free(yuvBench);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!best)
|
||||
{
|
||||
WLog_ERR(TAG, "No primitives to test, aborting.");
|
||||
goto out;
|
||||
}
|
||||
/* finally compute the results */
|
||||
*prims = *best->prims;
|
||||
|
||||
WLog_DBG(TAG, "primitives autodetect, using %s", best->name);
|
||||
ret = TRUE;
|
||||
out:
|
||||
if (!ret)
|
||||
*prims = pPrimitivesGeneric;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(WITH_OPENCL)
|
||||
static BOOL CALLBACK primitives_init_gpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
|
||||
{
|
||||
WINPR_UNUSED(once);
|
||||
WINPR_UNUSED(param);
|
||||
WINPR_UNUSED(context);
|
||||
|
||||
return primitives_init_opencl(&pPrimitivesGpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
static BOOL CALLBACK primitives_init_cpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
|
||||
{
|
||||
WINPR_UNUSED(once);
|
||||
WINPR_UNUSED(param);
|
||||
WINPR_UNUSED(context);
|
||||
|
||||
return (primitives_init_optimized(&pPrimitivesCpu));
|
||||
}
|
||||
#endif
|
||||
|
||||
static BOOL CALLBACK primitives_auto_init_cb(PINIT_ONCE once, PVOID param, PVOID* context)
|
||||
{
|
||||
WINPR_UNUSED(once);
|
||||
WINPR_UNUSED(param);
|
||||
WINPR_UNUSED(context);
|
||||
|
||||
return primitives_init(&pPrimitives, primitivesHints);
|
||||
}
|
||||
|
||||
BOOL primitives_init(primitives_t* p, primitive_hints hints)
|
||||
{
|
||||
switch (hints)
|
||||
{
|
||||
case PRIMITIVES_AUTODETECT:
|
||||
return primitives_autodetect_best(p);
|
||||
case PRIMITIVES_PURE_SOFT:
|
||||
*p = pPrimitivesGeneric;
|
||||
return TRUE;
|
||||
case PRIMITIVES_ONLY_CPU:
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
*p = pPrimitivesCpu;
|
||||
return TRUE;
|
||||
#endif
|
||||
case PRIMITIVES_ONLY_GPU:
|
||||
#if defined(WITH_OPENCL)
|
||||
*p = pPrimitivesGpu;
|
||||
return TRUE;
|
||||
#endif
|
||||
default:
|
||||
WLog_ERR(TAG, "unknown hint %u", hints);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
void primitives_uninit(void)
|
||||
{
|
||||
#if defined(WITH_OPENCL)
|
||||
if (pPrimitivesGpu.uninit)
|
||||
pPrimitivesGpu.uninit();
|
||||
#endif
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
if (pPrimitivesCpu.uninit)
|
||||
pPrimitivesCpu.uninit();
|
||||
#endif
|
||||
if (pPrimitivesGeneric.uninit)
|
||||
pPrimitivesGeneric.uninit();
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static void setup(void)
|
||||
{
|
||||
if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
|
||||
nullptr))
|
||||
return;
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, nullptr, nullptr))
|
||||
return;
|
||||
#endif
|
||||
#if defined(WITH_OPENCL)
|
||||
if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, nullptr, nullptr))
|
||||
return;
|
||||
#endif
|
||||
if (!InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, nullptr, nullptr))
|
||||
return;
|
||||
}
|
||||
|
||||
primitives_t* primitives_get(void)
|
||||
{
|
||||
setup();
|
||||
return &pPrimitives;
|
||||
}
|
||||
|
||||
primitives_t* primitives_get_generic(void)
|
||||
{
|
||||
if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
|
||||
nullptr))
|
||||
return nullptr;
|
||||
return &pPrimitivesGeneric;
|
||||
}
|
||||
|
||||
primitives_t* primitives_get_by_type(primitive_hints type)
|
||||
{
|
||||
if (!InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, nullptr,
|
||||
nullptr))
|
||||
return nullptr;
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case PRIMITIVES_ONLY_GPU:
|
||||
#if defined(WITH_OPENCL)
|
||||
if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, nullptr,
|
||||
nullptr))
|
||||
return nullptr;
|
||||
return &pPrimitivesGpu;
|
||||
#endif
|
||||
case PRIMITIVES_ONLY_CPU:
|
||||
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
|
||||
if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, nullptr,
|
||||
nullptr))
|
||||
return nullptr;
|
||||
return &pPrimitivesCpu;
|
||||
#endif
|
||||
case PRIMITIVES_PURE_SOFT:
|
||||
default:
|
||||
return &pPrimitivesGeneric;
|
||||
}
|
||||
}
|
||||
|
||||
DWORD primitives_flags(primitives_t* p)
|
||||
{
|
||||
return p->flags;
|
||||
}
|
||||
|
||||
const char* primitives_avc444_frame_type_str(avc444_frame_type type)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case AVC444_LUMA:
|
||||
return "AVC444_LUMA";
|
||||
case AVC444_CHROMAv1:
|
||||
return "AVC444_CHROMAv1";
|
||||
case AVC444_CHROMAv2:
|
||||
return "AVC444_CHROMAv2";
|
||||
default:
|
||||
return "INVALID_FRAME_TYPE";
|
||||
}
|
||||
}
|
||||
|
||||
const char* primtives_hint_str(primitive_hints hint)
|
||||
{
|
||||
switch (hint)
|
||||
{
|
||||
case PRIMITIVES_PURE_SOFT:
|
||||
return "PRIMITIVES_PURE_SOFT";
|
||||
case PRIMITIVES_ONLY_CPU:
|
||||
return "PRIMITIVES_ONLY_CPU";
|
||||
case PRIMITIVES_ONLY_GPU:
|
||||
return "PRIMITIVES_ONLY_GPU";
|
||||
case PRIMITIVES_AUTODETECT:
|
||||
return "PRIMITIVES_AUTODETECT";
|
||||
default:
|
||||
return "PRIMITIVES_UNKNOWN";
|
||||
}
|
||||
}
|
||||
383
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
vendored
Normal file
383
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
vendored
Normal file
@@ -0,0 +1,383 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized YCoCg<->RGB conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
UINT32 dstStep, UINT32 width, UINT32 height,
|
||||
UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
const BYTE* sptr = pSrc;
|
||||
BYTE* dptr = pDst;
|
||||
|
||||
WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
|
||||
WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
|
||||
const size_t sRowBump = srcStep - width * sizeof(UINT32);
|
||||
const size_t dRowBump = dstStep - width * sizeof(UINT32);
|
||||
/* Shift left by "shift" and divide by two is the same as shift
|
||||
* left by "shift-1".
|
||||
*/
|
||||
int dataShift = shift - 1;
|
||||
BYTE mask = (BYTE)(0xFFU << dataShift);
|
||||
|
||||
/* Let's say the data is of the form:
|
||||
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
|
||||
* Apply:
|
||||
* |R| | 1 1/2 -1/2 | |y|
|
||||
* |G| = | 1 0 1/2 | * |o|
|
||||
* |B| | 1 -1/2 -1/2 | |g|
|
||||
* where Y is 8-bit unsigned and o & g are 8-bit signed.
|
||||
*/
|
||||
|
||||
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
|
||||
{
|
||||
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
|
||||
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
|
||||
width, height, shift, withAlpha);
|
||||
}
|
||||
|
||||
for (UINT32 h = 0; h < height; h++)
|
||||
{
|
||||
UINT32 w = width;
|
||||
|
||||
while (w >= 8)
|
||||
{
|
||||
__m128i R0;
|
||||
__m128i R1;
|
||||
__m128i R2;
|
||||
__m128i R3;
|
||||
__m128i R4;
|
||||
__m128i R5;
|
||||
__m128i R6;
|
||||
__m128i R7;
|
||||
|
||||
R0 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
R1 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
|
||||
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
|
||||
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
|
||||
/* Shuffle to pack all the like types together. */
|
||||
R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
|
||||
R3 = _mm_shuffle_epi8(R0, R2);
|
||||
R4 = _mm_shuffle_epi8(R1, R2);
|
||||
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
|
||||
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
|
||||
R5 = _mm_unpackhi_epi32(R3, R4);
|
||||
R6 = _mm_unpacklo_epi32(R3, R4);
|
||||
|
||||
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
|
||||
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Save alphas aside */
|
||||
if (withAlpha)
|
||||
R7 = _mm_unpackhi_epi64(R5, R5);
|
||||
else
|
||||
R7 = mm_set1_epu32(0xFFFFFFFFU);
|
||||
|
||||
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
|
||||
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
|
||||
R1 = mm_set1_epu32(0);
|
||||
R0 = _mm_unpacklo_epi8(R5, R1);
|
||||
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
|
||||
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
|
||||
* Note: this must be done before sign-conversion.
|
||||
* Note also there is no slli_epi8, so we have to use a 16-bit
|
||||
* version and then mask.
|
||||
*/
|
||||
R6 = _mm_slli_epi16(R6, dataShift);
|
||||
R1 = mm_set1_epu8(mask);
|
||||
R6 = _mm_and_si128(R6, R1);
|
||||
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Expand Co's from 8-bit signed to 16-bit signed */
|
||||
R1 = _mm_unpackhi_epi8(R6, R6);
|
||||
R1 = _mm_srai_epi16(R1, 8);
|
||||
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
|
||||
/* Expand Cg's form 8-bit signed to 16-bit signed */
|
||||
R2 = _mm_unpacklo_epi8(R6, R6);
|
||||
R2 = _mm_srai_epi16(R2, 8);
|
||||
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
|
||||
/* Get Y - halfCg and save */
|
||||
R6 = _mm_subs_epi16(R0, R2);
|
||||
/* R = (Y-halfCg) + halfCo */
|
||||
R3 = _mm_adds_epi16(R6, R1);
|
||||
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
|
||||
/* G = Y + Cg(/2) */
|
||||
R4 = _mm_adds_epi16(R0, R2);
|
||||
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
|
||||
/* B = (Y-halfCg) - Co(/2) */
|
||||
R5 = _mm_subs_epi16(R6, R1);
|
||||
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
|
||||
/* Repack R's & B's. */
|
||||
R0 = _mm_packus_epi16(R3, R5);
|
||||
/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
|
||||
/* Repack G's. */
|
||||
R1 = _mm_packus_epi16(R4, R4);
|
||||
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
|
||||
/* And add the A's. */
|
||||
R1 = _mm_unpackhi_epi64(R1, R7);
|
||||
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
|
||||
/* Now do interleaving again. */
|
||||
R2 = _mm_unpacklo_epi8(R0, R1);
|
||||
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
|
||||
R3 = _mm_unpackhi_epi8(R0, R1);
|
||||
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
|
||||
R4 = _mm_unpacklo_epi16(R2, R3);
|
||||
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
|
||||
R5 = _mm_unpackhi_epi16(R2, R3);
|
||||
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
|
||||
STORE_SI128(dptr, R4);
|
||||
dptr += (128 / 8);
|
||||
STORE_SI128(dptr, R5);
|
||||
dptr += (128 / 8);
|
||||
w -= 8;
|
||||
}
|
||||
|
||||
/* Handle any remainder pixels. */
|
||||
if (w > 0)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->YCoCgToRGB_8u_AC4R(
|
||||
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr += w * sizeof(UINT32);
|
||||
dptr += w * sizeof(UINT32);
|
||||
}
|
||||
|
||||
sptr += sRowBump;
|
||||
dptr += dRowBump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 DstFormat, UINT32 dstStep, UINT32 width,
|
||||
UINT32 height, UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
const BYTE* sptr = pSrc;
|
||||
BYTE* dptr = pDst;
|
||||
size_t sRowBump = srcStep - width * sizeof(UINT32);
|
||||
size_t dRowBump = dstStep - width * sizeof(UINT32);
|
||||
/* Shift left by "shift" and divide by two is the same as shift
|
||||
* left by "shift-1".
|
||||
*/
|
||||
int dataShift = shift - 1;
|
||||
BYTE mask = (BYTE)(0xFFU << dataShift);
|
||||
|
||||
/* Let's say the data is of the form:
|
||||
* y0y0o0g0 a1y1o1g1 a2y2o2g2...
|
||||
* Apply:
|
||||
* |R| | 1 1/2 -1/2 | |y|
|
||||
* |G| = | 1 0 1/2 | * |o|
|
||||
* |B| | 1 -1/2 -1/2 | |g|
|
||||
* where Y is 8-bit unsigned and o & g are 8-bit signed.
|
||||
*/
|
||||
|
||||
if ((width < 8) || (ULONG_PTR)dptr & 0x03)
|
||||
{
|
||||
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
|
||||
DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
|
||||
width, height, shift, withAlpha);
|
||||
}
|
||||
|
||||
for (UINT32 h = 0; h < height; h++)
|
||||
{
|
||||
UINT32 w = width;
|
||||
|
||||
while (w >= 8)
|
||||
{
|
||||
__m128i R7;
|
||||
|
||||
/* The faster path, 16-byte aligned load. */
|
||||
__m128i R0 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
__m128i R1 = LOAD_SI128(sptr);
|
||||
sptr += (128 / 8);
|
||||
|
||||
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
|
||||
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
|
||||
/* Shuffle to pack all the like types together. */
|
||||
__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
|
||||
__m128i R3 = _mm_shuffle_epi8(R0, R2);
|
||||
__m128i R4 = _mm_shuffle_epi8(R1, R2);
|
||||
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
|
||||
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
|
||||
__m128i R5 = _mm_unpackhi_epi32(R3, R4);
|
||||
__m128i R6 = _mm_unpacklo_epi32(R3, R4);
|
||||
|
||||
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
|
||||
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Save alphas aside */
|
||||
if (withAlpha)
|
||||
R7 = _mm_unpackhi_epi64(R5, R5);
|
||||
else
|
||||
R7 = mm_set1_epu32(0xFFFFFFFFU);
|
||||
|
||||
/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
|
||||
/* Expand Y's from 8-bit unsigned to 16-bit signed. */
|
||||
R1 = mm_set1_epu32(0);
|
||||
R0 = _mm_unpacklo_epi8(R5, R1);
|
||||
/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
|
||||
/* Shift Co's and Cg's by (shift-1). -1 covers division by two.
|
||||
* Note: this must be done before sign-conversion.
|
||||
* Note also there is no slli_epi8, so we have to use a 16-bit
|
||||
* version and then mask.
|
||||
*/
|
||||
R6 = _mm_slli_epi16(R6, dataShift);
|
||||
R1 = mm_set1_epu8(mask);
|
||||
R6 = _mm_and_si128(R6, R1);
|
||||
/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
|
||||
/* Expand Co's from 8-bit signed to 16-bit signed */
|
||||
R1 = _mm_unpackhi_epi8(R6, R6);
|
||||
R1 = _mm_srai_epi16(R1, 8);
|
||||
/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
|
||||
/* Expand Cg's form 8-bit signed to 16-bit signed */
|
||||
R2 = _mm_unpacklo_epi8(R6, R6);
|
||||
R2 = _mm_srai_epi16(R2, 8);
|
||||
/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
|
||||
/* Get Y - halfCg and save */
|
||||
R6 = _mm_subs_epi16(R0, R2);
|
||||
/* R = (Y-halfCg) + halfCo */
|
||||
R3 = _mm_adds_epi16(R6, R1);
|
||||
/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
|
||||
/* G = Y + Cg(/2) */
|
||||
R4 = _mm_adds_epi16(R0, R2);
|
||||
/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
|
||||
/* B = (Y-halfCg) - Co(/2) */
|
||||
R5 = _mm_subs_epi16(R6, R1);
|
||||
/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
|
||||
/* Repack R's & B's. */
|
||||
/* This line is the only diff between inverted and non-inverted.
|
||||
* Unfortunately, it would be expensive to check "inverted"
|
||||
* every time through this loop.
|
||||
*/
|
||||
R0 = _mm_packus_epi16(R5, R3);
|
||||
/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
|
||||
/* Repack G's. */
|
||||
R1 = _mm_packus_epi16(R4, R4);
|
||||
/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
|
||||
/* And add the A's. */
|
||||
R1 = _mm_unpackhi_epi64(R1, R7);
|
||||
/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
|
||||
/* Now do interleaving again. */
|
||||
R2 = _mm_unpacklo_epi8(R0, R1);
|
||||
/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
|
||||
R3 = _mm_unpackhi_epi8(R0, R1);
|
||||
/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
|
||||
R4 = _mm_unpacklo_epi16(R2, R3);
|
||||
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
|
||||
R5 = _mm_unpackhi_epi16(R2, R3);
|
||||
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
|
||||
STORE_SI128(dptr, R4);
|
||||
dptr += (128 / 8);
|
||||
STORE_SI128(dptr, R5);
|
||||
dptr += (128 / 8);
|
||||
w -= 8;
|
||||
}
|
||||
|
||||
/* Handle any remainder pixels. */
|
||||
if (w > 0)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->YCoCgToRGB_8u_AC4R(
|
||||
sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
|
||||
shift, withAlpha);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
|
||||
dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
|
||||
}
|
||||
|
||||
sptr += sRowBump;
|
||||
dptr += dRowBump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
|
||||
BOOL withAlpha)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
||||
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
||||
pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
|
||||
WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
|
||||
|
||||
default:
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
|
||||
height, shift, withAlpha);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
|
||||
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
1742
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
vendored
Normal file
1742
third_party/FreeRDP/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
187
third_party/FreeRDP/libfreerdp/primitives/sse/prim_add_sse3.c
vendored
Normal file
187
third_party/FreeRDP/libfreerdp/primitives/sse/prim_add_sse3.c
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized add operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_add.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
|
||||
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||
|
||||
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
|
||||
INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
|
||||
{
|
||||
const int shifts = 2;
|
||||
INT16* dptr1 = pSrcDst1;
|
||||
INT16* dptr2 = pSrcDst2;
|
||||
|
||||
if (ulen < 16) /* pointless if too small */
|
||||
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
|
||||
|
||||
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
|
||||
if ((ULONG_PTR)pSrcDst1 & offBeatMask)
|
||||
{
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||
return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
|
||||
}
|
||||
/* Get to the 16-byte boundary now. */
|
||||
const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
|
||||
if (rem != 0)
|
||||
{
|
||||
const UINT32 add = 16 - (UINT32)rem;
|
||||
pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
dptr1 += add;
|
||||
dptr2 += add;
|
||||
}
|
||||
/* Use 4 128-bit SSE registers. */
|
||||
size_t len = ulen;
|
||||
size_t count = len >> (7 - shifts);
|
||||
len -= count << (7 - shifts);
|
||||
if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
|
||||
{
|
||||
/* Unaligned loads */
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* vsptr1 = (const __m128i*)dptr1;
|
||||
const __m128i* vsptr2 = (const __m128i*)dptr2;
|
||||
__m128i* vdptr1 = (__m128i*)dptr1;
|
||||
__m128i* vdptr2 = (__m128i*)dptr2;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm1 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm2 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm3 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm4 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm5 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm6 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm7 = LOAD_SI128(vsptr2++);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||
|
||||
STORE_SI128(vdptr1++, xmm0);
|
||||
STORE_SI128(vdptr1++, xmm1);
|
||||
STORE_SI128(vdptr1++, xmm2);
|
||||
STORE_SI128(vdptr1++, xmm3);
|
||||
|
||||
STORE_SI128(vdptr2++, xmm0);
|
||||
STORE_SI128(vdptr2++, xmm1);
|
||||
STORE_SI128(vdptr2++, xmm2);
|
||||
STORE_SI128(vdptr2++, xmm3);
|
||||
|
||||
dptr1 = (INT16*)vdptr1;
|
||||
dptr2 = (INT16*)vdptr2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Aligned loads */
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* vsptr1 = (const __m128i*)dptr1;
|
||||
const __m128i* vsptr2 = (const __m128i*)dptr2;
|
||||
__m128i* vdptr1 = (__m128i*)dptr1;
|
||||
__m128i* vdptr2 = (__m128i*)dptr2;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm1 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm2 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm3 = LOAD_SI128(vsptr1++);
|
||||
__m128i xmm4 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm5 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm6 = LOAD_SI128(vsptr2++);
|
||||
__m128i xmm7 = LOAD_SI128(vsptr2++);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||
|
||||
STORE_SI128(vdptr1++, xmm0);
|
||||
STORE_SI128(vdptr1++, xmm1);
|
||||
STORE_SI128(vdptr1++, xmm2);
|
||||
STORE_SI128(vdptr1++, xmm3);
|
||||
|
||||
STORE_SI128(vdptr2++, xmm0);
|
||||
STORE_SI128(vdptr2++, xmm1);
|
||||
STORE_SI128(vdptr2++, xmm2);
|
||||
STORE_SI128(vdptr2++, xmm3);
|
||||
|
||||
dptr1 = (INT16*)vdptr1;
|
||||
dptr2 = (INT16*)vdptr2;
|
||||
}
|
||||
}
|
||||
/* Use a single 128-bit SSE register. */
|
||||
count = len >> (5 - shifts);
|
||||
len -= count << (5 - shifts);
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* vsptr1 = (const __m128i*)dptr1;
|
||||
const __m128i* vsptr2 = (const __m128i*)dptr2;
|
||||
__m128i* vdptr1 = (__m128i*)dptr1;
|
||||
__m128i* vdptr2 = (__m128i*)dptr2;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(vsptr1);
|
||||
__m128i xmm1 = LOAD_SI128(vsptr2);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm1);
|
||||
|
||||
STORE_SI128(vdptr1++, xmm0);
|
||||
STORE_SI128(vdptr2++, xmm0);
|
||||
|
||||
dptr1 = (INT16*)vdptr1;
|
||||
dptr2 = (INT16*)vdptr2;
|
||||
}
|
||||
/* Finish off the remainder. */
|
||||
if (len > 0)
|
||||
return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->add_16s = sse3_add_16s;
|
||||
prims->add_16s_inplace = sse3_add_16s_inplace;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
215
third_party/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
vendored
Normal file
215
third_party/FreeRDP/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
vendored
Normal file
@@ -0,0 +1,215 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized alpha blending routines.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
* Note: this code assumes the second operand is fully opaque,
|
||||
* e.g.
|
||||
* newval = alpha1*val1 + (1-alpha1)*val2
|
||||
* rather than
|
||||
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
|
||||
* The IPP gives other options.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_alphaComp.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
|
||||
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
|
||||
UINT32 height)
|
||||
{
|
||||
const UINT32* sptr1 = (const UINT32*)pSrc1;
|
||||
const UINT32* sptr2 = (const UINT32*)pSrc2;
|
||||
|
||||
if ((width <= 0) || (height <= 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (width < 4) /* pointless if too small */
|
||||
{
|
||||
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
|
||||
height);
|
||||
}
|
||||
|
||||
UINT32* dptr = (UINT32*)pDst;
|
||||
const size_t linebytes = width * sizeof(UINT32);
|
||||
const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
|
||||
const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
|
||||
const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
|
||||
__m128i xmm0 = mm_set1_epu32(0);
|
||||
__m128i xmm1 = _mm_set1_epi16(1);
|
||||
|
||||
for (UINT32 y = 0; y < height; ++y)
|
||||
{
|
||||
uint32_t pixels = width;
|
||||
uint32_t count = 0;
|
||||
/* Get to the 16-byte boundary now. */
|
||||
uint32_t leadIn = 0;
|
||||
|
||||
switch ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
case 0:
|
||||
leadIn = 0;
|
||||
break;
|
||||
|
||||
case 4:
|
||||
leadIn = 3;
|
||||
break;
|
||||
|
||||
case 8:
|
||||
leadIn = 2;
|
||||
break;
|
||||
|
||||
case 12:
|
||||
leadIn = 1;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* We'll never hit a 16-byte boundary, so do the whole
|
||||
* thing the slow way.
|
||||
*/
|
||||
leadIn = width;
|
||||
break;
|
||||
}
|
||||
|
||||
if (leadIn)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
|
||||
src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr1 += leadIn;
|
||||
sptr2 += leadIn;
|
||||
dptr += leadIn;
|
||||
pixels -= leadIn;
|
||||
}
|
||||
|
||||
/* Use SSE registers to do 4 pixels at a time. */
|
||||
count = pixels >> 2;
|
||||
pixels -= count << 2;
|
||||
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm2;
|
||||
__m128i xmm3;
|
||||
__m128i xmm4;
|
||||
__m128i xmm5;
|
||||
__m128i xmm6;
|
||||
__m128i xmm7;
|
||||
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
|
||||
xmm2 = LOAD_SI128(sptr1);
|
||||
sptr1 += 4;
|
||||
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
|
||||
xmm3 = LOAD_SI128(sptr2);
|
||||
sptr2 += 4;
|
||||
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
|
||||
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
|
||||
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
|
||||
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
|
||||
/* subtract */
|
||||
xmm6 = _mm_subs_epi16(xmm4, xmm5);
|
||||
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
|
||||
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
|
||||
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
|
||||
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
|
||||
/* Add one to alphas */
|
||||
xmm4 = _mm_adds_epi16(xmm4, xmm1);
|
||||
/* Multiply and take low word */
|
||||
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
|
||||
/* Shift 8 right */
|
||||
xmm4 = _mm_srai_epi16(xmm4, 8);
|
||||
/* Add xmm5 */
|
||||
xmm4 = _mm_adds_epi16(xmm4, xmm5);
|
||||
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
|
||||
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
|
||||
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
|
||||
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
|
||||
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
|
||||
/* subtract */
|
||||
xmm7 = _mm_subs_epi16(xmm5, xmm6);
|
||||
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
|
||||
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
|
||||
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
|
||||
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
|
||||
/* Add one to alphas */
|
||||
xmm5 = _mm_adds_epi16(xmm5, xmm1);
|
||||
/* Multiply and take low word */
|
||||
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
|
||||
/* Shift 8 right */
|
||||
xmm5 = _mm_srai_epi16(xmm5, 8);
|
||||
/* Add xmm6 */
|
||||
xmm5 = _mm_adds_epi16(xmm5, xmm6);
|
||||
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
|
||||
/* Must mask off remainders or pack gets confused */
|
||||
xmm3 = _mm_set1_epi16(0x00ffU);
|
||||
xmm4 = _mm_and_si128(xmm4, xmm3);
|
||||
xmm5 = _mm_and_si128(xmm5, xmm3);
|
||||
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
|
||||
xmm5 = _mm_packus_epi16(xmm5, xmm4);
|
||||
STORE_SI128(dptr, xmm5);
|
||||
dptr += 4;
|
||||
}
|
||||
|
||||
/* Finish off the remainder. */
|
||||
if (pixels)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
|
||||
src2Step, (BYTE*)dptr, dstStep, pixels, 1);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
|
||||
sptr1 += pixels;
|
||||
sptr2 += pixels;
|
||||
dptr += pixels;
|
||||
}
|
||||
|
||||
/* Jump to next row. */
|
||||
sptr1 += src1Jump;
|
||||
sptr2 += src2Jump;
|
||||
dptr += dstJump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->alphaComp_argb = sse2_alphaComp_argb;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
54
third_party/FreeRDP/libfreerdp/primitives/sse/prim_andor_sse3.c
vendored
Normal file
54
third_party/FreeRDP/libfreerdp/primitives/sse/prim_andor_sse3.c
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized Logical operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_andor.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
|
||||
*dptr++ = *sptr++ & val)
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->andC_32u = sse3_andC_32u;
|
||||
prims->orC_32u = sse3_orC_32u;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
79
third_party/FreeRDP/libfreerdp/primitives/sse/prim_avxsse.h
vendored
Normal file
79
third_party/FreeRDP/libfreerdp/primitives/sse/prim_avxsse.h
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* FreeRDP primitives SSE implementation
|
||||
*
|
||||
* Copyright 2025 Armin Novak <armin.novak@thincast.com>
|
||||
* Copyright 2025 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <winpr/cast.h>
|
||||
|
||||
#include "../../core/simd.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
|
||||
{
|
||||
return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
|
||||
WINPR_CXX_COMPAT_CAST(int32_t, val3),
|
||||
WINPR_CXX_COMPAT_CAST(int32_t, val4));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
|
||||
uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
|
||||
uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
|
||||
uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
|
||||
{
|
||||
return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
|
||||
WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set1_epu32(uint32_t val)
|
||||
{
|
||||
return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i mm_set1_epu8(uint8_t val)
|
||||
{
|
||||
return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
|
||||
}
|
||||
|
||||
WINPR_ATTR_NODISCARD
|
||||
static inline __m128i LOAD_SI128(const void* ptr)
|
||||
{
|
||||
const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
|
||||
return _mm_lddqu_si128(mptr);
|
||||
}
|
||||
|
||||
static inline void STORE_SI128(void* ptr, __m128i val)
|
||||
{
|
||||
__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
|
||||
_mm_storeu_si128(mptr, val);
|
||||
}
|
||||
|
||||
#endif
|
||||
1056
third_party/FreeRDP/libfreerdp/primitives/sse/prim_colors_sse2.c
vendored
Normal file
1056
third_party/FreeRDP/libfreerdp/primitives/sse/prim_colors_sse2.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
vendored
Normal file
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
vendored
Normal file
@@ -0,0 +1,278 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Copy operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/log.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_copy.h"
|
||||
#include "../codec/color.h"
|
||||
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
|
||||
{
|
||||
return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
|
||||
(int32_t)i5, (int32_t)i6, (int32_t)i7);
|
||||
}
|
||||
|
||||
static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
|
||||
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
|
||||
UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 3;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
|
||||
0xFF000000, 0xFF000000, 0xFF000000);
|
||||
const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
|
||||
0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
|
||||
const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
|
||||
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
const UINT32 rem = nWidth % 8;
|
||||
const int64_t width = nWidth - rem;
|
||||
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
|
||||
/* Ensure alignment requirements can be met */
|
||||
for (; x < width; x += 8)
|
||||
{
|
||||
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m256i s0 = _mm256_loadu_si256(src);
|
||||
__m256i s1 = _mm256_shuffle_epi8(s0, smask);
|
||||
|
||||
/* _mm256_shuffle_epi8 can not cross 128bit lanes.
|
||||
* manually copy these bytes with extract/insert */
|
||||
const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
|
||||
const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
|
||||
const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000);
|
||||
const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
|
||||
|
||||
const __m256i s2 = _mm256_loadu_si256(dst);
|
||||
__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
|
||||
_mm256_storeu_si256(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 4;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m256i mask = _mm256_setr_epi8(
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
|
||||
(char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
|
||||
const UINT32 rem = nWidth % 8;
|
||||
const int64_t width = nWidth - rem;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
for (; x < width; x += 8)
|
||||
{
|
||||
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m256i s0 = _mm256_loadu_si256(src);
|
||||
const __m256i s1 = _mm256_loadu_si256(dst);
|
||||
__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
|
||||
_mm256_storeu_si256(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
|
||||
int64_t dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
|
||||
switch (SrcFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return avx2_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return avx2_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return avx2_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
|
||||
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
|
||||
{
|
||||
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
|
||||
int64_t srcVOffset = 0;
|
||||
int64_t srcVMultiplier = 1;
|
||||
int64_t dstVOffset = 0;
|
||||
int64_t dstVMultiplier = 1;
|
||||
|
||||
if ((nWidth == 0) || (nHeight == 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
|
||||
return -1;
|
||||
|
||||
if (!pDstData || !pSrcData)
|
||||
return -1;
|
||||
|
||||
if (nDstStep == 0)
|
||||
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
if (nSrcStep == 0)
|
||||
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
|
||||
if (vSrcVFlip)
|
||||
{
|
||||
srcVOffset = (nHeight - 1ll) * nSrcStep;
|
||||
srcVMultiplier = -1;
|
||||
}
|
||||
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, flags, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
{
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
WLog_VRB(PRIM_TAG, "AVX2 optimizations");
|
||||
prims->copy_no_overlap = avx2_image_copy_no_overlap;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
257
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_sse4_1.c
vendored
Normal file
257
third_party/FreeRDP/libfreerdp/primitives/sse/prim_copy_sse4_1.c
vendored
Normal file
@@ -0,0 +1,257 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Copy operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/log.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
#include "prim_copy.h"
|
||||
#include "../codec/color.h"
|
||||
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
|
||||
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
|
||||
UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 3;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
|
||||
const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
|
||||
const UINT32 rem = nWidth % 4;
|
||||
|
||||
const int64_t width = nWidth - rem;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
/* Ensure alignment requirements can be met */
|
||||
for (; x < width; x += 4)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m128i s0 = LOAD_SI128(src);
|
||||
const __m128i s1 = _mm_shuffle_epi8(s0, smask);
|
||||
const __m128i s2 = LOAD_SI128(dst);
|
||||
|
||||
__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
|
||||
STORE_SI128(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static inline pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
|
||||
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
|
||||
UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
int64_t srcVMultiplier, int64_t srcVOffset,
|
||||
int64_t dstVMultiplier, int64_t dstVOffset)
|
||||
{
|
||||
|
||||
const int64_t srcByte = 4;
|
||||
const int64_t dstByte = 4;
|
||||
|
||||
const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
|
||||
(char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
|
||||
(char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
|
||||
const UINT32 rem = nWidth % 4;
|
||||
const int64_t width = nWidth - rem;
|
||||
for (int64_t y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
int64_t x = 0;
|
||||
for (; x < width; x += 4)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m128i s0 = LOAD_SI128(src);
|
||||
const __m128i s1 = LOAD_SI128(dst);
|
||||
__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
|
||||
STORE_SI128(dst, d0);
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
|
||||
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
*dst++ = *src++;
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t sse_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
|
||||
int64_t dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
|
||||
switch (SrcFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return sse_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return sse_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return sse_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
|
||||
static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight,
|
||||
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
|
||||
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
|
||||
{
|
||||
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) != 0;
|
||||
int64_t srcVOffset = 0;
|
||||
int64_t srcVMultiplier = 1;
|
||||
int64_t dstVOffset = 0;
|
||||
int64_t dstVMultiplier = 1;
|
||||
|
||||
if ((nWidth == 0) || (nHeight == 0))
|
||||
return PRIMITIVES_SUCCESS;
|
||||
|
||||
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
|
||||
return -1;
|
||||
|
||||
if (!pDstData || !pSrcData)
|
||||
return -1;
|
||||
|
||||
if (nDstStep == 0)
|
||||
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
if (nSrcStep == 0)
|
||||
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
|
||||
if (vSrcVFlip)
|
||||
{
|
||||
srcVOffset = (nHeight - 1ll) * nSrcStep;
|
||||
srcVMultiplier = -1;
|
||||
}
|
||||
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, flags, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
{
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
|
||||
prims->copy_no_overlap = sse_image_copy_no_overlap;
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
235
third_party/FreeRDP/libfreerdp/primitives/sse/prim_set_sse2.c
vendored
Normal file
235
third_party/FreeRDP/libfreerdp/primitives/sse/prim_set_sse2.c
vendored
Normal file
@@ -0,0 +1,235 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized routines to set a chunk of memory to a constant.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
#include "prim_set.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
BYTE byte = 0;
|
||||
BYTE* dptr = nullptr;
|
||||
__m128i xmm0;
|
||||
size_t count = 0;
|
||||
|
||||
if (len < 16)
|
||||
return generic->set_8u(val, pDst, ulen);
|
||||
|
||||
byte = val;
|
||||
dptr = pDst;
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
*dptr++ = byte;
|
||||
|
||||
if (--len == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
xmm0 = mm_set1_epu8(byte);
|
||||
/* Cover 256-byte chunks via SSE register stores. */
|
||||
count = len >> 8;
|
||||
len -= count << 8;
|
||||
|
||||
/* Do 256-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
}
|
||||
|
||||
/* Cover 16-byte chunks via SSE register stores. */
|
||||
count = len >> 4;
|
||||
len -= count << 4;
|
||||
|
||||
/* Do 16-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 16;
|
||||
}
|
||||
|
||||
/* Do leftover bytes. */
|
||||
while (len--)
|
||||
*dptr++ = byte;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
const primitives_t* prim = primitives_get_generic();
|
||||
UINT32* dptr = pDst;
|
||||
__m128i xmm0;
|
||||
size_t count = 0;
|
||||
|
||||
/* If really short, just do it here. */
|
||||
if (len < 32)
|
||||
{
|
||||
while (len--)
|
||||
*dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* Assure we can reach 16-byte alignment. */
|
||||
if (((ULONG_PTR)dptr & 0x03) != 0)
|
||||
{
|
||||
return prim->set_32u(val, pDst, ulen);
|
||||
}
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
*dptr++ = val;
|
||||
|
||||
if (--len == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
xmm0 = mm_set1_epu32(val);
|
||||
/* Cover 256-byte chunks via SSE register stores. */
|
||||
count = len >> 6;
|
||||
len -= count << 6;
|
||||
|
||||
/* Do 256-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
}
|
||||
|
||||
/* Cover 16-byte chunks via SSE register stores. */
|
||||
count = len >> 2;
|
||||
len -= count << 2;
|
||||
|
||||
/* Do 16-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 4;
|
||||
}
|
||||
|
||||
/* Do leftover bytes. */
|
||||
while (len--)
|
||||
*dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
UINT32 uval = *((UINT32*)&val);
|
||||
return sse2_set_32u(uval, (UINT32*)pDst, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
/* Pick tuned versions if possible. */
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->set_8u = sse2_set_8u;
|
||||
prims->set_32s = sse2_set_32s;
|
||||
prims->set_32u = sse2_set_32u;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
160
third_party/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
vendored
Normal file
160
third_party/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Shift operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_shift.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
|
||||
*dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
|
||||
*dptr++ = *sptr++ >> val)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
|
||||
*dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
|
||||
*dptr++ = *sptr++ >> val)
|
||||
|
||||
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
const INT32 shifts = 2;
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
if (len < 16) /* pointless if too small */
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
|
||||
|
||||
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
|
||||
if ((ULONG_PTR)pSrcDst & offBeatMask)
|
||||
{
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
|
||||
}
|
||||
/* Get to the 16-byte boundary now. */
|
||||
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
|
||||
if (rem > 0)
|
||||
{
|
||||
const UINT32 add = 16 - rem;
|
||||
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
pSrcDst += add;
|
||||
len -= add;
|
||||
}
|
||||
|
||||
/* Use 8 128-bit SSE registers. */
|
||||
size_t count = len >> (8 - shifts);
|
||||
len -= count << (8 - shifts);
|
||||
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)pSrcDst;
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(src++);
|
||||
__m128i xmm1 = LOAD_SI128(src++);
|
||||
__m128i xmm2 = LOAD_SI128(src++);
|
||||
__m128i xmm3 = LOAD_SI128(src++);
|
||||
__m128i xmm4 = LOAD_SI128(src++);
|
||||
__m128i xmm5 = LOAD_SI128(src++);
|
||||
__m128i xmm6 = LOAD_SI128(src++);
|
||||
__m128i xmm7 = LOAD_SI128(src);
|
||||
|
||||
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
|
||||
xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
|
||||
xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
|
||||
xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
|
||||
xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
|
||||
xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
|
||||
xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
|
||||
xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
|
||||
|
||||
__m128i* dst = (__m128i*)pSrcDst;
|
||||
|
||||
STORE_SI128(dst++, xmm0);
|
||||
STORE_SI128(dst++, xmm1);
|
||||
STORE_SI128(dst++, xmm2);
|
||||
STORE_SI128(dst++, xmm3);
|
||||
STORE_SI128(dst++, xmm4);
|
||||
STORE_SI128(dst++, xmm5);
|
||||
STORE_SI128(dst++, xmm6);
|
||||
STORE_SI128(dst++, xmm7);
|
||||
|
||||
pSrcDst = (INT16*)dst;
|
||||
}
|
||||
|
||||
/* Use a single 128-bit SSE register. */
|
||||
count = len >> (5 - shifts);
|
||||
len -= count << (5 - shifts);
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)pSrcDst;
|
||||
__m128i xmm0 = LOAD_SI128(src);
|
||||
|
||||
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
|
||||
|
||||
__m128i* dst = (__m128i*)pSrcDst;
|
||||
STORE_SI128(dst++, xmm0);
|
||||
pSrcDst = (INT16*)dst;
|
||||
}
|
||||
|
||||
/* Finish off the remainder. */
|
||||
if (len > 0)
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
|
||||
* depending on the sign of val. To avoid using the deprecated inplace
|
||||
* routines, a wrapper can use the src for the dest.
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
|
||||
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
|
||||
prims->lShiftC_16s = sse2_lShiftC_16s;
|
||||
prims->rShiftC_16s = sse2_rShiftC_16s;
|
||||
prims->lShiftC_16u = sse2_lShiftC_16u;
|
||||
prims->rShiftC_16u = sse2_rShiftC_16u;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
188
third_party/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
vendored
Normal file
188
third_party/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized sign operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_sign.h"
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_avxsse.h"
|
||||
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
static primitives_t* generic = nullptr;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 ulen)
|
||||
{
|
||||
size_t len = ulen;
|
||||
const INT16* sptr = pSrc;
|
||||
INT16* dptr = pDst;
|
||||
size_t count = 0;
|
||||
|
||||
if (len < 16)
|
||||
{
|
||||
return generic->sign_16s(pSrc, pDst, ulen);
|
||||
}
|
||||
|
||||
/* Check for 16-byte alignment (eventually). */
|
||||
if ((ULONG_PTR)pDst & 0x01)
|
||||
{
|
||||
return generic->sign_16s(pSrc, pDst, ulen);
|
||||
}
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR)dptr & 0x0f)
|
||||
{
|
||||
INT16 src = *sptr++;
|
||||
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
|
||||
|
||||
if (--len == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* Do 32-short chunks using 8 XMM registers. */
|
||||
count = len >> 5; /* / 32 */
|
||||
len -= count << 5; /* * 32 */
|
||||
|
||||
if ((ULONG_PTR)sptr & 0x0f)
|
||||
{
|
||||
/* Unaligned */
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0;
|
||||
__m128i xmm1;
|
||||
__m128i xmm2;
|
||||
__m128i xmm3;
|
||||
__m128i xmm4;
|
||||
__m128i xmm5;
|
||||
__m128i xmm6;
|
||||
__m128i xmm7;
|
||||
xmm0 = _mm_set1_epi16(0x0001U);
|
||||
xmm1 = _mm_set1_epi16(0x0001U);
|
||||
xmm2 = _mm_set1_epi16(0x0001U);
|
||||
xmm3 = _mm_set1_epi16(0x0001U);
|
||||
xmm4 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm5 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm6 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm7 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_sign_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_sign_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_sign_epi16(xmm3, xmm7);
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm1);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm2);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm3);
|
||||
dptr += 8;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Aligned */
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0;
|
||||
__m128i xmm1;
|
||||
__m128i xmm2;
|
||||
__m128i xmm3;
|
||||
__m128i xmm4;
|
||||
__m128i xmm5;
|
||||
__m128i xmm6;
|
||||
__m128i xmm7;
|
||||
xmm0 = _mm_set1_epi16(0x0001U);
|
||||
xmm1 = _mm_set1_epi16(0x0001U);
|
||||
xmm2 = _mm_set1_epi16(0x0001U);
|
||||
xmm3 = _mm_set1_epi16(0x0001U);
|
||||
xmm4 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm5 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm6 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm7 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_sign_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_sign_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_sign_epi16(xmm3, xmm7);
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm1);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm2);
|
||||
dptr += 8;
|
||||
STORE_SI128(dptr, xmm3);
|
||||
dptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do 8-short chunks using two XMM registers. */
|
||||
count = len >> 3;
|
||||
len -= count << 3;
|
||||
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0 = _mm_set1_epi16(0x0001U);
|
||||
__m128i xmm1 = LOAD_SI128(sptr);
|
||||
sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm1);
|
||||
STORE_SI128(dptr, xmm0);
|
||||
dptr += 8;
|
||||
}
|
||||
|
||||
/* Do leftovers. */
|
||||
while (len--)
|
||||
{
|
||||
INT16 src = *sptr++;
|
||||
*dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#endif /* SSE_AVX_INTRINSICS_ENABLED */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(SSE_AVX_INTRINSICS_ENABLED)
|
||||
generic = primitives_get_generic();
|
||||
|
||||
/* Pick tuned versions if possible. */
|
||||
/* I didn't spot an IPP version of this. */
|
||||
|
||||
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
|
||||
prims->sign_16s = ssse3_sign_16s;
|
||||
|
||||
#else
|
||||
WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
||||
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_templates.h
vendored
Normal file
278
third_party/FreeRDP/libfreerdp/primitives/sse/prim_templates.h
vendored
Normal file
@@ -0,0 +1,278 @@
|
||||
/* prim_templates.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "prim_avxsse.h"
|
||||
|
||||
/* These are prototypes for SSE (potentially NEON) routines that do a
|
||||
* simple SSE operation over an array of data. Since so much of this
|
||||
* code is shared except for the operation itself, these prototypes are
|
||||
* used rather than duplicating code. The naming convention depends on
|
||||
* the parameters: S=Source param; C=Constant; D=Destination.
|
||||
* All the macros have parameters for a fallback procedure if the data
|
||||
* is too small and an operation "the slow way" for use at 16-byte edges.
|
||||
*/
|
||||
|
||||
/* SSE3 note: If someone needs to support an SSE2 version of these without
|
||||
* SSE3 support, an alternative version could be added that merely checks
|
||||
* that 16-byte alignment on both destination and source(s) can be
|
||||
* achieved, rather than use LDDQU for unaligned reads.
|
||||
*/
|
||||
|
||||
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
|
||||
* It easily can't do that if the value is stored in a variable.
|
||||
* So don't save it as an intermediate value.
|
||||
*/
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SCD = Source, Constant, Destination
|
||||
*/
|
||||
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
|
||||
WINPR_ATTR_NODISCARD \
|
||||
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
|
||||
_type_* WINPR_RESTRICT pDst, UINT32 ulen) \
|
||||
{ \
|
||||
size_t len = ulen; \
|
||||
INT32 shifts = 0; \
|
||||
const _type_* sptr = pSrc; \
|
||||
_type_* dptr = pDst; \
|
||||
if (val == 0) \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
if (val >= 16) \
|
||||
return -1; \
|
||||
if (sizeof(_type_) == 1) \
|
||||
shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) \
|
||||
shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) \
|
||||
shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) \
|
||||
shifts = 4; \
|
||||
/* Use 8 128-bit SSE registers. */ \
|
||||
size_t count = len >> (8 - shifts); \
|
||||
len -= count << (8 - shifts); \
|
||||
\
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm2 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm3 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm4 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm5 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm6 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm7 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, (_op_type_)val); \
|
||||
xmm1 = _op_(xmm1, (_op_type_)val); \
|
||||
xmm2 = _op_(xmm2, (_op_type_)val); \
|
||||
xmm3 = _op_(xmm3, (_op_type_)val); \
|
||||
xmm4 = _op_(xmm4, (_op_type_)val); \
|
||||
xmm5 = _op_(xmm5, (_op_type_)val); \
|
||||
xmm6 = _op_(xmm6, (_op_type_)val); \
|
||||
xmm7 = _op_(xmm7, (_op_type_)val); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm2); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm3); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm4); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm5); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm6); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm7); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
\
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5 - shifts); \
|
||||
len -= count << (5 - shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, (_op_type_)val); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
} \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SCD = Source, Constant, Destination
|
||||
* PRE = preload xmm0 with the constant.
|
||||
*/
|
||||
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
WINPR_ATTR_NODISCARD \
|
||||
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
|
||||
_type_* WINPR_RESTRICT pDst, INT32 ilen) \
|
||||
{ \
|
||||
size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
|
||||
int shifts = 0; \
|
||||
const _type_* sptr = pSrc; \
|
||||
_type_* dptr = pDst; \
|
||||
__m128i xmm0; \
|
||||
if (sizeof(_type_) == 1) \
|
||||
shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) \
|
||||
shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) \
|
||||
shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) \
|
||||
shifts = 4; \
|
||||
/* Use 4 128-bit SSE registers. */ \
|
||||
size_t count = len >> (7 - shifts); \
|
||||
len -= count << (7 - shifts); \
|
||||
xmm0 = mm_set1_epu32(val); \
|
||||
for (size_t x = 0; x < count; x++) \
|
||||
{ \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm2 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm3 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
__m128i xmm4 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
xmm2 = _op_(xmm2, xmm0); \
|
||||
xmm3 = _op_(xmm3, xmm0); \
|
||||
xmm4 = _op_(xmm4, xmm0); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm2); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm3); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm4); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5 - shifts); \
|
||||
len -= count << (5 - shifts); \
|
||||
for (size_t x = 0; x < count; x++) \
|
||||
{ \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); \
|
||||
sptr += (16 / sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
for (size_t x = 0; x < len; x++) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
} \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SSD = Source1, Source2, Destination
|
||||
*/
|
||||
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
WINPR_ATTR_NODISCARD \
|
||||
static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
|
||||
const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
|
||||
UINT32 ulen) \
|
||||
{ \
|
||||
size_t len = ulen; \
|
||||
int shifts = 0; \
|
||||
const _type_* sptr1 = pSrc1; \
|
||||
const _type_* sptr2 = pSrc2; \
|
||||
_type_* dptr = pDst; \
|
||||
size_t count; \
|
||||
if (sizeof(_type_) == 1) \
|
||||
shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) \
|
||||
shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) \
|
||||
shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) \
|
||||
shifts = 4; \
|
||||
/* Use 4 128-bit SSE registers. */ \
|
||||
count = len >> (7 - shifts); \
|
||||
len -= count << (7 - shifts); \
|
||||
/* Aligned loads */ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm1 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm2 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm3 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm4 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm5 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm6 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm7 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm4); \
|
||||
xmm1 = _op_(xmm1, xmm5); \
|
||||
xmm2 = _op_(xmm2, xmm6); \
|
||||
xmm3 = _op_(xmm3, xmm7); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm1); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm2); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
STORE_SI128(dptr, xmm3); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5 - shifts); \
|
||||
len -= count << (5 - shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr1); \
|
||||
sptr1 += (16 / sizeof(_type_)); \
|
||||
__m128i xmm1 = LOAD_SI128(sptr2); \
|
||||
sptr2 += (16 / sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm1); \
|
||||
STORE_SI128(dptr, xmm0); \
|
||||
dptr += (16 / sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) \
|
||||
{ \
|
||||
const pstatus_t rc = _slowWay_; \
|
||||
if (rc != PRIMITIVES_SUCCESS) \
|
||||
return rc; \
|
||||
} \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
39
third_party/FreeRDP/libfreerdp/primitives/test/CMakeLists.txt
vendored
Normal file
39
third_party/FreeRDP/libfreerdp/primitives/test/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
set(MODULE_NAME "TestPrimitives")
|
||||
set(MODULE_PREFIX "TEST_FREERDP_PRIMITIVES")
|
||||
|
||||
disable_warnings_for_directory(${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
set(${MODULE_PREFIX}_DRIVER ${MODULE_NAME}.c)
|
||||
|
||||
set(${MODULE_PREFIX}_TESTS
|
||||
TestPrimitivesAdd.c
|
||||
TestPrimitivesAlphaComp.c
|
||||
TestPrimitivesAndOr.c
|
||||
TestPrimitivesColors.c
|
||||
TestPrimitivesCopy.c
|
||||
TestPrimitivesSet.c
|
||||
TestPrimitivesShift.c
|
||||
TestPrimitivesSign.c
|
||||
TestPrimitivesYUV.c
|
||||
TestPrimitivesYCbCr.c
|
||||
TestPrimitivesYCoCg.c
|
||||
)
|
||||
|
||||
create_test_sourcelist(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_DRIVER} ${${MODULE_PREFIX}_TESTS})
|
||||
|
||||
set(${MODULE_PREFIX}_EXTRA_SRCS prim_test.c prim_test.h measure.h)
|
||||
|
||||
add_executable(${MODULE_NAME} ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_EXTRA_SRCS})
|
||||
|
||||
set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} winpr freerdp)
|
||||
|
||||
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
|
||||
|
||||
set_target_properties(${MODULE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${TESTING_OUTPUT_DIRECTORY}")
|
||||
|
||||
foreach(test ${${MODULE_PREFIX}_TESTS})
|
||||
get_filename_component(TestName ${test} NAME_WE)
|
||||
add_test(${TestName} ${TESTING_OUTPUT_DIRECTORY}/${MODULE_NAME} ${TestName})
|
||||
endforeach()
|
||||
|
||||
set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/Test")
|
||||
80
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesAdd.c
vendored
Normal file
80
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesAdd.c
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
/* test_add.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
|
||||
#define FUNC_TEST_SIZE 65536
|
||||
/* ========================================================================= */
|
||||
static BOOL test_add16s_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
|
||||
INT16 src1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 src2[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d2[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (winpr_RAND(src1, sizeof(src1)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src2, sizeof(src2)) < 0)
|
||||
return FALSE;
|
||||
status = generic->add_16s(src1 + 1, src2 + 1, d1 + 1, FUNC_TEST_SIZE);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = optimized->add_16s(src1 + 1, src2 + 1, d2 + 2, FUNC_TEST_SIZE);
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_add16s_speed(void)
|
||||
{
|
||||
BYTE src1[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
BYTE src2[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
BYTE dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (!g_TestPrimitivesPerformance)
|
||||
return TRUE;
|
||||
|
||||
if (winpr_RAND(src1, sizeof(src1)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src2, sizeof(src2)) < 0)
|
||||
return FALSE;
|
||||
|
||||
return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->add_16s,
|
||||
(speed_test_fkt)optimized->add_16s, src1, src2, dst, FUNC_TEST_SIZE));
|
||||
}
|
||||
|
||||
int TestPrimitivesAdd(int argc, char* argv[])
|
||||
{
|
||||
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
|
||||
prim_test_setup(FALSE);
|
||||
if (!test_add16s_func())
|
||||
return -1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_add16s_speed())
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
203
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
vendored
Normal file
203
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
/* test_alphaComp.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
#define MAX_BLOCK_SIZE 256
|
||||
#define SIZE_SQUARED (MAX_BLOCK_SIZE * MAX_BLOCK_SIZE)
|
||||
|
||||
/* ========================================================================= */
|
||||
#define ALF(_c_) (((_c_)&0xFF000000U) >> 24)
|
||||
#define RED(_c_) (((_c_)&0x00FF0000U) >> 16)
|
||||
#define GRN(_c_) (((_c_)&0x0000FF00U) >> 8)
|
||||
#define BLU(_c_) ((_c_)&0x000000FFU)
|
||||
#define TOLERANCE 1
|
||||
static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
|
||||
{
|
||||
const BYTE* addr = _addr_ + 1ULL * _x_ * sizeof(UINT32) + 1ULL * _y_ * _bytes_;
|
||||
return (const UINT32*)addr;
|
||||
}
|
||||
|
||||
#define SRC1_WIDTH 6
|
||||
#define SRC1_HEIGHT 6
|
||||
#define SRC2_WIDTH 7
|
||||
#define SRC2_HEIGHT 7
|
||||
#define DST_WIDTH 9
|
||||
#define DST_HEIGHT 9
|
||||
#define TEST_WIDTH 4
|
||||
#define TEST_HEIGHT 5
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static UINT32 alpha_add(UINT32 c1, UINT32 c2)
|
||||
{
|
||||
UINT32 a1 = ALF(c1);
|
||||
UINT32 r1 = RED(c1);
|
||||
UINT32 g1 = GRN(c1);
|
||||
UINT32 b1 = BLU(c1);
|
||||
UINT32 a2 = ALF(c2);
|
||||
UINT32 r2 = RED(c2);
|
||||
UINT32 g2 = GRN(c2);
|
||||
UINT32 b2 = BLU(c2);
|
||||
UINT32 a3 = ((a1 * a1 + (255 - a1) * a2) / 255) & 0xff;
|
||||
UINT32 r3 = ((a1 * r1 + (255 - a1) * r2) / 255) & 0xff;
|
||||
UINT32 g3 = ((a1 * g1 + (255 - a1) * g2) / 255) & 0xff;
|
||||
UINT32 b3 = ((a1 * b1 + (255 - a1) * b2) / 255) & 0xff;
|
||||
return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static UINT32 colordist(UINT32 c1, UINT32 c2)
|
||||
{
|
||||
int d = 0;
|
||||
int maxd = 0;
|
||||
d = ABS((INT32)(ALF(c1) - ALF(c2)));
|
||||
|
||||
if (d > maxd)
|
||||
maxd = d;
|
||||
|
||||
d = ABS((INT32)(RED(c1) - RED(c2)));
|
||||
|
||||
if (d > maxd)
|
||||
maxd = d;
|
||||
|
||||
d = ABS((INT32)(GRN(c1) - GRN(c2)));
|
||||
|
||||
if (d > maxd)
|
||||
maxd = d;
|
||||
|
||||
d = ABS((INT32)(BLU(c1) - BLU(c2)));
|
||||
|
||||
if (d > maxd)
|
||||
maxd = d;
|
||||
|
||||
return maxd;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL check(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step,
|
||||
BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height)
|
||||
{
|
||||
for (UINT32 y = 0; y < height; ++y)
|
||||
{
|
||||
for (UINT32 x = 0; x < width; ++x)
|
||||
{
|
||||
UINT32 s1 = *PIXEL(pSrc1, src1Step, x, y);
|
||||
UINT32 s2 = *PIXEL(pSrc2, src2Step, x, y);
|
||||
UINT32 c0 = alpha_add(s1, s2);
|
||||
UINT32 c1 = *PIXEL(pDst, dstStep, x, y);
|
||||
|
||||
if (colordist(c0, c1) > TOLERANCE)
|
||||
{
|
||||
printf("alphaComp-general: [%" PRIu32 ",%" PRIu32 "] 0x%08" PRIx32 "+0x%08" PRIx32
|
||||
"=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
|
||||
x, y, s1, s2, c0, c1);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL test_alphaComp_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
BYTE src1[SRC1_WIDTH * SRC1_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
|
||||
BYTE src2[SRC2_WIDTH * SRC2_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
|
||||
BYTE dst1[DST_WIDTH * DST_HEIGHT * 4] = WINPR_C_ARRAY_INIT;
|
||||
UINT32* ptr = nullptr;
|
||||
if (winpr_RAND(src1, sizeof(src1)) < 0)
|
||||
return FALSE;
|
||||
/* Special-case the first two values */
|
||||
src1[0] &= 0x00FFFFFFU;
|
||||
src1[1] |= 0xFF000000U;
|
||||
if (winpr_RAND(src2, sizeof(src2)) < 0)
|
||||
return FALSE;
|
||||
/* Set the second operand to fully-opaque. */
|
||||
ptr = (UINT32*)src2;
|
||||
|
||||
for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
|
||||
*ptr++ |= 0xFF000000U;
|
||||
|
||||
status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1,
|
||||
4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
|
||||
TEST_HEIGHT))
|
||||
return FALSE;
|
||||
|
||||
status = optimized->alphaComp_argb((const BYTE*)src1, 4 * SRC1_WIDTH, (const BYTE*)src2,
|
||||
4 * SRC2_WIDTH, (BYTE*)dst1, 4 * DST_WIDTH, TEST_WIDTH,
|
||||
TEST_HEIGHT);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
|
||||
TEST_HEIGHT))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static int test_alphaComp_speed(void)
|
||||
{
|
||||
BYTE src1[SRC1_WIDTH * SRC1_HEIGHT] = WINPR_C_ARRAY_INIT;
|
||||
BYTE src2[SRC2_WIDTH * SRC2_HEIGHT] = WINPR_C_ARRAY_INIT;
|
||||
BYTE dst1[DST_WIDTH * DST_HEIGHT] = WINPR_C_ARRAY_INIT;
|
||||
UINT32* ptr = nullptr;
|
||||
|
||||
if (winpr_RAND(src1, sizeof(src1)) < 0)
|
||||
return -1;
|
||||
/* Special-case the first two values */
|
||||
src1[0] &= 0x00FFFFFFU;
|
||||
src1[1] |= 0xFF000000U;
|
||||
if (winpr_RAND(src2, sizeof(src2)) < 0)
|
||||
return -1;
|
||||
/* Set the second operand to fully-opaque. */
|
||||
ptr = (UINT32*)src2;
|
||||
|
||||
for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
|
||||
*ptr++ |= 0xFF000000U;
|
||||
|
||||
return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->alphaComp_argb,
|
||||
(speed_test_fkt)optimized->alphaComp_argb, src1, 4 * SRC1_WIDTH, src2,
|
||||
4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT));
|
||||
}
|
||||
|
||||
int TestPrimitivesAlphaComp(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_alphaComp_func())
|
||||
return -1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_alphaComp_speed())
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
171
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesAndOr.c
vendored
Normal file
171
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesAndOr.c
vendored
Normal file
@@ -0,0 +1,171 @@
|
||||
/* test_andor.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
#define FUNC_TEST_SIZE 65536
|
||||
|
||||
#define VALUE (0xA5A5A5A5U)
|
||||
|
||||
/* ========================================================================= */
|
||||
static BOOL test_and_32u_impl(const char* name, fn_andC_32u_t fkt, const UINT32* src,
|
||||
const UINT32 val, UINT32* dst, size_t size)
|
||||
{
|
||||
pstatus_t status = fkt(src, val, dst, WINPR_ASSERTING_INT_CAST(int32_t, size));
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
if (dst[i] != (src[i] & val))
|
||||
{
|
||||
|
||||
printf("AND %s FAIL[%" PRIuz "] 0x%08" PRIx32 "&0x%08" PRIx32 "=0x%08" PRIx32
|
||||
", got 0x%08" PRIx32 "\n",
|
||||
name, i, src[i], val, (src[i] & val), dst[i]);
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL test_and_32u_func(void)
|
||||
{
|
||||
UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u, src + 1, VALUE, dst + 1,
|
||||
FUNC_TEST_SIZE))
|
||||
return FALSE;
|
||||
if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u, src + 1, VALUE,
|
||||
dst + 2, FUNC_TEST_SIZE))
|
||||
return FALSE;
|
||||
if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u, src + 1, VALUE,
|
||||
dst + 1, FUNC_TEST_SIZE))
|
||||
return FALSE;
|
||||
if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u, src + 1, VALUE,
|
||||
dst + 2, FUNC_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_and_32u_speed(void)
|
||||
{
|
||||
UINT32 src[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("andC_32u", "aligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
|
||||
(speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
if (!speed_test("andC_32u", "unaligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
|
||||
(speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
|
||||
{
|
||||
for (UINT32 i = 0; i < size; ++i)
|
||||
{
|
||||
if (dst[i] != (src[i] | value))
|
||||
{
|
||||
printf("OR-general general FAIL[%" PRIu32 "] 0x%08" PRIx32 "&0x%08" PRIx32
|
||||
"=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
|
||||
i, src[i], value, src[i] | value, dst[i]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL test_or_32u_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
|
||||
return FALSE;
|
||||
|
||||
status = optimized->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_or_32u_speed(void)
|
||||
{
|
||||
UINT32 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 dst[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
return (speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->orC_32u,
|
||||
(speed_test_fkt)optimized->orC_32u, src + 1, VALUE, dst + 1,
|
||||
FUNC_TEST_SIZE));
|
||||
}
|
||||
|
||||
int TestPrimitivesAndOr(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_and_32u_func())
|
||||
return -1;
|
||||
|
||||
if (!test_or_32u_func())
|
||||
return -1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_and_32u_speed())
|
||||
return -1;
|
||||
if (!test_or_32u_speed())
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
291
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesColors.c
vendored
Normal file
291
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesColors.c
vendored
Normal file
@@ -0,0 +1,291 @@
|
||||
/* test_colors.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include <freerdp/utils/profiler.h>
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_RGBToRGB_16s8u_P3AC4R_func(prim_size_t roi, DWORD DstFormat)
|
||||
{
|
||||
INT16* r = nullptr;
|
||||
INT16* g = nullptr;
|
||||
INT16* b = nullptr;
|
||||
BYTE* out1 = nullptr;
|
||||
BYTE* out2 = nullptr;
|
||||
BOOL failed = FALSE;
|
||||
const INT16* ptrs[3];
|
||||
const UINT32 rgbStride = roi.width * 2;
|
||||
const UINT32 dstStride = roi.width * 4;
|
||||
PROFILER_DEFINE(genericProf)
|
||||
PROFILER_DEFINE(optProf)
|
||||
PROFILER_CREATE(genericProf, "RGBToRGB_16s8u_P3AC4R-GENERIC")
|
||||
PROFILER_CREATE(optProf, "RGBToRGB_16s8u_P3AC4R-OPTIMIZED")
|
||||
r = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
|
||||
g = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
|
||||
b = winpr_aligned_calloc(1, 1ULL * rgbStride * roi.height, 16);
|
||||
out1 = winpr_aligned_calloc(1, 1ULL * dstStride * roi.height, 16);
|
||||
out2 = winpr_aligned_calloc(1, 1ULL * dstStride * roi.height, 16);
|
||||
|
||||
if (!r || !g || !b || !out1 || !out2)
|
||||
goto fail;
|
||||
|
||||
if (winpr_RAND(r, 1ULL * rgbStride * roi.height) < 0)
|
||||
goto fail;
|
||||
if (winpr_RAND(g, 1ULL * rgbStride * roi.height) < 0)
|
||||
goto fail;
|
||||
if (winpr_RAND(b, 1ULL * rgbStride * roi.height) < 0)
|
||||
goto fail;
|
||||
ptrs[0] = r;
|
||||
ptrs[1] = g;
|
||||
ptrs[2] = b;
|
||||
PROFILER_ENTER(genericProf)
|
||||
|
||||
if (generic->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out1, dstStride, DstFormat, &roi) !=
|
||||
PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
PROFILER_EXIT(genericProf)
|
||||
PROFILER_ENTER(optProf)
|
||||
|
||||
if (optimized->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out2, dstStride, DstFormat, &roi) !=
|
||||
PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
PROFILER_EXIT(optProf)
|
||||
|
||||
if (memcmp(out1, out2, 1ULL * dstStride * roi.height) != 0)
|
||||
{
|
||||
for (UINT64 i = 0; i < 1ull * roi.width * roi.height; ++i)
|
||||
{
|
||||
const UINT32 o1 = FreeRDPReadColor(out1 + 4 * i, DstFormat);
|
||||
const UINT32 o2 = FreeRDPReadColor(out2 + 4 * i, DstFormat);
|
||||
|
||||
if (o1 != o2)
|
||||
{
|
||||
printf("RGBToRGB_16s8u_P3AC4R FAIL: out1[%" PRIu64 "]=0x%08" PRIx8 " out2[%" PRIu64
|
||||
"]=0x%08" PRIx8 "\n",
|
||||
i, out1[i], i, out2[i]);
|
||||
failed = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("Results for %" PRIu32 "x%" PRIu32 " [%s]\n", roi.width, roi.height,
|
||||
FreeRDPGetColorFormatName(DstFormat));
|
||||
PROFILER_PRINT_HEADER
|
||||
PROFILER_PRINT(genericProf)
|
||||
PROFILER_PRINT(optProf)
|
||||
PROFILER_PRINT_FOOTER
|
||||
fail:
|
||||
PROFILER_FREE(genericProf)
|
||||
PROFILER_FREE(optProf)
|
||||
winpr_aligned_free(r);
|
||||
winpr_aligned_free(g);
|
||||
winpr_aligned_free(b);
|
||||
winpr_aligned_free(out1);
|
||||
winpr_aligned_free(out2);
|
||||
return !failed;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
|
||||
{
|
||||
union
|
||||
{
|
||||
const INT16** cpv;
|
||||
INT16** pv;
|
||||
} cnv;
|
||||
const prim_size_t roi64x64 = { 64, 64 };
|
||||
INT16 r[4096 + 1] = WINPR_C_ARRAY_INIT;
|
||||
INT16 g[4096 + 1] = WINPR_C_ARRAY_INIT;
|
||||
INT16 b[4096 + 1] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 dst[4096 + 1] = WINPR_C_ARRAY_INIT;
|
||||
INT16* ptrs[3] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(r, sizeof(r)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(g, sizeof(g)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(b, sizeof(b)) < 0)
|
||||
return FALSE;
|
||||
|
||||
/* clear upper bytes */
|
||||
for (int i = 0; i < 4096; ++i)
|
||||
{
|
||||
r[i] &= 0x00FFU;
|
||||
g[i] &= 0x00FFU;
|
||||
b[i] &= 0x00FFU;
|
||||
}
|
||||
|
||||
ptrs[0] = r + 1;
|
||||
ptrs[1] = g + 1;
|
||||
ptrs[2] = b + 1;
|
||||
|
||||
cnv.pv = ptrs;
|
||||
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
|
||||
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
|
||||
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv, 64 * 2, (BYTE*)dst,
|
||||
64 * 4, &roi64x64))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
|
||||
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
|
||||
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv, 64 * 2,
|
||||
((BYTE*)dst) + 1, 64 * 4, &roi64x64))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
INT16 y[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 cb[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 cr[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 r1[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 g1[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 b1[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 r2[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 g2[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 b2[4096] = WINPR_C_ARRAY_INIT;
|
||||
const INT16* in[3];
|
||||
INT16* out1[3];
|
||||
INT16* out2[3];
|
||||
prim_size_t roi = { 64, 64 };
|
||||
if (winpr_RAND(y, sizeof(y)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(cb, sizeof(cb)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(cr, sizeof(cr)) < 0)
|
||||
return FALSE;
|
||||
|
||||
/* Normalize to 11.5 fixed radix */
|
||||
for (int i = 0; i < 4096; ++i)
|
||||
{
|
||||
y[i] &= 0x1FE0U;
|
||||
cb[i] &= 0x1FE0U;
|
||||
cr[i] &= 0x1FE0U;
|
||||
}
|
||||
|
||||
in[0] = y;
|
||||
in[1] = cb;
|
||||
in[2] = cr;
|
||||
out1[0] = r1;
|
||||
out1[1] = g1;
|
||||
out1[2] = b1;
|
||||
out2[0] = r2;
|
||||
out2[1] = g2;
|
||||
out2[2] = b2;
|
||||
status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
for (int i = 0; i < 4096; ++i)
|
||||
{
|
||||
if ((ABS(r1[i] - r2[i]) > 1) || (ABS(g1[i] - g2[i]) > 1) || (ABS(b1[i] - b2[i]) > 1))
|
||||
{
|
||||
printf("YCbCrToRGB-SSE FAIL[%d]: %" PRId16 ",%" PRId16 ",%" PRId16 " vs %" PRId16
|
||||
",%" PRId16 ",%" PRId16 "\n",
|
||||
i, r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
|
||||
{
|
||||
prim_size_t roi = { 64, 64 };
|
||||
INT16 y[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 cb[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 cr[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 r[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 g[4096] = WINPR_C_ARRAY_INIT;
|
||||
INT16 b[4096] = WINPR_C_ARRAY_INIT;
|
||||
const INT16* input[3] = WINPR_C_ARRAY_INIT;
|
||||
INT16* output[3] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(y, sizeof(y)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(cb, sizeof(cb)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(cr, sizeof(cr)) < 0)
|
||||
return FALSE;
|
||||
|
||||
/* Normalize to 11.5 fixed radix */
|
||||
for (int i = 0; i < 4096; ++i)
|
||||
{
|
||||
y[i] &= 0x1FE0U;
|
||||
cb[i] &= 0x1FE0U;
|
||||
cr[i] &= 0x1FE0U;
|
||||
}
|
||||
|
||||
input[0] = y;
|
||||
input[1] = cb;
|
||||
input[2] = cr;
|
||||
output[0] = r;
|
||||
output[1] = g;
|
||||
output[2] = b;
|
||||
|
||||
return (speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
|
||||
(speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
|
||||
(speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3, input, 64 * 2, output,
|
||||
64 * 2, &roi));
|
||||
}
|
||||
|
||||
int TestPrimitivesColors(int argc, char* argv[])
|
||||
{
|
||||
const DWORD formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_ABGR32,
|
||||
PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
|
||||
PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
|
||||
prim_size_t roi = { 1920 / 4, 1080 / 4 };
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
|
||||
{
|
||||
if (!test_RGBToRGB_16s8u_P3AC4R_func(roi, formats[x]))
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_RGBToRGB_16s8u_P3AC4R_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!test_yCbCrToRGB_16s16s_P3P3_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_yCbCrToRGB_16s16s_P3P3_speed())
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
296
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesCopy.c
vendored
Normal file
296
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesCopy.c
vendored
Normal file
@@ -0,0 +1,296 @@
|
||||
/* test_copy.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <winpr/crypto.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
|
||||
#define COPY_TESTSIZE (256 * 2 + 16 * 2 + 15 + 15)
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_copy8u_func(void)
|
||||
{
|
||||
primitives_t* prims = primitives_get();
|
||||
BYTE data[COPY_TESTSIZE + 15] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(data, sizeof(data)) < 0)
|
||||
return FALSE;
|
||||
|
||||
for (int soff = 0; soff < 16; ++soff)
|
||||
{
|
||||
for (int doff = 0; doff < 16; ++doff)
|
||||
{
|
||||
for (int length = 1; length <= COPY_TESTSIZE - doff; ++length)
|
||||
{
|
||||
BYTE dest[COPY_TESTSIZE + 15] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
for (int i = 0; i < length; ++i)
|
||||
{
|
||||
if (dest[i + doff] != data[i + soff])
|
||||
{
|
||||
printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02" PRIx8 ""
|
||||
"data[%d]=0x%02" PRIx8 "\n",
|
||||
doff, length, i + doff, dest[i + doff], i + soff, data[i + soff]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_copy8u_speed(void)
|
||||
{
|
||||
BYTE src[MAX_TEST_SIZE + 4] = WINPR_C_ARRAY_INIT;
|
||||
BYTE dst[MAX_TEST_SIZE + 4] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
if (!speed_test("copy_8u", "aligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
|
||||
(speed_test_fkt)optimized->copy_8u, src, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("copy_8u", "unaligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
|
||||
(speed_test_fkt)optimized->copy_8u, src + 1, dst + 1, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BYTE* rand_alloc(size_t w, size_t h, size_t bpp, size_t pad, BYTE** copy)
|
||||
{
|
||||
const size_t s = w * bpp + pad;
|
||||
BYTE* ptr = calloc(s, h);
|
||||
if (!ptr)
|
||||
return nullptr;
|
||||
|
||||
if (winpr_RAND(ptr, s * h) < 0)
|
||||
{
|
||||
free(ptr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (copy)
|
||||
{
|
||||
BYTE* ptr2 = calloc(s, h);
|
||||
if (!ptr2)
|
||||
{
|
||||
free(ptr);
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(ptr2, ptr, s * h);
|
||||
*copy = ptr2;
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static size_t runcount = 0;
|
||||
|
||||
static BOOL test_copy_no_overlap_off(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
|
||||
UINT32 pad, UINT32 w, UINT32 h, UINT32 dxoff, UINT32 dyoff,
|
||||
UINT32 sxoff, UINT32 syoff)
|
||||
{
|
||||
BOOL rc = FALSE;
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
primitives_t* prims = primitives_get();
|
||||
if (!gen || !prims)
|
||||
return FALSE;
|
||||
|
||||
runcount++;
|
||||
|
||||
WINPR_ASSERT(dxoff < w);
|
||||
WINPR_ASSERT(sxoff < w);
|
||||
WINPR_ASSERT(dyoff < h);
|
||||
WINPR_ASSERT(syoff < h);
|
||||
|
||||
const UINT32 sbpp = FreeRDPGetBytesPerPixel(srcFormat);
|
||||
const UINT32 dbpp = FreeRDPGetBytesPerPixel(dstFormat);
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
(void)fprintf(stderr,
|
||||
"run src: %s, dst: %s [flags 0x%08" PRIx32 "] %" PRIu32 "x%" PRIu32
|
||||
", soff=%" PRIu32 "x%" PRIu32 ", doff=%" PRIu32 "x%" PRIu32 ", pad=%" PRIu32
|
||||
"\n",
|
||||
FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
|
||||
flags, w, h, sxoff, syoff, dxoff, dyoff, pad);
|
||||
}
|
||||
|
||||
const UINT32 sstride = (w + sxoff) * sbpp + pad;
|
||||
const UINT32 dstride = (w + dxoff) * dbpp + pad;
|
||||
BYTE* dst2 = nullptr;
|
||||
BYTE* src2 = nullptr;
|
||||
BYTE* dst1 = rand_alloc(w + dxoff, h + dyoff, dbpp, pad, &dst2);
|
||||
BYTE* src1 = rand_alloc(w + sxoff, h + syoff, sbpp, pad, &src2);
|
||||
if (!dst1 || !dst2 || !src1 || !src2)
|
||||
goto fail;
|
||||
|
||||
if (gen->copy_no_overlap(dst1, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat, sstride,
|
||||
sxoff, syoff, nullptr, flags) != PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
|
||||
goto fail;
|
||||
|
||||
if (prims->copy_no_overlap(dst2, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat,
|
||||
sstride, sxoff, syoff, nullptr, flags) != PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
|
||||
goto fail;
|
||||
|
||||
if (memcmp(dst1, dst2, 1ULL * dstride * h) != 0)
|
||||
goto fail;
|
||||
|
||||
if (flags == FREERDP_KEEP_DST_ALPHA)
|
||||
{
|
||||
for (size_t y = 0; y < h; y++)
|
||||
{
|
||||
const BYTE* d1 = &dst1[(y + dyoff) * dstride];
|
||||
const BYTE* d2 = &dst2[(y + dyoff) * dstride];
|
||||
for (size_t x = 0; x < w; x++)
|
||||
{
|
||||
const UINT32 c1 = FreeRDPReadColor(&d1[(x + dxoff) * dbpp], dstFormat);
|
||||
const UINT32 c2 = FreeRDPReadColor(&d2[(x + dxoff) * dbpp], dstFormat);
|
||||
BYTE a1 = 0;
|
||||
BYTE a2 = 0;
|
||||
FreeRDPSplitColor(c1, dstFormat, nullptr, nullptr, nullptr, &a1, nullptr);
|
||||
FreeRDPSplitColor(c2, dstFormat, nullptr, nullptr, nullptr, &a2, nullptr);
|
||||
if (a1 != a2)
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = TRUE;
|
||||
|
||||
fail:
|
||||
if (!rc)
|
||||
{
|
||||
(void)fprintf(stderr, "failed to compare copy_no_overlap(%s -> %s [0x%08" PRIx32 "])\n",
|
||||
FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
|
||||
flags);
|
||||
}
|
||||
free(dst1);
|
||||
free(dst2);
|
||||
free(src1);
|
||||
free(src2);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static BOOL test_copy_no_overlap(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
|
||||
UINT32 width, UINT32 height)
|
||||
{
|
||||
BOOL rc = TRUE;
|
||||
const UINT32 mw = 4;
|
||||
const UINT32 mh = 4;
|
||||
for (UINT32 dxoff = 0; dxoff < mw; dxoff++)
|
||||
{
|
||||
for (UINT32 dyoff = 0; dyoff <= mh; dyoff++)
|
||||
{
|
||||
for (UINT32 sxoff = 0; sxoff <= mw; sxoff++)
|
||||
{
|
||||
for (UINT32 syoff = 0; syoff <= mh; syoff++)
|
||||
{
|
||||
/* We need minimum alignment of 8 bytes.
|
||||
* AVX2 can read 8 pixels (at most 8x4=32 bytes) per step
|
||||
* if we have 24bpp input that is 24 bytes with 8 bytes read
|
||||
* out of bound */
|
||||
for (UINT32 pad = 8; pad <= 12; pad++)
|
||||
{
|
||||
if (!test_copy_no_overlap_off(verbose, srcFormat, dstFormat, flags, pad,
|
||||
width, height, dxoff, dyoff, sxoff, syoff))
|
||||
rc = FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int TestPrimitivesCopy(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
|
||||
const BOOL verbose = argc > 1;
|
||||
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_copy8u_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_copy8u_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
const UINT32 flags[] = {
|
||||
FREERDP_FLIP_NONE,
|
||||
FREERDP_KEEP_DST_ALPHA,
|
||||
FREERDP_FLIP_HORIZONTAL,
|
||||
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_HORIZONTAL,
|
||||
#if defined(TEST_ALL_FLAGS)
|
||||
FREERDP_FLIP_VERTICAL,
|
||||
FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL,
|
||||
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL,
|
||||
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL
|
||||
#endif
|
||||
};
|
||||
const UINT32 formats[] = { PIXEL_FORMAT_BGRA32,
|
||||
PIXEL_FORMAT_BGRX32,
|
||||
PIXEL_FORMAT_BGR24
|
||||
#if defined(TEST_ALL_FLAGS) /* Only the previous 3 have SIMD optimizations, so skip the rest */
|
||||
,
|
||||
PIXEL_FORMAT_RGB24,
|
||||
PIXEL_FORMAT_ABGR32,
|
||||
PIXEL_FORMAT_ARGB32,
|
||||
PIXEL_FORMAT_XBGR32,
|
||||
PIXEL_FORMAT_XRGB32,
|
||||
PIXEL_FORMAT_RGBA32,
|
||||
PIXEL_FORMAT_RGBX32
|
||||
#endif
|
||||
};
|
||||
|
||||
int rc = 0;
|
||||
for (size_t z = 0; z < ARRAYSIZE(flags); z++)
|
||||
{
|
||||
const UINT32 flag = flags[z];
|
||||
for (size_t x = 0; x < ARRAYSIZE(formats); x++)
|
||||
{
|
||||
const UINT32 sformat = formats[x];
|
||||
for (size_t y = 0; y < ARRAYSIZE(formats); y++)
|
||||
{
|
||||
const UINT32 dformat = formats[y];
|
||||
|
||||
if (!test_copy_no_overlap(verbose, sformat, dformat, flag, 21, 17))
|
||||
rc = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose)
|
||||
(void)fprintf(stderr, "runcount=%" PRIuz "\n", runcount);
|
||||
|
||||
return rc;
|
||||
}
|
||||
277
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesSet.c
vendored
Normal file
277
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesSet.c
vendored
Normal file
@@ -0,0 +1,277 @@
|
||||
/* test_set.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL check8(const BYTE* src, UINT32 length, UINT32 offset, BYTE value)
|
||||
{
|
||||
for (UINT32 i = 0; i < length; ++i)
|
||||
{
|
||||
if (src[offset + i] != value)
|
||||
{
|
||||
printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%02" PRIx8
|
||||
"\n",
|
||||
offset, length, i + offset, src[i + offset]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL test_set8u_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
|
||||
for (UINT32 off = 0; off < 16; ++off)
|
||||
{
|
||||
BYTE dest[1024];
|
||||
|
||||
memset(dest, 3, sizeof(dest));
|
||||
for (UINT32 len = 1; len < 48 - off; ++len)
|
||||
{
|
||||
status = generic->set_8u(0xa5, dest + off, len);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check8(dest, len, off, 0xa5))
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
for (UINT32 off = 0; off < 16; ++off)
|
||||
{
|
||||
BYTE dest[1024];
|
||||
|
||||
memset(dest, 3, sizeof(dest));
|
||||
for (UINT32 len = 1; len < 48 - off; ++len)
|
||||
{
|
||||
status = optimized->set_8u(0xa5, dest + off, len);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check8(dest, len, off, 0xa5))
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_set8u_speed(void)
|
||||
{
|
||||
BYTE dest[1024];
|
||||
BYTE value = 0;
|
||||
|
||||
for (UINT32 x = 0; x < 16; x++)
|
||||
{
|
||||
if (winpr_RAND(&value, sizeof(value)) < 0)
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("set_8u", "", g_Iterations, (speed_test_fkt)generic->set_8u,
|
||||
(speed_test_fkt)optimized->set_8u, value, dest + x, x))
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL check32s(const INT32* src, UINT32 length, UINT32 offset, INT32 value)
|
||||
{
|
||||
for (UINT32 i = 0; i < length; ++i)
|
||||
{
|
||||
if (src[offset + i] != value)
|
||||
{
|
||||
printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
|
||||
"\n",
|
||||
offset, length, i + offset, src[i + offset]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_set32s_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
const INT32 value = -0x12345678;
|
||||
|
||||
for (UINT32 off = 0; off < 16; ++off)
|
||||
{
|
||||
INT32 dest[1024] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
for (UINT32 len = 1; len < 48 - off; ++len)
|
||||
{
|
||||
status = generic->set_32s(value, dest + off, len);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check32s(dest, len, off, value))
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
for (UINT32 off = 0; off < 16; ++off)
|
||||
{
|
||||
INT32 dest[1024] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
for (UINT32 len = 1; len < 48 - off; ++len)
|
||||
{
|
||||
status = optimized->set_32s(value, dest + off, len);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check32s(dest, len, off, value))
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BOOL check32u(const UINT32* src, UINT32 length, UINT32 offset, UINT32 value)
|
||||
{
|
||||
for (UINT32 i = 0; i < length; ++i)
|
||||
{
|
||||
if (src[offset + i] != value)
|
||||
{
|
||||
printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
|
||||
"\n",
|
||||
offset, length, i + offset, src[i + offset]);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_set32u_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
const UINT32 value = 0xABCDEF12;
|
||||
|
||||
for (UINT32 off = 0; off < 16; ++off)
|
||||
{
|
||||
UINT32 dest[1024] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
for (UINT32 len = 1; len < 48 - off; ++len)
|
||||
{
|
||||
status = generic->set_32u(value, dest + off, len);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check32u(dest, len, off, value))
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
for (UINT32 off = 0; off < 16; ++off)
|
||||
{
|
||||
UINT32 dest[1024] = WINPR_C_ARRAY_INIT;
|
||||
|
||||
for (UINT32 len = 1; len < 48 - off; ++len)
|
||||
{
|
||||
status = optimized->set_32u(value, dest + off, len);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (!check32u(dest, len, off, value))
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_set32u_speed(void)
|
||||
{
|
||||
UINT32 dest[1024];
|
||||
BYTE value = 0;
|
||||
|
||||
for (UINT32 x = 0; x < 16; x++)
|
||||
{
|
||||
if (winpr_RAND(&value, sizeof(value)) < 0)
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("set_32u", "", g_Iterations, (speed_test_fkt)generic->set_32u,
|
||||
(speed_test_fkt)optimized->set_32u, value, dest + x, x))
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_set32s_speed(void)
|
||||
{
|
||||
INT32 dest[1024];
|
||||
BYTE value = 0;
|
||||
|
||||
for (UINT32 x = 0; x < 16; x++)
|
||||
{
|
||||
if (winpr_RAND(&value, sizeof(value)) < 0)
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("set_32s", "", g_Iterations, (speed_test_fkt)generic->set_32s,
|
||||
(speed_test_fkt)optimized->set_32s, value, dest + x, x))
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int TestPrimitivesSet(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_set8u_func())
|
||||
return -1;
|
||||
|
||||
if (!test_set32s_func())
|
||||
return -1;
|
||||
|
||||
if (!test_set32u_func())
|
||||
return -1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_set8u_speed())
|
||||
return -1;
|
||||
|
||||
if (!test_set32s_speed())
|
||||
return -1;
|
||||
|
||||
if (!test_set32u_speed())
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
470
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesShift.c
vendored
Normal file
470
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesShift.c
vendored
Normal file
@@ -0,0 +1,470 @@
|
||||
/* test_shift.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
|
||||
#define FUNC_TEST_SIZE 65536
|
||||
|
||||
static BOOL test_lShift_16s_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 val = 0;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
val = val % 16;
|
||||
/* Negative tests */
|
||||
status = generic->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Aligned */
|
||||
status = generic->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = generic->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
static BOOL test_lShift_16u_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 val = 0;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
val = val % 16;
|
||||
|
||||
/* Negative tests */
|
||||
status = generic->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Aligned */
|
||||
status = generic->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = generic->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
static BOOL test_rShift_16s_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 val = 0;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
val = val % 16;
|
||||
|
||||
/* Negative Tests */
|
||||
status = generic->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Aligned */
|
||||
status = generic->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = generic->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
static BOOL test_rShift_16u_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 val = 0;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
val = val % 16;
|
||||
/* Negative tests */
|
||||
status = generic->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Aligned */
|
||||
status = generic->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = generic->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
static BOOL test_ShiftWrapper_16s_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
INT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 tmp = 0;
|
||||
if (winpr_RAND(&tmp, sizeof(tmp)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
INT32 val = WINPR_ASSERTING_INT_CAST(int32_t, tmp % 16);
|
||||
|
||||
/* Negative tests */
|
||||
status = generic->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Aligned */
|
||||
status = generic->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = generic->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = generic->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = generic->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
static BOOL test_ShiftWrapper_16u_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
UINT16 src[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT16 d1[FUNC_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
UINT32 tmp = 0;
|
||||
if (winpr_RAND(&tmp, sizeof(tmp)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
INT32 val = WINPR_ASSERTING_INT_CAST(int32_t, tmp % 16);
|
||||
|
||||
/* Negative */
|
||||
status = generic->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status == PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Aligned */
|
||||
status = generic->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = generic->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
/* Unaligned */
|
||||
status = generic->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = generic->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
|
||||
|
||||
return (status == PRIMITIVES_SUCCESS);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_lShift_16s_speed(void)
|
||||
{
|
||||
UINT32 val = 0;
|
||||
INT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
INT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(&val, sizeof(val)))
|
||||
return FALSE;
|
||||
|
||||
val = val % 16;
|
||||
if (!speed_test("lShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
|
||||
(speed_test_fkt)optimized->lShiftC_16s, src, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("lShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
|
||||
(speed_test_fkt)optimized->lShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_lShift_16u_speed(void)
|
||||
{
|
||||
UINT32 val = 0;
|
||||
UINT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
UINT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
val = val % 16;
|
||||
if (!speed_test("lShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
|
||||
(speed_test_fkt)optimized->lShiftC_16u, src, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("lShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
|
||||
(speed_test_fkt)optimized->lShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_rShift_16s_speed(void)
|
||||
{
|
||||
UINT32 val = 0;
|
||||
INT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
INT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
|
||||
val = val % 16;
|
||||
if (!speed_test("rShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
|
||||
(speed_test_fkt)optimized->rShiftC_16s, src, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("rShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
|
||||
(speed_test_fkt)optimized->rShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_rShift_16u_speed(void)
|
||||
{
|
||||
UINT32 val = 0;
|
||||
UINT16 src[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
UINT16 dst[MAX_TEST_SIZE + 1] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(&val, sizeof(val)) < 0)
|
||||
return FALSE;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
val = val % 16;
|
||||
if (!speed_test("rShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
|
||||
(speed_test_fkt)optimized->rShiftC_16u, src, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("rShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
|
||||
(speed_test_fkt)optimized->rShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int TestPrimitivesShift(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_lShift_16s_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_lShift_16s_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!test_lShift_16u_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_lShift_16u_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!test_rShift_16s_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_rShift_16s_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!test_rShift_16u_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_rShift_16u_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!test_ShiftWrapper_16s_func())
|
||||
return 1;
|
||||
|
||||
if (!test_ShiftWrapper_16u_func())
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
95
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesSign.c
vendored
Normal file
95
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesSign.c
vendored
Normal file
@@ -0,0 +1,95 @@
|
||||
/* test_sign.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
|
||||
#define TEST_BUFFER_SIZE 65535
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_sign16s_func(void)
|
||||
{
|
||||
pstatus_t status = 0;
|
||||
INT16 src[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d1[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
|
||||
INT16 d2[TEST_BUFFER_SIZE + 16] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (memcmp(d1, d2, sizeof(d1)) != 0)
|
||||
return FALSE;
|
||||
|
||||
status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return FALSE;
|
||||
|
||||
if (memcmp(d1, d2, sizeof(d1)) != 0)
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static int test_sign16s_speed(void)
|
||||
{
|
||||
INT16 src[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
INT16 dst[MAX_TEST_SIZE + 3] = WINPR_C_ARRAY_INIT;
|
||||
if (winpr_RAND(src, sizeof(src)) < 0)
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("sign16s", "aligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
|
||||
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 1, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
if (!speed_test("sign16s", "unaligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
|
||||
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 2, MAX_TEST_SIZE))
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int TestPrimitivesSign(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_sign16s_func())
|
||||
return 1;
|
||||
|
||||
if (g_TestPrimitivesPerformance)
|
||||
{
|
||||
if (!test_sign16s_speed())
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
1843
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesYCbCr.c
vendored
Normal file
1843
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesYCbCr.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
150
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesYCoCg.c
vendored
Normal file
150
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesYCoCg.c
vendored
Normal file
@@ -0,0 +1,150 @@
|
||||
/* test_YCoCg.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
#include <freerdp/utils/profiler.h>
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
|
||||
{
|
||||
pstatus_t status = -1;
|
||||
BYTE* out_sse = nullptr;
|
||||
BYTE* in = nullptr;
|
||||
BYTE* out_c = nullptr;
|
||||
const UINT32 srcStride = width * 4;
|
||||
const UINT32 size = srcStride * height;
|
||||
const UINT32 formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32,
|
||||
PIXEL_FORMAT_RGBX32, PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
|
||||
PROFILER_DEFINE(genericProf)
|
||||
PROFILER_DEFINE(optProf)
|
||||
in = winpr_aligned_calloc(1, size, 16);
|
||||
out_c = winpr_aligned_calloc(1, size, 16);
|
||||
out_sse = winpr_aligned_calloc(1, size, 16);
|
||||
|
||||
if (!in || !out_c || !out_sse)
|
||||
goto fail;
|
||||
|
||||
if (winpr_RAND(in, size) < 0)
|
||||
goto fail;
|
||||
|
||||
for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
|
||||
{
|
||||
const UINT32 format = formats[x];
|
||||
const UINT32 dstStride = width * FreeRDPGetBytesPerPixel(format);
|
||||
const char* formatName = FreeRDPGetColorFormatName(format);
|
||||
PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC")
|
||||
PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT")
|
||||
PROFILER_ENTER(genericProf)
|
||||
status = generic->YCoCgToRGB_8u_AC4R(in, WINPR_ASSERTING_INT_CAST(int, srcStride), out_c,
|
||||
format, WINPR_ASSERTING_INT_CAST(int, dstStride),
|
||||
width, height, 2, TRUE);
|
||||
PROFILER_EXIT(genericProf)
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
goto loop_fail;
|
||||
|
||||
PROFILER_ENTER(optProf)
|
||||
status = optimized->YCoCgToRGB_8u_AC4R(
|
||||
in, WINPR_ASSERTING_INT_CAST(int, srcStride), out_sse, format,
|
||||
WINPR_ASSERTING_INT_CAST(int, dstStride), width, height, 2, TRUE);
|
||||
PROFILER_EXIT(optProf)
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
goto loop_fail;
|
||||
|
||||
if (memcmp(out_c, out_sse, 1ULL * dstStride * height) != 0)
|
||||
{
|
||||
for (size_t i = 0; i < 1ull * width * height; ++i)
|
||||
{
|
||||
const UINT32 c = FreeRDPReadColor(out_c + 4 * i, format);
|
||||
const UINT32 sse = FreeRDPReadColor(out_sse + 4 * i, format);
|
||||
|
||||
if (c != sse)
|
||||
{
|
||||
printf("optimized->YCoCgRToRGB FAIL[%s] [%" PRIuz "]: 0x%08" PRIx32
|
||||
" -> C 0x%08" PRIx32 " vs optimized 0x%08" PRIx32 "\n",
|
||||
formatName, i, in[i + 1], c, sse);
|
||||
status = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("--------------------------- [%s] [%" PRIu32 "x%" PRIu32
|
||||
"] ---------------------------\n",
|
||||
formatName, width, height);
|
||||
PROFILER_PRINT_HEADER
|
||||
PROFILER_PRINT(genericProf)
|
||||
PROFILER_PRINT(optProf)
|
||||
PROFILER_PRINT_FOOTER
|
||||
loop_fail:
|
||||
PROFILER_FREE(genericProf)
|
||||
PROFILER_FREE(optProf)
|
||||
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
fail:
|
||||
winpr_aligned_free(in);
|
||||
winpr_aligned_free(out_c);
|
||||
winpr_aligned_free(out_sse);
|
||||
return status == PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
int TestPrimitivesYCoCg(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
/* Random resolution tests */
|
||||
if (argc < 2)
|
||||
{
|
||||
for (UINT32 x = 0; x < 10; x++)
|
||||
{
|
||||
UINT32 w = 0;
|
||||
UINT32 h = 0;
|
||||
|
||||
do
|
||||
{
|
||||
if (winpr_RAND(&w, sizeof(w)) < 0)
|
||||
return -1;
|
||||
w %= 2048 / 4;
|
||||
} while (w < 16);
|
||||
|
||||
do
|
||||
{
|
||||
if (winpr_RAND(&h, sizeof(h)) < 0)
|
||||
return -1;
|
||||
h %= 2048 / 4;
|
||||
} while (h < 16);
|
||||
|
||||
if (!test_YCoCgRToRGB_8u_AC4R_func(w, h))
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Test once with full HD/4 */
|
||||
if (!test_YCoCgRToRGB_8u_AC4R_func(1920 / 4, 1080 / 4))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
1474
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesYUV.c
vendored
Normal file
1474
third_party/FreeRDP/libfreerdp/primitives/test/TestPrimitivesYUV.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
138
third_party/FreeRDP/libfreerdp/primitives/test/measure.h
vendored
Normal file
138
third_party/FreeRDP/libfreerdp/primitives/test/measure.h
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
/* measure.h
|
||||
* Macros to help with performance measurement.
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*
|
||||
* MEASURE_LOOP_START("measurement", 2000)
|
||||
* code to be measured
|
||||
* MEASURE_LOOP_STOP
|
||||
* buffer flush and such
|
||||
* MEASURE_SHOW_RESULTS
|
||||
*
|
||||
* Define GOOGLE_PROFILER if you want gperftools included.
|
||||
*/
|
||||
|
||||
#ifndef TEST_MEASURE_H_INCLUDED
|
||||
#define TEST_MEASURE_H_INCLUDED
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <time.h>
|
||||
#include <winpr/string.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
|
||||
#include <winpr/crt.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#define PROFILER_START(_prefix_)
|
||||
#define PROFILER_STOP
|
||||
|
||||
#define MEASURE_LOOP_START(_prefix_, _count_)
|
||||
#define MEASURE_LOOP_STOP
|
||||
#define MEASURE_GET_RESULTS(_result_)
|
||||
#define MEASURE_SHOW_RESULTS(_result_)
|
||||
#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)
|
||||
#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_)
|
||||
|
||||
#else
|
||||
|
||||
#ifdef GOOGLE_PROFILER
|
||||
#include <gperftools/profiler.h>
|
||||
#define PROFILER_START(_prefix_) \
|
||||
do \
|
||||
{ \
|
||||
char _path[PATH_MAX]; \
|
||||
sprintf_s(_path, sizeof(_path), "./%s.prof", (_prefix_)); \
|
||||
ProfilerStart(_path); \
|
||||
} while (0);
|
||||
#define PROFILER_STOP \
|
||||
do \
|
||||
{ \
|
||||
ProfilerStop(); \
|
||||
} while (0);
|
||||
#else
|
||||
#define PROFILER_START(_prefix_)
|
||||
#define PROFILER_STOP
|
||||
#endif // GOOGLE_PROFILER
|
||||
|
||||
extern float measure_delta_time(UINT64 t0, UINT64 t1);
|
||||
extern void measure_floatprint(float t, char* output, size_t len);
|
||||
|
||||
#define MEASURE_LOOP_START(_prefix_, _count_) \
|
||||
{ \
|
||||
int _count = (_count_); \
|
||||
int _loop; \
|
||||
char str1[32] = WINPR_C_ARRAY_INIT; \
|
||||
char str2[32] = WINPR_C_ARRAY_INIT; \
|
||||
char* _prefix = _strdup(_prefix_); \
|
||||
const UINT64 start = winpr_GetTickCount64NS(); \
|
||||
PROFILER_START(_prefix); \
|
||||
_loop = (_count); \
|
||||
do \
|
||||
{
|
||||
|
||||
#define MEASURE_LOOP_STOP \
|
||||
} \
|
||||
while (--_loop) \
|
||||
;
|
||||
|
||||
#define MEASURE_GET_RESULTS(_result_) \
|
||||
PROFILER_STOP; \
|
||||
const UINT64 stop = winpr_GetTickCount64NS(); \
|
||||
const float delta = measure_delta_time(start, stop); \
|
||||
(_result_) = (float)_count / delta; \
|
||||
free(_prefix); \
|
||||
}
|
||||
|
||||
#define MEASURE_SHOW_RESULTS(_result_) \
|
||||
PROFILER_STOP; \
|
||||
const UINT64 stop = winpr_GetTickCount64NS(); \
|
||||
const float delta = measure_delta_time(start, stop); \
|
||||
(_result_) = (float)_count / delta; \
|
||||
measure_floatprint((float)_count / delta, str1); \
|
||||
printf("%s: %9d iterations in %5.1f seconds = %s/s \n", _prefix, _count, delta, str1); \
|
||||
free(_prefix); \
|
||||
}
|
||||
|
||||
#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \
|
||||
PROFILER_STOP; \
|
||||
const UINT64 stop = winpr_GetTickCount64NS(); \
|
||||
const float delta = measure_delta_time(start, stop); \
|
||||
measure_floatprint((float)_count / delta, str1); \
|
||||
measure_floatprint((float)_count / delta * (_scale_), str2); \
|
||||
printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", _prefix, _count, delta, str1, \
|
||||
str2, _label_); \
|
||||
free(_prefix); \
|
||||
}
|
||||
|
||||
#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
|
||||
{ \
|
||||
float _r; \
|
||||
MEASURE_LOOP_START(_label_, _init_iter_); \
|
||||
_call_; \
|
||||
MEASURE_LOOP_STOP; \
|
||||
MEASURE_GET_RESULTS(_r); \
|
||||
MEASURE_LOOP_START(_label_, _r* _test_time_); \
|
||||
_call_; \
|
||||
MEASURE_LOOP_STOP; \
|
||||
MEASURE_SHOW_RESULTS(_result_); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // __MEASURE_H_INCLUDED__
|
||||
94
third_party/FreeRDP/libfreerdp/primitives/test/prim_test.c
vendored
Normal file
94
third_party/FreeRDP/libfreerdp/primitives/test/prim_test.c
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
/* prim_test.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <fcntl.h>
|
||||
#include <math.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include <winpr/platform.h>
|
||||
#include <winpr/crypto.h>
|
||||
|
||||
primitives_t* generic = nullptr;
|
||||
primitives_t* optimized = nullptr;
|
||||
BOOL g_TestPrimitivesPerformance = FALSE;
|
||||
UINT32 g_Iterations = 1000;
|
||||
|
||||
int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
float measure_delta_time(UINT64 t0, UINT64 t1)
|
||||
{
|
||||
INT64 diff = (INT64)(t1 - t0);
|
||||
double retval = ((double)diff / 1000000000.0);
|
||||
return (retval < 0.0) ? 0.0f : (float)retval;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void measure_floatprint(float t, char* output, size_t len)
|
||||
{
|
||||
/* I don't want to link against -lm, so avoid log,exp,... */
|
||||
float f = 10.0f;
|
||||
int i = 0;
|
||||
|
||||
while (t > f)
|
||||
f *= 10.0f;
|
||||
|
||||
f /= 1000.0f;
|
||||
i = ((int)(t / f + 0.5f)) * (int)f;
|
||||
|
||||
if (t < 0.0f)
|
||||
(void)_snprintf(output, len, "%f", t);
|
||||
else if (i == 0)
|
||||
(void)_snprintf(output, len, "%d", (int)(t + 0.5f));
|
||||
else if (t < 1e+3f)
|
||||
(void)_snprintf(output, len, "%3d", i);
|
||||
else if (t < 1e+6f)
|
||||
(void)_snprintf(output, len, "%3d,%03d", i / 1000, i % 1000);
|
||||
else if (t < 1e+9f)
|
||||
(void)_snprintf(output, len, "%3d,%03d,000", i / 1000000, (i % 1000000) / 1000);
|
||||
else if (t < 1e+12f)
|
||||
(void)_snprintf(output, len, "%3d,%03d,000,000", i / 1000000000,
|
||||
(i % 1000000000) / 1000000);
|
||||
else
|
||||
(void)_snprintf(output, len, "%f", t);
|
||||
}
|
||||
|
||||
void prim_test_setup(BOOL performance)
|
||||
{
|
||||
generic = primitives_get_generic();
|
||||
optimized = primitives_get();
|
||||
g_TestPrimitivesPerformance = performance;
|
||||
}
|
||||
|
||||
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
|
||||
speed_test_fkt optimized, ...)
|
||||
{
|
||||
if (!name || !generic || !optimized || (iterations == 0))
|
||||
return FALSE;
|
||||
|
||||
for (UINT32 i = 0; i < iterations; i++)
|
||||
{
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
48
third_party/FreeRDP/libfreerdp/primitives/test/prim_test.h
vendored
Normal file
48
third_party/FreeRDP/libfreerdp/primitives/test/prim_test.h
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
/* primtest.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIMTEST_H
|
||||
#define FREERDP_LIB_PRIMTEST_H
|
||||
|
||||
#include <winpr/crt.h>
|
||||
#include <winpr/spec.h>
|
||||
#include <winpr/wtypes.h>
|
||||
#include <winpr/platform.h>
|
||||
#include <winpr/crypto.h>
|
||||
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "measure.h"
|
||||
|
||||
#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
|
||||
#define MAX_TEST_SIZE 4096
|
||||
|
||||
extern int test_sizes[];
|
||||
#define NUM_TEST_SIZES 10
|
||||
|
||||
extern BOOL g_TestPrimitivesPerformance;
|
||||
extern UINT32 g_Iterations;
|
||||
|
||||
extern primitives_t* generic;
|
||||
extern primitives_t* optimized;
|
||||
|
||||
void prim_test_setup(BOOL performance);
|
||||
|
||||
typedef pstatus_t (*speed_test_fkt)();
|
||||
|
||||
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
|
||||
speed_test_fkt optimized, ...);
|
||||
|
||||
#endif /* FREERDP_LIB_PRIMTEST_H */
|
||||
Reference in New Issue
Block a user